diff --git a/api/models.py b/api/models.py index 0518b227..0307562e 100644 --- a/api/models.py +++ b/api/models.py @@ -19,6 +19,7 @@ from api.config import ( get_effective_default_model, _get_session_agent_lock, ) from api.workspace import get_last_workspace +from api.usage import prompt_cache_hit_percent from api.agent_sessions import ( _is_continuation_session, read_importable_agent_session_rows, @@ -634,6 +635,7 @@ class Session: 'estimated_cost': self.estimated_cost, 'cache_read_tokens': self.cache_read_tokens, 'cache_write_tokens': self.cache_write_tokens, + 'cache_hit_percent': prompt_cache_hit_percent(self.cache_read_tokens, self.input_tokens), 'personality': self.personality, 'compression_anchor_visible_idx': self.compression_anchor_visible_idx, 'compression_anchor_message_key': self.compression_anchor_message_key, diff --git a/api/streaming.py b/api/streaming.py index 5998bdbc..4b6c1dc5 100644 --- a/api/streaming.py +++ b/api/streaming.py @@ -39,6 +39,7 @@ from api.compression_anchor import visible_messages_for_anchor from api.metering import meter from api.run_journal import RunJournalWriter from api.turn_journal import append_turn_journal_event_for_stream +from api.usage import prompt_cache_hit_percent # Global lock for os.environ writes. Per-session locks (_agent_lock) prevent # concurrent runs of the SAME session, but two DIFFERENT sessions can still @@ -2988,6 +2989,7 @@ def _run_agent_streaming( 'estimated_cost': 0, 'cache_read_tokens': 0, 'cache_write_tokens': 0, + 'cache_hit_percent': None, 'context_length': 0, 'threshold_tokens': 0, 'last_prompt_tokens': 0, @@ -3025,6 +3027,10 @@ def _run_agent_streaming( pass _real_prompt_tokens = int(_usage.get('last_prompt_tokens') or 0) + _usage['cache_hit_percent'] = prompt_cache_hit_percent( + _usage.get('cache_read_tokens') or 0, + _usage.get('input_tokens') or 0, + ) if _real_prompt_tokens and _real_prompt_tokens != _live_prompt_exact_tokens[0]: _live_prompt_exact_tokens[0] = _real_prompt_tokens _live_prompt_estimate_tokens[0] = _real_prompt_tokens @@ -4474,6 +4480,15 @@ def _run_agent_streaming( estimated_cost = getattr(agent, 'session_estimated_cost_usd', None) cache_read_tokens = getattr(agent, 'session_cache_read_tokens', 0) or 0 cache_write_tokens = getattr(agent, 'session_cache_write_tokens', 0) or 0 + prev_input_tokens = getattr(s, 'input_tokens', 0) or 0 + prev_cache_read_tokens = getattr(s, 'cache_read_tokens', 0) or 0 + turn_input_tokens = max(0, input_tokens - prev_input_tokens) + turn_cache_read_tokens = max(0, cache_read_tokens - prev_cache_read_tokens) + # Per-turn percent is computed server-side from persisted session + # counters so the message label uses the same denominator as the + # final usage payload even if the browser missed an intermediate event. + cache_hit_percent = prompt_cache_hit_percent(cache_read_tokens, input_tokens) + turn_cache_hit_percent = prompt_cache_hit_percent(turn_cache_read_tokens, turn_input_tokens) if input_tokens > 0: s.input_tokens = input_tokens if output_tokens > 0: @@ -4730,6 +4745,8 @@ def _run_agent_streaming( 'estimated_cost': estimated_cost, 'cache_read_tokens': cache_read_tokens, 'cache_write_tokens': cache_write_tokens, + 'cache_hit_percent': cache_hit_percent, + 'turn_cache_hit_percent': turn_cache_hit_percent, 'duration_seconds': round(_turn_duration_seconds, 3), } if _turn_tps is not None: diff --git a/api/usage.py b/api/usage.py new file mode 100644 index 00000000..5ab5bb7a --- /dev/null +++ b/api/usage.py @@ -0,0 +1,26 @@ +"""Usage metric helpers for WebUI display payloads. + +Prompt-cache hit percentage is cached prompt reads over the full prompt total +(input + cache reads + cache writes). Keep this calculation in the backend so +browser display code cannot drift across context indicator and per-turn labels. +""" + + +def _to_int(value) -> int: + try: + return int(value or 0) + except (TypeError, ValueError): + return 0 + + +def prompt_cache_hit_percent(cache_read_tokens, prompt_tokens): + """Return cached reads as a percent of full prompt-token total. + + ``prompt_tokens`` must include ordinary input, cache reads, and cache writes + (matching Agent's ``session_prompt_tokens`` value). + """ + cache_read = _to_int(cache_read_tokens) + prompt = _to_int(prompt_tokens) + if cache_read <= 0 or prompt <= 0: + return None + return min(100, round((cache_read / prompt) * 100)) diff --git a/static/i18n.js b/static/i18n.js index 05800281..7e541d0f 100644 --- a/static/i18n.js +++ b/static/i18n.js @@ -215,6 +215,8 @@ const LOCALES = { focus_label: 'Focus', token_usage_on: 'Token usage on', token_usage_off: 'Token usage off', + usage_cache_hit_detail: 'Cache: {0}% hit ({1} read / {2} write)', + usage_cached_percent: '{0}% cached', theme_usage: 'Usage: /theme ', theme_set: 'Theme: ', no_active_session: 'No active session', @@ -1434,6 +1436,8 @@ const LOCALES = { focus_label: 'Focus', token_usage_on: 'Uso token attivo', token_usage_off: 'Uso token disattivo', + usage_cache_hit_detail: 'Cache: {0}% in cache ({1} letti / {2} scritti)', + usage_cached_percent: '{0}% in cache', theme_usage: 'Uso: /theme ', theme_set: 'Tema: ', no_active_session: 'Nessuna sessione attiva', @@ -2645,6 +2649,8 @@ const LOCALES = { focus_label: 'フォーカス', token_usage_on: 'トークン使用量: ON', token_usage_off: 'トークン使用量: OFF', + usage_cache_hit_detail: 'キャッシュ: {0}% ヒット(読み取り {1} / 書き込み {2})', + usage_cached_percent: '{0}% キャッシュ済み', theme_usage: '使い方: /theme ', theme_set: 'テーマ: ', no_active_session: 'アクティブなセッションがありません', @@ -3817,6 +3823,8 @@ const LOCALES = { token_usage_on: 'Отображение токенов включено', usage_personality_none: 'none', // TODO: translate token_usage_off: 'Отображение токенов выключено', + usage_cache_hit_detail: 'Кэш: {0}% попаданий ({1} чтение / {2} запись)', + usage_cached_percent: '{0}% из кэша', theme_usage: 'Использование: /theme ', theme_set: 'Тема: ', no_active_session: 'Нет активной сессии', @@ -5004,6 +5012,8 @@ const LOCALES = { token_usage_on: 'Uso de tokens activado', usage_personality_none: 'none', // TODO: translate token_usage_off: 'Uso de tokens desactivado', + usage_cache_hit_detail: 'Caché: {0}% de acierto ({1} lectura / {2} escritura)', + usage_cached_percent: '{0}% en caché', theme_usage: 'Uso: /theme ', theme_set: 'Tema: ', no_active_session: 'No hay ninguna sesión activa', @@ -6128,6 +6138,8 @@ const LOCALES = { token_usage_on: 'Token-Verbrauch an', usage_personality_none: 'none', // TODO: translate token_usage_off: 'Token-Verbrauch aus', + usage_cache_hit_detail: 'Cache: {0}% Treffer ({1} gelesen / {2} geschrieben)', + usage_cached_percent: '{0}% im Cache', theme_usage: 'Nutzung: /theme ', theme_set: 'Theme: ', no_active_session: 'Keine aktive Sitzung', @@ -7303,6 +7315,8 @@ const LOCALES = { token_usage_on: 'Token 用量显示已开启', usage_personality_none: '无', token_usage_off: 'Token 用量显示已关闭', + usage_cache_hit_detail: '缓存:{0}% 命中(读取 {1} / 写入 {2})', + usage_cached_percent: '{0}% 已缓存', theme_usage: '用法:/theme ', theme_set: '主题:', no_active_session: '当前没有活动会话', @@ -8414,6 +8428,8 @@ const LOCALES = { focus_label: '\u4e3b\u984c', token_usage_on: 'Token \u7528\u91cf\u986f\u793a\u5df2\u958b\u555f', token_usage_off: 'Token \u7528\u91cf\u986f\u793a\u5df2\u95dc\u9589', + usage_cache_hit_detail: '快取:{0}% 命中(讀取 {1} / 寫入 {2})', + usage_cached_percent: '{0}% 已快取', theme_usage: '\u7528\u6cd5\uff1a/theme ', theme_set: '\u4e3b\u984c\uff1a', no_active_session: '\u7576\u524d\u6c92\u6709\u6d3b\u52d5\u6703\u8a71', @@ -9617,6 +9633,8 @@ const LOCALES = { focus_label: 'Foco', token_usage_on: 'Uso de tokens ligado', token_usage_off: 'Uso de tokens desligado', + usage_cache_hit_detail: 'Cache: {0}% de acerto ({1} leitura / {2} escrita)', + usage_cached_percent: '{0}% em cache', theme_usage: 'Uso: /theme ', theme_set: 'Tema: ', no_active_session: 'Nenhuma sessão ativa', @@ -10716,6 +10734,8 @@ const LOCALES = { focus_label: 'Focus', token_usage_on: 'Token usage on', token_usage_off: 'Token usage off', + usage_cache_hit_detail: '캐시: {0}% 적중({1} 읽기 / {2} 쓰기)', + usage_cached_percent: '{0}% 캐시됨', theme_usage: 'Usage: /theme ', theme_set: 'Theme: ', no_active_session: '활성 세션 없음', @@ -11919,6 +11939,8 @@ const LOCALES = { focus_label: 'Se concentrer', token_usage_on: 'Utilisation du jeton sur', token_usage_off: 'Utilisation des jetons désactivée', + usage_cache_hit_detail: 'Cache : {0}% de réussite ({1} lecture / {2} écriture)', + usage_cached_percent: '{0}% en cache', theme_usage: 'Utilisation : /theme ', theme_set: 'Thème:', no_active_session: 'Aucune session active', diff --git a/static/messages.js b/static/messages.js index 8969fe44..e24f5d67 100644 --- a/static/messages.js +++ b/static/messages.js @@ -1681,6 +1681,7 @@ function attachLiveStream(activeSid, streamId, uploaded=[], options={}){ estimated_cost:Math.max(0,curCost-prevCost), cache_read_tokens:Math.max(0,curCacheRead-_prevCacheRead), cache_write_tokens:Math.max(0,curCacheWrite-_prevCacheWrite), + cache_hit_percent:d.usage.turn_cache_hit_percent, }; } if(typeof d.usage.duration_seconds==='number'){ diff --git a/static/sessions.js b/static/sessions.js index 613a0c77..6be342bb 100644 --- a/static/sessions.js +++ b/static/sessions.js @@ -500,6 +500,9 @@ async function newSession(flash, options={}){ input_tokens:data.session.input_tokens||0, output_tokens:data.session.output_tokens||0, estimated_cost:data.session.estimated_cost||0, + cache_read_tokens:data.session.cache_read_tokens||0, + cache_write_tokens:data.session.cache_write_tokens||0, + cache_hit_percent:data.session.cache_hit_percent, context_length:data.session.context_length||0, last_prompt_tokens:data.session.last_prompt_tokens||0, threshold_tokens:data.session.threshold_tokens||0, @@ -768,6 +771,9 @@ async function loadSession(sid){ input_tokens: _pick(u.input_tokens, _s.input_tokens), output_tokens: _pick(u.output_tokens, _s.output_tokens), estimated_cost: _pick(u.estimated_cost, _s.estimated_cost), + cache_read_tokens: _pick(u.cache_read_tokens, _s.cache_read_tokens), + cache_write_tokens:_pick(u.cache_write_tokens,_s.cache_write_tokens), + cache_hit_percent: _pick(u.cache_hit_percent, _s.cache_hit_percent, null), context_length: _pick(_s.context_length, u.context_length), last_prompt_tokens:_pick(u.last_prompt_tokens,_s.last_prompt_tokens), threshold_tokens: _pick(_s.threshold_tokens, u.threshold_tokens), @@ -1176,6 +1182,9 @@ function _resolveSessionModelForDisplaySoon(sid){ input_tokens:_pick(u.input_tokens,S.session.input_tokens), output_tokens:_pick(u.output_tokens,S.session.output_tokens), estimated_cost:_pick(u.estimated_cost,S.session.estimated_cost), + cache_read_tokens:_pick(u.cache_read_tokens,S.session.cache_read_tokens), + cache_write_tokens:_pick(u.cache_write_tokens,S.session.cache_write_tokens), + cache_hit_percent:_pick(u.cache_hit_percent,S.session.cache_hit_percent,null), context_length:data.session.context_length||0, last_prompt_tokens:_pick(u.last_prompt_tokens,S.session.last_prompt_tokens), threshold_tokens:data.session.threshold_tokens||0, diff --git a/static/ui.js b/static/ui.js index 154ed641..dbe53196 100644 --- a/static/ui.js +++ b/static/ui.js @@ -2262,9 +2262,8 @@ function _syncCtxIndicator(usage){ const compressText=pct>=75?t('ctx_compress_action'):(pct>=50?t('ctx_compress_hint'):''); if(compressWrap) compressWrap.style.display=compressText?'':'none'; _setCtxCompressButton(compressBtn,compressText); - const cacheTotalTok=cacheReadTok+cacheWriteTok; - const cacheHitPct=cacheTotalTok?Math.round((cacheReadTok/cacheTotalTok)*100):null; - const cacheText=cacheTotalTok?`cache: ${cacheHitPct}% hit (${_fmtTokens(cacheReadTok)} read / ${_fmtTokens(cacheWriteTok)} write)`:''; + const cacheHitPct=usage.cache_hit_percent; + const cacheText=cacheHitPct!=null?t('usage_cache_hit_detail',cacheHitPct,_fmtTokens(cacheReadTok),_fmtTokens(cacheWriteTok)):''; let label=hasPromptTok?`Context window ${pct}% used`:`${_fmtTokens(totalTok)} tokens used`; if(!hasExplicitCtx&&hasPromptTok) label+=' (est. 128K)'; if(cost) label+=` \u00b7 $${cost<0.01?cost.toFixed(4):cost.toFixed(2)}`; @@ -6210,12 +6209,10 @@ function renderMessages(options){ const inTok=msg._turnUsage.input_tokens||0; const outTok=msg._turnUsage.output_tokens||0; const cost=msg._turnUsage.estimated_cost; - const cacheRead=msg._turnUsage.cache_read_tokens||0; - const cacheWrite=msg._turnUsage.cache_write_tokens||0; let text=`${_fmtTokens(inTok)} in · ${_fmtTokens(outTok)} out`; if(cost) text+=` · ~$${cost<0.01?cost.toFixed(4):cost.toFixed(2)}`; - const cacheTotal=cacheRead+cacheWrite; - if(cacheTotal) text+=` · cache ${Math.round((cacheRead/cacheTotal)*100)}% hit`; + const cacheHitPct=msg._turnUsage.cache_hit_percent; + if(cacheHitPct!=null) text+=` · ${t('usage_cached_percent',cacheHitPct)}`; usage.textContent=text; fragments.push(usage); } diff --git a/tests/test_issue2419_cache_usage_display.py b/tests/test_issue2419_cache_usage_display.py index 7d5804a8..e6cf94e2 100644 --- a/tests/test_issue2419_cache_usage_display.py +++ b/tests/test_issue2419_cache_usage_display.py @@ -3,23 +3,34 @@ from pathlib import Path ROOT = Path(__file__).resolve().parents[1] +def test_webui_backend_prompt_cache_hit_percent_uses_prompt_total_denominator(): + from api.usage import prompt_cache_hit_percent + + assert prompt_cache_hit_percent(100_000, 125_000) == 80 + assert prompt_cache_hit_percent(0, 125_000) is None + assert prompt_cache_hit_percent(100, 0) is None + assert prompt_cache_hit_percent(None, None) is None + assert prompt_cache_hit_percent(200, 100) == 100 + + def test_session_compact_exposes_prompt_cache_counters(): from api.models import Session session = Session( session_id="issue2419_cache_usage", workspace="/tmp", - input_tokens=120_000, + input_tokens=125_000, output_tokens=5_000, estimated_cost=0.44, cache_read_tokens=100_000, - cache_write_tokens=20_000, + cache_write_tokens=5_000, ) compact = session.compact() assert compact["cache_read_tokens"] == 100_000 - assert compact["cache_write_tokens"] == 20_000 + assert compact["cache_write_tokens"] == 5_000 + assert compact["cache_hit_percent"] == 80 def test_streaming_usage_payload_includes_prompt_cache_counters(): @@ -27,8 +38,9 @@ def test_streaming_usage_payload_includes_prompt_cache_counters(): assert "session_cache_read_tokens" in src assert "session_cache_write_tokens" in src - assert "'cache_read_tokens': cache_read_tokens" in src - assert "'cache_write_tokens': cache_write_tokens" in src + assert "prompt_cache_hit_percent(" in src + assert "'cache_hit_percent':" in src + assert "'turn_cache_hit_percent':" in src def test_context_indicator_surfaces_cache_hit_rate(): @@ -36,9 +48,25 @@ def test_context_indicator_surfaces_cache_hit_rate(): assert "cacheReadTok=usage.cache_read_tokens||0" in src assert "cacheWriteTok=usage.cache_write_tokens||0" in src - assert "cache: ${cacheHitPct}% hit" in src + assert "cacheHitPct=usage.cache_hit_percent" in src + assert "t('usage_cache_hit_detail',cacheHitPct" in src assert "Estimated cost: $${cost<0.01?cost.toFixed(4):cost.toFixed(2)}" in src - assert "cache ${Math.round((cacheRead/cacheTotal)*100)}% hit" in src + assert "cacheHitPct=msg._turnUsage.cache_hit_percent" in src + assert "t('usage_cached_percent',cacheHitPct)" in src + assert "cacheHitPct!=null" in src + assert "cacheReadTok/cacheTotalTok" not in src + assert "cacheRead/cacheTotal" not in src + assert "cacheReadTok/promptTok" not in src + assert "cacheRead/cacheDenom" not in src + + +def test_cache_usage_labels_are_localized(): + src = (ROOT / "static" / "i18n.js").read_text() + + assert src.count("usage_cache_hit_detail:") == 11 + assert src.count("usage_cached_percent:") == 11 + assert "usage_cache_hit_detail: 'Cache: {0}% hit ({1} read / {2} write)'" in src + assert "usage_cached_percent: '{0}% cached'" in src def test_done_handler_preserves_per_turn_cache_deltas(): @@ -48,3 +76,4 @@ def test_done_handler_preserves_per_turn_cache_deltas(): assert "curCacheRead=d.usage.cache_read_tokens||0" in src assert "cache_read_tokens:Math.max(0,curCacheRead-_prevCacheRead)" in src assert "cache_write_tokens:Math.max(0,curCacheWrite-_prevCacheWrite)" in src + assert "cache_hit_percent:d.usage.turn_cache_hit_percent" in src