mirror of
https://github.com/nesquena/hermes-webui.git
synced 2026-05-25 11:10:18 +00:00
Stage 386: PR #2579
This commit is contained in:
@@ -19,6 +19,7 @@ from api.config import (
|
||||
get_effective_default_model, _get_session_agent_lock,
|
||||
)
|
||||
from api.workspace import get_last_workspace
|
||||
from api.usage import prompt_cache_hit_percent
|
||||
from api.agent_sessions import (
|
||||
_is_continuation_session,
|
||||
read_importable_agent_session_rows,
|
||||
@@ -634,6 +635,7 @@ class Session:
|
||||
'estimated_cost': self.estimated_cost,
|
||||
'cache_read_tokens': self.cache_read_tokens,
|
||||
'cache_write_tokens': self.cache_write_tokens,
|
||||
'cache_hit_percent': prompt_cache_hit_percent(self.cache_read_tokens, self.input_tokens),
|
||||
'personality': self.personality,
|
||||
'compression_anchor_visible_idx': self.compression_anchor_visible_idx,
|
||||
'compression_anchor_message_key': self.compression_anchor_message_key,
|
||||
|
||||
@@ -39,6 +39,7 @@ from api.compression_anchor import visible_messages_for_anchor
|
||||
from api.metering import meter
|
||||
from api.run_journal import RunJournalWriter
|
||||
from api.turn_journal import append_turn_journal_event_for_stream
|
||||
from api.usage import prompt_cache_hit_percent
|
||||
|
||||
# Global lock for os.environ writes. Per-session locks (_agent_lock) prevent
|
||||
# concurrent runs of the SAME session, but two DIFFERENT sessions can still
|
||||
@@ -2988,6 +2989,7 @@ def _run_agent_streaming(
|
||||
'estimated_cost': 0,
|
||||
'cache_read_tokens': 0,
|
||||
'cache_write_tokens': 0,
|
||||
'cache_hit_percent': None,
|
||||
'context_length': 0,
|
||||
'threshold_tokens': 0,
|
||||
'last_prompt_tokens': 0,
|
||||
@@ -3025,6 +3027,10 @@ def _run_agent_streaming(
|
||||
pass
|
||||
|
||||
_real_prompt_tokens = int(_usage.get('last_prompt_tokens') or 0)
|
||||
_usage['cache_hit_percent'] = prompt_cache_hit_percent(
|
||||
_usage.get('cache_read_tokens') or 0,
|
||||
_usage.get('input_tokens') or 0,
|
||||
)
|
||||
if _real_prompt_tokens and _real_prompt_tokens != _live_prompt_exact_tokens[0]:
|
||||
_live_prompt_exact_tokens[0] = _real_prompt_tokens
|
||||
_live_prompt_estimate_tokens[0] = _real_prompt_tokens
|
||||
@@ -4474,6 +4480,15 @@ def _run_agent_streaming(
|
||||
estimated_cost = getattr(agent, 'session_estimated_cost_usd', None)
|
||||
cache_read_tokens = getattr(agent, 'session_cache_read_tokens', 0) or 0
|
||||
cache_write_tokens = getattr(agent, 'session_cache_write_tokens', 0) or 0
|
||||
prev_input_tokens = getattr(s, 'input_tokens', 0) or 0
|
||||
prev_cache_read_tokens = getattr(s, 'cache_read_tokens', 0) or 0
|
||||
turn_input_tokens = max(0, input_tokens - prev_input_tokens)
|
||||
turn_cache_read_tokens = max(0, cache_read_tokens - prev_cache_read_tokens)
|
||||
# Per-turn percent is computed server-side from persisted session
|
||||
# counters so the message label uses the same denominator as the
|
||||
# final usage payload even if the browser missed an intermediate event.
|
||||
cache_hit_percent = prompt_cache_hit_percent(cache_read_tokens, input_tokens)
|
||||
turn_cache_hit_percent = prompt_cache_hit_percent(turn_cache_read_tokens, turn_input_tokens)
|
||||
if input_tokens > 0:
|
||||
s.input_tokens = input_tokens
|
||||
if output_tokens > 0:
|
||||
@@ -4730,6 +4745,8 @@ def _run_agent_streaming(
|
||||
'estimated_cost': estimated_cost,
|
||||
'cache_read_tokens': cache_read_tokens,
|
||||
'cache_write_tokens': cache_write_tokens,
|
||||
'cache_hit_percent': cache_hit_percent,
|
||||
'turn_cache_hit_percent': turn_cache_hit_percent,
|
||||
'duration_seconds': round(_turn_duration_seconds, 3),
|
||||
}
|
||||
if _turn_tps is not None:
|
||||
|
||||
@@ -0,0 +1,26 @@
|
||||
"""Usage metric helpers for WebUI display payloads.
|
||||
|
||||
Prompt-cache hit percentage is cached prompt reads over the full prompt total
|
||||
(input + cache reads + cache writes). Keep this calculation in the backend so
|
||||
browser display code cannot drift across context indicator and per-turn labels.
|
||||
"""
|
||||
|
||||
|
||||
def _to_int(value) -> int:
|
||||
try:
|
||||
return int(value or 0)
|
||||
except (TypeError, ValueError):
|
||||
return 0
|
||||
|
||||
|
||||
def prompt_cache_hit_percent(cache_read_tokens, prompt_tokens):
|
||||
"""Return cached reads as a percent of full prompt-token total.
|
||||
|
||||
``prompt_tokens`` must include ordinary input, cache reads, and cache writes
|
||||
(matching Agent's ``session_prompt_tokens`` value).
|
||||
"""
|
||||
cache_read = _to_int(cache_read_tokens)
|
||||
prompt = _to_int(prompt_tokens)
|
||||
if cache_read <= 0 or prompt <= 0:
|
||||
return None
|
||||
return min(100, round((cache_read / prompt) * 100))
|
||||
@@ -215,6 +215,8 @@ const LOCALES = {
|
||||
focus_label: 'Focus',
|
||||
token_usage_on: 'Token usage on',
|
||||
token_usage_off: 'Token usage off',
|
||||
usage_cache_hit_detail: 'Cache: {0}% hit ({1} read / {2} write)',
|
||||
usage_cached_percent: '{0}% cached',
|
||||
theme_usage: 'Usage: /theme ',
|
||||
theme_set: 'Theme: ',
|
||||
no_active_session: 'No active session',
|
||||
@@ -1434,6 +1436,8 @@ const LOCALES = {
|
||||
focus_label: 'Focus',
|
||||
token_usage_on: 'Uso token attivo',
|
||||
token_usage_off: 'Uso token disattivo',
|
||||
usage_cache_hit_detail: 'Cache: {0}% in cache ({1} letti / {2} scritti)',
|
||||
usage_cached_percent: '{0}% in cache',
|
||||
theme_usage: 'Uso: /theme ',
|
||||
theme_set: 'Tema: ',
|
||||
no_active_session: 'Nessuna sessione attiva',
|
||||
@@ -2645,6 +2649,8 @@ const LOCALES = {
|
||||
focus_label: 'フォーカス',
|
||||
token_usage_on: 'トークン使用量: ON',
|
||||
token_usage_off: 'トークン使用量: OFF',
|
||||
usage_cache_hit_detail: 'キャッシュ: {0}% ヒット(読み取り {1} / 書き込み {2})',
|
||||
usage_cached_percent: '{0}% キャッシュ済み',
|
||||
theme_usage: '使い方: /theme ',
|
||||
theme_set: 'テーマ: ',
|
||||
no_active_session: 'アクティブなセッションがありません',
|
||||
@@ -3817,6 +3823,8 @@ const LOCALES = {
|
||||
token_usage_on: 'Отображение токенов включено',
|
||||
usage_personality_none: 'none', // TODO: translate
|
||||
token_usage_off: 'Отображение токенов выключено',
|
||||
usage_cache_hit_detail: 'Кэш: {0}% попаданий ({1} чтение / {2} запись)',
|
||||
usage_cached_percent: '{0}% из кэша',
|
||||
theme_usage: 'Использование: /theme ',
|
||||
theme_set: 'Тема: ',
|
||||
no_active_session: 'Нет активной сессии',
|
||||
@@ -5004,6 +5012,8 @@ const LOCALES = {
|
||||
token_usage_on: 'Uso de tokens activado',
|
||||
usage_personality_none: 'none', // TODO: translate
|
||||
token_usage_off: 'Uso de tokens desactivado',
|
||||
usage_cache_hit_detail: 'Caché: {0}% de acierto ({1} lectura / {2} escritura)',
|
||||
usage_cached_percent: '{0}% en caché',
|
||||
theme_usage: 'Uso: /theme ',
|
||||
theme_set: 'Tema: ',
|
||||
no_active_session: 'No hay ninguna sesión activa',
|
||||
@@ -6128,6 +6138,8 @@ const LOCALES = {
|
||||
token_usage_on: 'Token-Verbrauch an',
|
||||
usage_personality_none: 'none', // TODO: translate
|
||||
token_usage_off: 'Token-Verbrauch aus',
|
||||
usage_cache_hit_detail: 'Cache: {0}% Treffer ({1} gelesen / {2} geschrieben)',
|
||||
usage_cached_percent: '{0}% im Cache',
|
||||
theme_usage: 'Nutzung: /theme ',
|
||||
theme_set: 'Theme: ',
|
||||
no_active_session: 'Keine aktive Sitzung',
|
||||
@@ -7303,6 +7315,8 @@ const LOCALES = {
|
||||
token_usage_on: 'Token 用量显示已开启',
|
||||
usage_personality_none: '无',
|
||||
token_usage_off: 'Token 用量显示已关闭',
|
||||
usage_cache_hit_detail: '缓存:{0}% 命中(读取 {1} / 写入 {2})',
|
||||
usage_cached_percent: '{0}% 已缓存',
|
||||
theme_usage: '用法:/theme ',
|
||||
theme_set: '主题:',
|
||||
no_active_session: '当前没有活动会话',
|
||||
@@ -8414,6 +8428,8 @@ const LOCALES = {
|
||||
focus_label: '\u4e3b\u984c',
|
||||
token_usage_on: 'Token \u7528\u91cf\u986f\u793a\u5df2\u958b\u555f',
|
||||
token_usage_off: 'Token \u7528\u91cf\u986f\u793a\u5df2\u95dc\u9589',
|
||||
usage_cache_hit_detail: '快取:{0}% 命中(讀取 {1} / 寫入 {2})',
|
||||
usage_cached_percent: '{0}% 已快取',
|
||||
theme_usage: '\u7528\u6cd5\uff1a/theme ',
|
||||
theme_set: '\u4e3b\u984c\uff1a',
|
||||
no_active_session: '\u7576\u524d\u6c92\u6709\u6d3b\u52d5\u6703\u8a71',
|
||||
@@ -9617,6 +9633,8 @@ const LOCALES = {
|
||||
focus_label: 'Foco',
|
||||
token_usage_on: 'Uso de tokens ligado',
|
||||
token_usage_off: 'Uso de tokens desligado',
|
||||
usage_cache_hit_detail: 'Cache: {0}% de acerto ({1} leitura / {2} escrita)',
|
||||
usage_cached_percent: '{0}% em cache',
|
||||
theme_usage: 'Uso: /theme ',
|
||||
theme_set: 'Tema: ',
|
||||
no_active_session: 'Nenhuma sessão ativa',
|
||||
@@ -10716,6 +10734,8 @@ const LOCALES = {
|
||||
focus_label: 'Focus',
|
||||
token_usage_on: 'Token usage on',
|
||||
token_usage_off: 'Token usage off',
|
||||
usage_cache_hit_detail: '캐시: {0}% 적중({1} 읽기 / {2} 쓰기)',
|
||||
usage_cached_percent: '{0}% 캐시됨',
|
||||
theme_usage: 'Usage: /theme ',
|
||||
theme_set: 'Theme: ',
|
||||
no_active_session: '활성 세션 없음',
|
||||
@@ -11919,6 +11939,8 @@ const LOCALES = {
|
||||
focus_label: 'Se concentrer',
|
||||
token_usage_on: 'Utilisation du jeton sur',
|
||||
token_usage_off: 'Utilisation des jetons désactivée',
|
||||
usage_cache_hit_detail: 'Cache : {0}% de réussite ({1} lecture / {2} écriture)',
|
||||
usage_cached_percent: '{0}% en cache',
|
||||
theme_usage: 'Utilisation : /theme ',
|
||||
theme_set: 'Thème:',
|
||||
no_active_session: 'Aucune session active',
|
||||
|
||||
@@ -1681,6 +1681,7 @@ function attachLiveStream(activeSid, streamId, uploaded=[], options={}){
|
||||
estimated_cost:Math.max(0,curCost-prevCost),
|
||||
cache_read_tokens:Math.max(0,curCacheRead-_prevCacheRead),
|
||||
cache_write_tokens:Math.max(0,curCacheWrite-_prevCacheWrite),
|
||||
cache_hit_percent:d.usage.turn_cache_hit_percent,
|
||||
};
|
||||
}
|
||||
if(typeof d.usage.duration_seconds==='number'){
|
||||
|
||||
@@ -500,6 +500,9 @@ async function newSession(flash, options={}){
|
||||
input_tokens:data.session.input_tokens||0,
|
||||
output_tokens:data.session.output_tokens||0,
|
||||
estimated_cost:data.session.estimated_cost||0,
|
||||
cache_read_tokens:data.session.cache_read_tokens||0,
|
||||
cache_write_tokens:data.session.cache_write_tokens||0,
|
||||
cache_hit_percent:data.session.cache_hit_percent,
|
||||
context_length:data.session.context_length||0,
|
||||
last_prompt_tokens:data.session.last_prompt_tokens||0,
|
||||
threshold_tokens:data.session.threshold_tokens||0,
|
||||
@@ -768,6 +771,9 @@ async function loadSession(sid){
|
||||
input_tokens: _pick(u.input_tokens, _s.input_tokens),
|
||||
output_tokens: _pick(u.output_tokens, _s.output_tokens),
|
||||
estimated_cost: _pick(u.estimated_cost, _s.estimated_cost),
|
||||
cache_read_tokens: _pick(u.cache_read_tokens, _s.cache_read_tokens),
|
||||
cache_write_tokens:_pick(u.cache_write_tokens,_s.cache_write_tokens),
|
||||
cache_hit_percent: _pick(u.cache_hit_percent, _s.cache_hit_percent, null),
|
||||
context_length: _pick(_s.context_length, u.context_length),
|
||||
last_prompt_tokens:_pick(u.last_prompt_tokens,_s.last_prompt_tokens),
|
||||
threshold_tokens: _pick(_s.threshold_tokens, u.threshold_tokens),
|
||||
@@ -1176,6 +1182,9 @@ function _resolveSessionModelForDisplaySoon(sid){
|
||||
input_tokens:_pick(u.input_tokens,S.session.input_tokens),
|
||||
output_tokens:_pick(u.output_tokens,S.session.output_tokens),
|
||||
estimated_cost:_pick(u.estimated_cost,S.session.estimated_cost),
|
||||
cache_read_tokens:_pick(u.cache_read_tokens,S.session.cache_read_tokens),
|
||||
cache_write_tokens:_pick(u.cache_write_tokens,S.session.cache_write_tokens),
|
||||
cache_hit_percent:_pick(u.cache_hit_percent,S.session.cache_hit_percent,null),
|
||||
context_length:data.session.context_length||0,
|
||||
last_prompt_tokens:_pick(u.last_prompt_tokens,S.session.last_prompt_tokens),
|
||||
threshold_tokens:data.session.threshold_tokens||0,
|
||||
|
||||
+4
-7
@@ -2262,9 +2262,8 @@ function _syncCtxIndicator(usage){
|
||||
const compressText=pct>=75?t('ctx_compress_action'):(pct>=50?t('ctx_compress_hint'):'');
|
||||
if(compressWrap) compressWrap.style.display=compressText?'':'none';
|
||||
_setCtxCompressButton(compressBtn,compressText);
|
||||
const cacheTotalTok=cacheReadTok+cacheWriteTok;
|
||||
const cacheHitPct=cacheTotalTok?Math.round((cacheReadTok/cacheTotalTok)*100):null;
|
||||
const cacheText=cacheTotalTok?`cache: ${cacheHitPct}% hit (${_fmtTokens(cacheReadTok)} read / ${_fmtTokens(cacheWriteTok)} write)`:'';
|
||||
const cacheHitPct=usage.cache_hit_percent;
|
||||
const cacheText=cacheHitPct!=null?t('usage_cache_hit_detail',cacheHitPct,_fmtTokens(cacheReadTok),_fmtTokens(cacheWriteTok)):'';
|
||||
let label=hasPromptTok?`Context window ${pct}% used`:`${_fmtTokens(totalTok)} tokens used`;
|
||||
if(!hasExplicitCtx&&hasPromptTok) label+=' (est. 128K)';
|
||||
if(cost) label+=` \u00b7 $${cost<0.01?cost.toFixed(4):cost.toFixed(2)}`;
|
||||
@@ -6210,12 +6209,10 @@ function renderMessages(options){
|
||||
const inTok=msg._turnUsage.input_tokens||0;
|
||||
const outTok=msg._turnUsage.output_tokens||0;
|
||||
const cost=msg._turnUsage.estimated_cost;
|
||||
const cacheRead=msg._turnUsage.cache_read_tokens||0;
|
||||
const cacheWrite=msg._turnUsage.cache_write_tokens||0;
|
||||
let text=`${_fmtTokens(inTok)} in · ${_fmtTokens(outTok)} out`;
|
||||
if(cost) text+=` · ~$${cost<0.01?cost.toFixed(4):cost.toFixed(2)}`;
|
||||
const cacheTotal=cacheRead+cacheWrite;
|
||||
if(cacheTotal) text+=` · cache ${Math.round((cacheRead/cacheTotal)*100)}% hit`;
|
||||
const cacheHitPct=msg._turnUsage.cache_hit_percent;
|
||||
if(cacheHitPct!=null) text+=` · ${t('usage_cached_percent',cacheHitPct)}`;
|
||||
usage.textContent=text;
|
||||
fragments.push(usage);
|
||||
}
|
||||
|
||||
@@ -3,23 +3,34 @@ from pathlib import Path
|
||||
ROOT = Path(__file__).resolve().parents[1]
|
||||
|
||||
|
||||
def test_webui_backend_prompt_cache_hit_percent_uses_prompt_total_denominator():
|
||||
from api.usage import prompt_cache_hit_percent
|
||||
|
||||
assert prompt_cache_hit_percent(100_000, 125_000) == 80
|
||||
assert prompt_cache_hit_percent(0, 125_000) is None
|
||||
assert prompt_cache_hit_percent(100, 0) is None
|
||||
assert prompt_cache_hit_percent(None, None) is None
|
||||
assert prompt_cache_hit_percent(200, 100) == 100
|
||||
|
||||
|
||||
def test_session_compact_exposes_prompt_cache_counters():
|
||||
from api.models import Session
|
||||
|
||||
session = Session(
|
||||
session_id="issue2419_cache_usage",
|
||||
workspace="/tmp",
|
||||
input_tokens=120_000,
|
||||
input_tokens=125_000,
|
||||
output_tokens=5_000,
|
||||
estimated_cost=0.44,
|
||||
cache_read_tokens=100_000,
|
||||
cache_write_tokens=20_000,
|
||||
cache_write_tokens=5_000,
|
||||
)
|
||||
|
||||
compact = session.compact()
|
||||
|
||||
assert compact["cache_read_tokens"] == 100_000
|
||||
assert compact["cache_write_tokens"] == 20_000
|
||||
assert compact["cache_write_tokens"] == 5_000
|
||||
assert compact["cache_hit_percent"] == 80
|
||||
|
||||
|
||||
def test_streaming_usage_payload_includes_prompt_cache_counters():
|
||||
@@ -27,8 +38,9 @@ def test_streaming_usage_payload_includes_prompt_cache_counters():
|
||||
|
||||
assert "session_cache_read_tokens" in src
|
||||
assert "session_cache_write_tokens" in src
|
||||
assert "'cache_read_tokens': cache_read_tokens" in src
|
||||
assert "'cache_write_tokens': cache_write_tokens" in src
|
||||
assert "prompt_cache_hit_percent(" in src
|
||||
assert "'cache_hit_percent':" in src
|
||||
assert "'turn_cache_hit_percent':" in src
|
||||
|
||||
|
||||
def test_context_indicator_surfaces_cache_hit_rate():
|
||||
@@ -36,9 +48,25 @@ def test_context_indicator_surfaces_cache_hit_rate():
|
||||
|
||||
assert "cacheReadTok=usage.cache_read_tokens||0" in src
|
||||
assert "cacheWriteTok=usage.cache_write_tokens||0" in src
|
||||
assert "cache: ${cacheHitPct}% hit" in src
|
||||
assert "cacheHitPct=usage.cache_hit_percent" in src
|
||||
assert "t('usage_cache_hit_detail',cacheHitPct" in src
|
||||
assert "Estimated cost: $${cost<0.01?cost.toFixed(4):cost.toFixed(2)}" in src
|
||||
assert "cache ${Math.round((cacheRead/cacheTotal)*100)}% hit" in src
|
||||
assert "cacheHitPct=msg._turnUsage.cache_hit_percent" in src
|
||||
assert "t('usage_cached_percent',cacheHitPct)" in src
|
||||
assert "cacheHitPct!=null" in src
|
||||
assert "cacheReadTok/cacheTotalTok" not in src
|
||||
assert "cacheRead/cacheTotal" not in src
|
||||
assert "cacheReadTok/promptTok" not in src
|
||||
assert "cacheRead/cacheDenom" not in src
|
||||
|
||||
|
||||
def test_cache_usage_labels_are_localized():
|
||||
src = (ROOT / "static" / "i18n.js").read_text()
|
||||
|
||||
assert src.count("usage_cache_hit_detail:") == 11
|
||||
assert src.count("usage_cached_percent:") == 11
|
||||
assert "usage_cache_hit_detail: 'Cache: {0}% hit ({1} read / {2} write)'" in src
|
||||
assert "usage_cached_percent: '{0}% cached'" in src
|
||||
|
||||
|
||||
def test_done_handler_preserves_per_turn_cache_deltas():
|
||||
@@ -48,3 +76,4 @@ def test_done_handler_preserves_per_turn_cache_deltas():
|
||||
assert "curCacheRead=d.usage.cache_read_tokens||0" in src
|
||||
assert "cache_read_tokens:Math.max(0,curCacheRead-_prevCacheRead)" in src
|
||||
assert "cache_write_tokens:Math.max(0,curCacheWrite-_prevCacheWrite)" in src
|
||||
assert "cache_hit_percent:d.usage.turn_cache_hit_percent" in src
|
||||
|
||||
Reference in New Issue
Block a user