mirror of
https://github.com/nesquena/hermes-webui.git
synced 2026-05-26 11:40:26 +00:00
@@ -6,6 +6,7 @@
|
||||
### Fixed
|
||||
|
||||
- When the session index is missing, WebUI now starts a background rebuild while preserving the first sidebar full-scan result, so the index is primed for later requests without temporarily hiding existing sessions.
|
||||
- Live token-usage hints now cap the cumulative in-flight tool-result prompt estimate per assistant turn, preventing many large tool callbacks from temporarily inflating the context ring before exact provider accounting arrives.
|
||||
|
||||
## [v0.51.135] — 2026-05-25 — Release DG (stage-batch17 — 9-PR small-fix batch)
|
||||
|
||||
|
||||
+28
-7
@@ -2851,6 +2851,7 @@ _TOOL_RESULT_SNIPPET_MAX = 4000
|
||||
|
||||
|
||||
_LIVE_TOOL_PROMPT_DELTA_MAX = 12_000
|
||||
_LIVE_TOOL_PROMPT_TURN_MAX = 24_000
|
||||
|
||||
|
||||
def _bounded_live_tool_prompt_delta(messages, *, cap: int = _LIVE_TOOL_PROMPT_DELTA_MAX) -> int:
|
||||
@@ -2878,19 +2879,32 @@ def live_usage_prompt_estimate_after_tool_delta(
|
||||
exact_prompt_tokens: int = 0,
|
||||
messages=None,
|
||||
cap: int = _LIVE_TOOL_PROMPT_DELTA_MAX,
|
||||
turn_tool_prompt_tokens: int = 0,
|
||||
turn_cap: int = _LIVE_TOOL_PROMPT_TURN_MAX,
|
||||
) -> dict:
|
||||
"""Compute the live `last_prompt_tokens` estimate after a tool update.
|
||||
|
||||
Exact compressor/provider prompt accounting wins. When no newer exact prompt
|
||||
is available, add only a bounded live tool delta to the persisted base.
|
||||
is available, add only bounded live tool deltas to the persisted base.
|
||||
"""
|
||||
base = int(base_prompt_tokens or 0)
|
||||
exact = int(exact_prompt_tokens or 0)
|
||||
if exact and exact != base:
|
||||
return {'last_prompt_tokens': exact, 'estimated': False}
|
||||
return {
|
||||
'last_prompt_tokens': exact,
|
||||
'estimated': False,
|
||||
'turn_tool_prompt_tokens': 0,
|
||||
}
|
||||
prior_turn_delta = max(0, int(turn_tool_prompt_tokens or 0))
|
||||
turn_ceiling = max(0, int(turn_cap or 0))
|
||||
next_turn_delta = min(
|
||||
prior_turn_delta + _bounded_live_tool_prompt_delta(messages, cap=cap),
|
||||
turn_ceiling,
|
||||
)
|
||||
return {
|
||||
'last_prompt_tokens': base + _bounded_live_tool_prompt_delta(messages, cap=cap),
|
||||
'last_prompt_tokens': base + next_turn_delta,
|
||||
'estimated': True,
|
||||
'turn_tool_prompt_tokens': next_turn_delta,
|
||||
}
|
||||
|
||||
|
||||
@@ -3396,6 +3410,7 @@ def _run_agent_streaming(
|
||||
agent = None
|
||||
_live_prompt_estimate_tokens = [0]
|
||||
_live_prompt_exact_tokens = [0]
|
||||
_live_prompt_estimate_tool_delta_tokens = [0]
|
||||
_live_prompt_estimate_seen_ids = set()
|
||||
|
||||
def _seed_live_prompt_estimate() -> int:
|
||||
@@ -3425,10 +3440,15 @@ def _run_agent_streaming(
|
||||
"""Increment a rough next-prompt estimate from live tool activity."""
|
||||
if not messages:
|
||||
return _live_prompt_estimate_tokens[0]
|
||||
_delta = _bounded_live_tool_prompt_delta(messages)
|
||||
if _delta > 0:
|
||||
_seed_live_prompt_estimate()
|
||||
_live_prompt_estimate_tokens[0] += _delta
|
||||
_seed_live_prompt_estimate()
|
||||
_usage = live_usage_prompt_estimate_after_tool_delta(
|
||||
base_prompt_tokens=_live_prompt_exact_tokens[0],
|
||||
exact_prompt_tokens=_live_prompt_exact_tokens[0],
|
||||
messages=messages,
|
||||
turn_tool_prompt_tokens=_live_prompt_estimate_tool_delta_tokens[0],
|
||||
)
|
||||
_live_prompt_estimate_tokens[0] = _usage['last_prompt_tokens']
|
||||
_live_prompt_estimate_tool_delta_tokens[0] = _usage['turn_tool_prompt_tokens']
|
||||
return _live_prompt_estimate_tokens[0]
|
||||
|
||||
def _live_usage_snapshot():
|
||||
@@ -3490,6 +3510,7 @@ def _run_agent_streaming(
|
||||
if _real_prompt_tokens and _real_prompt_tokens != _live_prompt_exact_tokens[0]:
|
||||
_live_prompt_exact_tokens[0] = _real_prompt_tokens
|
||||
_live_prompt_estimate_tokens[0] = _real_prompt_tokens
|
||||
_live_prompt_estimate_tool_delta_tokens[0] = 0
|
||||
elif _live_prompt_estimate_tokens[0] > _real_prompt_tokens:
|
||||
_usage['last_prompt_tokens'] = _live_prompt_estimate_tokens[0]
|
||||
|
||||
|
||||
@@ -20,3 +20,31 @@ def test_live_usage_estimate_preserves_real_prompt_when_exact_prompt_advances():
|
||||
)
|
||||
|
||||
assert usage["last_prompt_tokens"] == 136_000
|
||||
assert usage["turn_tool_prompt_tokens"] == 0
|
||||
|
||||
|
||||
def test_live_usage_estimate_caps_cumulative_tool_delta_per_turn():
|
||||
from api import streaming
|
||||
|
||||
base_prompt_tokens = 86_723
|
||||
turn_tool_prompt_tokens = 0
|
||||
usage = None
|
||||
original_delta = streaming._bounded_live_tool_prompt_delta
|
||||
streaming._bounded_live_tool_prompt_delta = lambda messages, cap=12_000: int(cap or 0)
|
||||
|
||||
try:
|
||||
for _ in range(20):
|
||||
usage = live_usage_prompt_estimate_after_tool_delta(
|
||||
base_prompt_tokens=base_prompt_tokens,
|
||||
exact_prompt_tokens=base_prompt_tokens,
|
||||
messages=[{"role": "tool", "content": "x" * 80_000}],
|
||||
turn_tool_prompt_tokens=turn_tool_prompt_tokens,
|
||||
)
|
||||
turn_tool_prompt_tokens = usage["turn_tool_prompt_tokens"]
|
||||
finally:
|
||||
streaming._bounded_live_tool_prompt_delta = original_delta
|
||||
|
||||
assert usage is not None
|
||||
assert usage["turn_tool_prompt_tokens"] == 24_000
|
||||
assert usage["last_prompt_tokens"] == base_prompt_tokens + 24_000
|
||||
assert usage["last_prompt_tokens"] < base_prompt_tokens + (20 * 12_000)
|
||||
|
||||
Reference in New Issue
Block a user