Merge pull request #2918

# Conflicts: # CHANGELOG.md
2026-05-26 11:40:26 +00:00 · 2026-05-25 17:17:31 +00:00
parent 6eaddada93 89d8c3a94c
commit 0be9874549
3 changed files with 57 additions and 7 deletions
@@ -6,6 +6,7 @@
 ### Fixed

 - When the session index is missing, WebUI now starts a background rebuild while preserving the first sidebar full-scan result, so the index is primed for later requests without temporarily hiding existing sessions.
+- Live token-usage hints now cap the cumulative in-flight tool-result prompt estimate per assistant turn, preventing many large tool callbacks from temporarily inflating the context ring before exact provider accounting arrives.

 ## [v0.51.135] — 2026-05-25 — Release DG (stage-batch17 — 9-PR small-fix batch)

@@ -2851,6 +2851,7 @@ _TOOL_RESULT_SNIPPET_MAX = 4000


 _LIVE_TOOL_PROMPT_DELTA_MAX = 12_000
+_LIVE_TOOL_PROMPT_TURN_MAX = 24_000


 def _bounded_live_tool_prompt_delta(messages, *, cap: int = _LIVE_TOOL_PROMPT_DELTA_MAX) -> int:
@@ -2878,19 +2879,32 @@ def live_usage_prompt_estimate_after_tool_delta(
    exact_prompt_tokens: int = 0,
    messages=None,
    cap: int = _LIVE_TOOL_PROMPT_DELTA_MAX,
+    turn_tool_prompt_tokens: int = 0,
+    turn_cap: int = _LIVE_TOOL_PROMPT_TURN_MAX,
 ) -> dict:
    """Compute the live `last_prompt_tokens` estimate after a tool update.

    Exact compressor/provider prompt accounting wins. When no newer exact prompt
-    is available, add only a bounded live tool delta to the persisted base.
+    is available, add only bounded live tool deltas to the persisted base.
    """
    base = int(base_prompt_tokens or 0)
    exact = int(exact_prompt_tokens or 0)
    if exact and exact != base:
-        return {'last_prompt_tokens': exact, 'estimated': False}
+        return {
+            'last_prompt_tokens': exact,
+            'estimated': False,
+            'turn_tool_prompt_tokens': 0,
+        }
+    prior_turn_delta = max(0, int(turn_tool_prompt_tokens or 0))
+    turn_ceiling = max(0, int(turn_cap or 0))
+    next_turn_delta = min(
+        prior_turn_delta + _bounded_live_tool_prompt_delta(messages, cap=cap),
+        turn_ceiling,
+    )
    return {
-        'last_prompt_tokens': base + _bounded_live_tool_prompt_delta(messages, cap=cap),
+        'last_prompt_tokens': base + next_turn_delta,
        'estimated': True,
+        'turn_tool_prompt_tokens': next_turn_delta,
    }


@@ -3396,6 +3410,7 @@ def _run_agent_streaming(
    agent = None
    _live_prompt_estimate_tokens = [0]
    _live_prompt_exact_tokens = [0]
+    _live_prompt_estimate_tool_delta_tokens = [0]
    _live_prompt_estimate_seen_ids = set()

    def _seed_live_prompt_estimate() -> int:
@@ -3425,10 +3440,15 @@ def _run_agent_streaming(
        """Increment a rough next-prompt estimate from live tool activity."""
        if not messages:
            return _live_prompt_estimate_tokens[0]
-        _delta = _bounded_live_tool_prompt_delta(messages)
-        if _delta > 0:
-            _seed_live_prompt_estimate()
-            _live_prompt_estimate_tokens[0] += _delta
+        _seed_live_prompt_estimate()
+        _usage = live_usage_prompt_estimate_after_tool_delta(
+            base_prompt_tokens=_live_prompt_exact_tokens[0],
+            exact_prompt_tokens=_live_prompt_exact_tokens[0],
+            messages=messages,
+            turn_tool_prompt_tokens=_live_prompt_estimate_tool_delta_tokens[0],
+        )
+        _live_prompt_estimate_tokens[0] = _usage['last_prompt_tokens']
+        _live_prompt_estimate_tool_delta_tokens[0] = _usage['turn_tool_prompt_tokens']
        return _live_prompt_estimate_tokens[0]

    def _live_usage_snapshot():
@@ -3490,6 +3510,7 @@ def _run_agent_streaming(
        if _real_prompt_tokens and _real_prompt_tokens != _live_prompt_exact_tokens[0]:
            _live_prompt_exact_tokens[0] = _real_prompt_tokens
            _live_prompt_estimate_tokens[0] = _real_prompt_tokens
+            _live_prompt_estimate_tool_delta_tokens[0] = 0
        elif _live_prompt_estimate_tokens[0] > _real_prompt_tokens:
            _usage['last_prompt_tokens'] = _live_prompt_estimate_tokens[0]

@@ -20,3 +20,31 @@ def test_live_usage_estimate_preserves_real_prompt_when_exact_prompt_advances():
    )

    assert usage["last_prompt_tokens"] == 136_000
+    assert usage["turn_tool_prompt_tokens"] == 0
+
+
+def test_live_usage_estimate_caps_cumulative_tool_delta_per_turn():
+    from api import streaming
+
+    base_prompt_tokens = 86_723
+    turn_tool_prompt_tokens = 0
+    usage = None
+    original_delta = streaming._bounded_live_tool_prompt_delta
+    streaming._bounded_live_tool_prompt_delta = lambda messages, cap=12_000: int(cap or 0)
+
+    try:
+        for _ in range(20):
+            usage = live_usage_prompt_estimate_after_tool_delta(
+                base_prompt_tokens=base_prompt_tokens,
+                exact_prompt_tokens=base_prompt_tokens,
+                messages=[{"role": "tool", "content": "x" * 80_000}],
+                turn_tool_prompt_tokens=turn_tool_prompt_tokens,
+            )
+            turn_tool_prompt_tokens = usage["turn_tool_prompt_tokens"]
+    finally:
+        streaming._bounded_live_tool_prompt_delta = original_delta
+
+    assert usage is not None
+    assert usage["turn_tool_prompt_tokens"] == 24_000
+    assert usage["last_prompt_tokens"] == base_prompt_tokens + 24_000
+    assert usage["last_prompt_tokens"] < base_prompt_tokens + (20 * 12_000)