From 89d8c3a94cb48ffec3b590a7cac39f49d822c492 Mon Sep 17 00:00:00 2001
From: Frank Song <franksong2702@gmail.com>
Date: Mon, 25 May 2026 15:54:01 +0800
Subject: [PATCH] fix: cap live tool prompt estimate per turn

---
 CHANGELOG.md                                |  4 +++
 api/streaming.py                            | 35 ++++++++++++++++-----
 tests/test_streaming_live_usage_estimate.py | 28 +++++++++++++++++
 3 files changed, 60 insertions(+), 7 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 42919130..c5060306 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,10 @@
 
 ## [Unreleased]
 
+### Fixed
+
+- Live token-usage hints now cap the cumulative in-flight tool-result prompt estimate per assistant turn, preventing many large tool callbacks from temporarily inflating the context ring before exact provider accounting arrives.
+
 ## [v0.51.134] — 2026-05-25 — Release DF (stage-batch16 — single-PR Windows path defaults)
 
 ### Fixed
diff --git a/api/streaming.py b/api/streaming.py
index e18e8792..9925738d 100644
--- a/api/streaming.py
+++ b/api/streaming.py
@@ -2851,6 +2851,7 @@ _TOOL_RESULT_SNIPPET_MAX = 4000
 
 
 _LIVE_TOOL_PROMPT_DELTA_MAX = 12_000
+_LIVE_TOOL_PROMPT_TURN_MAX = 24_000
 
 
 def _bounded_live_tool_prompt_delta(messages, *, cap: int = _LIVE_TOOL_PROMPT_DELTA_MAX) -> int:
@@ -2878,19 +2879,32 @@ def live_usage_prompt_estimate_after_tool_delta(
     exact_prompt_tokens: int = 0,
     messages=None,
     cap: int = _LIVE_TOOL_PROMPT_DELTA_MAX,
+    turn_tool_prompt_tokens: int = 0,
+    turn_cap: int = _LIVE_TOOL_PROMPT_TURN_MAX,
 ) -> dict:
     """Compute the live `last_prompt_tokens` estimate after a tool update.
 
     Exact compressor/provider prompt accounting wins. When no newer exact prompt
-    is available, add only a bounded live tool delta to the persisted base.
+    is available, add only bounded live tool deltas to the persisted base.
     """
     base = int(base_prompt_tokens or 0)
     exact = int(exact_prompt_tokens or 0)
     if exact and exact != base:
-        return {'last_prompt_tokens': exact, 'estimated': False}
+        return {
+            'last_prompt_tokens': exact,
+            'estimated': False,
+            'turn_tool_prompt_tokens': 0,
+        }
+    prior_turn_delta = max(0, int(turn_tool_prompt_tokens or 0))
+    turn_ceiling = max(0, int(turn_cap or 0))
+    next_turn_delta = min(
+        prior_turn_delta + _bounded_live_tool_prompt_delta(messages, cap=cap),
+        turn_ceiling,
+    )
     return {
-        'last_prompt_tokens': base + _bounded_live_tool_prompt_delta(messages, cap=cap),
+        'last_prompt_tokens': base + next_turn_delta,
         'estimated': True,
+        'turn_tool_prompt_tokens': next_turn_delta,
     }
 
 
@@ -3396,6 +3410,7 @@ def _run_agent_streaming(
     agent = None
     _live_prompt_estimate_tokens = [0]
     _live_prompt_exact_tokens = [0]
+    _live_prompt_estimate_tool_delta_tokens = [0]
     _live_prompt_estimate_seen_ids = set()
 
     def _seed_live_prompt_estimate() -> int:
@@ -3425,10 +3440,15 @@ def _run_agent_streaming(
         """Increment a rough next-prompt estimate from live tool activity."""
         if not messages:
             return _live_prompt_estimate_tokens[0]
-        _delta = _bounded_live_tool_prompt_delta(messages)
-        if _delta > 0:
-            _seed_live_prompt_estimate()
-            _live_prompt_estimate_tokens[0] += _delta
+        _seed_live_prompt_estimate()
+        _usage = live_usage_prompt_estimate_after_tool_delta(
+            base_prompt_tokens=_live_prompt_exact_tokens[0],
+            exact_prompt_tokens=_live_prompt_exact_tokens[0],
+            messages=messages,
+            turn_tool_prompt_tokens=_live_prompt_estimate_tool_delta_tokens[0],
+        )
+        _live_prompt_estimate_tokens[0] = _usage['last_prompt_tokens']
+        _live_prompt_estimate_tool_delta_tokens[0] = _usage['turn_tool_prompt_tokens']
         return _live_prompt_estimate_tokens[0]
 
     def _live_usage_snapshot():
@@ -3490,6 +3510,7 @@ def _run_agent_streaming(
         if _real_prompt_tokens and _real_prompt_tokens != _live_prompt_exact_tokens[0]:
             _live_prompt_exact_tokens[0] = _real_prompt_tokens
             _live_prompt_estimate_tokens[0] = _real_prompt_tokens
+            _live_prompt_estimate_tool_delta_tokens[0] = 0
         elif _live_prompt_estimate_tokens[0] > _real_prompt_tokens:
             _usage['last_prompt_tokens'] = _live_prompt_estimate_tokens[0]
 
diff --git a/tests/test_streaming_live_usage_estimate.py b/tests/test_streaming_live_usage_estimate.py
index 57d87c08..9f1edb04 100644
--- a/tests/test_streaming_live_usage_estimate.py
+++ b/tests/test_streaming_live_usage_estimate.py
@@ -20,3 +20,31 @@ def test_live_usage_estimate_preserves_real_prompt_when_exact_prompt_advances():
     )
 
     assert usage["last_prompt_tokens"] == 136_000
+    assert usage["turn_tool_prompt_tokens"] == 0
+
+
+def test_live_usage_estimate_caps_cumulative_tool_delta_per_turn():
+    from api import streaming
+
+    base_prompt_tokens = 86_723
+    turn_tool_prompt_tokens = 0
+    usage = None
+    original_delta = streaming._bounded_live_tool_prompt_delta
+    streaming._bounded_live_tool_prompt_delta = lambda messages, cap=12_000: int(cap or 0)
+
+    try:
+        for _ in range(20):
+            usage = live_usage_prompt_estimate_after_tool_delta(
+                base_prompt_tokens=base_prompt_tokens,
+                exact_prompt_tokens=base_prompt_tokens,
+                messages=[{"role": "tool", "content": "x" * 80_000}],
+                turn_tool_prompt_tokens=turn_tool_prompt_tokens,
+            )
+            turn_tool_prompt_tokens = usage["turn_tool_prompt_tokens"]
+    finally:
+        streaming._bounded_live_tool_prompt_delta = original_delta
+
+    assert usage is not None
+    assert usage["turn_tool_prompt_tokens"] == 24_000
+    assert usage["last_prompt_tokens"] == base_prompt_tokens + 24_000
+    assert usage["last_prompt_tokens"] < base_prompt_tokens + (20 * 12_000)