From 89d8c3a94cb48ffec3b590a7cac39f49d822c492 Mon Sep 17 00:00:00 2001 From: Frank Song Date: Mon, 25 May 2026 15:54:01 +0800 Subject: [PATCH] fix: cap live tool prompt estimate per turn --- CHANGELOG.md | 4 +++ api/streaming.py | 35 ++++++++++++++++----- tests/test_streaming_live_usage_estimate.py | 28 +++++++++++++++++ 3 files changed, 60 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 42919130..c5060306 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,10 @@ ## [Unreleased] +### Fixed + +- Live token-usage hints now cap the cumulative in-flight tool-result prompt estimate per assistant turn, preventing many large tool callbacks from temporarily inflating the context ring before exact provider accounting arrives. + ## [v0.51.134] — 2026-05-25 — Release DF (stage-batch16 — single-PR Windows path defaults) ### Fixed diff --git a/api/streaming.py b/api/streaming.py index e18e8792..9925738d 100644 --- a/api/streaming.py +++ b/api/streaming.py @@ -2851,6 +2851,7 @@ _TOOL_RESULT_SNIPPET_MAX = 4000 _LIVE_TOOL_PROMPT_DELTA_MAX = 12_000 +_LIVE_TOOL_PROMPT_TURN_MAX = 24_000 def _bounded_live_tool_prompt_delta(messages, *, cap: int = _LIVE_TOOL_PROMPT_DELTA_MAX) -> int: @@ -2878,19 +2879,32 @@ def live_usage_prompt_estimate_after_tool_delta( exact_prompt_tokens: int = 0, messages=None, cap: int = _LIVE_TOOL_PROMPT_DELTA_MAX, + turn_tool_prompt_tokens: int = 0, + turn_cap: int = _LIVE_TOOL_PROMPT_TURN_MAX, ) -> dict: """Compute the live `last_prompt_tokens` estimate after a tool update. Exact compressor/provider prompt accounting wins. When no newer exact prompt - is available, add only a bounded live tool delta to the persisted base. + is available, add only bounded live tool deltas to the persisted base. """ base = int(base_prompt_tokens or 0) exact = int(exact_prompt_tokens or 0) if exact and exact != base: - return {'last_prompt_tokens': exact, 'estimated': False} + return { + 'last_prompt_tokens': exact, + 'estimated': False, + 'turn_tool_prompt_tokens': 0, + } + prior_turn_delta = max(0, int(turn_tool_prompt_tokens or 0)) + turn_ceiling = max(0, int(turn_cap or 0)) + next_turn_delta = min( + prior_turn_delta + _bounded_live_tool_prompt_delta(messages, cap=cap), + turn_ceiling, + ) return { - 'last_prompt_tokens': base + _bounded_live_tool_prompt_delta(messages, cap=cap), + 'last_prompt_tokens': base + next_turn_delta, 'estimated': True, + 'turn_tool_prompt_tokens': next_turn_delta, } @@ -3396,6 +3410,7 @@ def _run_agent_streaming( agent = None _live_prompt_estimate_tokens = [0] _live_prompt_exact_tokens = [0] + _live_prompt_estimate_tool_delta_tokens = [0] _live_prompt_estimate_seen_ids = set() def _seed_live_prompt_estimate() -> int: @@ -3425,10 +3440,15 @@ def _run_agent_streaming( """Increment a rough next-prompt estimate from live tool activity.""" if not messages: return _live_prompt_estimate_tokens[0] - _delta = _bounded_live_tool_prompt_delta(messages) - if _delta > 0: - _seed_live_prompt_estimate() - _live_prompt_estimate_tokens[0] += _delta + _seed_live_prompt_estimate() + _usage = live_usage_prompt_estimate_after_tool_delta( + base_prompt_tokens=_live_prompt_exact_tokens[0], + exact_prompt_tokens=_live_prompt_exact_tokens[0], + messages=messages, + turn_tool_prompt_tokens=_live_prompt_estimate_tool_delta_tokens[0], + ) + _live_prompt_estimate_tokens[0] = _usage['last_prompt_tokens'] + _live_prompt_estimate_tool_delta_tokens[0] = _usage['turn_tool_prompt_tokens'] return _live_prompt_estimate_tokens[0] def _live_usage_snapshot(): @@ -3490,6 +3510,7 @@ def _run_agent_streaming( if _real_prompt_tokens and _real_prompt_tokens != _live_prompt_exact_tokens[0]: _live_prompt_exact_tokens[0] = _real_prompt_tokens _live_prompt_estimate_tokens[0] = _real_prompt_tokens + _live_prompt_estimate_tool_delta_tokens[0] = 0 elif _live_prompt_estimate_tokens[0] > _real_prompt_tokens: _usage['last_prompt_tokens'] = _live_prompt_estimate_tokens[0] diff --git a/tests/test_streaming_live_usage_estimate.py b/tests/test_streaming_live_usage_estimate.py index 57d87c08..9f1edb04 100644 --- a/tests/test_streaming_live_usage_estimate.py +++ b/tests/test_streaming_live_usage_estimate.py @@ -20,3 +20,31 @@ def test_live_usage_estimate_preserves_real_prompt_when_exact_prompt_advances(): ) assert usage["last_prompt_tokens"] == 136_000 + assert usage["turn_tool_prompt_tokens"] == 0 + + +def test_live_usage_estimate_caps_cumulative_tool_delta_per_turn(): + from api import streaming + + base_prompt_tokens = 86_723 + turn_tool_prompt_tokens = 0 + usage = None + original_delta = streaming._bounded_live_tool_prompt_delta + streaming._bounded_live_tool_prompt_delta = lambda messages, cap=12_000: int(cap or 0) + + try: + for _ in range(20): + usage = live_usage_prompt_estimate_after_tool_delta( + base_prompt_tokens=base_prompt_tokens, + exact_prompt_tokens=base_prompt_tokens, + messages=[{"role": "tool", "content": "x" * 80_000}], + turn_tool_prompt_tokens=turn_tool_prompt_tokens, + ) + turn_tool_prompt_tokens = usage["turn_tool_prompt_tokens"] + finally: + streaming._bounded_live_tool_prompt_delta = original_delta + + assert usage is not None + assert usage["turn_tool_prompt_tokens"] == 24_000 + assert usage["last_prompt_tokens"] == base_prompt_tokens + 24_000 + assert usage["last_prompt_tokens"] < base_prompt_tokens + (20 * 12_000)