hermes-agent/agent/conversation_loop.py

"""The agent conversation loop — extracted from ``run_agent.AIAgent``.

This is the biggest single chunk pulled out of ``run_agent.py``: the
roughly 3,900-line :func:`run_conversation` body that drives one user
turn through the agent (model call, tool dispatch, retries, fallbacks,
compression, post-turn hooks, background memory/skill review nudges).

The function takes the parent ``AIAgent`` instance as its first
argument (``agent``) and accesses its state via attribute lookup.
``_ra().AIAgent.run_conversation`` is now a thin forwarder.

Symbols that production code or tests patch on ``run_agent`` directly
(``handle_function_call``, ``_set_interrupt``, ``OpenAI``, ...) are
resolved through :func:`_ra` so those patches keep working.
"""

from __future__ import annotations

import json
import logging
import os
import random
import re
import ssl
import threading
import time
import uuid
from typing import Any, Dict, List, Optional

from agent.anthropic_adapter import _is_oauth_token
from agent.auxiliary_client import set_runtime_main
from agent.codex_responses_adapter import _summarize_user_message_for_log
from agent.display import KawaiiSpinner
from agent.error_classifier import FailoverReason, classify_api_error
from agent.iteration_budget import IterationBudget
from agent.memory_manager import build_memory_context_block
from agent.message_sanitization import (
    _repair_tool_call_arguments,
    _sanitize_messages_non_ascii,
    _sanitize_messages_surrogates,
    _sanitize_structure_non_ascii,
    _sanitize_structure_surrogates,
    _sanitize_surrogates,
    _sanitize_tools_non_ascii,
    _strip_images_from_messages,
    _strip_non_ascii,
)
from agent.model_metadata import (
    estimate_messages_tokens_rough,
    estimate_request_tokens_rough,
    get_next_probe_tier,
    parse_available_output_tokens_from_error,
    parse_context_limit_from_error,
    save_context_length,
)
from agent.nous_rate_guard import (
    clear_nous_rate_limit,
    is_genuine_nous_rate_limit,
    nous_rate_limit_remaining,
    record_nous_rate_limit,
)
from agent.process_bootstrap import _install_safe_stdio
from agent.prompt_caching import apply_anthropic_cache_control
from agent.retry_utils import jittered_backoff
from agent.trajectory import has_incomplete_scratchpad
from agent.usage_pricing import estimate_usage_cost, normalize_usage
from hermes_constants import display_hermes_home as _dhh_fn
from hermes_logging import set_session_context
from tools.schema_sanitizer import strip_pattern_and_format
from tools.skill_provenance import set_current_write_origin
from utils import base_url_host_matches, env_var_enabled

logger = logging.getLogger(__name__)


def _ra():
    """Lazy reference to ``run_agent`` so callers can patch
    ``run_agent.handle_function_call`` / ``run_agent._set_interrupt`` /
    ``run_agent.OpenAI`` and have those patches reach this code path.
    """
    import run_agent
    return run_agent


def _restore_or_build_system_prompt(agent, system_message, conversation_history):
    """Restore the cached system prompt from the session DB or build it fresh.

    Mutates ``agent._cached_system_prompt`` and persists a freshly-built
    prompt back to the session DB on first build.  Extracted from
    ``run_conversation`` so the prefix-cache restore path can be tested in
    isolation.

    Three-way state distinction for the stored row, surfaced via logs so
    silent prefix-cache misses are visible in ``agent.log``:

      * ``missing`` — no session row yet (legitimate first turn).
      * ``null``   — row exists, ``system_prompt`` column is NULL.
        Legacy session predating system-prompt persistence, or a migration
        leftover.  Warns when ``conversation_history`` is non-empty.
      * ``empty``  — row exists, ``system_prompt`` column is the empty
        string.  Indicates a previous-turn write that ran but stored
        nothing (silent persistence bug).  Always warns.
      * ``present`` — row exists with a usable prompt → reused verbatim.

    Read or write failures against the session DB log at WARNING (not
    DEBUG) so persistent issues (disk full, schema drift, lock contention)
    surface without needing verbose mode.  This used to be a debug-level
    log that silently broke prefix-cache reuse on the gateway path
    (which constructs a fresh ``AIAgent`` per turn and depends on this
    DB roundtrip).
    """
    stored_prompt = None
    stored_state = "missing"
    if conversation_history and agent._session_db:
        try:
            session_row = agent._session_db.get_session(agent.session_id)
            if session_row is not None:
                raw_prompt = session_row.get("system_prompt")
                if raw_prompt is None:
                    stored_state = "null"
                elif raw_prompt == "":
                    stored_state = "empty"
                else:
                    stored_prompt = raw_prompt
                    stored_state = "present"
        except Exception as exc:
            logger.warning(
                "Session DB get_session failed for system-prompt restore "
                "(session=%s): %s. Falling back to fresh build — prefix "
                "cache will miss for this turn.",
                agent.session_id, exc,
            )

    if stored_prompt:
        # Continuing session — reuse the exact system prompt from the
        # previous turn so the Anthropic cache prefix matches.
        agent._cached_system_prompt = stored_prompt
        return

    if conversation_history and stored_state in ("null", "empty"):
        # Continuing session whose stored prompt is unusable.  The
        # previous turn's write either never happened or wrote an empty
        # string — either way every turn now rebuilds and the prefix
        # cache misses every time.
        logger.warning(
            "Stored system prompt for session %s is %s; rebuilding "
            "from scratch this turn. Prefix cache will miss until "
            "the rebuild persists. Investigate the previous turn's "
            "update_system_prompt write path.",
            agent.session_id, stored_state,
        )

    # First turn of a new session (or recovering from a broken stored
    # prompt) — build from scratch.
    agent._cached_system_prompt = agent._build_system_prompt(system_message)

    # Plugin hook: on_session_start — fired once when a brand-new
    # session is created (not on continuation).  Plugins can use this
    # to initialise session-scoped state (e.g. warm a memory cache).
    try:
        from hermes_cli.plugins import invoke_hook as _invoke_hook
        _invoke_hook(
            "on_session_start",
            session_id=agent.session_id,
            model=agent.model,
            platform=getattr(agent, "platform", None) or "",
        )
    except Exception as exc:
        logger.warning("on_session_start hook failed: %s", exc)

    # Persist the system prompt snapshot in SQLite.  Failure here used
    # to log at DEBUG, which silently broke prefix-cache reuse on the
    # gateway path (fresh AIAgent per turn → reads from this row every
    # subsequent turn).
    if agent._session_db:
        try:
            agent._session_db.update_system_prompt(agent.session_id, agent._cached_system_prompt)
        except Exception as exc:
            logger.warning(
                "Session DB update_system_prompt failed for session %s: "
                "%s. Subsequent turns will rebuild the system prompt and "
                "miss the prefix cache.",
                agent.session_id, exc,
            )


def run_conversation(
    agent,
    user_message: str,
    system_message: str = None,
    conversation_history: List[Dict[str, Any]] = None,
    task_id: str = None,
    stream_callback: Optional[callable] = None,
    persist_user_message: Optional[str] = None,
) -> Dict[str, Any]:
    """
    Run a complete conversation with tool calling until completion.

    Args:
        user_message (str): The user's message/question
        system_message (str): Custom system message (optional, overrides ephemeral_system_prompt if provided)
        conversation_history (List[Dict]): Previous conversation messages (optional)
        task_id (str): Unique identifier for this task to isolate VMs between concurrent tasks (optional, auto-generated if not provided)
        stream_callback: Optional callback invoked with each text delta during streaming.
            Used by the TTS pipeline to start audio generation before the full response.
            When None (default), API calls use the standard non-streaming path.
        persist_user_message: Optional clean user message to store in
            transcripts/history when user_message contains API-only
            synthetic prefixes.
                or queuing follow-up prefetch work.

    Returns:
        Dict: Complete conversation result with final response and message history
    """
    # Guard stdio against OSError from broken pipes (systemd/headless/daemon).
    # Installed once, transparent when streams are healthy, prevents crash on write.
    _install_safe_stdio()

    agent._ensure_db_session()

    # Tell auxiliary_client what the live main provider/model are for
    # this turn. Used by tools whose behaviour depends on the active
    # main model (e.g. vision_analyze's native fast path) so they see
    # the CLI/gateway override instead of the stale config.yaml
    # default. Idempotent — fine to call every turn.
    try:
        from agent.auxiliary_client import set_runtime_main
        set_runtime_main(
            getattr(agent, "provider", "") or "",
            getattr(agent, "model", "") or "",
        )
    except Exception:
        pass

    # Tag all log records on this thread with the session ID so
    # ``hermes logs --session <id>`` can filter a single conversation.
    from hermes_logging import set_session_context
    set_session_context(agent.session_id)

    # Bind the skill write-origin ContextVar for this thread so tool
    # handlers (e.g. skill_manage create) can tell whether they are
    # running inside the background agent-improvement review fork vs.
    # a foreground user-directed turn. Set at the top of each call;
    # the review fork runs on its own thread with a fresh context,
    # so the foreground value here does not leak into it.
    from tools.skill_provenance import set_current_write_origin
    set_current_write_origin(getattr(agent, "_memory_write_origin", "assistant_tool"))

    # If the previous turn activated fallback, restore the primary
    # runtime so this turn gets a fresh attempt with the preferred model.
    # No-op when _fallback_activated is False (gateway, first turn, etc.).
    agent._restore_primary_runtime()

    # Sanitize surrogate characters from user input.  Clipboard paste from
    # rich-text editors (Google Docs, Word, etc.) can inject lone surrogates
    # that are invalid UTF-8 and crash JSON serialization in the OpenAI SDK.
    if isinstance(user_message, str):
        user_message = _sanitize_surrogates(user_message)
    if isinstance(persist_user_message, str):
        persist_user_message = _sanitize_surrogates(persist_user_message)

    # Store stream callback for _interruptible_api_call to pick up
    agent._stream_callback = stream_callback
    agent._persist_user_message_idx = None
    agent._persist_user_message_override = persist_user_message
    # Generate unique task_id if not provided to isolate VMs between concurrent tasks
    effective_task_id = task_id or str(uuid.uuid4())
    # Expose the active task_id so tools running mid-turn (e.g. delegate_task
    # in delegate_tool.py) can identify this agent for the cross-agent file
    # state registry.  Set BEFORE any tool dispatch so snapshots taken at
    # child-launch time see the parent's real id, not None.
    agent._current_task_id = effective_task_id

    # Reset retry counters and iteration budget at the start of each turn
    # so subagent usage from a previous turn doesn't eat into the next one.
    agent._invalid_tool_retries = 0
    agent._invalid_json_retries = 0
    agent._empty_content_retries = 0
    agent._incomplete_scratchpad_retries = 0
    agent._codex_incomplete_retries = 0
    agent._thinking_prefill_retries = 0
    agent._post_tool_empty_retried = False
    agent._last_content_with_tools = None
    agent._last_content_tools_all_housekeeping = False
    agent._mute_post_response = False
    agent._unicode_sanitization_passes = 0
    agent._tool_guardrails.reset_for_turn()
    agent._tool_guardrail_halt_decision = None
    # True until the server rejects an image_url content part with an error
    # like "Only 'text' content type is supported."  Set to False on first
    # rejection and kept False for the rest of the session so we never re-send
    # images to a text-only endpoint.  Scoped per `_run()` call, not per instance.
    agent._vision_supported = True

    # Pre-turn connection health check: detect and clean up dead TCP
    # connections left over from provider outages or dropped streams.
    # This prevents the next API call from hanging on a zombie socket.
    if agent.api_mode != "anthropic_messages":
        try:
            if agent._cleanup_dead_connections():
                agent._emit_status(
                    "🔌 Detected stale connections from a previous provider "
                    "issue — cleaned up automatically. Proceeding with fresh "
                    "connection."
                )
        except Exception:
            pass
    # Replay compression warning through status_callback for gateway
    # platforms (the callback was not wired during __init__).
    if agent._compression_warning:
        agent._replay_compression_warning()
        agent._compression_warning = None  # send once

    # NOTE: _turns_since_memory and _iters_since_skill are NOT reset here.
    # They are initialized in __init__ and must persist across run_conversation
    # calls so that nudge logic accumulates correctly in CLI mode.
    agent.iteration_budget = IterationBudget(agent.max_iterations)

    # Log conversation turn start for debugging/observability
    _preview_text = _summarize_user_message_for_log(user_message)
    _msg_preview = (_preview_text[:80] + "...") if len(_preview_text) > 80 else _preview_text
    _msg_preview = _msg_preview.replace("\n", " ")
    logger.info(
        "conversation turn: session=%s model=%s provider=%s platform=%s history=%d msg=%r",
        agent.session_id or "none", agent.model, agent.provider or "unknown",
        agent.platform or "unknown", len(conversation_history or []),
        _msg_preview,
    )

    # Initialize conversation (copy to avoid mutating the caller's list)
    messages = list(conversation_history) if conversation_history else []

    # Hydrate todo store from conversation history (gateway creates a fresh
    # AIAgent per message, so the in-memory store is empty -- we need to
    # recover the todo state from the most recent todo tool response in history)
    if conversation_history and not agent._todo_store.has_items():
        agent._hydrate_todo_store(conversation_history)

    # Hydrate per-session nudge counters from persisted history.
    # Gateway creates a fresh AIAgent per inbound message (cache miss /
    # 1h idle eviction / config-signature mismatch / process restart), so
    # _turns_since_memory and _user_turn_count start at 0 every turn and
    # the memory.nudge_interval trigger may never be reached. Reconstruct
    # an effective count from prior user turns in conversation_history.
    # Idempotent: a cached agent that already accumulated counters keeps
    # them; only a freshly-built agent with empty in-memory state hydrates.
    # See issue #22357.
    if conversation_history and agent._user_turn_count == 0:
        prior_user_turns = sum(
            1 for m in conversation_history if m.get("role") == "user"
        )
        if prior_user_turns > 0:
            agent._user_turn_count = prior_user_turns
            if agent._memory_nudge_interval > 0 and agent._turns_since_memory == 0:
                # % preserves original 1-in-N cadence rather than firing a
                # review immediately on resume (which would surprise users
                # whose session happened to land just past a multiple of N).
                agent._turns_since_memory = prior_user_turns % agent._memory_nudge_interval


    # Prefill messages (few-shot priming) are injected at API-call time only,
    # never stored in the messages list. This keeps them ephemeral: they won't
    # be saved to session DB, session logs, or batch trajectories, but they're
    # automatically re-applied on every API call (including session continuations).

    # Track user turns for memory flush and periodic nudge logic
    agent._user_turn_count += 1

    # Reset the streaming context scrubber at the top of each turn so a
    # hung span from a prior interrupted stream can't taint this turn's
    # output.
    scrubber = getattr(agent, "_stream_context_scrubber", None)
    if scrubber is not None:
        scrubber.reset()
    # Reset the think scrubber for the same reason — an interrupted
    # prior stream may have left us inside an unterminated block.
    think_scrubber = getattr(agent, "_stream_think_scrubber", None)
    if think_scrubber is not None:
        think_scrubber.reset()

    # Preserve the original user message (no nudge injection).
    original_user_message = persist_user_message if persist_user_message is not None else user_message

    # Track memory nudge trigger (turn-based, checked here).
    # Skill trigger is checked AFTER the agent loop completes, based on
    # how many tool iterations THIS turn used.
    _should_review_memory = False
    if (agent._memory_nudge_interval > 0
            and "memory" in agent.valid_tool_names
            and agent._memory_store):
        agent._turns_since_memory += 1
        if agent._turns_since_memory >= agent._memory_nudge_interval:
            _should_review_memory = True
            agent._turns_since_memory = 0

    # Add user message
    user_msg = {"role": "user", "content": user_message}
    messages.append(user_msg)
    current_turn_user_idx = len(messages) - 1
    agent._persist_user_message_idx = current_turn_user_idx

    if not agent.quiet_mode:
        _print_preview = _summarize_user_message_for_log(user_message)
        agent._safe_print(f"💬 Starting conversation: '{_print_preview[:60]}{'...' if len(_print_preview) > 60 else ''}'")

    # ── System prompt (cached per session for prefix caching) ──
    # Built once on first call, reused for all subsequent calls.
    # Only rebuilt after context compression events (which invalidate
    # the cache and reload memory from disk).
    #
    # For continuing sessions (gateway creates a fresh AIAgent per
    # message), we load the stored system prompt from the session DB
    # instead of rebuilding.  Rebuilding would pick up memory changes
    # from disk that the model already knows about (it wrote them!),
    # producing a different system prompt and breaking the Anthropic
    # prefix cache.
    if agent._cached_system_prompt is None:
        _restore_or_build_system_prompt(agent, system_message, conversation_history)

    active_system_prompt = agent._cached_system_prompt

    # ── Preflight context compression ──
    # Before entering the main loop, check if the loaded conversation
    # history already exceeds the model's context threshold.  This handles
    # cases where a user switches to a model with a smaller context window
    # while having a large existing session — compress proactively rather
    # than waiting for an API error (which might be caught as a non-retryable
    # 4xx and abort the request entirely).
    if (
        agent.compression_enabled
        and len(messages) > agent.context_compressor.protect_first_n
                            + agent.context_compressor.protect_last_n + 1
    ):
        # Include tool schema tokens — with many tools these can add
        # 20-30K+ tokens that the old sys+msg estimate missed entirely.
        _preflight_tokens = estimate_request_tokens_rough(
            messages,
            system_prompt=active_system_prompt or "",
            tools=agent.tools or None,
        )

        if _preflight_tokens >= agent.context_compressor.threshold_tokens:
            logger.info(
                "Preflight compression: ~%s tokens >= %s threshold (model %s, ctx %s)",
                f"{_preflight_tokens:,}",
                f"{agent.context_compressor.threshold_tokens:,}",
                agent.model,
                f"{agent.context_compressor.context_length:,}",
            )
            agent._emit_status(
                f"📦 Preflight compression: ~{_preflight_tokens:,} tokens "
                f">= {agent.context_compressor.threshold_tokens:,} threshold. "
                "This may take a moment."
            )
            # May need multiple passes for very large sessions with small
            # context windows (each pass summarises the middle N turns).
            for _pass in range(3):
                _orig_len = len(messages)
                messages, active_system_prompt = agent._compress_context(
                    messages, system_message, approx_tokens=_preflight_tokens,
                    task_id=effective_task_id,
                )
                if len(messages) >= _orig_len:
                    break  # Cannot compress further
                # Compression created a new session — clear the history
                # reference so _flush_messages_to_session_db writes ALL
                # compressed messages to the new session's SQLite, not
                # skipping them because conversation_history is still the
                # pre-compression length.
                conversation_history = None
                # Fix: reset retry counters after compression so the model
                # gets a fresh budget on the compressed context.  Without
                # this, pre-compression retries carry over and the model
                # hits "(empty)" immediately after compression-induced
                # context loss.
                agent._empty_content_retries = 0
                agent._thinking_prefill_retries = 0
                agent._last_content_with_tools = None
                agent._last_content_tools_all_housekeeping = False
                agent._mute_post_response = False
                # Re-estimate after compression
                _preflight_tokens = estimate_request_tokens_rough(
                    messages,
                    system_prompt=active_system_prompt or "",
                    tools=agent.tools or None,
                )
                if _preflight_tokens < agent.context_compressor.threshold_tokens:
                    break  # Under threshold

    # Plugin hook: pre_llm_call
    # Fired once per turn before the tool-calling loop.  Plugins can
    # return a dict with a ``context`` key (or a plain string) whose
    # value is appended to the current turn's user message.
    #
    # Context is ALWAYS injected into the user message, never the
    # system prompt.  This preserves the prompt cache prefix — the
    # system prompt stays identical across turns so cached tokens
    # are reused.  The system prompt is Hermes's territory; plugins
    # contribute context alongside the user's input.
    #
    # All injected context is ephemeral (not persisted to session DB).
    _plugin_user_context = ""
    try:
        from hermes_cli.plugins import invoke_hook as _invoke_hook
        _pre_results = _invoke_hook(
            "pre_llm_call",
            session_id=agent.session_id,
            user_message=original_user_message,
            conversation_history=list(messages),
            is_first_turn=(not bool(conversation_history)),
            model=agent.model,
            platform=getattr(agent, "platform", None) or "",
            sender_id=getattr(agent, "_user_id", None) or "",
        )
        _ctx_parts: list[str] = []
        for r in _pre_results:
            if isinstance(r, dict) and r.get("context"):
                _ctx_parts.append(str(r["context"]))
            elif isinstance(r, str) and r.strip():
                _ctx_parts.append(r)
        if _ctx_parts:
            _plugin_user_context = "\n\n".join(_ctx_parts)
    except Exception as exc:
        logger.warning("pre_llm_call hook failed: %s", exc)

    # Main conversation loop
    api_call_count = 0
    final_response = None
    interrupted = False
    codex_ack_continuations = 0
    length_continue_retries = 0
    truncated_tool_call_retries = 0
    truncated_response_parts: List[str] = []
    compression_attempts = 0
    _turn_exit_reason = "unknown"  # Diagnostic: why the loop ended

    # Per-turn file-mutation verifier state.  Keyed by resolved path;
    # each failed ``write_file`` / ``patch`` call records the error
    # preview.  Later successful writes to the same path remove the
    # entry (the model recovered).  At end-of-turn, any entries still
    # present are surfaced in an advisory footer so the model cannot
    # over-claim success while the file is actually unchanged on disk.
    agent._turn_failed_file_mutations: Dict[str, Dict[str, Any]] = {}

    # Record the execution thread so interrupt()/clear_interrupt() can
    # scope the tool-level interrupt signal to THIS agent's thread only.
    # Must be set before any thread-scoped interrupt syncing.
    agent._execution_thread_id = threading.current_thread().ident

    # Always clear stale per-thread state from a previous turn. If an
    # interrupt arrived before startup finished, preserve it and bind it
    # to this execution thread now instead of dropping it on the floor.
    _ra()._set_interrupt(False, agent._execution_thread_id)
    if agent._interrupt_requested:
        _ra()._set_interrupt(True, agent._execution_thread_id)
        agent._interrupt_thread_signal_pending = False
    else:
        agent._interrupt_message = None
        agent._interrupt_thread_signal_pending = False

    # Notify memory providers of the new turn so cadence tracking works.
    # Must happen BEFORE prefetch_all() so providers know which turn it is
    # and can gate context/dialectic refresh via contextCadence/dialecticCadence.
    if agent._memory_manager:
        try:
            _turn_msg = original_user_message if isinstance(original_user_message, str) else ""
            agent._memory_manager.on_turn_start(agent._user_turn_count, _turn_msg)
        except Exception:
            pass

    # External memory provider: prefetch once before the tool loop.
    # Reuse the cached result on every iteration to avoid re-calling
    # prefetch_all() on each tool call (10 tool calls = 10x latency + cost).
    # Use original_user_message (clean input) — user_message may contain
    # injected skill content that bloats / breaks provider queries.
    _ext_prefetch_cache = ""
    if agent._memory_manager:
        try:
            _query = original_user_message if isinstance(original_user_message, str) else ""
            _ext_prefetch_cache = agent._memory_manager.prefetch_all(_query) or ""
        except Exception:
            pass

    # Optional opt-in runtime: if api_mode == codex_app_server, hand the
    # turn to the codex app-server subprocess (terminal/file ops/patching
    # all run inside Codex). Default Hermes path is bypassed entirely.
    # See agent/transports/codex_app_server_session.py for the adapter
    # and references/codex-app-server-runtime.md for the rationale.
    if agent.api_mode == "codex_app_server":
        return agent._run_codex_app_server_turn(
            user_message=user_message,
            original_user_message=original_user_message,
            messages=messages,
            effective_task_id=effective_task_id,
            should_review_memory=_should_review_memory,
        )

    while (api_call_count < agent.max_iterations and agent.iteration_budget.remaining > 0) or agent._budget_grace_call:
        # Reset per-turn checkpoint dedup so each iteration can take one snapshot
        agent._checkpoint_mgr.new_turn()

        # Check for interrupt request (e.g., user sent new message)
        if agent._interrupt_requested:
            interrupted = True
            _turn_exit_reason = "interrupted_by_user"
            if not agent.quiet_mode:
                agent._safe_print("\n⚡ Breaking out of tool loop due to interrupt...")
            break

        api_call_count += 1
        agent._api_call_count = api_call_count
        agent._touch_activity(f"starting API call #{api_call_count}")

        # Grace call: the budget is exhausted but we gave the model one
        # more chance.  Consume the grace flag so the loop exits after
        # this iteration regardless of outcome.
        if agent._budget_grace_call:
            agent._budget_grace_call = False
        elif not agent.iteration_budget.consume():
            _turn_exit_reason = "budget_exhausted"
            if not agent.quiet_mode:
                agent._safe_print(f"\n⚠️  Iteration budget exhausted ({agent.iteration_budget.used}/{agent.iteration_budget.max_total} iterations used)")
            break

        # Fire step_callback for gateway hooks (agent:step event)
        if agent.step_callback is not None:
            try:
                prev_tools = []
                for _idx, _m in enumerate(reversed(messages)):
                    if _m.get("role") == "assistant" and _m.get("tool_calls"):
                        _fwd_start = len(messages) - _idx
                        _results_by_id = {}
                        for _tm in messages[_fwd_start:]:
                            if _tm.get("role") != "tool":
                                break
                            _tcid = _tm.get("tool_call_id")
                            if _tcid:
                                _results_by_id[_tcid] = _tm.get("content", "")
                        prev_tools = [
                            {
                                "name": tc["function"]["name"],
                                "result": _results_by_id.get(tc.get("id")),
                                "arguments": tc["function"].get("arguments"),
                            }
                            for tc in _m["tool_calls"]
                            if isinstance(tc, dict)
                        ]
                        break
                agent.step_callback(api_call_count, prev_tools)
            except Exception as _step_err:
                logger.debug("step_callback error (iteration %s): %s", api_call_count, _step_err)

        # Track tool-calling iterations for skill nudge.
        # Counter resets whenever skill_manage is actually used.
        if (agent._skill_nudge_interval > 0
                and "skill_manage" in agent.valid_tool_names):
            agent._iters_since_skill += 1

        # ── Pre-API-call /steer drain ──────────────────────────────────
        # If a /steer arrived during the previous API call (while the model
        # was thinking), drain it now — before we build api_messages — so
        # the model sees the steer text on THIS iteration.  Without this,
        # steers sent during an API call only land after the NEXT tool batch,
        # which may never come if the model returns a final response.
        #
        # We scan backwards for the last tool-role message in the messages
        # list.  If found, the steer is appended there.  If not (first
        # iteration, no tools yet), the steer stays pending for the next
        # tool batch — injecting into a user message would break role
        # alternation, and there's no tool output to piggyback on.
        _pre_api_steer = agent._drain_pending_steer()
        if _pre_api_steer:
            _injected = False
            for _si in range(len(messages) - 1, -1, -1):
                _sm = messages[_si]
                if isinstance(_sm, dict) and _sm.get("role") == "tool":
                    marker = f"\n\nUser guidance: {_pre_api_steer}"
                    existing = _sm.get("content", "")
                    if isinstance(existing, str):
                        _sm["content"] = existing + marker
                    else:
                        # Multimodal content blocks — append text block
                        try:
                            blocks = list(existing) if existing else []
                            blocks.append({"type": "text", "text": marker})
                            _sm["content"] = blocks
                        except Exception:
                            pass
                    _injected = True
                    logger.debug(
                        "Pre-API-call steer drain: injected into tool msg at index %d",
                        _si,
                    )
                    break
            if not _injected:
                # No tool message to inject into — put it back so
                # the post-tool-execution drain picks it up later.
                _lock = getattr(agent, "_pending_steer_lock", None)
                if _lock is not None:
                    with _lock:
                        if agent._pending_steer:
                            agent._pending_steer = agent._pending_steer + "\n" + _pre_api_steer
                        else:
                            agent._pending_steer = _pre_api_steer
                else:
                    existing = getattr(agent, "_pending_steer", None)
                    agent._pending_steer = (existing + "\n" + _pre_api_steer) if existing else _pre_api_steer

        # Prepare messages for API call
        # If we have an ephemeral system prompt, prepend it to the messages
        # Note: Reasoning is embedded in content via <think> tags for trajectory storage.
        # However, providers like Moonshot AI require a separate 'reasoning_content' field
        # on assistant messages with tool_calls. We handle both cases here.
        request_logger = getattr(agent, "logger", None) or logging.getLogger(__name__)
        repaired_tool_calls = agent._sanitize_tool_call_arguments(
            messages,
            logger=request_logger,
            session_id=agent.session_id,
        )
        if repaired_tool_calls > 0:
            request_logger.info(
                "Sanitized %s corrupted tool_call arguments before request (session=%s)",
                repaired_tool_calls,
                agent.session_id or "-",
            )

        # Defensive: repair malformed role-alternation before API call.
        # Catches cases where the history got wedged into a
        # ``tool → user`` or ``user → user`` tail (e.g. after empty-
        # response scaffolding was stripped and a new user message
        # landed after an orphan tool result). Most providers return
        # empty content on malformed sequences, which would otherwise
        # retrigger the empty-retry loop indefinitely.
        repaired_seq = agent._repair_message_sequence(messages)
        if repaired_seq > 0:
            request_logger.info(
                "Repaired %s message-alternation violations before request (session=%s)",
                repaired_seq,
                agent.session_id or "-",
            )

        api_messages = []
        for idx, msg in enumerate(messages):
            api_msg = msg.copy()

            # Inject ephemeral context into the current turn's user message.
            # Sources: memory manager prefetch + plugin pre_llm_call hooks
            # with target="user_message" (the default).  Both are
            # API-call-time only — the original message in `messages` is
            # never mutated, so nothing leaks into session persistence.
            if idx == current_turn_user_idx and msg.get("role") == "user":
                _injections = []
                if _ext_prefetch_cache:
                    _fenced = build_memory_context_block(_ext_prefetch_cache)
                    if _fenced:
                        _injections.append(_fenced)
                if _plugin_user_context:
                    _injections.append(_plugin_user_context)
                if _injections:
                    _base = api_msg.get("content", "")
                    if isinstance(_base, str):
                        api_msg["content"] = _base + "\n\n" + "\n\n".join(_injections)

            # For ALL assistant messages, pass reasoning back to the API
            # This ensures multi-turn reasoning context is preserved
            agent._copy_reasoning_content_for_api(msg, api_msg)

            # Remove 'reasoning' field - it's for trajectory storage only
            # We've copied it to 'reasoning_content' for the API above
            if "reasoning" in api_msg:
                api_msg.pop("reasoning")
            # Remove finish_reason - not accepted by strict APIs (e.g. Mistral)
            if "finish_reason" in api_msg:
                api_msg.pop("finish_reason")
            # Strip internal thinking-prefill marker
            api_msg.pop("_thinking_prefill", None)
            # Strip Codex Responses API fields (call_id, response_item_id) for
            # strict providers like Mistral, Fireworks, etc. that reject unknown fields.
            # Uses new dicts so the internal messages list retains the fields
            # for Codex Responses compatibility.
            if agent._should_sanitize_tool_calls():
                agent._sanitize_tool_calls_for_strict_api(api_msg)
            # Keep 'reasoning_details' - OpenRouter uses this for multi-turn reasoning context
            # The signature field helps maintain reasoning continuity
            api_messages.append(api_msg)

        # Build the final system message: cached prompt + ephemeral system prompt.
        # Ephemeral additions are API-call-time only (not persisted to session DB).
        # External recall context is injected into the user message, not the system
        # prompt, so the stable cache prefix remains unchanged.
        #
        # NOTE: Plugin context from pre_llm_call hooks is injected into the
        # user message (see injection block above), NOT the system prompt.
        # This is intentional — system prompt modifications break the prompt
        # cache prefix.  The system prompt is reserved for Hermes internals.
        #
        # Hermes invariant: the system prompt is built ONCE per session
        # (cached on ``_cached_system_prompt``) and replayed verbatim on
        # every turn.  We send it as a single content string so the
        # bytes are byte-stable across turns and upstream prompt caches
        # stay warm.
        effective_system = active_system_prompt or ""
        if agent.ephemeral_system_prompt:
            effective_system = (effective_system + "\n\n" + agent.ephemeral_system_prompt).strip()
        if effective_system:
            api_messages = [{"role": "system", "content": effective_system}] + api_messages

        # Inject ephemeral prefill messages right after the system prompt
        # but before conversation history. Same API-call-time-only pattern.
        if agent.prefill_messages:
            sys_offset = 1 if (api_messages and api_messages[0].get("role") == "system") else 0
            for idx, pfm in enumerate(agent.prefill_messages):
                api_messages.insert(sys_offset + idx, pfm.copy())

        # Apply Anthropic prompt caching for Claude models on native
        # Anthropic, OpenRouter, and third-party Anthropic-compatible
        # gateways. Auto-detected: if ``_use_prompt_caching`` is set,
        # inject cache_control breakpoints (system + last 3 messages)
        # to reduce input token costs by ~75% on multi-turn
        # conversations.
        if agent._use_prompt_caching:
            api_messages = apply_anthropic_cache_control(
                api_messages,
                cache_ttl=agent._cache_ttl,
                native_anthropic=agent._use_native_cache_layout,
            )

        # Safety net: strip orphaned tool results / add stubs for missing
        # results before sending to the API.  Runs unconditionally — not
        # gated on context_compressor — so orphans from session loading or
        # manual message manipulation are always caught.
        api_messages = agent._sanitize_api_messages(api_messages)

        # Drop thinking-only assistant turns (reasoning but no visible
        # output and no tool_calls) and merge any adjacent user messages
        # left behind. Prevents Anthropic 400s ("The final block in an
        # assistant message cannot be `thinking`.") and equivalent errors
        # from third-party Anthropic-compatible gateways that can't replay
        # a thinking-only turn. Runs on the per-call copy only — the
        # stored conversation history keeps the reasoning block for the
        # UI transcript and session persistence.
        api_messages = agent._drop_thinking_only_and_merge_users(api_messages)

        # Normalize message whitespace and tool-call JSON for consistent
        # prefix matching.  Ensures bit-perfect prefixes across turns,
        # which enables KV cache reuse on local inference servers
        # (llama.cpp, vLLM, Ollama) and improves cache hit rates for
        # cloud providers.  Operates on api_messages (the API copy) so
        # the original conversation history in `messages` is untouched.
        for am in api_messages:
            if isinstance(am.get("content"), str):
                am["content"] = am["content"].strip()
        for am in api_messages:
            tcs = am.get("tool_calls")
            if not tcs:
                continue
            new_tcs = []
            for tc in tcs:
                if isinstance(tc, dict) and "function" in tc:
                    try:
                        args_obj = json.loads(tc["function"]["arguments"])
                        tc = {**tc, "function": {
                            **tc["function"],
                            "arguments": json.dumps(
                                args_obj, separators=(",", ":"),
                                sort_keys=True,
                            ),
                        }}
                    except Exception:
                        tc["function"]["arguments"] = _repair_tool_call_arguments(
                            tc["function"]["arguments"],
                            tc["function"].get("name", "?"),
                        )
                new_tcs.append(tc)
            am["tool_calls"] = new_tcs

        # Proactively strip any surrogate characters before the API call.
        # Models served via Ollama (Kimi K2.5, GLM-5, Qwen) can return
        # lone surrogates (U+D800-U+DFFF) that crash json.dumps() inside
        # the OpenAI SDK. Sanitizing here prevents the 3-retry cycle.
        _sanitize_messages_surrogates(api_messages)

        # Calculate approximate request size for logging
        total_chars = sum(len(str(msg)) for msg in api_messages)
        approx_tokens = estimate_messages_tokens_rough(api_messages)

        # Thinking spinner for quiet mode (animated during API call)
        thinking_spinner = None

        if not agent.quiet_mode:
            agent._vprint(f"\n{agent.log_prefix}🔄 Making API call #{api_call_count}/{agent.max_iterations}...")
            agent._vprint(f"{agent.log_prefix}   📊 Request size: {len(api_messages)} messages, ~{approx_tokens:,} tokens (~{total_chars:,} chars)")
            agent._vprint(f"{agent.log_prefix}   🔧 Available tools: {len(agent.tools) if agent.tools else 0}")
        else:
            # Animated thinking spinner in quiet mode
            face = random.choice(KawaiiSpinner.get_thinking_faces())
            verb = random.choice(KawaiiSpinner.get_thinking_verbs())
            if agent.thinking_callback:
                # CLI TUI mode: use prompt_toolkit widget instead of raw spinner
                # (works in both streaming and non-streaming modes)
                agent.thinking_callback(f"{face} {verb}...")
            elif not agent._has_stream_consumers() and agent._should_start_quiet_spinner():
                # Raw KawaiiSpinner only when no streaming consumers and the
                # spinner output has a safe sink.
                spinner_type = random.choice(['brain', 'sparkle', 'pulse', 'moon', 'star'])
                thinking_spinner = KawaiiSpinner(f"{face} {verb}...", spinner_type=spinner_type, print_fn=agent._print_fn)
                thinking_spinner.start()

        # Log request details if verbose
        if agent.verbose_logging:
            logging.debug(f"API Request - Model: {agent.model}, Messages: {len(messages)}, Tools: {len(agent.tools) if agent.tools else 0}")
            logging.debug(f"Last message role: {messages[-1]['role'] if messages else 'none'}")
            logging.debug(f"Total message size: ~{approx_tokens:,} tokens")

        api_start_time = time.time()
        retry_count = 0
        max_retries = agent._api_max_retries
        primary_recovery_attempted = False
        max_compression_attempts = 3
        codex_auth_retry_attempted=False
        anthropic_auth_retry_attempted=False
        nous_auth_retry_attempted=False
        copilot_auth_retry_attempted=False
        thinking_sig_retry_attempted = False
        image_shrink_retry_attempted = False
        oauth_1m_beta_retry_attempted = False
        llama_cpp_grammar_retry_attempted = False
        has_retried_429 = False
        restart_with_compressed_messages = False
        restart_with_length_continuation = False

        finish_reason = "stop"
        response = None  # Guard against UnboundLocalError if all retries fail
        api_kwargs = None  # Guard against UnboundLocalError in except handler

        while retry_count < max_retries:
            # ── Nous Portal rate limit guard ──────────────────────
            # If another session already recorded that Nous is rate-
            # limited, skip the API call entirely.  Each attempt
            # (including SDK-level retries) counts against RPH and
            # deepens the rate limit hole.
            if agent.provider == "nous":
                try:
                    from agent.nous_rate_guard import (
                        nous_rate_limit_remaining,
                        format_remaining as _fmt_nous_remaining,
                    )
                    _nous_remaining = nous_rate_limit_remaining()
                    if _nous_remaining is not None and _nous_remaining > 0:
                        _nous_msg = (
                            f"Nous Portal rate limit active — "
                            f"resets in {_fmt_nous_remaining(_nous_remaining)}."
                        )
                        agent._vprint(
                            f"{agent.log_prefix}⏳ {_nous_msg} Trying fallback...",
                            force=True,
                        )
                        agent._emit_status(f"⏳ {_nous_msg}")
                        if agent._try_activate_fallback():
                            retry_count = 0
                            compression_attempts = 0
                            primary_recovery_attempted = False
                            continue
                        # No fallback available — return with clear message
                        agent._persist_session(messages, conversation_history)
                        return {
                            "final_response": (
                                f"⏳ {_nous_msg}\n\n"
                                "No fallback provider available. "
                                "Try again after the reset, or add a "
                                "fallback provider in config.yaml."
                            ),
                            "messages": messages,
                            "api_calls": api_call_count,
                            "completed": False,
                            "failed": True,
                            "error": _nous_msg,
                        }
                except ImportError:
                    pass
                except Exception:
                    pass  # Never let rate guard break the agent loop

            try:
                agent._reset_stream_delivery_tracking()
                api_kwargs = agent._build_api_kwargs(api_messages)
                if agent._force_ascii_payload:
                    _sanitize_structure_non_ascii(api_kwargs)
                if agent.api_mode == "codex_responses":
                    api_kwargs = agent._get_transport().preflight_kwargs(api_kwargs, allow_stream=False)

                try:
                    from hermes_cli.plugins import invoke_hook as _invoke_hook
                    request_messages = api_kwargs.get("messages")
                    if not isinstance(request_messages, list):
                        request_messages = api_kwargs.get("input")
                    if not isinstance(request_messages, list):
                        request_messages = api_messages
                    # Shallow-copy the outer list so plugins that retain the
                    # reference for async snapshotting don't observe later
                    # mutations of api_messages.  The inner dicts are not
                    # mutated by the agent loop, so a shallow copy is
                    # sufficient; a deepcopy would walk every tool result
                    # and base64 image on every API call.
                    _invoke_hook(
                        "pre_api_request",
                        task_id=effective_task_id,
                        session_id=agent.session_id or "",
                        user_message=original_user_message,
                        conversation_history=list(messages),
                        platform=agent.platform or "",
                        model=agent.model,
                        provider=agent.provider,
                        base_url=agent.base_url,
                        api_mode=agent.api_mode,
                        api_call_count=api_call_count,
                        request_messages=list(request_messages) if isinstance(request_messages, list) else [],
                        message_count=len(api_messages),
                        tool_count=len(agent.tools or []),
                        approx_input_tokens=approx_tokens,
                        request_char_count=total_chars,
                        max_tokens=agent.max_tokens,
                    )
                except Exception:
                    pass

                if env_var_enabled("HERMES_DUMP_REQUESTS"):
                    agent._dump_api_request_debug(api_kwargs, reason="preflight")

                # Always prefer the streaming path — even without stream
                # consumers.  Streaming gives us fine-grained health
                # checking (90s stale-stream detection, 60s read timeout)
                # that the non-streaming path lacks.  Without this,
                # subagents and other quiet-mode callers can hang
                # indefinitely when the provider keeps the connection
                # alive with SSE pings but never delivers a response.
                # The streaming path is a no-op for callbacks when no
                # consumers are registered, and falls back to non-
                # streaming automatically if the provider doesn't
                # support it.
                def _stop_spinner():
                    nonlocal thinking_spinner
                    if thinking_spinner:
                        thinking_spinner.stop("")
                        thinking_spinner = None
                    if agent.thinking_callback:
                        agent.thinking_callback("")

                _use_streaming = True
                # Provider signaled "stream not supported" on a previous
                # attempt — switch to non-streaming for the rest of this
                # session instead of re-failing every retry.
                if getattr(agent, "_disable_streaming", False):
                    _use_streaming = False
                # CopilotACPClient communicates via subprocess stdio and
                # returns a plain SimpleNamespace — not an iterable
                # stream.  Mirror the ACP exclusion used for Responses
                # API upgrade (lines ~1083-1085).
                elif (
                    agent.provider == "copilot-acp"
                    or str(agent.base_url or "").lower().startswith("acp://copilot")
                    or str(agent.base_url or "").lower().startswith("acp+tcp://")
                ):
                    _use_streaming = False
                elif not agent._has_stream_consumers():
                    # No display/TTS consumer. Still prefer streaming for
                    # health checking, but skip for Mock clients in tests
                    # (mocks return SimpleNamespace, not stream iterators).
                    from unittest.mock import Mock
                    if isinstance(getattr(agent, "client", None), Mock):
                        _use_streaming = False

                if _use_streaming:
                    response = agent._interruptible_streaming_api_call(
                        api_kwargs, on_first_delta=_stop_spinner
                    )
                else:
                    response = agent._interruptible_api_call(api_kwargs)

                api_duration = time.time() - api_start_time

                # Stop thinking spinner silently -- the response box or tool
                # execution messages that follow are more informative.
                if thinking_spinner:
                    thinking_spinner.stop("")
                    thinking_spinner = None
                if agent.thinking_callback:
                    agent.thinking_callback("")

                if not agent.quiet_mode:
                    agent._vprint(f"{agent.log_prefix}⏱️  API call completed in {api_duration:.2f}s")

                if agent.verbose_logging:
                    # Log response with provider info if available
                    resp_model = getattr(response, 'model', 'N/A') if response else 'N/A'
                    logging.debug(f"API Response received - Model: {resp_model}, Usage: {response.usage if hasattr(response, 'usage') else 'N/A'}")

                # Validate response shape before proceeding
                response_invalid = False
                error_details = []
                if agent.api_mode == "codex_responses":
                    _ct_v = agent._get_transport()
                    if not _ct_v.validate_response(response):
                        if response is None:
                            response_invalid = True
                            error_details.append("response is None")
                        else:
                            # Provider returned a terminal failure (e.g. quota exhaustion).
                            # Treat as invalid so the fallback chain is triggered instead of
                            # letting the error bubble up outside the retry/fallback loop.
                            _codex_resp_status = str(getattr(response, "status", "") or "").strip().lower()
                            if _codex_resp_status in {"failed", "cancelled"}:
                                _codex_error_obj = getattr(response, "error", None)
                                _codex_error_msg = (
                                    _codex_error_obj.get("message") if isinstance(_codex_error_obj, dict)
                                    else str(_codex_error_obj) if _codex_error_obj
                                    else f"Responses API returned status '{_codex_resp_status}'"
                                )
                                logging.warning(
                                    "Codex response status='%s' (error=%s). Routing to fallback. %s",
                                    _codex_resp_status, _codex_error_msg,
                                    agent._client_log_context(),
                                )
                                response_invalid = True
                                error_details.append(f"response.status={_codex_resp_status}: {_codex_error_msg}")
                            else:
                                # output_text fallback: stream backfill may have failed
                                # but normalize can still recover from output_text
                                _out_text = getattr(response, "output_text", None)
                                _out_text_stripped = _out_text.strip() if isinstance(_out_text, str) else ""
                                if _out_text_stripped:
                                    logger.debug(
                                        "Codex response.output is empty but output_text is present "
                                        "(%d chars); deferring to normalization.",
                                        len(_out_text_stripped),
                                    )
                                else:
                                    _resp_status = getattr(response, "status", None)
                                    _resp_incomplete = getattr(response, "incomplete_details", None)
                                    logger.warning(
                                        "Codex response.output is empty after stream backfill "
                                        "(status=%s, incomplete_details=%s, model=%s). %s",
                                        _resp_status, _resp_incomplete,
                                        getattr(response, "model", None),
                                        f"api_mode={agent.api_mode} provider={agent.provider}",
                                    )
                                    response_invalid = True
                                    error_details.append("response.output is empty")
                elif agent.api_mode == "anthropic_messages":
                    _tv = agent._get_transport()
                    if not _tv.validate_response(response):
                        response_invalid = True
                        if response is None:
                            error_details.append("response is None")
                        else:
                            error_details.append("response.content invalid (not a non-empty list)")
                elif agent.api_mode == "bedrock_converse":
                    _btv = agent._get_transport()
                    if not _btv.validate_response(response):
                        response_invalid = True
                        if response is None:
                            error_details.append("response is None")
                        else:
                            error_details.append("Bedrock response invalid (no output or choices)")
                else:
                    _ctv = agent._get_transport()
                    if not _ctv.validate_response(response):
                        response_invalid = True
                        if response is None:
                            error_details.append("response is None")
                        elif not hasattr(response, 'choices'):
                            error_details.append("response has no 'choices' attribute")
                        elif response.choices is None:
                            error_details.append("response.choices is None")
                        else:
                            error_details.append("response.choices is empty")

                if response_invalid:
                    # Stop spinner before printing error messages
                    if thinking_spinner:
                        thinking_spinner.stop("(´;ω;`) oops, retrying...")
                        thinking_spinner = None
                    if agent.thinking_callback:
                        agent.thinking_callback("")

                    # Invalid response — could be rate limiting, provider timeout,
                    # upstream server error, or malformed response.
                    retry_count += 1

                    # Eager fallback: empty/malformed responses are a common
                    # rate-limit symptom.  Switch to fallback immediately
                    # rather than retrying with extended backoff.
                    if agent._fallback_index < len(agent._fallback_chain):
                        agent._emit_status("⚠️ Empty/malformed response — switching to fallback...")
                    if agent._try_activate_fallback():
                        retry_count = 0
                        compression_attempts = 0
                        primary_recovery_attempted = False
                        continue

                    # Check for error field in response (some providers include this)
                    error_msg = "Unknown"
                    provider_name = "Unknown"
                    if response and hasattr(response, 'error') and response.error:
                        error_msg = str(response.error)
                        # Try to extract provider from error metadata
                        if hasattr(response.error, 'metadata') and response.error.metadata:
                            provider_name = response.error.metadata.get('provider_name', 'Unknown')
                    elif response and hasattr(response, 'message') and response.message:
                        error_msg = str(response.message)

                    # Try to get provider from model field (OpenRouter often returns actual model used)
                    if provider_name == "Unknown" and response and hasattr(response, 'model') and response.model:
                        provider_name = f"model={response.model}"

                    # Check for x-openrouter-provider or similar metadata
                    if provider_name == "Unknown" and response:
                        # Log all response attributes for debugging
                        resp_attrs = {k: str(v)[:100] for k, v in vars(response).items() if not k.startswith('_')}
                        if agent.verbose_logging:
                            logging.debug(f"Response attributes for invalid response: {resp_attrs}")

                    # Extract error code from response for contextual diagnostics
                    _resp_error_code = None
                    if response and hasattr(response, 'error') and response.error:
                        _code_raw = getattr(response.error, 'code', None)
                        if _code_raw is None and isinstance(response.error, dict):
                            _code_raw = response.error.get('code')
                        if _code_raw is not None:
                            try:
                                _resp_error_code = int(_code_raw)
                            except (TypeError, ValueError):
                                pass

                    # Build a human-readable failure hint from the error code
                    # and response time, instead of always assuming rate limiting.
                    if _resp_error_code == 524:
                        _failure_hint = f"upstream provider timed out (Cloudflare 524, {api_duration:.0f}s)"
                    elif _resp_error_code == 504:
                        _failure_hint = f"upstream gateway timeout (504, {api_duration:.0f}s)"
                    elif _resp_error_code == 429:
                        _failure_hint = f"rate limited by upstream provider (429)"
                    elif _resp_error_code in {500, 502}:
                        _failure_hint = f"upstream server error ({_resp_error_code}, {api_duration:.0f}s)"
                    elif _resp_error_code in {503, 529}:
                        _failure_hint = f"upstream provider overloaded ({_resp_error_code})"
                    elif _resp_error_code is not None:
                        _failure_hint = f"upstream error (code {_resp_error_code}, {api_duration:.0f}s)"
                    elif api_duration < 10:
                        _failure_hint = f"fast response ({api_duration:.1f}s) — likely rate limited"
                    elif api_duration > 60:
                        _failure_hint = f"slow response ({api_duration:.0f}s) — likely upstream timeout"
                    else:
                        _failure_hint = f"response time {api_duration:.1f}s"

                    agent._vprint(f"{agent.log_prefix}⚠️  Invalid API response (attempt {retry_count}/{max_retries}): {', '.join(error_details)}", force=True)
                    agent._vprint(f"{agent.log_prefix}   🏢 Provider: {provider_name}", force=True)
                    cleaned_provider_error = agent._clean_error_message(error_msg)
                    agent._vprint(f"{agent.log_prefix}   📝 Provider message: {cleaned_provider_error}", force=True)
                    agent._vprint(f"{agent.log_prefix}   ⏱️  {_failure_hint}", force=True)

                    if retry_count >= max_retries:
                        # Try fallback before giving up
                        agent._emit_status(f"⚠️ Max retries ({max_retries}) for invalid responses — trying fallback...")
                        if agent._try_activate_fallback():
                            retry_count = 0
                            compression_attempts = 0
                            primary_recovery_attempted = False
                            continue
                        agent._emit_status(f"❌ Max retries ({max_retries}) exceeded for invalid responses. Giving up.")
                        logging.error(f"{agent.log_prefix}Invalid API response after {max_retries} retries.")
                        agent._persist_session(messages, conversation_history)
                        return {
                            "messages": messages,
                            "completed": False,
                            "api_calls": api_call_count,
                            "error": f"Invalid API response after {max_retries} retries: {_failure_hint}",
                            "failed": True  # Mark as failure for filtering
                        }

                    # Backoff before retry — jittered exponential: 5s base, 120s cap
                    wait_time = jittered_backoff(retry_count, base_delay=5.0, max_delay=120.0)
                    agent._vprint(f"{agent.log_prefix}⏳ Retrying in {wait_time:.1f}s ({_failure_hint})...", force=True)
                    logging.warning(f"Invalid API response (retry {retry_count}/{max_retries}): {', '.join(error_details)} | Provider: {provider_name}")

                    # Sleep in small increments to stay responsive to interrupts
                    sleep_end = time.time() + wait_time
                    _backoff_touch_counter = 0
                    while time.time() < sleep_end:
                        if agent._interrupt_requested:
                            agent._vprint(f"{agent.log_prefix}⚡ Interrupt detected during retry wait, aborting.", force=True)
                            agent._persist_session(messages, conversation_history)
                            agent.clear_interrupt()
                            return {
                                "final_response": f"Operation interrupted during retry ({_failure_hint}, attempt {retry_count}/{max_retries}).",
                                "messages": messages,
                                "api_calls": api_call_count,
                                "completed": False,
                                "interrupted": True,
                            }
                        time.sleep(0.2)
                        # Touch activity every ~30s so the gateway's inactivity
                        # monitor knows we're alive during backoff waits.
                        _backoff_touch_counter += 1
                        if _backoff_touch_counter % 150 == 0:  # 150 × 0.2s = 30s
                            agent._touch_activity(
                                f"retry backoff ({retry_count}/{max_retries}), "
                                f"{int(sleep_end - time.time())}s remaining"
                            )
                    continue  # Retry the API call

                # Check finish_reason before proceeding
                if agent.api_mode == "codex_responses":
                    status = getattr(response, "status", None)
                    incomplete_details = getattr(response, "incomplete_details", None)
                    incomplete_reason = None
                    if isinstance(incomplete_details, dict):
                        incomplete_reason = incomplete_details.get("reason")
                    else:
                        incomplete_reason = getattr(incomplete_details, "reason", None)
                    if status == "incomplete" and incomplete_reason in {"max_output_tokens", "length"}:
                        finish_reason = "length"
                    else:
                        finish_reason = "stop"
                elif agent.api_mode == "anthropic_messages":
                    _tfr = agent._get_transport()
                    finish_reason = _tfr.map_finish_reason(response.stop_reason)
                elif agent.api_mode == "bedrock_converse":
                    # Bedrock response already normalized at dispatch — use transport
                    _bt_fr = agent._get_transport()
                    _bedrock_result = _bt_fr.normalize_response(response)
                    finish_reason = _bedrock_result.finish_reason
                else:
                    _cc_fr = agent._get_transport()
                    _finish_result = _cc_fr.normalize_response(response)
                    finish_reason = _finish_result.finish_reason
                    assistant_message = _finish_result
                    if agent._should_treat_stop_as_truncated(
                        finish_reason,
                        assistant_message,
                        messages,
                    ):
                        agent._vprint(
                            f"{agent.log_prefix}⚠️  Treating suspicious Ollama/GLM stop response as truncated",
                            force=True,
                        )
                        finish_reason = "length"

                if finish_reason == "length":
                    agent._vprint(f"{agent.log_prefix}⚠️  Response truncated (finish_reason='length') - model hit max output tokens", force=True)

                    # Normalize the truncated response to a single OpenAI-style
                    # message shape so text-continuation and tool-call retry
                    # work uniformly across chat_completions, bedrock_converse,
                    # and anthropic_messages.  For Anthropic we use the same
                    # adapter the agent loop already relies on so the rebuilt
                    # interim assistant message is byte-identical to what
                    # would have been appended in the non-truncated path.
                    _trunc_msg = None
                    _trunc_transport = agent._get_transport()
                    if agent.api_mode == "anthropic_messages":
                        _trunc_result = _trunc_transport.normalize_response(
                            response, strip_tool_prefix=agent._is_anthropic_oauth
                        )
                    else:
                        _trunc_result = _trunc_transport.normalize_response(response)
                    _trunc_msg = _trunc_result

                    _trunc_content = getattr(_trunc_msg, "content", None) if _trunc_msg else None
                    _trunc_has_tool_calls = bool(getattr(_trunc_msg, "tool_calls", None)) if _trunc_msg else False

                    # ── Detect thinking-budget exhaustion ──────────────
                    # When the model spends ALL output tokens on reasoning
                    # and has none left for the response, continuation
                    # retries are pointless.  Detect this early and give a
                    # targeted error instead of wasting 3 API calls.
                    # A response is "thinking exhausted" only when the model
                    # actually produced reasoning blocks but no visible text after
                    # them.  Models that do not use <think> tags (e.g. GLM-4.7 on
                    # NVIDIA Build, minimax) may return content=None or an empty
                    # string for unrelated reasons — treat those as normal
                    # truncations that deserve continuation retries, not as
                    # thinking-budget exhaustion.
                    _has_think_tags = bool(
                        _trunc_content and re.search(
                            r'<(?:think|thinking|reasoning|REASONING_SCRATCHPAD)[^>]*>',
                            _trunc_content,
                            re.IGNORECASE,
                        )
                    )
                    _thinking_exhausted = (
                        not _trunc_has_tool_calls
                        and _has_think_tags
                        and (
                            (_trunc_content is not None and not agent._has_content_after_think_block(_trunc_content))
                            or _trunc_content is None
                        )
                    )

                    if _thinking_exhausted:
                        _exhaust_error = (
                            "Model used all output tokens on reasoning with none left "
                            "for the response. Try lowering reasoning effort or "
                            "increasing max_tokens."
                        )
                        agent._vprint(
                            f"{agent.log_prefix}💭 Reasoning exhausted the output token budget — "
                            f"no visible response was produced.",
                            force=True,
                        )
                        # Return a user-friendly message as the response so
                        # CLI (response box) and gateway (chat message) both
                        # display it naturally instead of a suppressed error.
                        _exhaust_response = (
                            "⚠️ **Thinking Budget Exhausted**\n\n"
                            "The model used all its output tokens on reasoning "
                            "and had none left for the actual response.\n\n"
                            "To fix this:\n"
                            "→ Lower reasoning effort: `/thinkon low` or `/thinkon minimal`\n"
                            "→ Or switch to a larger/non-reasoning model with `/model`"
                        )
                        agent._cleanup_task_resources(effective_task_id)
                        agent._persist_session(messages, conversation_history)
                        return {
                            "final_response": _exhaust_response,
                            "messages": messages,
                            "api_calls": api_call_count,
                            "completed": False,
                            "partial": True,
                            "error": _exhaust_error,
                        }

                    if agent.api_mode in {"chat_completions", "bedrock_converse", "anthropic_messages"}:
                        assistant_message = _trunc_msg
                        if assistant_message is not None and not _trunc_has_tool_calls:
                            length_continue_retries += 1
                            interim_msg = agent._build_assistant_message(assistant_message, finish_reason)
                            messages.append(interim_msg)
                            if assistant_message.content:
                                truncated_response_parts.append(assistant_message.content)

                            if length_continue_retries < 3:
                                agent._vprint(
                                    f"{agent.log_prefix}↻ Requesting continuation "
                                    f"({length_continue_retries}/3)..."
                                )
                                continue_msg = {
                                    "role": "user",
                                    "content": (
                                        "[System: Your previous response was truncated by the output "
                                        "length limit. Continue exactly where you left off. Do not "
                                        "restart or repeat prior text. Finish the answer directly.]"
                                    ),
                                }
                                messages.append(continue_msg)
                                agent._session_messages = messages
                                agent._save_session_log(messages)
                                restart_with_length_continuation = True
                                break

                            partial_response = agent._strip_think_blocks("".join(truncated_response_parts)).strip()
                            agent._cleanup_task_resources(effective_task_id)
                            agent._persist_session(messages, conversation_history)
                            return {
                                "final_response": partial_response or None,
                                "messages": messages,
                                "api_calls": api_call_count,
                                "completed": False,
                                "partial": True,
                                "error": "Response remained truncated after 3 continuation attempts",
                            }

                    if agent.api_mode in {"chat_completions", "bedrock_converse", "anthropic_messages"}:
                        assistant_message = _trunc_msg
                        if assistant_message is not None and _trunc_has_tool_calls:
                            if truncated_tool_call_retries < 1:
                                truncated_tool_call_retries += 1
                                agent._vprint(
                                    f"{agent.log_prefix}⚠️  Truncated tool call detected — retrying API call...",
                                    force=True,
                                )
                                # Don't append the broken response to messages;
                                # just re-run the same API call from the current
                                # message state, giving the model another chance.
                                continue
                            agent._vprint(
                                f"{agent.log_prefix}⚠️  Truncated tool call response detected again — refusing to execute incomplete tool arguments.",
                                force=True,
                            )
                            agent._cleanup_task_resources(effective_task_id)
                            agent._persist_session(messages, conversation_history)
                            return {
                                "final_response": None,
                                "messages": messages,
                                "api_calls": api_call_count,
                                "completed": False,
                                "partial": True,
                                "error": "Response truncated due to output length limit",
                            }

                    # If we have prior messages, roll back to last complete state
                    if len(messages) > 1:
                        agent._vprint(f"{agent.log_prefix}   ⏪ Rolling back to last complete assistant turn")
                        rolled_back_messages = agent._get_messages_up_to_last_assistant(messages)

                        agent._cleanup_task_resources(effective_task_id)
                        agent._persist_session(messages, conversation_history)

                        return {
                            "final_response": None,
                            "messages": rolled_back_messages,
                            "api_calls": api_call_count,
                            "completed": False,
                            "partial": True,
                            "error": "Response truncated due to output length limit"
                        }
                    else:
                        # First message was truncated - mark as failed
                        agent._vprint(f"{agent.log_prefix}❌ First response truncated - cannot recover", force=True)
                        agent._persist_session(messages, conversation_history)
                        return {
                            "final_response": None,
                            "messages": messages,
                            "api_calls": api_call_count,
                            "completed": False,
                            "failed": True,
                            "error": "First response truncated due to output length limit"
                        }

                # Track actual token usage from response for context management
                if hasattr(response, 'usage') and response.usage:
                    canonical_usage = normalize_usage(
                        response.usage,
                        provider=agent.provider,
                        api_mode=agent.api_mode,
                    )
                    prompt_tokens = canonical_usage.prompt_tokens
                    completion_tokens = canonical_usage.output_tokens
                    total_tokens = canonical_usage.total_tokens
                    usage_dict = {
                        "prompt_tokens": prompt_tokens,
                        "completion_tokens": completion_tokens,
                        "total_tokens": total_tokens,
                    }
                    agent.context_compressor.update_from_response(usage_dict)

                    # Cache discovered context length after successful call.
                    # Only persist limits confirmed by the provider (parsed
                    # from the error message), not guessed probe tiers.
                    if getattr(agent.context_compressor, "_context_probed", False):
                        ctx = agent.context_compressor.context_length
                        if getattr(agent.context_compressor, "_context_probe_persistable", False):
                            save_context_length(agent.model, agent.base_url, ctx)
                            agent._safe_print(f"{agent.log_prefix}💾 Cached context length: {ctx:,} tokens for {agent.model}")
                        agent.context_compressor._context_probed = False
                        agent.context_compressor._context_probe_persistable = False

                    agent.session_prompt_tokens += prompt_tokens
                    agent.session_completion_tokens += completion_tokens
                    agent.session_total_tokens += total_tokens
                    agent.session_api_calls += 1
                    agent.session_input_tokens += canonical_usage.input_tokens
                    agent.session_output_tokens += canonical_usage.output_tokens
                    agent.session_cache_read_tokens += canonical_usage.cache_read_tokens
                    agent.session_cache_write_tokens += canonical_usage.cache_write_tokens
                    agent.session_reasoning_tokens += canonical_usage.reasoning_tokens

                    # Log API call details for debugging/observability
                    _cache_pct = ""
                    if canonical_usage.cache_read_tokens and prompt_tokens:
                        _cache_pct = f" cache={canonical_usage.cache_read_tokens}/{prompt_tokens} ({100*canonical_usage.cache_read_tokens/prompt_tokens:.0f}%)"
                    logger.info(
                        "API call #%d: model=%s provider=%s in=%d out=%d total=%d latency=%.1fs%s",
                        agent.session_api_calls, agent.model, agent.provider or "unknown",
                        prompt_tokens, completion_tokens, total_tokens,
                        api_duration, _cache_pct,
                    )

                    cost_result = estimate_usage_cost(
                        agent.model,
                        canonical_usage,
                        provider=agent.provider,
                        base_url=agent.base_url,
                        api_key=getattr(agent, "api_key", ""),
                    )
                    if cost_result.amount_usd is not None:
                        agent.session_estimated_cost_usd += float(cost_result.amount_usd)
                    agent.session_cost_status = cost_result.status
                    agent.session_cost_source = cost_result.source

                    # Persist token counts to session DB for /insights.
                    # Do this for every platform with a session_id so non-CLI
                    # sessions (gateway, cron, delegated runs) cannot lose
                    # token/accounting data if a higher-level persistence path
                    # is skipped or fails. Gateway/session-store writes use
                    # absolute totals, so they safely overwrite these per-call
                    # deltas instead of double-counting them.
                    if agent._session_db and agent.session_id:
                        try:
                            # Ensure the session row exists before attempting UPDATE.
                            # Under concurrent load (cron/kanban), the initial
                            # _ensure_db_session() may have failed due to SQLite
                            # locking.  Retry here so per-call token deltas are
                            # not silently lost (UPDATE on a non-existent row
                            # affects 0 rows without error).
                            if not agent._session_db_created:
                                agent._ensure_db_session()
                            agent._session_db.update_token_counts(
                                agent.session_id,
                                input_tokens=canonical_usage.input_tokens,
                                output_tokens=canonical_usage.output_tokens,
                                cache_read_tokens=canonical_usage.cache_read_tokens,
                                cache_write_tokens=canonical_usage.cache_write_tokens,
                                reasoning_tokens=canonical_usage.reasoning_tokens,
                                estimated_cost_usd=float(cost_result.amount_usd)
                                if cost_result.amount_usd is not None else None,
                                cost_status=cost_result.status,
                                cost_source=cost_result.source,
                                billing_provider=agent.provider,
                                billing_base_url=agent.base_url,
                                billing_mode="subscription_included"
                                if cost_result.status == "included" else None,
                                model=agent.model,
                                api_call_count=1,
                            )
                        except Exception as e:
                            # Log token persistence failures so they're
                            # visible in agent.log — silent loss here is
                            # the root cause of undercounted analytics.
                            logger.debug(
                                "Token persistence failed (session=%s, tokens=%d): %s",
                                agent.session_id, total_tokens, e,
                            )

                    if agent.verbose_logging:
                        logging.debug(f"Token usage: prompt={usage_dict['prompt_tokens']:,}, completion={usage_dict['completion_tokens']:,}, total={usage_dict['total_tokens']:,}")

                    # Surface cache hit stats for any provider that reports
                    # them — not just those where we inject cache_control
                    # markers.  OpenAI/Kimi/DeepSeek/Qwen all do automatic
                    # server-side prefix caching and return
                    # ``prompt_tokens_details.cached_tokens``; users
                    # previously could not see their cache % because this
                    # line was gated on ``_use_prompt_caching``, which is
                    # only True for Anthropic-style marker injection.
                    # ``canonical_usage`` is already normalised from all
                    # three API shapes (Anthropic / Codex / OpenAI-chat)
                    # so we can rely on its values directly.
                    cached = canonical_usage.cache_read_tokens
                    written = canonical_usage.cache_write_tokens
                    prompt = usage_dict["prompt_tokens"]
                    if (cached or written) and not agent.quiet_mode:
                        hit_pct = (cached / prompt * 100) if prompt > 0 else 0
                        agent._vprint(
                            f"{agent.log_prefix}   💾 Cache: "
                            f"{cached:,}/{prompt:,} tokens "
                            f"({hit_pct:.0f}% hit, {written:,} written)"
                        )

                has_retried_429 = False  # Reset on success
                # Clear Nous rate limit state on successful request —
                # proves the limit has reset and other sessions can
                # resume hitting Nous.
                if agent.provider == "nous":
                    try:
                        from agent.nous_rate_guard import clear_nous_rate_limit
                        clear_nous_rate_limit()
                    except Exception:
                        pass
                agent._touch_activity(f"API call #{api_call_count} completed")
                break  # Success, exit retry loop

            except InterruptedError:
                if thinking_spinner:
                    thinking_spinner.stop("")
                    thinking_spinner = None
                if agent.thinking_callback:
                    agent.thinking_callback("")
                api_elapsed = time.time() - api_start_time
                agent._vprint(f"{agent.log_prefix}⚡ Interrupted during API call.", force=True)
                agent._persist_session(messages, conversation_history)
                interrupted = True
                final_response = f"Operation interrupted: waiting for model response ({api_elapsed:.1f}s elapsed)."
                break

            except Exception as api_error:
                # Stop spinner before printing error messages
                if thinking_spinner:
                    thinking_spinner.stop("(╥_╥) error, retrying...")
                    thinking_spinner = None
                if agent.thinking_callback:
                    agent.thinking_callback("")

                # -----------------------------------------------------------
                # UnicodeEncodeError recovery.  Two common causes:
                #   1. Lone surrogates (U+D800..U+DFFF) from clipboard paste
                #      (Google Docs, rich-text editors) — sanitize and retry.
                #   2. ASCII codec on systems with LANG=C or non-UTF-8 locale
                #      (e.g. Chromebooks) — any non-ASCII character fails.
                #      Detect via the error message mentioning 'ascii' codec.
                # We sanitize messages in-place and may retry twice:
                # first to strip surrogates, then once more for pure
                # ASCII-only locale sanitization if needed.
                # -----------------------------------------------------------
                if isinstance(api_error, UnicodeEncodeError) and getattr(agent, '_unicode_sanitization_passes', 0) < 2:
                    _err_str = str(api_error).lower()
                    _is_ascii_codec = "'ascii'" in _err_str or "ascii" in _err_str
                    # Detect surrogate errors — utf-8 codec refusing to
                    # encode U+D800..U+DFFF.  The error text is:
                    #   "'utf-8' codec can't encode characters in position
                    #    N-M: surrogates not allowed"
                    _is_surrogate_error = (
                        "surrogate" in _err_str
                        or ("'utf-8'" in _err_str and not _is_ascii_codec)
                    )
                    # Sanitize surrogates from both the canonical `messages`
                    # list AND `api_messages` (the API-copy, which may carry
                    # `reasoning_content`/`reasoning_details` transformed
                    # from `reasoning` — fields the canonical list doesn't
                    # have directly).  Also clean `api_kwargs` if built and
                    # `prefill_messages` if present.  Mirrors the ASCII
                    # codec recovery below.
                    _surrogates_found = _sanitize_messages_surrogates(messages)
                    if isinstance(api_messages, list):
                        if _sanitize_messages_surrogates(api_messages):
                            _surrogates_found = True
                    if isinstance(api_kwargs, dict):
                        if _sanitize_structure_surrogates(api_kwargs):
                            _surrogates_found = True
                    if isinstance(getattr(agent, "prefill_messages", None), list):
                        if _sanitize_messages_surrogates(agent.prefill_messages):
                            _surrogates_found = True
                    # Gate the retry on the error type, not on whether we
                    # found anything — _force_ascii_payload / the extended
                    # surrogate walker above cover all known paths, but a
                    # new transformed field could still slip through.  If
                    # the error was a surrogate encode failure, always let
                    # the retry run; the proactive sanitizer at line ~8781
                    # runs again on the next iteration.  Bounded by
                    # _unicode_sanitization_passes < 2 (outer guard).
                    if _surrogates_found or _is_surrogate_error:
                        agent._unicode_sanitization_passes += 1
                        if _surrogates_found:
                            agent._vprint(
                                f"{agent.log_prefix}⚠️  Stripped invalid surrogate characters from messages. Retrying...",
                                force=True,
                            )
                        else:
                            agent._vprint(
                                f"{agent.log_prefix}⚠️  Surrogate encoding error — retrying after full-payload sanitization...",
                                force=True,
                            )
                        continue
                    if _is_ascii_codec:
                        agent._force_ascii_payload = True
                        # ASCII codec: the system encoding can't handle
                        # non-ASCII characters at all. Sanitize all
                        # non-ASCII content from messages/tool schemas and retry.
                        # Sanitize both the canonical `messages` list and
                        # `api_messages` (the API-copy built before the retry
                        # loop, which may contain extra fields like
                        # reasoning_content that are not in `messages`).
                        _messages_sanitized = _sanitize_messages_non_ascii(messages)
                        if isinstance(api_messages, list):
                            _sanitize_messages_non_ascii(api_messages)
                        # Also sanitize the last api_kwargs if already built,
                        # so a leftover non-ASCII value in a transformed field
                        # (e.g. extra_body, reasoning_content) doesn't survive
                        # into the next attempt via _build_api_kwargs cache paths.
                        if isinstance(api_kwargs, dict):
                            _sanitize_structure_non_ascii(api_kwargs)
                        _prefill_sanitized = False
                        if isinstance(getattr(agent, "prefill_messages", None), list):
                            _prefill_sanitized = _sanitize_messages_non_ascii(agent.prefill_messages)

                        _tools_sanitized = False
                        if isinstance(getattr(agent, "tools", None), list):
                            _tools_sanitized = _sanitize_tools_non_ascii(agent.tools)

                        _system_sanitized = False
                        if isinstance(active_system_prompt, str):
                            _sanitized_system = _strip_non_ascii(active_system_prompt)
                            if _sanitized_system != active_system_prompt:
                                active_system_prompt = _sanitized_system
                                agent._cached_system_prompt = _sanitized_system
                                _system_sanitized = True
                        if isinstance(getattr(agent, "ephemeral_system_prompt", None), str):
                            _sanitized_ephemeral = _strip_non_ascii(agent.ephemeral_system_prompt)
                            if _sanitized_ephemeral != agent.ephemeral_system_prompt:
                                agent.ephemeral_system_prompt = _sanitized_ephemeral
                                _system_sanitized = True

                        _headers_sanitized = False
                        _default_headers = (
                            agent._client_kwargs.get("default_headers")
                            if isinstance(getattr(agent, "_client_kwargs", None), dict)
                            else None
                        )
                        if isinstance(_default_headers, dict):
                            _headers_sanitized = _sanitize_structure_non_ascii(_default_headers)

                        # Sanitize the API key — non-ASCII characters in
                        # credentials (e.g. ʋ instead of v from a bad
                        # copy-paste) cause httpx to fail when encoding
                        # the Authorization header as ASCII.  This is the
                        # most common cause of persistent UnicodeEncodeError
                        # that survives message/tool sanitization (#6843).
                        _credential_sanitized = False
                        _raw_key = getattr(agent, "api_key", None) or ""
                        # Entra ID bearer providers are callables — their
                        # minted JWTs are always ASCII, so no sanitization
                        # is needed (and ``_strip_non_ascii`` would crash
                        # on a callable input).
                        if _raw_key and isinstance(_raw_key, str):
                            _clean_key = _strip_non_ascii(_raw_key)
                            if _clean_key != _raw_key:
                                agent.api_key = _clean_key
                                if isinstance(getattr(agent, "_client_kwargs", None), dict):
                                    agent._client_kwargs["api_key"] = _clean_key
                                # Also update the live client — it holds its
                                # own copy of api_key which auth_headers reads
                                # dynamically on every request.
                                if getattr(agent, "client", None) is not None and hasattr(agent.client, "api_key"):
                                    agent.client.api_key = _clean_key
                                _credential_sanitized = True
                                agent._vprint(
                                    f"{agent.log_prefix}⚠️  API key contained non-ASCII characters "
                                    f"(bad copy-paste?) — stripped them. If auth fails, "
                                    f"re-copy the key from your provider's dashboard.",
                                    force=True,
                                )

                        # Always retry on ASCII codec detection —
                        # _force_ascii_payload guarantees the full
                        # api_kwargs payload is sanitized on the
                        # next iteration (line ~8475).  Even when
                        # per-component checks above find nothing
                        # (e.g. non-ASCII only in api_messages'
                        # reasoning_content), the flag catches it.
                        # Bounded by _unicode_sanitization_passes < 2.
                        agent._unicode_sanitization_passes += 1
                        _any_sanitized = (
                            _messages_sanitized
                            or _prefill_sanitized
                            or _tools_sanitized
                            or _system_sanitized
                            or _headers_sanitized
                            or _credential_sanitized
                        )
                        if _any_sanitized:
                            agent._vprint(
                                f"{agent.log_prefix}⚠️  System encoding is ASCII — stripped non-ASCII characters from request payload. Retrying...",
                                force=True,
                            )
                        else:
                            agent._vprint(
                                f"{agent.log_prefix}⚠️  System encoding is ASCII — enabling full-payload sanitization for retry...",
                                force=True,
                            )
                        continue

                # ── Image-rejection recovery ──────────────────────────────
                # Some providers (mlx-lm, text-only endpoints, text-only
                # fallbacks on multimodal models) reject any message that
                # contains image_url content with a 4xx error like
                # "Only 'text' content type is supported."  On first hit,
                # strip all images from the message list, mark the session
                # as vision-unsupported, and retry with text only.
                #
                # Detection is best-effort English phrase matching — a
                # locale-translated or heavily-reworded upstream error
                # will bypass this guard and fall through to the normal
                # error handler.  Expand the phrase list when new
                # provider wordings are observed in the wild.
                _err_body = ""
                try:
                    _err_body = str(getattr(api_error, "body", None) or
                                    getattr(api_error, "message", None) or
                                    str(api_error))
                except Exception:
                    pass
                _err_status = getattr(api_error, "status_code", None)
                _IMAGE_REJECTION_PHRASES = (
                    "only 'text' content type is supported",
                    "only text content type is supported",
                    "image_url is not supported",
                    "image content is not supported",
                    "multimodal is not supported",
                    "multimodal content is not supported",
                    "multimodal input is not supported",
                    "vision is not supported",
                    "vision input is not supported",
                    "does not support images",
                    "does not support image input",
                    "does not support multimodal",
                    "does not support vision",
                    "model does not support image",
                    # ChatGPT-account Codex backend
                    # (https://chatgpt.com/backend-api/codex) rejects
                    # data:image/...base64 URLs in input_image fields
                    # with HTTP 400 "Invalid 'input[N].content[K].image_url'.
                    # Expected a valid URL, but got a value with an
                    # invalid format." The OpenAI Responses API on the
                    # public endpoint accepts data URLs, but the
                    # ChatGPT-account variant does not. Without this
                    # phrase the agent cascaded into compression /
                    # context-too-large recovery instead of just
                    # stripping the images. Match is narrow on
                    # purpose — keyed on the field-path apostrophe so
                    # we don't false-trip on other URL validation
                    # errors. (issue #23570)
                    "image_url'. expected",
                    # DeepSeek's OpenAI-compatible API reports text-only
                    # request-body variants as:
                    # "unknown variant `image_url`, expected `text`".
                    "unknown variant `image_url`, expected `text`",
                    "unknown variant image_url, expected text",
                )
                _err_lower = _err_body.lower()
                _looks_like_image_rejection = any(
                    p in _err_lower for p in _IMAGE_REJECTION_PHRASES
                )
                # 4xx-only gate: never interpret 5xx/timeout as "server
                # said no to images" — those are transient and must
                # route to the normal retry path.
                _status_ok = _err_status is None or (400 <= int(_err_status) < 500)
                if (
                    getattr(agent, "_vision_supported", True)
                    and _looks_like_image_rejection
                    and _status_ok
                ):
                    agent._vision_supported = False
                    _imgs_removed = _strip_images_from_messages(messages)
                    if isinstance(api_messages, list):
                        _strip_images_from_messages(api_messages)
                    agent._vprint(
                        f"{agent.log_prefix}⚠️  Server rejected image content — "
                        f"switching to text-only mode for this session"
                        + (". Stripped images from history and retrying." if _imgs_removed else "."),
                        force=True,
                    )
                    continue

                status_code = getattr(api_error, "status_code", None)
                error_context = agent._extract_api_error_context(api_error)

                # ── Classify the error for structured recovery decisions ──
                _compressor = getattr(agent, "context_compressor", None)
                _ctx_len = getattr(_compressor, "context_length", 200000) if _compressor else 200000
                classified = classify_api_error(
                    api_error,
                    provider=getattr(agent, "provider", "") or "",
                    model=getattr(agent, "model", "") or "",
                    approx_tokens=approx_tokens,
                    context_length=_ctx_len,
                    num_messages=len(api_messages) if api_messages else 0,
                )
                logger.debug(
                    "Error classified: reason=%s status=%s retryable=%s compress=%s rotate=%s fallback=%s",
                    classified.reason.value, classified.status_code,
                    classified.retryable, classified.should_compress,
                    classified.should_rotate_credential, classified.should_fallback,
                )

                recovered_with_pool, has_retried_429 = agent._recover_with_credential_pool(
                    status_code=status_code,
                    has_retried_429=has_retried_429,
                    classified_reason=classified.reason,
                    error_context=error_context,
                )
                if recovered_with_pool:
                    continue

                # Image-too-large recovery: shrink oversized native image
                # parts in-place and retry once.  Triggered by Anthropic's
                # per-image 5 MB ceiling (400 with "image exceeds 5 MB
                # maximum") or any other provider that complains about
                # image size.  If shrink fails or a second attempt still
                # fails, fall through to normal error handling.
                if (
                    classified.reason == FailoverReason.image_too_large
                    and not image_shrink_retry_attempted
                ):
                    image_shrink_retry_attempted = True
                    if agent._try_shrink_image_parts_in_messages(api_messages):
                        agent._vprint(
                            f"{agent.log_prefix}📐 Image(s) exceeded provider size limit — "
                            f"shrank and retrying...",
                            force=True,
                        )
                        continue
                    else:
                        logger.info(
                            "image-shrink recovery: no data-URL image parts found "
                            "or shrink didn't reduce size; surfacing original error."
                        )

                # Anthropic OAuth subscription rejected the 1M-context beta
                # header ("long context beta is not yet available for this
                # subscription"). Disable the beta for the rest of this
                # session, rebuild the client, and retry once.  1M-capable
                # subscriptions never hit this branch — they accept the
                # beta and keep full 1M context.  See PR #17680 for the
                # original report (we chose reactive recovery over the
                # proposed unconditional omit so capable subscriptions
                # don't silently lose the capability).
                if (
                    classified.reason == FailoverReason.oauth_long_context_beta_forbidden
                    and agent.api_mode == "anthropic_messages"
                    and agent._is_anthropic_oauth
                    and not oauth_1m_beta_retry_attempted
                ):
                    oauth_1m_beta_retry_attempted = True
                    if not getattr(agent, "_oauth_1m_beta_disabled", False):
                        agent._oauth_1m_beta_disabled = True
                        try:
                            agent._anthropic_client.close()
                        except Exception:
                            pass
                        agent._rebuild_anthropic_client()
                        agent._vprint(
                            f"{agent.log_prefix}🔕 OAuth subscription doesn't support "
                            f"the 1M-context beta — disabled for this session and retrying...",
                            force=True,
                        )
                        continue

                if (
                    agent.api_mode == "codex_responses"
                    and agent.provider in {"openai-codex", "xai-oauth"}
                    and status_code == 401
                    and not codex_auth_retry_attempted
                ):
                    codex_auth_retry_attempted = True
                    if agent._try_refresh_codex_client_credentials(force=True):
                        _label = "xAI OAuth" if agent.provider == "xai-oauth" else "Codex"
                        agent._vprint(f"{agent.log_prefix}🔐 {_label} auth refreshed after 401. Retrying request...")
                        continue
                if (
                    agent.api_mode == "chat_completions"
                    and agent.provider == "nous"
                    and status_code == 401
                    and not nous_auth_retry_attempted
                ):
                    nous_auth_retry_attempted = True
                    if agent._try_refresh_nous_client_credentials(force=True):
                        print(f"{agent.log_prefix}🔐 Nous agent key refreshed after 401. Retrying request...")
                        continue
                    # Credential refresh didn't help — show diagnostic info.
                    # Most common causes: Portal OAuth expired/revoked,
                    # account out of credits, or agent key blocked.
                    from hermes_constants import display_hermes_home as _dhh_fn
                    _dhh = _dhh_fn()
                    _body_text = ""
                    try:
                        _body = getattr(api_error, "body", None) or getattr(api_error, "response", None)
                        if _body is not None:
                            _body_text = str(_body)[:200]
                    except Exception:
                        pass
                    print(f"{agent.log_prefix}🔐 Nous 401 — Portal authentication failed.")
                    if _body_text:
                        print(f"{agent.log_prefix}   Response: {_body_text}")
                    print(f"{agent.log_prefix}   Most likely: Portal OAuth expired, account out of credits, or agent key revoked.")
                    print(f"{agent.log_prefix}   Troubleshooting:")
                    print(f"{agent.log_prefix}     • Re-authenticate: hermes login --provider nous")
                    print(f"{agent.log_prefix}     • Check credits / billing: https://portal.nousresearch.com")
                    print(f"{agent.log_prefix}     • Verify stored credentials: {_dhh}/auth.json")
                    print(f"{agent.log_prefix}     • Switch providers temporarily: /model <model> --provider openrouter")
                if (
                    agent.provider == "copilot"
                    and status_code == 401
                    and not copilot_auth_retry_attempted
                ):
                    copilot_auth_retry_attempted = True
                    if agent._try_refresh_copilot_client_credentials():
                        agent._vprint(f"{agent.log_prefix}🔐 Copilot credentials refreshed after 401. Retrying request...")
                        continue
                if (
                    agent.api_mode == "anthropic_messages"
                    and status_code == 401
                    and hasattr(agent, '_anthropic_api_key')
                    and not anthropic_auth_retry_attempted
                ):
                    anthropic_auth_retry_attempted = True
                    from agent.anthropic_adapter import _is_oauth_token
                    from agent.azure_identity_adapter import is_token_provider
                    if agent._try_refresh_anthropic_client_credentials():
                        print(f"{agent.log_prefix}🔐 Anthropic credentials refreshed after 401. Retrying request...")
                        continue
                    # Credential refresh didn't help — show diagnostic info
                    key = agent._anthropic_api_key
                    print(f"{agent.log_prefix}🔐 Anthropic 401 — authentication failed.")
                    if is_token_provider(key):
                        # Azure Foundry Entra ID — the bearer token is
                        # minted per-request by an httpx event hook on a
                        # custom http_client passed to the SDK. The 401
                        # means Azure rejected the JWT (RBAC role missing,
                        # az login expired, IMDS unreachable, etc.).
                        print(f"{agent.log_prefix}   Auth method: Microsoft Entra ID (httpx event hook)")
                        print(f"{agent.log_prefix}   Run `hermes doctor` for credential-chain diagnostics, or")
                        print(f"{agent.log_prefix}   `az login` if your developer session expired.")
                    else:
                        auth_method = "Bearer (OAuth/setup-token)" if _is_oauth_token(key) else "x-api-key (API key)"
                        print(f"{agent.log_prefix}   Auth method: {auth_method}")
                        print(f"{agent.log_prefix}   Token prefix: {key[:12]}..." if isinstance(key, str) and len(key) > 12 else f"{agent.log_prefix}   Token: (empty or short)")
                    print(f"{agent.log_prefix}   Troubleshooting:")
                    from hermes_constants import display_hermes_home as _dhh_fn
                    _dhh = _dhh_fn()
                    print(f"{agent.log_prefix}     • Check ANTHROPIC_TOKEN in {_dhh}/.env for Hermes-managed OAuth/setup tokens")
                    print(f"{agent.log_prefix}     • Check ANTHROPIC_API_KEY in {_dhh}/.env for API keys or legacy token values")
                    print(f"{agent.log_prefix}     • For API keys: verify at https://platform.claude.com/settings/keys")
                    print(f"{agent.log_prefix}     • For Claude Code: run 'claude /login' to refresh, then retry")
                    print(f"{agent.log_prefix}     • Legacy cleanup: hermes config set ANTHROPIC_TOKEN \"\"")
                    print(f"{agent.log_prefix}     • Clear stale keys: hermes config set ANTHROPIC_API_KEY \"\"")

                # ── Thinking block signature recovery ─────────────────
                # Anthropic signs thinking blocks against the full turn
                # content.  Any upstream mutation (context compression,
                # session truncation, message merging) invalidates the
                # signature → HTTP 400.  Recovery: strip reasoning_details
                # from all messages so the next retry sends no thinking
                # blocks at all.  One-shot — don't retry infinitely.
                if (
                    classified.reason == FailoverReason.thinking_signature
                    and not thinking_sig_retry_attempted
                ):
                    thinking_sig_retry_attempted = True
                    for _m in messages:
                        if isinstance(_m, dict):
                            _m.pop("reasoning_details", None)
                    agent._vprint(
                        f"{agent.log_prefix}⚠️  Thinking block signature invalid — "
                        f"stripped all thinking blocks, retrying...",
                        force=True,
                    )
                    logging.warning(
                        "%sThinking block signature recovery: stripped "
                        "reasoning_details from %d messages",
                        agent.log_prefix, len(messages),
                    )
                    continue

                # ── llama.cpp grammar-parse recovery ──────────────────
                # llama.cpp's ``json-schema-to-grammar`` converter rejects
                # regex escape classes (``\d``, ``\w``, ``\s``) and most
                # ``format`` values in tool schemas.  MCP servers emit
                # these routinely for date/phone/email params.  Recovery:
                # strip ``pattern``/``format`` from ``agent.tools`` and
                # retry once.  We keep the keywords by default so cloud
                # providers get the full prompting hints; this branch
                # fires only for users on llama.cpp's OAI server.
                if (
                    classified.reason == FailoverReason.llama_cpp_grammar_pattern
                    and not llama_cpp_grammar_retry_attempted
                ):
                    llama_cpp_grammar_retry_attempted = True
                    try:
                        from tools.schema_sanitizer import strip_pattern_and_format
                        _, _stripped = strip_pattern_and_format(agent.tools)
                    except Exception as _strip_exc:  # pragma: no cover — defensive
                        logging.warning(
                            "%sllama.cpp grammar recovery: strip helper failed: %s",
                            agent.log_prefix, _strip_exc,
                        )
                        _stripped = 0
                    if _stripped:
                        agent._vprint(
                            f"{agent.log_prefix}⚠️  llama.cpp rejected tool schema grammar — "
                            f"stripped {_stripped} pattern/format keyword(s), retrying...",
                            force=True,
                        )
                        logging.warning(
                            "%sllama.cpp grammar recovery: stripped %d "
                            "pattern/format keyword(s) from tool schemas",
                            agent.log_prefix, _stripped,
                        )
                        continue
                    # No keywords found to strip — fall through to normal
                    # retry path rather than loop forever on the same error.
                    logging.warning(
                        "%sllama.cpp grammar error but no pattern/format "
                        "keywords to strip — falling through to normal retry",
                        agent.log_prefix,
                    )

                retry_count += 1
                elapsed_time = time.time() - api_start_time
                agent._touch_activity(
                    f"API error recovery (attempt {retry_count}/{max_retries})"
                )

                error_type = type(api_error).__name__
                error_msg = str(api_error).lower()
                _error_summary = agent._summarize_api_error(api_error)
                logger.warning(
                    "API call failed (attempt %s/%s) error_type=%s %s summary=%s",
                    retry_count,
                    max_retries,
                    error_type,
                    agent._client_log_context(),
                    _error_summary,
                )

                _provider = getattr(agent, "provider", "unknown")
                _base = getattr(agent, "base_url", "unknown")
                _model = getattr(agent, "model", "unknown")
                _status_code_str = f" [HTTP {status_code}]" if status_code else ""
                agent._vprint(f"{agent.log_prefix}⚠️  API call failed (attempt {retry_count}/{max_retries}): {error_type}{_status_code_str}", force=True)
                agent._vprint(f"{agent.log_prefix}   🔌 Provider: {_provider}  Model: {_model}", force=True)
                agent._vprint(f"{agent.log_prefix}   🌐 Endpoint: {_base}", force=True)
                agent._vprint(f"{agent.log_prefix}   📝 Error: {_error_summary}", force=True)
                if status_code and status_code < 500:
                    _err_body = getattr(api_error, "body", None)
                    _err_body_str = str(_err_body)[:300] if _err_body else None
                    if _err_body_str:
                        agent._vprint(f"{agent.log_prefix}   📋 Details: {_err_body_str}", force=True)
                agent._vprint(f"{agent.log_prefix}   ⏱️  Elapsed: {elapsed_time:.2f}s  Context: {len(api_messages)} msgs, ~{approx_tokens:,} tokens")

                # Actionable hint for OpenRouter "no tool endpoints" error.
                # This fires regardless of whether fallback succeeds — the
                # user needs to know WHY their model failed so they can fix
                # their provider routing, not just silently fall back.
                if (
                    agent._is_openrouter_url()
                    and "support tool use" in error_msg
                ):
                    agent._vprint(
                        f"{agent.log_prefix}   💡 No OpenRouter providers for {_model} support tool calling with your current settings.",
                        force=True,
                    )
                    if agent.providers_allowed:
                        agent._vprint(
                            f"{agent.log_prefix}      Your provider_routing.only restriction is filtering out tool-capable providers.",
                            force=True,
                        )
                        agent._vprint(
                            f"{agent.log_prefix}      Try removing the restriction or adding providers that support tools for this model.",
                            force=True,
                        )
                    agent._vprint(
                        f"{agent.log_prefix}      Check which providers support tools: https://openrouter.ai/models/{_model}",
                        force=True,
                    )

                # Check for interrupt before deciding to retry
                if agent._interrupt_requested:
                    agent._vprint(f"{agent.log_prefix}⚡ Interrupt detected during error handling, aborting retries.", force=True)
                    agent._persist_session(messages, conversation_history)
                    agent.clear_interrupt()
                    return {
                        "final_response": f"Operation interrupted: handling API error ({error_type}: {agent._clean_error_message(str(api_error))}).",
                        "messages": messages,
                        "api_calls": api_call_count,
                        "completed": False,
                        "interrupted": True,
                    }

                # Check for 413 payload-too-large BEFORE generic 4xx handler.
                # A 413 is a payload-size error — the correct response is to
                # compress history and retry, not abort immediately.
                status_code = getattr(api_error, "status_code", None)

                # ── Anthropic Sonnet long-context tier gate ───────────
                # Anthropic returns HTTP 429 "Extra usage is required for
                # long context requests" when a Claude Max (or similar)
                # subscription doesn't include the 1M-context tier.  This
                # is NOT a transient rate limit — retrying or switching
                # credentials won't help.  Reduce context to 200k (the
                # standard tier) and compress.
                if classified.reason == FailoverReason.long_context_tier:
                    _reduced_ctx = 200000
                    compressor = agent.context_compressor
                    old_ctx = compressor.context_length
                    if old_ctx > _reduced_ctx:
                        compressor.update_model(
                            model=agent.model,
                            context_length=_reduced_ctx,
                            base_url=agent.base_url,
                            api_key=getattr(agent, "api_key", ""),
                            provider=agent.provider,
                        )
                        # Context probing flags — only set on built-in
                        # compressor (plugin engines manage their own).
                        if hasattr(compressor, "_context_probed"):
                            compressor._context_probed = True
                            # Don't persist — this is a subscription-tier
                            # limitation, not a model capability.  If the
                            # user later enables extra usage the 1M limit
                            # should come back automatically.
                            compressor._context_probe_persistable = False
                        agent._vprint(
                            f"{agent.log_prefix}⚠️  Anthropic long-context tier "
                            f"requires extra usage — reducing context: "
                            f"{old_ctx:,} → {_reduced_ctx:,} tokens",
                            force=True,
                        )

                    compression_attempts += 1
                    if compression_attempts <= max_compression_attempts:
                        original_len = len(messages)
                        messages, active_system_prompt = agent._compress_context(
                            messages, system_message,
                            approx_tokens=approx_tokens,
                            task_id=effective_task_id,
                        )
                        # Compression created a new session — clear history
                        # so _flush_messages_to_session_db writes compressed
                        # messages to the new session, not skipping them.
                        conversation_history = None
                        if len(messages) < original_len or old_ctx > _reduced_ctx:
                            agent._emit_status(
                                f"🗜️ Context reduced to {_reduced_ctx:,} tokens "
                                f"(was {old_ctx:,}), retrying..."
                            )
                            time.sleep(2)
                            restart_with_compressed_messages = True
                            break
                    # Fall through to normal error handling if compression
                    # is exhausted or didn't help.

                # Eager fallback for rate-limit errors (429 or quota exhaustion).
                # When a fallback model is configured, switch immediately instead
                # of burning through retries with exponential backoff -- the
                # primary provider won't recover within the retry window.
                is_rate_limited = classified.reason in {
                    FailoverReason.rate_limit,
                    FailoverReason.billing,
                }
                if is_rate_limited and agent._fallback_index < len(agent._fallback_chain):
                    # Don't eagerly fallback if credential pool rotation may
                    # still recover.  See _pool_may_recover_from_rate_limit
                    # for the single-credential-pool and CloudCode-quota
                    # exceptions.  Fixes #11314 and #13636.
                    pool_may_recover = _ra()._pool_may_recover_from_rate_limit(
                        agent._credential_pool,
                        provider=agent.provider,
                        base_url=getattr(agent, "base_url", None),
                    )
                    if not pool_may_recover:
                        agent._emit_status("⚠️ Rate limited — switching to fallback provider...")
                        if agent._try_activate_fallback(reason=classified.reason):
                            retry_count = 0
                            compression_attempts = 0
                            primary_recovery_attempted = False
                            continue

                # ── Nous Portal: record rate limit & skip retries ─────
                # When Nous returns a 429 that is a genuine account-
                # level rate limit, record the reset time to a shared
                # file so ALL sessions (cron, gateway, auxiliary) know
                # not to pile on, then skip further retries -- each
                # one burns another RPH request and deepens the hole.
                # The retry loop's top-of-iteration guard will catch
                # this on the next pass and try fallback or bail.
                #
                # IMPORTANT: Nous Portal multiplexes multiple upstream
                # providers (DeepSeek, Kimi, MiMo, Hermes).  A 429 can
                # also mean an UPSTREAM provider is out of capacity
                # for one specific model -- transient, clears in
                # seconds, nothing to do with the caller's quota.
                # Tripping the cross-session breaker on that would
                # block every Nous model for minutes.  We use
                # ``is_genuine_nous_rate_limit`` to tell the two
                # apart via the 429's own x-ratelimit-* headers and
                # the last-known-good state captured on the previous
                # successful response.
                if (
                    is_rate_limited
                    and agent.provider == "nous"
                    and classified.reason == FailoverReason.rate_limit
                    and not recovered_with_pool
                ):
                    _genuine_nous_rate_limit = False
                    try:
                        from agent.nous_rate_guard import (
                            is_genuine_nous_rate_limit,
                            record_nous_rate_limit,
                        )
                        _err_resp = getattr(api_error, "response", None)
                        _err_hdrs = (
                            getattr(_err_resp, "headers", None)
                            if _err_resp else None
                        )
                        _genuine_nous_rate_limit = is_genuine_nous_rate_limit(
                            headers=_err_hdrs,
                            last_known_state=agent._rate_limit_state,
                        )
                        if _genuine_nous_rate_limit:
                            record_nous_rate_limit(
                                headers=_err_hdrs,
                                error_context=error_context,
                            )
                        else:
                            logging.info(
                                "Nous 429 looks like upstream capacity "
                                "(no exhausted bucket in headers or "
                                "last-known state) -- not tripping "
                                "cross-session breaker."
                            )
                    except Exception:
                        pass
                    if _genuine_nous_rate_limit:
                        # Skip straight to max_retries -- the
                        # top-of-loop guard will handle fallback or
                        # bail cleanly.
                        retry_count = max_retries
                        continue
                    # Upstream capacity 429: fall through to normal
                    # retry logic.  A different model (or the same
                    # model a moment later) will typically succeed.

                is_payload_too_large = (
                    classified.reason == FailoverReason.payload_too_large
                )

                # Actionable hint for GitHub Models (Azure) 413 errors.
                # The free tier enforces a hard 8K token cap per request,
                # which Hermes' system prompt + tool schemas alone exceed.
                # Compression can't help — the floor is the system prompt
                # itself, not the conversation — so surface a clear "not
                # compatible" message instead of looping into three futile
                # compression attempts.
                if (
                    status_code == 413
                    and isinstance(agent.base_url, str)
                    and "models.inference.ai.azure.com" in agent.base_url
                ):
                    agent._vprint(
                        f"{agent.log_prefix}   💡 GitHub Models free tier (models.inference.ai.azure.com) caps every",
                        force=True,
                    )
                    agent._vprint(
                        f"{agent.log_prefix}      request at ~8K tokens. Hermes' system prompt + tool schemas baseline",
                        force=True,
                    )
                    agent._vprint(
                        f"{agent.log_prefix}      exceeds that floor, so this endpoint cannot run an agentic loop.",
                        force=True,
                    )
                    agent._vprint(
                        f"{agent.log_prefix}      Use the `copilot` provider with a Copilot subscription token (`hermes",
                        force=True,
                    )
                    agent._vprint(
                        f"{agent.log_prefix}      setup` → GitHub Copilot), or pick any other provider.",
                        force=True,
                    )

                if is_payload_too_large:
                    compression_attempts += 1
                    if compression_attempts > max_compression_attempts:
                        agent._vprint(f"{agent.log_prefix}❌ Max compression attempts ({max_compression_attempts}) reached for payload-too-large error.", force=True)
                        agent._vprint(f"{agent.log_prefix}   💡 Try /new to start a fresh conversation, or /compress to retry compression.", force=True)
                        logging.error(f"{agent.log_prefix}413 compression failed after {max_compression_attempts} attempts.")
                        agent._persist_session(messages, conversation_history)
                        return {
                            "messages": messages,
                            "completed": False,
                            "api_calls": api_call_count,
                            "error": f"Request payload too large: max compression attempts ({max_compression_attempts}) reached.",
                            "partial": True,
                            "failed": True,
                            "compression_exhausted": True,
                        }
                    agent._emit_status(f"⚠️  Request payload too large (413) — compression attempt {compression_attempts}/{max_compression_attempts}...")

                    original_len = len(messages)
                    messages, active_system_prompt = agent._compress_context(
                        messages, system_message, approx_tokens=approx_tokens,
                        task_id=effective_task_id,
                    )
                    # Compression created a new session — clear history
                    # so _flush_messages_to_session_db writes compressed
                    # messages to the new session, not skipping them.
                    conversation_history = None

                    if len(messages) < original_len:
                        agent._emit_status(f"🗜️ Compressed {original_len} → {len(messages)} messages, retrying...")
                        time.sleep(2)  # Brief pause between compression retries
                        restart_with_compressed_messages = True
                        break
                    else:
                        agent._vprint(f"{agent.log_prefix}❌ Payload too large and cannot compress further.", force=True)
                        agent._vprint(f"{agent.log_prefix}   💡 Try /new to start a fresh conversation, or /compress to retry compression.", force=True)
                        logging.error(f"{agent.log_prefix}413 payload too large. Cannot compress further.")
                        agent._persist_session(messages, conversation_history)
                        return {
                            "messages": messages,
                            "completed": False,
                            "api_calls": api_call_count,
                            "error": "Request payload too large (413). Cannot compress further.",
                            "partial": True,
                            "failed": True,
                            "compression_exhausted": True,
                        }

                # Check for context-length errors BEFORE generic 4xx handler.
                # The classifier detects context overflow from: explicit error
                # messages, generic 400 + large session heuristic (#1630), and
                # server disconnect + large session pattern (#2153).
                is_context_length_error = (
                    classified.reason == FailoverReason.context_overflow
                )

                if is_context_length_error:
                    compressor = agent.context_compressor
                    old_ctx = compressor.context_length

                    # ── Distinguish two very different errors ───────────
                    # 1. "Prompt too long": the INPUT exceeds the context window.
                    #    Fix: reduce context_length + compress history.
                    # 2. "max_tokens too large": input is fine, but
                    #    input_tokens + requested max_tokens > context_window.
                    #    Fix: reduce max_tokens (the OUTPUT cap) for this call.
                    #    Do NOT shrink context_length — the window is unchanged.
                    #
                    # Note: max_tokens = output token cap (one response).
                    #       context_length = total window (input + output combined).
                    available_out = parse_available_output_tokens_from_error(error_msg)
                    if available_out is not None:
                        # Error is purely about the output cap being too large.
                        # Cap output to the available space and retry without
                        # touching context_length or triggering compression.
                        safe_out = max(1, available_out - 64)  # small safety margin
                        agent._ephemeral_max_output_tokens = safe_out
                        agent._vprint(
                            f"{agent.log_prefix}⚠️  Output cap too large for current prompt — "
                            f"retrying with max_tokens={safe_out:,} "
                            f"(available_tokens={available_out:,}; context_length unchanged at {old_ctx:,})",
                            force=True,
                        )
                        # Still count against compression_attempts so we don't
                        # loop forever if the error keeps recurring.
                        compression_attempts += 1
                        if compression_attempts > max_compression_attempts:
                            agent._vprint(f"{agent.log_prefix}❌ Max compression attempts ({max_compression_attempts}) reached.", force=True)
                            agent._vprint(f"{agent.log_prefix}   💡 Try /new to start a fresh conversation, or /compress to retry compression.", force=True)
                            logging.error(f"{agent.log_prefix}Context compression failed after {max_compression_attempts} attempts.")
                            agent._persist_session(messages, conversation_history)
                            return {
                                "messages": messages,
                                "completed": False,
                                "api_calls": api_call_count,
                                "error": f"Context length exceeded: max compression attempts ({max_compression_attempts}) reached.",
                                "partial": True,
                                "failed": True,
                                "compression_exhausted": True,
                            }
                        restart_with_compressed_messages = True
                        break

                    # Error is about the INPUT being too large — reduce context_length.
                    # Try to parse the actual limit from the error message
                    parsed_limit = parse_context_limit_from_error(error_msg)
                    _provider_lower = (getattr(agent, "provider", "") or "").lower()
                    _base_lower = (getattr(agent, "base_url", "") or "").rstrip("/").lower()
                    is_minimax_provider = (
                        _provider_lower in {"minimax", "minimax-cn"}
                        or _base_lower.startswith((
                            "https://api.minimax.io/anthropic",
                            "https://api.minimaxi.com/anthropic",
                        ))
                    )
                    minimax_delta_only_overflow = (
                        is_minimax_provider
                        and parsed_limit is None
                        and "context window exceeds limit (" in error_msg
                    )
                    if parsed_limit and parsed_limit < old_ctx:
                        new_ctx = parsed_limit
                        agent._vprint(f"{agent.log_prefix}Context limit detected from API: {new_ctx:,} tokens (was {old_ctx:,})", force=True)
                    elif minimax_delta_only_overflow:
                        new_ctx = old_ctx
                        agent._vprint(
                            f"{agent.log_prefix}Provider reported overflow amount only; "
                            f"keeping context_length at {old_ctx:,} tokens and compressing.",
                            force=True,
                        )
                    else:
                        # Step down to the next probe tier
                        new_ctx = get_next_probe_tier(old_ctx)

                    if new_ctx and new_ctx < old_ctx:
                        compressor.update_model(
                            model=agent.model,
                            context_length=new_ctx,
                            base_url=agent.base_url,
                            api_key=getattr(agent, "api_key", ""),
                            provider=agent.provider,
                        )
                        # Context probing flags — only set on built-in
                        # compressor (plugin engines manage their own).
                        if hasattr(compressor, "_context_probed"):
                            compressor._context_probed = True
                            # Only persist limits parsed from the provider's
                            # error message (a real number).  Guessed fallback
                            # tiers from get_next_probe_tier() should stay
                            # in-memory only — persisting them pollutes the
                            # cache with wrong values.
                            compressor._context_probe_persistable = bool(
                                parsed_limit and parsed_limit == new_ctx
                            )
                        agent._vprint(f"{agent.log_prefix}⚠️  Context length exceeded — stepping down: {old_ctx:,} → {new_ctx:,} tokens", force=True)
                    else:
                        agent._vprint(f"{agent.log_prefix}⚠️  Context length exceeded at minimum tier — attempting compression...", force=True)

                    compression_attempts += 1
                    if compression_attempts > max_compression_attempts:
                        agent._vprint(f"{agent.log_prefix}❌ Max compression attempts ({max_compression_attempts}) reached.", force=True)
                        agent._vprint(f"{agent.log_prefix}   💡 Try /new to start a fresh conversation, or /compress to retry compression.", force=True)
                        logging.error(f"{agent.log_prefix}Context compression failed after {max_compression_attempts} attempts.")
                        agent._persist_session(messages, conversation_history)
                        return {
                            "messages": messages,
                            "completed": False,
                            "api_calls": api_call_count,
                            "error": f"Context length exceeded: max compression attempts ({max_compression_attempts}) reached.",
                            "partial": True,
                            "failed": True,
                            "compression_exhausted": True,
                        }
                    agent._emit_status(f"🗜️ Context too large (~{approx_tokens:,} tokens) — compressing ({compression_attempts}/{max_compression_attempts})...")

                    original_len = len(messages)
                    messages, active_system_prompt = agent._compress_context(
                        messages, system_message, approx_tokens=approx_tokens,
                        task_id=effective_task_id,
                    )
                    # Compression created a new session — clear history
                    # so _flush_messages_to_session_db writes compressed
                    # messages to the new session, not skipping them.
                    conversation_history = None

                    if len(messages) < original_len or new_ctx and new_ctx < old_ctx:
                        if len(messages) < original_len:
                            agent._emit_status(f"🗜️ Compressed {original_len} → {len(messages)} messages, retrying...")
                        time.sleep(2)  # Brief pause between compression retries
                        restart_with_compressed_messages = True
                        break
                    else:
                        # Can't compress further and already at minimum tier
                        agent._vprint(f"{agent.log_prefix}❌ Context length exceeded and cannot compress further.", force=True)
                        agent._vprint(f"{agent.log_prefix}   💡 The conversation has accumulated too much content. Try /new to start fresh, or /compress to manually trigger compression.", force=True)
                        logging.error(f"{agent.log_prefix}Context length exceeded: {approx_tokens:,} tokens. Cannot compress further.")
                        agent._persist_session(messages, conversation_history)
                        return {
                            "messages": messages,
                            "completed": False,
                            "api_calls": api_call_count,
                            "error": f"Context length exceeded ({approx_tokens:,} tokens). Cannot compress further.",
                            "partial": True,
                            "failed": True,
                            "compression_exhausted": True,
                        }

                # Check for non-retryable client errors.  The classifier
                # already accounts for 413, 429, 529 (transient), context
                # overflow, and generic-400 heuristics.  Local validation
                # errors (ValueError, TypeError) are programming bugs.
                # Exclude UnicodeEncodeError — it's a ValueError subclass
                # but is handled separately by the surrogate sanitization
                # path above.  Exclude json.JSONDecodeError — also a
                # ValueError subclass, but it indicates a transient
                # provider/network failure (malformed response body,
                # truncated stream, routing layer corruption), not a
                # local programming bug, and should be retried (#14782).
                is_local_validation_error = (
                    isinstance(api_error, (ValueError, TypeError))
                    and not isinstance(
                        api_error, (UnicodeEncodeError, json.JSONDecodeError)
                    )
                    # ssl.SSLError (and its subclass SSLCertVerificationError)
                    # inherits from OSError *and* ValueError via Python MRO,
                    # so the isinstance(ValueError) check above would
                    # misclassify a TLS transport failure as a local
                    # programming bug and abort without retrying.  Exclude
                    # ssl.SSLError explicitly so the error classifier's
                    # retryable=True mapping takes effect instead.
                    and not isinstance(api_error, ssl.SSLError)
                )
                is_client_error = (
                    is_local_validation_error
                    or (
                        not classified.retryable
                        and not classified.should_compress
                        and classified.reason not in {
                            FailoverReason.rate_limit,
                            FailoverReason.billing,
                            FailoverReason.overloaded,
                            FailoverReason.context_overflow,
                            FailoverReason.payload_too_large,
                            FailoverReason.long_context_tier,
                            FailoverReason.thinking_signature,
                        }
                    )
                ) and not is_context_length_error

                if is_client_error:
                    # Try fallback before aborting — a different provider
                    # may not have the same issue (rate limit, auth, etc.)
                    agent._emit_status(f"⚠️ Non-retryable error (HTTP {status_code}) — trying fallback...")
                    if agent._try_activate_fallback():
                        retry_count = 0
                        compression_attempts = 0
                        primary_recovery_attempted = False
                        continue
                    if api_kwargs is not None:
                        agent._dump_api_request_debug(
                            api_kwargs, reason="non_retryable_client_error", error=api_error,
                        )
                    agent._emit_status(
                        f"❌ Non-retryable error (HTTP {status_code}): "
                        f"{agent._summarize_api_error(api_error)}"
                    )
                    agent._vprint(f"{agent.log_prefix}❌ Non-retryable client error (HTTP {status_code}). Aborting.", force=True)
                    agent._vprint(f"{agent.log_prefix}   🔌 Provider: {_provider}  Model: {_model}", force=True)
                    agent._vprint(f"{agent.log_prefix}   🌐 Endpoint: {_base}", force=True)
                    # Actionable guidance for common auth errors
                    if classified.is_auth or classified.reason == FailoverReason.billing:
                        if _provider in {"openai-codex", "xai-oauth"} and status_code == 401:
                            if _provider == "openai-codex":
                                agent._vprint(f"{agent.log_prefix}   💡 Codex OAuth token was rejected (HTTP 401). Your token may have been", force=True)
                                agent._vprint(f"{agent.log_prefix}      refreshed by another client (Codex CLI, VS Code). To fix:", force=True)
                                agent._vprint(f"{agent.log_prefix}      1. Run `codex` in your terminal to generate fresh tokens.", force=True)
                                agent._vprint(f"{agent.log_prefix}      2. Then run `hermes auth` to re-authenticate.", force=True)
                            else:
                                agent._vprint(f"{agent.log_prefix}   💡 xAI OAuth token was rejected (HTTP 401). To fix:", force=True)
                                agent._vprint(f"{agent.log_prefix}      re-authenticate with xAI Grok OAuth (SuperGrok Subscription) from `hermes model`.", force=True)
                        else:
                            agent._vprint(f"{agent.log_prefix}   💡 Your API key was rejected by the provider. Check:", force=True)
                            agent._vprint(f"{agent.log_prefix}      • Is the key valid? Run: hermes setup", force=True)
                            agent._vprint(f"{agent.log_prefix}      • Does your account have access to {_model}?", force=True)
                            if base_url_host_matches(str(_base), "openrouter.ai"):
                                agent._vprint(f"{agent.log_prefix}      • Check credits: https://openrouter.ai/settings/credits", force=True)
                    else:
                        agent._vprint(f"{agent.log_prefix}   💡 This type of error won't be fixed by retrying.", force=True)
                    logging.error(f"{agent.log_prefix}Non-retryable client error: {api_error}")
                    # Skip session persistence when the error is likely
                    # context-overflow related (status 400 + large session).
                    # Persisting the failed user message would make the
                    # session even larger, causing the same failure on the
                    # next attempt. (#1630)
                    if status_code == 400 and (approx_tokens > 50000 or len(api_messages) > 80):
                        agent._vprint(
                            f"{agent.log_prefix}⚠️  Skipping session persistence "
                            f"for large failed session to prevent growth loop.",
                            force=True,
                        )
                    else:
                        agent._persist_session(messages, conversation_history)
                    return {
                        "final_response": None,
                        "messages": messages,
                        "api_calls": api_call_count,
                        "completed": False,
                        "failed": True,
                        "error": str(api_error),
                    }

                if retry_count >= max_retries:
                    # Before falling back, try rebuilding the primary
                    # client once for transient transport errors (stale
                    # connection pool, TCP reset).  Only attempted once
                    # per API call block.
                    if not primary_recovery_attempted and agent._try_recover_primary_transport(
                        api_error, retry_count=retry_count, max_retries=max_retries,
                    ):
                        primary_recovery_attempted = True
                        retry_count = 0
                        continue
                    # Try fallback before giving up entirely
                    agent._emit_status(f"⚠️ Max retries ({max_retries}) exhausted — trying fallback...")
                    if agent._try_activate_fallback():
                        retry_count = 0
                        compression_attempts = 0
                        primary_recovery_attempted = False
                        continue
                    _final_summary = agent._summarize_api_error(api_error)
                    if is_rate_limited:
                        agent._emit_status(f"❌ Rate limited after {max_retries} retries — {_final_summary}")
                    else:
                        agent._emit_status(f"❌ API failed after {max_retries} retries — {_final_summary}")
                    agent._vprint(f"{agent.log_prefix}   💀 Final error: {_final_summary}", force=True)

                    # Detect SSE stream-drop pattern (e.g. "Network
                    # connection lost") and surface actionable guidance.
                    # This typically happens when the model generates a
                    # very large tool call (write_file with huge content)
                    # and the proxy/CDN drops the stream mid-response.
                    _is_stream_drop = (
                        not getattr(api_error, "status_code", None)
                        and any(p in error_msg for p in (
                            "connection lost", "connection reset",
                            "connection closed", "network connection",
                            "network error", "terminated",
                        ))
                    )
                    if _is_stream_drop:
                        agent._vprint(
                            f"{agent.log_prefix}   💡 The provider's stream "
                            f"connection keeps dropping. This often happens "
                            f"when the model tries to write a very large "
                            f"file in a single tool call.",
                            force=True,
                        )
                        agent._vprint(
                            f"{agent.log_prefix}      Try asking the model "
                            f"to use execute_code with Python's open() for "
                            f"large files, or to write the file in smaller "
                            f"sections.",
                            force=True,
                        )

                    logging.error(
                        "%sAPI call failed after %s retries. %s | provider=%s model=%s msgs=%s tokens=~%s",
                        agent.log_prefix, max_retries, _final_summary,
                        _provider, _model, len(api_messages), f"{approx_tokens:,}",
                    )
                    if api_kwargs is not None:
                        agent._dump_api_request_debug(
                            api_kwargs, reason="max_retries_exhausted", error=api_error,
                        )
                    agent._persist_session(messages, conversation_history)
                    _final_response = f"API call failed after {max_retries} retries: {_final_summary}"
                    if _is_stream_drop:
                        _final_response += (
                            "\n\nThe provider's stream connection keeps "
                            "dropping — this often happens when generating "
                            "very large tool call responses (e.g. write_file "
                            "with long content). Try asking me to use "
                            "execute_code with Python's open() for large "
                            "files, or to write in smaller sections."
                        )
                    return {
                        "final_response": _final_response,
                        "messages": messages,
                        "api_calls": api_call_count,
                        "completed": False,
                        "failed": True,
                        "error": _final_summary,
                    }

                # For rate limits, respect the Retry-After header if present
                _retry_after = None
                if is_rate_limited:
                    _resp_headers = getattr(getattr(api_error, "response", None), "headers", None)
                    if _resp_headers and hasattr(_resp_headers, "get"):
                        _ra_raw = _resp_headers.get("retry-after") or _resp_headers.get("Retry-After")
                        if _ra_raw:
                            try:
                                _retry_after = min(float(_ra_raw), 120)  # Cap at 2 minutes
                            except (TypeError, ValueError):
                                pass
                wait_time = _retry_after if _retry_after else jittered_backoff(retry_count, base_delay=2.0, max_delay=60.0)
                if is_rate_limited:
                    agent._emit_status(f"⏱️ Rate limited. Waiting {wait_time:.1f}s (attempt {retry_count + 1}/{max_retries})...")
                else:
                    agent._emit_status(f"⏳ Retrying in {wait_time:.1f}s (attempt {retry_count}/{max_retries})...")
                logger.warning(
                    "Retrying API call in %ss (attempt %s/%s) %s error=%s",
                    wait_time,
                    retry_count,
                    max_retries,
                    agent._client_log_context(),
                    api_error,
                )
                # Sleep in small increments so we can respond to interrupts quickly
                # instead of blocking the entire wait_time in one sleep() call
                sleep_end = time.time() + wait_time
                _backoff_touch_counter = 0
                while time.time() < sleep_end:
                    if agent._interrupt_requested:
                        agent._vprint(f"{agent.log_prefix}⚡ Interrupt detected during retry wait, aborting.", force=True)
                        agent._persist_session(messages, conversation_history)
                        agent.clear_interrupt()
                        return {
                            "final_response": f"Operation interrupted: retrying API call after error (retry {retry_count}/{max_retries}).",
                            "messages": messages,
                            "api_calls": api_call_count,
                            "completed": False,
                            "interrupted": True,
                        }
                    time.sleep(0.2)  # Check interrupt every 200ms
                    # Touch activity every ~30s so the gateway's inactivity
                    # monitor knows we're alive during backoff waits.
                    _backoff_touch_counter += 1
                    if _backoff_touch_counter % 150 == 0:  # 150 × 0.2s = 30s
                        agent._touch_activity(
                            f"error retry backoff ({retry_count}/{max_retries}), "
                            f"{int(sleep_end - time.time())}s remaining"
                        )

        # If the API call was interrupted, skip response processing
        if interrupted:
            _turn_exit_reason = "interrupted_during_api_call"
            break

        if restart_with_compressed_messages:
            api_call_count -= 1
            agent.iteration_budget.refund()
            # Count compression restarts toward the retry limit to prevent
            # infinite loops when compression reduces messages but not enough
            # to fit the context window.
            retry_count += 1
            restart_with_compressed_messages = False
            continue

        if restart_with_length_continuation:
            # Progressively boost the output token budget on each retry.
            # Retry 1 → 2× base, retry 2 → 3× base, capped at 32 768.
            # Applies to all providers via _ephemeral_max_output_tokens.
            _boost_base = agent.max_tokens if agent.max_tokens else 4096
            _boost = _boost_base * (length_continue_retries + 1)
            agent._ephemeral_max_output_tokens = min(_boost, 32768)
            continue

        # Guard: if all retries exhausted without a successful response
        # (e.g. repeated context-length errors that exhausted retry_count),
        # the `response` variable is still None. Break out cleanly.
        if response is None:
            _turn_exit_reason = "all_retries_exhausted_no_response"
            print(f"{agent.log_prefix}❌ All API retries exhausted with no successful response.")
            agent._persist_session(messages, conversation_history)
            break

        try:
            _transport = agent._get_transport()
            _normalize_kwargs = {}
            if agent.api_mode == "anthropic_messages":
                _normalize_kwargs["strip_tool_prefix"] = agent._is_anthropic_oauth
            normalized = _transport.normalize_response(response, **_normalize_kwargs)
            assistant_message = normalized
            finish_reason = normalized.finish_reason

            # Normalize content to string — some OpenAI-compatible servers
            # (llama-server, etc.) return content as a dict or list instead
            # of a plain string, which crashes downstream .strip() calls.
            if assistant_message.content is not None and not isinstance(assistant_message.content, str):
                raw = assistant_message.content
                if isinstance(raw, dict):
                    assistant_message.content = raw.get("text", "") or raw.get("content", "") or json.dumps(raw)
                elif isinstance(raw, list):
                    # Multimodal content list — extract text parts
                    parts = []
                    for part in raw:
                        if isinstance(part, str):
                            parts.append(part)
                        elif isinstance(part, dict) and part.get("type") == "text":
                            parts.append(part.get("text", ""))
                        elif isinstance(part, dict) and "text" in part:
                            parts.append(str(part["text"]))
                    assistant_message.content = "\n".join(parts)
                else:
                    assistant_message.content = str(raw)

            try:
                from hermes_cli.plugins import invoke_hook as _invoke_hook
                _assistant_tool_calls = getattr(assistant_message, "tool_calls", None) or []
                _assistant_text = assistant_message.content or ""
                _invoke_hook(
                    "post_api_request",
                    task_id=effective_task_id,
                    session_id=agent.session_id or "",
                    platform=agent.platform or "",
                    model=agent.model,
                    provider=agent.provider,
                    base_url=agent.base_url,
                    api_mode=agent.api_mode,
                    api_call_count=api_call_count,
                    api_duration=api_duration,
                    finish_reason=finish_reason,
                    message_count=len(api_messages),
                    response_model=getattr(response, "model", None),
                    response=response,
                    usage=agent._usage_summary_for_api_request_hook(response),
                    assistant_message=assistant_message,
                    assistant_content_chars=len(_assistant_text),
                    assistant_tool_call_count=len(_assistant_tool_calls),
                )
            except Exception:
                pass

            # Handle assistant response
            if assistant_message.content and not agent.quiet_mode:
                if agent.verbose_logging:
                    agent._vprint(f"{agent.log_prefix}🤖 Assistant: {assistant_message.content}")
                else:
                    agent._vprint(f"{agent.log_prefix}🤖 Assistant: {assistant_message.content[:100]}{'...' if len(assistant_message.content) > 100 else ''}")

            # Notify progress callback of model's thinking (used by subagent
            # delegation to relay the child's reasoning to the parent display).
            if (assistant_message.content and agent.tool_progress_callback):
                _think_text = assistant_message.content.strip()
                # Strip reasoning XML tags that shouldn't leak to parent display
                _think_text = re.sub(
                    r'</?(?:REASONING_SCRATCHPAD|think|reasoning)>', '', _think_text
                ).strip()
                # For subagents: relay first line to parent display (existing behaviour).
                # For all agents with a structured callback: emit reasoning.available event.
                first_line = _think_text.split('\n')[0][:80] if _think_text else ""
                if first_line and getattr(agent, '_delegate_depth', 0) > 0:
                    try:
                        agent.tool_progress_callback("_thinking", first_line)
                    except Exception:
                        pass
                elif _think_text:
                    try:
                        agent.tool_progress_callback("reasoning.available", "_thinking", _think_text[:500], None)
                    except Exception:
                        pass

            # Check for incomplete <REASONING_SCRATCHPAD> (opened but never closed)
            # This means the model ran out of output tokens mid-reasoning — retry up to 2 times
            if has_incomplete_scratchpad(assistant_message.content or ""):
                agent._incomplete_scratchpad_retries += 1

                agent._vprint(f"{agent.log_prefix}⚠️  Incomplete <REASONING_SCRATCHPAD> detected (opened but never closed)")

                if agent._incomplete_scratchpad_retries <= 2:
                    agent._vprint(f"{agent.log_prefix}🔄 Retrying API call ({agent._incomplete_scratchpad_retries}/2)...")
                    # Don't add the broken message, just retry
                    continue
                else:
                    # Max retries - discard this turn and save as partial
                    agent._vprint(f"{agent.log_prefix}❌ Max retries (2) for incomplete scratchpad. Saving as partial.", force=True)
                    agent._incomplete_scratchpad_retries = 0

                    rolled_back_messages = agent._get_messages_up_to_last_assistant(messages)
                    agent._cleanup_task_resources(effective_task_id)
                    agent._persist_session(messages, conversation_history)

                    return {
                        "final_response": None,
                        "messages": rolled_back_messages,
                        "api_calls": api_call_count,
                        "completed": False,
                        "partial": True,
                        "error": "Incomplete REASONING_SCRATCHPAD after 2 retries"
                    }

            # Reset incomplete scratchpad counter on clean response
            agent._incomplete_scratchpad_retries = 0

            if agent.api_mode == "codex_responses" and finish_reason == "incomplete":
                agent._codex_incomplete_retries += 1

                interim_msg = agent._build_assistant_message(assistant_message, finish_reason)
                interim_has_content = bool((interim_msg.get("content") or "").strip())
                interim_has_reasoning = bool(interim_msg.get("reasoning", "").strip()) if isinstance(interim_msg.get("reasoning"), str) else False
                interim_has_codex_reasoning = bool(interim_msg.get("codex_reasoning_items"))
                interim_has_codex_message_items = bool(interim_msg.get("codex_message_items"))

                if (
                    interim_has_content
                    or interim_has_reasoning
                    or interim_has_codex_reasoning
                    or interim_has_codex_message_items
                ):
                    last_msg = messages[-1] if messages else None
                    # Duplicate detection: two consecutive incomplete assistant
                    # messages with identical content AND reasoning are collapsed.
                    # For provider-state-only changes (encrypted reasoning
                    # items or replayable message ids/phases/statuses differ
                    # while visible content/reasoning are unchanged), compare
                    # those opaque payloads too so we don't silently drop the
                    # newer continuation state.
                    last_codex_items = last_msg.get("codex_reasoning_items") if isinstance(last_msg, dict) else None
                    interim_codex_items = interim_msg.get("codex_reasoning_items")
                    last_codex_message_items = last_msg.get("codex_message_items") if isinstance(last_msg, dict) else None
                    interim_codex_message_items = interim_msg.get("codex_message_items")
                    duplicate_interim = (
                        isinstance(last_msg, dict)
                        and last_msg.get("role") == "assistant"
                        and last_msg.get("finish_reason") == "incomplete"
                        and (last_msg.get("content") or "") == (interim_msg.get("content") or "")
                        and (last_msg.get("reasoning") or "") == (interim_msg.get("reasoning") or "")
                        and last_codex_items == interim_codex_items
                        and last_codex_message_items == interim_codex_message_items
                    )
                    if not duplicate_interim:
                        messages.append(interim_msg)
                        agent._emit_interim_assistant_message(interim_msg)

                if agent._codex_incomplete_retries < 3:
                    if not agent.quiet_mode:
                        agent._vprint(f"{agent.log_prefix}↻ Codex response incomplete; continuing turn ({agent._codex_incomplete_retries}/3)")
                    agent._session_messages = messages
                    agent._save_session_log(messages)
                    continue

                agent._codex_incomplete_retries = 0
                agent._persist_session(messages, conversation_history)
                return {
                    "final_response": None,
                    "messages": messages,
                    "api_calls": api_call_count,
                    "completed": False,
                    "partial": True,
                    "error": "Codex response remained incomplete after 3 continuation attempts",
                }
            elif hasattr(agent, "_codex_incomplete_retries"):
                agent._codex_incomplete_retries = 0

            # Check for tool calls
            if assistant_message.tool_calls:
                if not agent.quiet_mode:
                    agent._vprint(f"{agent.log_prefix}🔧 Processing {len(assistant_message.tool_calls)} tool call(s)...")

                if agent.verbose_logging:
                    for tc in assistant_message.tool_calls:
                        logging.debug(f"Tool call: {tc.function.name} with args: {tc.function.arguments[:200]}...")

                # Validate tool call names - detect model hallucinations
                # Repair mismatched tool names before validating
                for tc in assistant_message.tool_calls:
                    if tc.function.name not in agent.valid_tool_names:
                        repaired = agent._repair_tool_call(tc.function.name)
                        if repaired:
                            print(f"{agent.log_prefix}🔧 Auto-repaired tool name: '{tc.function.name}' -> '{repaired}'")
                            tc.function.name = repaired
                invalid_tool_calls = [
                    tc.function.name for tc in assistant_message.tool_calls
                    if tc.function.name not in agent.valid_tool_names
                ]
                if invalid_tool_calls:
                    # Track retries for invalid tool calls
                    agent._invalid_tool_retries += 1

                    # Return helpful error to model — model can agent-correct next turn
                    available = ", ".join(sorted(agent.valid_tool_names))
                    invalid_name = invalid_tool_calls[0]
                    invalid_preview = invalid_name[:80] + "..." if len(invalid_name) > 80 else invalid_name
                    agent._vprint(f"{agent.log_prefix}⚠️  Unknown tool '{invalid_preview}' — sending error to model for agent-correction ({agent._invalid_tool_retries}/3)")

                    if agent._invalid_tool_retries >= 3:
                        agent._vprint(f"{agent.log_prefix}❌ Max retries (3) for invalid tool calls exceeded. Stopping as partial.", force=True)
                        agent._invalid_tool_retries = 0
                        agent._persist_session(messages, conversation_history)
                        return {
                            "final_response": None,
                            "messages": messages,
                            "api_calls": api_call_count,
                            "completed": False,
                            "partial": True,
                            "error": f"Model generated invalid tool call: {invalid_preview}"
                        }

                    assistant_msg = agent._build_assistant_message(assistant_message, finish_reason)
                    messages.append(assistant_msg)
                    for tc in assistant_message.tool_calls:
                        if tc.function.name not in agent.valid_tool_names:
                            content = f"Tool '{tc.function.name}' does not exist. Available tools: {available}"
                        else:
                            content = "Skipped: another tool call in this turn used an invalid name. Please retry this tool call."
                        messages.append({
                            "role": "tool",
                            "name": tc.function.name,
                            "tool_call_id": tc.id,
                            "content": content,
                        })
                    continue
                # Reset retry counter on successful tool call validation
                agent._invalid_tool_retries = 0

                # Validate tool call arguments are valid JSON
                # Handle empty strings as empty objects (common model quirk)
                invalid_json_args = []
                for tc in assistant_message.tool_calls:
                    args = tc.function.arguments
                    if isinstance(args, (dict, list)):
                        tc.function.arguments = json.dumps(args)
                        continue
                    if args is not None and not isinstance(args, str):
                        tc.function.arguments = str(args)
                        args = tc.function.arguments
                    # Treat empty/whitespace strings as empty object
                    if not args or not args.strip():
                        tc.function.arguments = "{}"
                        continue
                    try:
                        json.loads(args)
                    except json.JSONDecodeError as e:
                        invalid_json_args.append((tc.function.name, str(e)))

                if invalid_json_args:
                    # Check if the invalid JSON is due to truncation rather
                    # than a model formatting mistake.  Routers sometimes
                    # rewrite finish_reason from "length" to "tool_calls",
                    # hiding the truncation from the length handler above.
                    # Detect truncation: args that don't end with } or ]
                    # (after stripping whitespace) are cut off mid-stream.
                    _truncated = any(
                        not (tc.function.arguments or "").rstrip().endswith(("}", "]"))
                        for tc in assistant_message.tool_calls
                        if tc.function.name in {n for n, _ in invalid_json_args}
                    )
                    if _truncated:
                        agent._vprint(
                            f"{agent.log_prefix}⚠️  Truncated tool call arguments detected "
                            f"(finish_reason={finish_reason!r}) — refusing to execute.",
                            force=True,
                        )
                        agent._invalid_json_retries = 0
                        agent._cleanup_task_resources(effective_task_id)
                        agent._persist_session(messages, conversation_history)
                        return {
                            "final_response": None,
                            "messages": messages,
                            "api_calls": api_call_count,
                            "completed": False,
                            "partial": True,
                            "error": "Response truncated due to output length limit",
                        }

                    # Track retries for invalid JSON arguments
                    agent._invalid_json_retries += 1

                    tool_name, error_msg = invalid_json_args[0]
                    agent._vprint(f"{agent.log_prefix}⚠️  Invalid JSON in tool call arguments for '{tool_name}': {error_msg}")

                    if agent._invalid_json_retries < 3:
                        agent._vprint(f"{agent.log_prefix}🔄 Retrying API call ({agent._invalid_json_retries}/3)...")
                        # Don't add anything to messages, just retry the API call
                        continue
                    else:
                        # Instead of returning partial, inject tool error results so the model can recover.
                        # Using tool results (not user messages) preserves role alternation.
                        agent._vprint(f"{agent.log_prefix}⚠️  Injecting recovery tool results for invalid JSON...")
                        agent._invalid_json_retries = 0  # Reset for next attempt

                        # Append the assistant message with its (broken) tool_calls
                        recovery_assistant = agent._build_assistant_message(assistant_message, finish_reason)
                        messages.append(recovery_assistant)

                        # Respond with tool error results for each tool call
                        invalid_names = {name for name, _ in invalid_json_args}
                        for tc in assistant_message.tool_calls:
                            if tc.function.name in invalid_names:
                                err = next(e for n, e in invalid_json_args if n == tc.function.name)
                                tool_result = (
                                    f"Error: Invalid JSON arguments. {err}. "
                                    f"For tools with no required parameters, use an empty object: {{}}. "
                                    f"Please retry with valid JSON."
                                )
                            else:
                                tool_result = "Skipped: other tool call in this response had invalid JSON."
                            messages.append({
                                "role": "tool",
                                "name": tc.function.name,
                                "tool_call_id": tc.id,
                                "content": tool_result,
                            })
                        continue

                # Reset retry counter on successful JSON validation
                agent._invalid_json_retries = 0

                # ── Post-call guardrails ──────────────────────────
                assistant_message.tool_calls = agent._cap_delegate_task_calls(
                    assistant_message.tool_calls
                )
                assistant_message.tool_calls = agent._deduplicate_tool_calls(
                    assistant_message.tool_calls
                )

                assistant_msg = agent._build_assistant_message(assistant_message, finish_reason)

                # If this turn has both content AND tool_calls, capture the content
                # as a fallback final response. Common pattern: model delivers its
                # answer and calls memory/skill tools as a side-effect in the same
                # turn. If the follow-up turn after tools is empty, we use this.
                turn_content = assistant_message.content or ""
                if turn_content and agent._has_content_after_think_block(turn_content):
                    agent._last_content_with_tools = turn_content
                    # Only mute subsequent output when EVERY tool call in
                    # this turn is post-response housekeeping (memory, todo,
                    # skill_manage, etc.).  If any substantive tool is present
                    # (search_files, read_file, write_file, terminal, ...),
                    # keep output visible so the user sees progress.
                    _HOUSEKEEPING_TOOLS = frozenset({
                        "memory", "todo", "skill_manage", "session_search",
                    })
                    _all_housekeeping = all(
                        tc.function.name in _HOUSEKEEPING_TOOLS
                        for tc in assistant_message.tool_calls
                    )
                    agent._last_content_tools_all_housekeeping = _all_housekeeping
                    if _all_housekeeping and agent._has_stream_consumers():
                        agent._mute_post_response = True
                    elif agent._should_emit_quiet_tool_messages():
                        clean = agent._strip_think_blocks(turn_content).strip()
                        if clean:
                            agent._vprint(f"  ┊ 💬 {clean}")

                # Pop thinking-only prefill message(s) before appending
                # (tool-call path — same rationale as the final-response path).
                _had_prefill = False
                while (
                    messages
                    and isinstance(messages[-1], dict)
                    and messages[-1].get("_thinking_prefill")
                ):
                    messages.pop()
                    _had_prefill = True

                # Reset prefill counter when tool calls follow a prefill
                # recovery.  Without this, the counter accumulates across
                # the whole conversation — a model that intermittently
                # empties (empty → prefill → tools → empty → prefill →
                # tools) burns both prefill attempts and the third empty
                # gets zero recovery.  Resetting here treats each tool-
                # call success as a fresh start.
                if _had_prefill:
                    agent._thinking_prefill_retries = 0
                    agent._empty_content_retries = 0
                # Successful tool execution — reset the post-tool nudge
                # flag so it can fire again if the model goes empty on
                # a LATER tool round.
                agent._post_tool_empty_retried = False

                messages.append(assistant_msg)
                agent._emit_interim_assistant_message(assistant_msg)

                # Close any open streaming display (response box, reasoning
                # box) before tool execution begins.  Intermediate turns may
                # have streamed early content that opened the response box;
                # flushing here prevents it from wrapping tool feed lines.
                # Only signal the display callback — TTS (_stream_callback)
                # should NOT receive None (it uses None as end-of-stream).
                if agent.stream_delta_callback:
                    try:
                        agent.stream_delta_callback(None)
                    except Exception:
                        pass

                agent._execute_tool_calls(assistant_message, messages, effective_task_id, api_call_count)

                if agent._tool_guardrail_halt_decision is not None:
                    decision = agent._tool_guardrail_halt_decision
                    _turn_exit_reason = "guardrail_halt"
                    final_response = agent._toolguard_controlled_halt_response(decision)
                    agent._emit_status(
                        f"⚠️ Tool guardrail halted {decision.tool_name}: {decision.code}"
                    )
                    messages.append({"role": "assistant", "content": final_response})
                    break

                # Reset per-turn retry counters after successful tool
                # execution so a single truncation doesn't poison the
                # entire conversation.
                truncated_tool_call_retries = 0

                # Signal that a paragraph break is needed before the next
                # streamed text.  We don't emit it immediately because
                # multiple consecutive tool iterations would stack up
                # redundant blank lines.  Instead, _fire_stream_delta()
                # will prepend a single "\n\n" the next time real text
                # arrives.
                agent._stream_needs_break = True

                # Refund the iteration if the ONLY tool(s) called were
                # execute_code (programmatic tool calling).  These are
                # cheap RPC-style calls that shouldn't eat the budget.
                _tc_names = {tc.function.name for tc in assistant_message.tool_calls}
                if _tc_names == {"execute_code"}:
                    agent.iteration_budget.refund()

                # Use real token counts from the API response to decide
                # compression.  prompt_tokens + completion_tokens is the
                # actual context size the provider reported plus the
                # assistant turn — a tight lower bound for the next prompt.
                # Tool results appended above aren't counted yet, but the
                # threshold (default 50%) leaves ample headroom; if tool
                # results push past it, the next API call will report the
                # real total and trigger compression then.
                #
                # If last_prompt_tokens is 0 (stale after API disconnect
                # or provider returned no usage data), fall back to rough
                # estimate to avoid missing compression.  Without this,
                # a session can grow unbounded after disconnects because
                # should_compress(0) never fires.  (#2153)
                _compressor = agent.context_compressor
                if _compressor.last_prompt_tokens > 0:
                    # Only use prompt_tokens — completion/reasoning
                    # tokens don't consume context window space.
                    # Thinking models (GLM-5.1, QwQ, DeepSeek R1)
                    # inflate completion_tokens with reasoning,
                    # causing premature compression.  (#12026)
                    _real_tokens = _compressor.last_prompt_tokens
                else:
                    # Include tool schemas — with 50+ tools enabled
                    # these add 20-30K tokens the messages-only
                    # estimate misses, which can skip compression
                    # past the configured threshold (#14695).
                    _real_tokens = estimate_request_tokens_rough(
                        messages, tools=agent.tools or None
                    )

                if agent.compression_enabled and _compressor.should_compress(_real_tokens):
                    agent._safe_print("  ⟳ compacting context…")
                    messages, active_system_prompt = agent._compress_context(
                        messages, system_message,
                        approx_tokens=agent.context_compressor.last_prompt_tokens,
                        task_id=effective_task_id,
                    )
                    # Compression created a new session — clear history so
                    # _flush_messages_to_session_db writes compressed messages
                    # to the new session (see preflight compression comment).
                    conversation_history = None

                # Save session log incrementally (so progress is visible even if interrupted)
                agent._session_messages = messages
                agent._save_session_log(messages)

                # Continue loop for next response
                continue

            else:
                # No tool calls - this is the final response
                final_response = assistant_message.content or ""

                # Fix: unmute output when entering the no-tool-call branch
                # so the user can see empty-response warnings and recovery
                # status messages.  _mute_post_response was set during a
                # prior housekeeping tool turn and should not silence the
                # final response path.
                agent._mute_post_response = False

                # Check if response only has think block with no actual content after it
                if not agent._has_content_after_think_block(final_response):
                    # ── Partial stream recovery ─────────────────────
                    # If content was already streamed to the user before
                    # the connection died, use it as the final response
                    # instead of falling through to prior-turn fallback
                    # or wasting API calls on retries.
                    _partial_streamed = (
                        getattr(agent, "_current_streamed_assistant_text", "") or ""
                    )
                    if agent._has_content_after_think_block(_partial_streamed):
                        _turn_exit_reason = "partial_stream_recovery"
                        _recovered = agent._strip_think_blocks(_partial_streamed).strip()
                        logger.info(
                            "Partial stream content delivered (%d chars) "
                            "— using as final response",
                            len(_recovered),
                        )
                        agent._emit_status(
                            "↻ Stream interrupted — using delivered content "
                            "as final response"
                        )
                        final_response = _recovered
                        agent._response_was_previewed = True
                        break

                    # If the previous turn already delivered real content alongside
                    # HOUSEKEEPING tool calls (e.g. "You're welcome!" + memory save),
                    # the model has nothing more to say. Use the earlier content
                    # immediately instead of wasting API calls on retries.
                    # NOTE: Only use this shortcut when ALL tools in that turn were
                    # housekeeping (memory, todo, etc.).  When substantive tools
                    # were called (terminal, search_files, etc.), the content was
                    # likely mid-task narration ("I'll scan the directory...") and
                    # the empty follow-up means the model choked — let the
                    # post-tool nudge below handle that instead of exiting early.
                    fallback = getattr(agent, '_last_content_with_tools', None)
                    if fallback and getattr(agent, '_last_content_tools_all_housekeeping', False):
                        _turn_exit_reason = "fallback_prior_turn_content"
                        logger.info("Empty follow-up after tool calls — using prior turn content as final response")
                        agent._emit_status("↻ Empty response after tool calls — using earlier content as final answer")
                        agent._last_content_with_tools = None
                        agent._last_content_tools_all_housekeeping = False
                        agent._empty_content_retries = 0
                        # Do NOT modify the assistant message content — the
                        # old code injected "Calling the X tools..." which
                        # poisoned the conversation history.  Just use the
                        # fallback text as the final response and break.
                        final_response = agent._strip_think_blocks(fallback).strip()
                        agent._response_was_previewed = True
                        break

                    # ── Post-tool-call empty response nudge ───────────
                    # The model returned empty after executing tool calls.
                    # This covers two cases:
                    #  (a) No prior-turn content at all — model went silent
                    #  (b) Prior turn had content + SUBSTANTIVE tools (the
                    #      fallback above was skipped because the content
                    #      was mid-task narration, not a final answer)
                    # Instead of giving up, nudge the model to continue by
                    # appending a user-level hint.  This is the #9400 case:
                    # weaker models (mimo-v2-pro, GLM-5, etc.) sometimes
                    # return empty after tool results instead of continuing
                    # to the next step.  One retry with a nudge usually
                    # fixes it.
                    _prior_was_tool = any(
                        m.get("role") == "tool"
                        for m in messages[-5:]  # check recent messages
                    )
                    # Detect Qwen3/Ollama-style in-content thinking blocks.
                    # Ollama puts <think> in the content field (not in
                    # reasoning_content), so _has_structured below would
                    # miss it.  We check here so thinking-only responses
                    # after tool calls route to prefill instead of nudge.
                    _has_inline_thinking = bool(
                        re.search(
                            r'<think>|<thinking>|<reasoning>',
                            final_response or "",
                            re.IGNORECASE,
                        )
                    )
                    if (
                        _prior_was_tool
                        and not getattr(agent, "_post_tool_empty_retried", False)
                        and not _has_inline_thinking  # thinking model still working — let prefill handle
                    ):
                        agent._post_tool_empty_retried = True
                        # Clear stale narration so it doesn't resurface
                        # on a later empty response after the nudge.
                        agent._last_content_with_tools = None
                        agent._last_content_tools_all_housekeeping = False
                        logger.info(
                            "Empty response after tool calls — nudging model "
                            "to continue processing"
                        )
                        agent._emit_status(
                            "⚠️ Model returned empty after tool calls — "
                            "nudging to continue"
                        )
                        # Append the empty assistant message first so the
                        # message sequence stays valid:
                        #   tool(result) → assistant("(empty)") → user(nudge)
                        # Without this, we'd have tool → user which most
                        # APIs reject as an invalid sequence.
                        _nudge_msg = agent._build_assistant_message(assistant_message, finish_reason)
                        _nudge_msg["content"] = "(empty)"
                        _nudge_msg["_empty_recovery_synthetic"] = True
                        messages.append(_nudge_msg)
                        messages.append({
                            "role": "user",
                            "content": (
                                "You just executed tool calls but returned an "
                                "empty response. Please process the tool "
                                "results above and continue with the task."
                            ),
                            "_empty_recovery_synthetic": True,
                        })
                        continue

                    # ── Thinking-only prefill continuation ──────────
                    # The model produced structured reasoning (via API
                    # fields) but no visible text content.  Rather than
                    # giving up, append the assistant message as-is and
                    # continue — the model will see its own reasoning
                    # on the next turn and produce the text portion.
                    # Inspired by clawdbot's "incomplete-text" recovery.
                    # Also covers Qwen3/Ollama in-content <think> blocks
                    # (detected above as _has_inline_thinking).
                    _has_structured = bool(
                        getattr(assistant_message, "reasoning", None)
                        or getattr(assistant_message, "reasoning_content", None)
                        or getattr(assistant_message, "reasoning_details", None)
                        or _has_inline_thinking
                    )
                    if _has_structured and agent._thinking_prefill_retries < 2:
                        agent._thinking_prefill_retries += 1
                        logger.info(
                            "Thinking-only response (no visible content) — "
                            "prefilling to continue (%d/2)",
                            agent._thinking_prefill_retries,
                        )
                        agent._emit_status(
                            f"↻ Thinking-only response — prefilling to continue "
                            f"({agent._thinking_prefill_retries}/2)"
                        )
                        interim_msg = agent._build_assistant_message(
                            assistant_message, "incomplete"
                        )
                        interim_msg["_thinking_prefill"] = True
                        messages.append(interim_msg)
                        agent._session_messages = messages
                        agent._save_session_log(messages)
                        continue

                    # ── Empty response retry ──────────────────────
                    # Model returned nothing usable.  Retry up to 3
                    # times before attempting fallback.  This covers
                    # both truly empty responses (no content, no
                    # reasoning) AND reasoning-only responses after
                    # prefill exhaustion — models like mimo-v2-pro
                    # always populate reasoning fields via OpenRouter,
                    # so the old `not _has_structured` guard blocked
                    # retries for every reasoning model after prefill.
                    _truly_empty = not agent._strip_think_blocks(
                        final_response
                    ).strip()
                    _prefill_exhausted = (
                        _has_structured
                        and agent._thinking_prefill_retries >= 2
                    )
                    if _truly_empty and (not _has_structured or _prefill_exhausted) and agent._empty_content_retries < 3:
                        agent._empty_content_retries += 1
                        logger.warning(
                            "Empty response (no content or reasoning) — "
                            "retry %d/3 (model=%s)",
                            agent._empty_content_retries, agent.model,
                        )
                        agent._emit_status(
                            f"⚠️ Empty response from model — retrying "
                            f"({agent._empty_content_retries}/3)"
                        )
                        continue

                    # ── Exhausted retries — try fallback provider ──
                    # Before giving up with "(empty)", attempt to
                    # switch to the next provider in the fallback
                    # chain.  This covers the case where a model
                    # (e.g. GLM-4.5-Air) consistently returns empty
                    # due to context degradation or provider issues.
                    if _truly_empty and agent._fallback_chain:
                        logger.warning(
                            "Empty response after %d retries — "
                            "attempting fallback (model=%s, provider=%s)",
                            agent._empty_content_retries, agent.model,
                            agent.provider,
                        )
                        agent._emit_status(
                            "⚠️ Model returning empty responses — "
                            "switching to fallback provider..."
                        )
                        if agent._try_activate_fallback():
                            agent._empty_content_retries = 0
                            agent._emit_status(
                                f"↻ Switched to fallback: {agent.model} "
                                f"({agent.provider})"
                            )
                            logger.info(
                                "Fallback activated after empty responses: "
                                "now using %s on %s",
                                agent.model, agent.provider,
                            )
                            continue

                    # Exhausted retries and fallback chain (or no
                    # fallback configured).  Fall through to the
                    # "(empty)" terminal.
                    _turn_exit_reason = "empty_response_exhausted"
                    reasoning_text = agent._extract_reasoning(assistant_message)
                    agent._drop_trailing_empty_response_scaffolding(messages)
                    assistant_msg = agent._build_assistant_message(assistant_message, finish_reason)
                    assistant_msg["content"] = "(empty)"
                    # This is a user-facing failure sentinel for the gateway,
                    # not real assistant content. Persisting it makes later
                    # "continue" turns replay assistant("(empty)") as if it
                    # were a meaningful model response, which can keep long
                    # tool-heavy sessions stuck in empty-response loops.
                    assistant_msg["_empty_terminal_sentinel"] = True
                    messages.append(assistant_msg)

                    if reasoning_text:
                        reasoning_preview = reasoning_text[:500] + "..." if len(reasoning_text) > 500 else reasoning_text
                        logger.warning(
                            "Reasoning-only response (no visible content) "
                            "after exhausting retries and fallback. "
                            "Reasoning: %s", reasoning_preview,
                        )
                        agent._emit_status(
                            "⚠️ Model produced reasoning but no visible "
                            "response after all retries. Returning empty."
                        )
                    else:
                        logger.warning(
                            "Empty response (no content or reasoning) "
                            "after %d retries. No fallback available. "
                            "model=%s provider=%s",
                            agent._empty_content_retries, agent.model,
                            agent.provider,
                        )
                        agent._emit_status(
                            "❌ Model returned no content after all retries"
                            + (" and fallback attempts." if agent._fallback_chain else
                               ". No fallback providers configured.")
                        )

                    final_response = "(empty)"
                    break

                # Reset retry counter/signature on successful content
                agent._empty_content_retries = 0
                agent._thinking_prefill_retries = 0

                if (
                    agent.api_mode == "codex_responses"
                    and agent.valid_tool_names
                    and codex_ack_continuations < 2
                    and agent._looks_like_codex_intermediate_ack(
                        user_message=user_message,
                        assistant_content=final_response,
                        messages=messages,
                    )
                ):
                    codex_ack_continuations += 1
                    interim_msg = agent._build_assistant_message(assistant_message, "incomplete")
                    messages.append(interim_msg)
                    agent._emit_interim_assistant_message(interim_msg)

                    continue_msg = {
                        "role": "user",
                        "content": (
                            "[System: Continue now. Execute the required tool calls and only "
                            "send your final answer after completing the task.]"
                        ),
                    }
                    messages.append(continue_msg)
                    agent._session_messages = messages
                    agent._save_session_log(messages)
                    continue

                codex_ack_continuations = 0

                if truncated_response_parts:
                    final_response = "".join(truncated_response_parts) + final_response
                    truncated_response_parts = []
                    length_continue_retries = 0

                final_response = agent._strip_think_blocks(final_response).strip()

                final_msg = agent._build_assistant_message(assistant_message, finish_reason)

                # Pop thinking-only prefill and empty-response retry
                # scaffolding before appending the final response.  These
                # internal turns are only for the next API retry and should
                # not become durable transcript context.
                while (
                    messages
                    and isinstance(messages[-1], dict)
                    and (
                        messages[-1].get("_thinking_prefill")
                        or messages[-1].get("_empty_recovery_synthetic")
                        or messages[-1].get("_empty_terminal_sentinel")
                    )
                ):
                    messages.pop()

                messages.append(final_msg)

                _turn_exit_reason = f"text_response(finish_reason={finish_reason})"
                if not agent.quiet_mode:
                    agent._safe_print(f"🎉 Conversation completed after {api_call_count} OpenAI-compatible API call(s)")
                break

        except Exception as e:
            error_msg = f"Error during OpenAI-compatible API call #{api_call_count}: {str(e)}"
            try:
                print(f"❌ {error_msg}")
            except (OSError, ValueError):
                logger.error(error_msg)

            logger.debug("Outer loop error in API call #%d", api_call_count, exc_info=True)

            # If an assistant message with tool_calls was already appended,
            # the API expects a role="tool" result for every tool_call_id.
            # Fill in error results for any that weren't answered yet.
            for idx in range(len(messages) - 1, -1, -1):
                msg = messages[idx]
                if not isinstance(msg, dict):
                    break
                if msg.get("role") == "tool":
                    continue
                if msg.get("role") == "assistant" and msg.get("tool_calls"):
                    answered_ids = {
                        m["tool_call_id"]
                        for m in messages[idx + 1:]
                        if isinstance(m, dict) and m.get("role") == "tool"
                    }
                    for tc in msg["tool_calls"]:
                        if not tc or not isinstance(tc, dict): continue
                        if tc["id"] not in answered_ids:
                            err_msg = {
                                "role": "tool",
                                "name": _ra().AIAgent._get_tool_call_name_static(tc),
                                "tool_call_id": tc["id"],
                                "content": f"Error executing tool: {error_msg}",
                            }
                            messages.append(err_msg)
                break

            # Non-tool errors don't need a synthetic message injected.
            # The error is already printed to the user (line above), and
            # the retry loop continues.  Injecting a fake user/assistant
            # message pollutes history, burns tokens, and risks violating
            # role-alternation invariants.

            # If we're near the limit, break to avoid infinite loops
            if api_call_count >= agent.max_iterations - 1:
                _turn_exit_reason = f"error_near_max_iterations({error_msg[:80]})"
                final_response = f"I apologize, but I encountered repeated errors: {error_msg}"
                # Append as assistant so the history stays valid for
                # session resume (avoids consecutive user messages).
                messages.append({"role": "assistant", "content": final_response})
                break

    if final_response is None and (
        api_call_count >= agent.max_iterations
        or agent.iteration_budget.remaining <= 0
    ):
        # Budget exhausted — ask the model for a summary via one extra
        # API call with tools stripped.  _handle_max_iterations injects a
        # user message and makes a single toolless request.
        _turn_exit_reason = f"max_iterations_reached({api_call_count}/{agent.max_iterations})"
        agent._emit_status(
            f"⚠️ Iteration budget exhausted ({api_call_count}/{agent.max_iterations}) "
            "— asking model to summarise"
        )
        if not agent.quiet_mode:
            agent._safe_print(
                f"\n⚠️  Iteration budget exhausted ({api_call_count}/{agent.max_iterations}) "
                "— requesting summary..."
            )
        final_response = agent._handle_max_iterations(messages, api_call_count)

        # If running as a kanban worker, block the task so the dispatcher
        # knows the worker could not complete (rather than treating it as a
        # protocol violation).  The agent loop strips tools before calling
        # _handle_max_iterations, so the model cannot call kanban_block
        # itself — we must do it on its behalf.
        _kanban_task = os.environ.get("HERMES_KANBAN_TASK")
        if _kanban_task:
            try:
                _ra().handle_function_call(
                    "kanban_block",
                    {
                        "task_id": _kanban_task,
                        "reason": (
                            f"Iteration budget exhausted "
                            f"({api_call_count}/{agent.max_iterations}) — "
                            "task could not complete within the allowed "
                            "iterations"
                        ),
                    },
                    task_id=effective_task_id,
                )
                logger.info(
                    "kanban_block called for task %s after iteration "
                    "exhaustion (%d/%d)",
                    _kanban_task, api_call_count, agent.max_iterations,
                )
            except Exception:
                logger.warning(
                    "Failed to call kanban_block after iteration "
                    "exhaustion for task %s",
                    _kanban_task,
                    exc_info=True,
                )

    # Determine if conversation completed successfully
    completed = final_response is not None and api_call_count < agent.max_iterations

    # Save trajectory if enabled.  ``user_message`` may be a multimodal
    # list of parts; the trajectory format wants a plain string.
    agent._save_trajectory(messages, _summarize_user_message_for_log(user_message), completed)

    # Clean up VM and browser for this task after conversation completes
    agent._cleanup_task_resources(effective_task_id)

    # Persist session to both JSON log and SQLite only after private retry
    # scaffolding has been removed. Otherwise a later user "continue" turn
    # can replay assistant("(empty)") / recovery nudges and fall into the
    # same empty-response loop again.
    agent._drop_trailing_empty_response_scaffolding(messages)
    agent._persist_session(messages, conversation_history)

    # ── Turn-exit diagnostic log ─────────────────────────────────────
    # Always logged at INFO so agent.log captures WHY every turn ended.
    # When the last message is a tool result (agent was mid-work), log
    # at WARNING — this is the "just stops" scenario users report.
    _last_msg_role = messages[-1].get("role") if messages else None
    _last_tool_name = None
    if _last_msg_role == "tool":
        # Walk back to find the assistant message with the tool call
        for _m in reversed(messages):
            if _m.get("role") == "assistant" and _m.get("tool_calls"):
                _tcs = _m["tool_calls"]
                if _tcs and isinstance(_tcs[0], dict):
                    _last_tool_name = _tcs[-1].get("function", {}).get("name")
                break

    _turn_tool_count = sum(
        1 for m in messages
        if isinstance(m, dict) and m.get("role") == "assistant" and m.get("tool_calls")
    )
    _resp_len = len(final_response) if final_response else 0
    _budget_used = agent.iteration_budget.used if agent.iteration_budget else 0
    _budget_max = agent.iteration_budget.max_total if agent.iteration_budget else 0

    _diag_msg = (
        "Turn ended: reason=%s model=%s api_calls=%d/%d budget=%d/%d "
        "tool_turns=%d last_msg_role=%s response_len=%d session=%s"
    )
    _diag_args = (
        _turn_exit_reason, agent.model, api_call_count, agent.max_iterations,
        _budget_used, _budget_max,
        _turn_tool_count, _last_msg_role, _resp_len,
        agent.session_id or "none",
    )

    if _last_msg_role == "tool" and not interrupted:
        # Agent was mid-work — this is the "just stops" case.
        logger.warning(
            "Turn ended with pending tool result (agent may appear stuck). "
            + _diag_msg + " last_tool=%s",
            *_diag_args, _last_tool_name,
        )
    else:
        logger.info(_diag_msg, *_diag_args)

    # File-mutation verifier footer.
    # If one or more ``write_file`` / ``patch`` calls failed during this
    # turn and were never superseded by a successful write to the same
    # path, append an advisory footer to the assistant response.  This
    # catches the specific case — reported by Ben Eng (#15524-adjacent)
    # — where a model issues a batch of parallel patches, half of them
    # fail with "Could not find old_string", and the model summarises
    # the turn claiming every file was edited.  The user then has to
    # manually run ``git status`` to catch the lie.  With this footer
    # the truth is surfaced on every turn, so over-claiming is
    # structurally impossible past the model.
    #
    # Gate: only applied when a real text response exists for this
    # turn and the user didn't interrupt.  Empty/interrupted turns
    # already have other surface text that shouldn't be augmented.
    if final_response and not interrupted:
        try:
            _failed = getattr(agent, "_turn_failed_file_mutations", None) or {}
            if _failed and agent._file_mutation_verifier_enabled():
                footer = agent._format_file_mutation_failure_footer(_failed)
                if footer:
                    final_response = final_response.rstrip() + "\n\n" + footer
        except Exception as _ver_err:
            logger.debug("file-mutation verifier footer failed: %s", _ver_err)

    # Plugin hook: transform_llm_output
    # Fired once per turn after the tool-calling loop completes.
    # Plugins can transform the LLM's output text before it's returned.
    # First hook to return a string wins; None/empty return leaves text unchanged.
    if final_response and not interrupted:
        try:
            from hermes_cli.plugins import invoke_hook as _invoke_hook
            _transform_results = _invoke_hook(
                "transform_llm_output",
                response_text=final_response,
                session_id=agent.session_id or "",
                model=agent.model,
                platform=getattr(agent, "platform", None) or "",
            )
            for _hook_result in _transform_results:
                if isinstance(_hook_result, str) and _hook_result:
                    final_response = _hook_result
                    break  # First non-empty string wins
        except Exception as exc:
            logger.warning("transform_llm_output hook failed: %s", exc)

    # Plugin hook: post_llm_call
    # Fired once per turn after the tool-calling loop completes.
    # Plugins can use this to persist conversation data (e.g. sync
    # to an external memory system).
    if final_response and not interrupted:
        try:
            from hermes_cli.plugins import invoke_hook as _invoke_hook
            _invoke_hook(
                "post_llm_call",
                session_id=agent.session_id,
                user_message=original_user_message,
                assistant_response=final_response,
                conversation_history=list(messages),
                model=agent.model,
                platform=getattr(agent, "platform", None) or "",
            )
        except Exception as exc:
            logger.warning("post_llm_call hook failed: %s", exc)

    # Extract reasoning from the CURRENT turn only.  Walk backwards
    # but stop at the user message that started this turn — anything
    # earlier is from a prior turn and must not leak into the reasoning
    # box (confusing stale display; #17055).  Within the current turn
    # we still want the *most recent* non-empty reasoning: many
    # providers (Claude thinking, DeepSeek v4, Codex Responses) emit
    # reasoning on the tool-call step and leave the final-answer step
    # with reasoning=None, so picking only the last assistant would
    # silently drop legitimate same-turn reasoning.
    last_reasoning = None
    for msg in reversed(messages):
        if msg.get("role") == "user":
            break  # turn boundary — don't cross into prior turns
        if msg.get("role") == "assistant" and msg.get("reasoning"):
            last_reasoning = msg["reasoning"]
            break

    # Build result with interrupt info if applicable
    result = {
        "final_response": final_response,
        "last_reasoning": last_reasoning,
        "messages": messages,
        "api_calls": api_call_count,
        "completed": completed,
        "turn_exit_reason": _turn_exit_reason,
        "partial": False,  # True only when stopped due to invalid tool calls
        "interrupted": interrupted,
        "response_previewed": getattr(agent, "_response_was_previewed", False),
        "model": agent.model,
        "provider": agent.provider,
        "base_url": agent.base_url,
        "input_tokens": agent.session_input_tokens,
        "output_tokens": agent.session_output_tokens,
        "cache_read_tokens": agent.session_cache_read_tokens,
        "cache_write_tokens": agent.session_cache_write_tokens,
        "reasoning_tokens": agent.session_reasoning_tokens,
        "prompt_tokens": agent.session_prompt_tokens,
        "completion_tokens": agent.session_completion_tokens,
        "total_tokens": agent.session_total_tokens,
        "last_prompt_tokens": getattr(agent.context_compressor, "last_prompt_tokens", 0) or 0,
        "estimated_cost_usd": agent.session_estimated_cost_usd,
        "cost_status": agent.session_cost_status,
        "cost_source": agent.session_cost_source,
    }
    if agent._tool_guardrail_halt_decision is not None:
        result["guardrail"] = agent._tool_guardrail_halt_decision.to_metadata()
    # If a /steer landed after the final assistant turn (no more tool
    # batches to drain into), hand it back to the caller so it can be
    # delivered as the next user turn instead of being silently lost.
    _leftover_steer = agent._drain_pending_steer()
    if _leftover_steer:
        result["pending_steer"] = _leftover_steer
    agent._response_was_previewed = False

    # Include interrupt message if one triggered the interrupt
    if interrupted and agent._interrupt_message:
        result["interrupt_message"] = agent._interrupt_message

    # Clear interrupt state after handling
    agent.clear_interrupt()

    # Clear stream callback so it doesn't leak into future calls
    agent._stream_callback = None

    # Check skill trigger NOW — based on how many tool iterations THIS turn used.
    _should_review_skills = False
    if (agent._skill_nudge_interval > 0
            and agent._iters_since_skill >= agent._skill_nudge_interval
            and "skill_manage" in agent.valid_tool_names):
        _should_review_skills = True
        agent._iters_since_skill = 0

    # External memory provider: sync the completed turn + queue next prefetch.
    agent._sync_external_memory_for_turn(
        original_user_message=original_user_message,
        final_response=final_response,
        interrupted=interrupted,
    )

    # Background memory/skill review — runs AFTER the response is delivered
    # so it never competes with the user's task for model attention.
    if final_response and not interrupted and (_should_review_memory or _should_review_skills):
        try:
            agent._spawn_background_review(
                messages_snapshot=list(messages),
                review_memory=_should_review_memory,
                review_skills=_should_review_skills,
            )
        except Exception:
            pass  # Background review is best-effort

    # Note: Memory provider on_session_end() + shutdown_all() are NOT
    # called here — run_conversation() is called once per user message in
    # multi-turn sessions. Shutting down after every turn would kill the
    # provider before the second message. Actual session-end cleanup is
    # handled by the CLI (atexit / /reset) and gateway (session expiry /
    # _reset_session).

    # Plugin hook: on_session_end
    # Fired at the very end of every run_conversation call.
    # Plugins can use this for cleanup, flushing buffers, etc.
    try:
        from hermes_cli.plugins import invoke_hook as _invoke_hook
        _invoke_hook(
            "on_session_end",
            session_id=agent.session_id,
            completed=completed,
            interrupted=interrupted,
            model=agent.model,
            platform=getattr(agent, "platform", None) or "",
        )
    except Exception as exc:
        logger.warning("on_session_end hook failed: %s", exc)

    return result


__all__ = ["run_conversation"]