Stage 393: PR #2615

2026-05-24 18:50:15 +00:00 · 2026-05-20 22:23:53 +00:00
parent f3b8d57c99 98106c809b
commit e35c94bf55
5 changed files with 1362 additions and 23 deletions
@@ -3,6 +3,10 @@

 ## [Unreleased]

+### Fixed
+
+- **Sidecar repair / streaming recovery** — Lazily retry run-journal recovery when sidecar repair runs before the journaled tokens are visible on disk (mainly affects WSL2 / network FS setups where the run-journal `.jsonl` is written by the dead worker but the WebUI process reads it through a cache that has not yet seen the writes). The interrupted-turn marker now self-heals on the next session read: it carries a `_pending_journal_recovery` flag that re-runs `_append_journaled_partial_output` from `get_session()`, promotes the marker to the recovered-output wording on success, and demotes to neutral wording after 12 failed retries or 24h, so users no longer see a permanent "no agent output was recovered" banner when the journaled tokens are present on disk. New regression test `tests/test_session_lost_response_regression.py` and additional coverage in `tests/test_session_sidecar_repair.py`.
+

 ## [v0.51.99] — 2026-05-20 — Release BW (stage-392 — 5-PR batch — compact tool activity grouping + CLI sidebar scan cap + title-generation API key forwarding + post-compression replay dedup + clarify popup stability)

@@ -741,27 +741,69 @@ def _get_profile_home(profile) -> Path:
        return Path(os.environ.get('HERMES_HOME') or '~/.hermes').expanduser()


-def _interrupted_recovery_marker(*, recovered_output: bool = False) -> dict:
+_INTERRUPTED_RECOVERED_WORDING = (
+    '**Response interrupted.**\n\n'
+    'The WebUI process restarted before this turn finished. '
+    'The partial output above was recovered from the run journal, '
+    'but the interrupted agent process could not continue.'
+)
+_INTERRUPTED_NO_OUTPUT_WORDING = (
+    '**Response interrupted.**\n\n'
+    'The WebUI process restarted before this turn finished. '
+    'The user message above was preserved, but no agent output was recovered.'
+)
+_INTERRUPTED_PENDING_RETRY_WORDING = (
+    '**Response interrupted.**\n\n'
+    'The WebUI process restarted before this turn finished. '
+    'Recovering the partial output from the run journal — '
+    'reload this session to retry.'
+)
+# Neutral wording used when the lazy retry path gives up (max attempts reached
+# or the marker has been pending longer than _JOURNAL_RETRY_GIVEUP_SECONDS).
+_INTERRUPTED_NEUTRAL_WORDING = (
+    '**Response interrupted.**\n\n'
+    'The WebUI process restarted before this turn finished. '
+    'Partial output may have been lost.'
+)
+
+
+def _interrupted_recovery_marker(
+    *,
+    recovered_output: bool = False,
+    pending_retry: bool = False,
+) -> dict:
+    """Build the standard interrupted-turn marker.
+
+    ``recovered_output=True`` means the run journal already yielded visible
+    text on this repair pass — the marker advertises that the partial output
+    has been recovered.
+
+    ``pending_retry=True`` is the lazy-retry hook: the journal was unreadable
+    on this pass (page-cache loss, un-fsynced writes on slow FS, etc.). The
+    marker carries a ``_pending_journal_recovery`` flag so a later
+    ``get_session()`` can re-attempt recovery without baking a permanent
+    "no output" claim into the transcript.
+
+    The two are mutually exclusive; ``recovered_output`` wins if both are
+    set so the caller cannot accidentally re-arm retry on a successful
+    repair.
+    """
    if recovered_output:
-        content = (
-            '**Response interrupted.**\n\n'
-            'The WebUI process restarted before this turn finished. '
-            'The partial output above was recovered from the run journal, '
-            'but the interrupted agent process could not continue.'
-        )
+        content = _INTERRUPTED_RECOVERED_WORDING
+    elif pending_retry:
+        content = _INTERRUPTED_PENDING_RETRY_WORDING
    else:
-        content = (
-            '**Response interrupted.**\n\n'
-            'The WebUI process restarted before this turn finished. '
-            'The user message above was preserved, but no agent output was recovered.'
-        )
-    return {
+        content = _INTERRUPTED_NO_OUTPUT_WORDING
+    marker = {
        'role': 'assistant',
        'content': content,
        'timestamp': int(time.time()),
        '_error': True,
        'type': 'interrupted',
    }
+    if pending_retry and not recovered_output:
+        marker['_pending_journal_recovery'] = True
+    return marker


 def _truncate_journal_tool_args(args, limit: int = 4) -> dict:
@@ -848,9 +890,33 @@ def _find_existing_assistant_for_journal_content(session, content: str) -> int |
    return None


-def _journal_tool_already_present(session, name: str, preview: str) -> bool:
+def _journal_tool_already_present(
+    session,
+    name: str,
+    preview: str,
+    *,
+    stream_id: str | None = None,
+) -> bool:
+    """Return True when an equivalent tool card already exists.
+
+    Matching rule:
+
+    * If the existing tool card carries ``_recovered_stream_id``, that means a
+      previous journal-recovery run materialized it.  The retry can safely
+      collapse against it only when both stream ids match — otherwise a
+      legitimately-repeated tool (e.g. a second ``terminal: ls`` in a
+      different turn) would be dropped.
+    * If the existing tool card has no ``_recovered_stream_id`` (a live tool
+      card, or a tool card carried over from a core transcript that pre-dates
+      stream-id tagging), the legacy name+preview match still wins.  This
+      preserves the "core transcript already has this tool, don't duplicate
+      it" invariant the original repair path established.
+    * When ``stream_id`` is omitted, the helper degrades cleanly to its
+      pre-fix session-wide behaviour.
+    """
    candidate_name = str(name or '')
    candidate_preview = _normalize_journal_recovery_text(preview)
+    candidate_stream = str(stream_id) if stream_id else None
    for tool_call in session.tool_calls or []:
        if not isinstance(tool_call, dict):
            continue
@@ -859,8 +925,17 @@ def _journal_tool_already_present(session, name: str, preview: str) -> bool:
        existing_preview = _normalize_journal_recovery_text(
            tool_call.get('preview') or tool_call.get('snippet') or ''
        )
-        if existing_preview == candidate_preview:
-            return True
+        if existing_preview != candidate_preview:
+            continue
+        if candidate_stream is not None:
+            existing_stream = tool_call.get('_recovered_stream_id')
+            # A tool card explicitly tagged with a recovered_stream_id that
+            # differs from ours belongs to another retry's turn — don't let
+            # it pre-empt this retry.  Untagged tool cards (live or carried
+            # over from the core transcript) still match.
+            if existing_stream and str(existing_stream) != candidate_stream:
+                continue
+        return True
    return False


@@ -889,6 +964,39 @@ def _run_journal_has_visible_output(session, stream_id: str | None) -> bool:
    return False


+def _journal_is_still_arriving(session, stream_id: str | None) -> bool:
+    """Return True for journals that may become visible on a later read.
+
+    `read_run_events()` deliberately collapses missing files and empty files
+    into an empty event list, so the lazy retry path needs a small filesystem
+    visibility check to avoid burning all retry attempts while WSL2 / network
+    filesystems are still surfacing the journal.  Non-empty journals are treated
+    as sealed enough for retry-budget accounting; if they contain no visible
+    output, the normal capped give-up path handles them.
+    """
+    if not stream_id:
+        return False
+    try:
+        from api.run_journal import _run_path, latest_run_summary
+
+        path = _run_path(session.session_id, stream_id)
+        summary = latest_run_summary(session.session_id, stream_id)
+        if summary.get('terminal'):
+            return False
+        try:
+            return (not path.exists()) or path.stat().st_size == 0
+        except OSError:
+            return True
+    except Exception:
+        logger.debug(
+            "Session %s: failed to classify journal visibility for stream %s",
+            getattr(session, 'session_id', '?'),
+            stream_id,
+            exc_info=True,
+        )
+        return False
+
+
 def _append_journaled_partial_output(
    session,
    stream_id: str | None,
@@ -1003,7 +1111,9 @@ def _append_journaled_partial_output(
                anchor_idx = ensure_assistant_anchor(created_at)
            name = str(payload.get('name') or 'tool')
            preview = str(payload.get('preview') or '')
-            if dedupe_existing and _journal_tool_already_present(session, name, preview):
+            if dedupe_existing and _journal_tool_already_present(
+                session, name, preview, stream_id=stream_id,
+            ):
                current_assistant_idx = anchor_idx
                continue
            recovered_tool_calls.append({
@@ -1045,6 +1155,273 @@ def _append_journaled_partial_output(
    return appended_any


+# ── Lazy run-journal recovery (read-side self-heal) ─────────────────────────
+#
+# When sidecar repair runs before the run-journal for the dead stream is
+# visible on disk (page-cache loss on WSL2 9p / DrvFs, an un-fsynced journal
+# tail, a slow network FS, …), `_append_journaled_partial_output` returns
+# False even though the journaled events will appear on disk shortly. Without
+# the helpers below the repair path baked a permanent "no agent output was
+# recovered" claim into the marker, and a later session read could never
+# correct it.
+#
+# The contract is:
+#
+#   * Sidecar repair (`_apply_core_sync_or_error_marker`) writes a marker
+#     with `_pending_journal_recovery=True` whenever it could not recover
+#     visible output AND the stream id is known. Three retry-meta keys go
+#     onto the marker: `_journal_retry_stream_id`, `_journal_retry_attempts`,
+#     `_journal_retry_first_seen_ts`.
+#   * Every `get_session()` call that returns the full session checks the
+#     latest assistant marker; if the flag is set it re-runs
+#     `_append_journaled_partial_output` with `dedupe_existing=True`. On
+#     success the marker is promoted in place to the recovered-output
+#     wording, the journaled rows are reordered to sit above the marker,
+#     and all retry meta is stripped. If the journal is still missing or
+#     zero-byte, the retry is a no-op and does not consume attempt budget.
+#     Terminal/non-useful journals consume attempt budget and can demote
+#     immediately at the max-attempt cap.
+#   * After `_JOURNAL_RETRY_MAX_ATTEMPTS` failed retries or
+#     `_JOURNAL_RETRY_GIVEUP_SECONDS` of wall-clock age, the marker is
+#     demoted to the neutral wording ("Partial output may have been lost.")
+#     so users do not see "reload to retry" prompts forever.
+_JOURNAL_RETRY_MAX_ATTEMPTS = 12
+_JOURNAL_RETRY_GIVEUP_SECONDS = 24 * 3600
+_JOURNAL_RETRY_LOCKS: dict[str, threading.Lock] = {}
+_JOURNAL_RETRY_LOCKS_GUARD = threading.Lock()
+
+
+def _journal_retry_lock_for_sid(sid: str) -> threading.Lock:
+    with _JOURNAL_RETRY_LOCKS_GUARD:
+        return _JOURNAL_RETRY_LOCKS.setdefault(str(sid), threading.Lock())
+
+
+def _build_recovery_marker_with_retry_hook(
+    *, recovered_output: bool, stream_id: str | None,
+) -> dict:
+    """Build an interrupted-turn marker, arming the lazy-retry hook when
+    visible output was not recovered yet but a stream id is available."""
+    if recovered_output:
+        return _interrupted_recovery_marker(recovered_output=True)
+    if not stream_id:
+        return _interrupted_recovery_marker(recovered_output=False)
+    marker = _interrupted_recovery_marker(pending_retry=True)
+    marker['_journal_retry_stream_id'] = str(stream_id)
+    marker['_journal_retry_attempts'] = 0
+    marker['_journal_retry_first_seen_ts'] = int(time.time())
+    return marker
+
+
+def _session_has_pending_journal_retry(session) -> bool:
+    """Cheap short-circuit: scan from the tail until the most recent normal
+    assistant turn. Any `_pending_journal_recovery` flag found before then
+    means a retry is queued.
+    """
+    messages = getattr(session, 'messages', None) or []
+    for msg in reversed(messages):
+        if not isinstance(msg, dict):
+            continue
+        if msg.get('_pending_journal_recovery'):
+            return True
+        if msg.get('role') == 'assistant' and not msg.get('_error'):
+            # A normal assistant turn after any pending marker — nothing to
+            # retry above this point.
+            return False
+    return False
+
+
+def _strip_journal_retry_meta(marker: dict) -> None:
+    marker.pop('_pending_journal_recovery', None)
+    marker.pop('_journal_retry_stream_id', None)
+    marker.pop('_journal_retry_attempts', None)
+    marker.pop('_journal_retry_first_seen_ts', None)
+
+
+def _reorder_journal_tail_above_marker(session, marker_idx: int) -> None:
+    """Move `_recovered_from_run_journal=True` rows appended *after*
+    ``marker_idx`` to sit immediately above the marker so chronological
+    order is preserved (journaled output happened during the turn, marker
+    annotates its end).
+    """
+    messages = session.messages
+    if marker_idx < 0 or marker_idx >= len(messages):
+        return
+    tail = messages[marker_idx + 1 :]
+    if not tail:
+        return
+    journaled = [
+        m for m in tail
+        if isinstance(m, dict) and m.get('_recovered_from_run_journal')
+    ]
+    if not journaled:
+        return
+    rest = [
+        m for m in tail
+        if not (isinstance(m, dict) and m.get('_recovered_from_run_journal'))
+    ]
+    marker = messages[marker_idx]
+    new_messages = (
+        messages[:marker_idx]
+        + journaled
+        + [marker]
+        + rest
+    )
+    # Rebase any tool_calls.assistant_msg_idx values that pointed into the
+    # journaled rows when they were appended at the tail.
+    old_journaled_idx_base = marker_idx + 1
+    new_journaled_idx_base = marker_idx
+    shift = new_journaled_idx_base - old_journaled_idx_base  # = -1
+    for tool_call in session.tool_calls or []:
+        if not isinstance(tool_call, dict):
+            continue
+        idx = tool_call.get('assistant_msg_idx')
+        if isinstance(idx, int) and idx >= old_journaled_idx_base \
+                and idx < old_journaled_idx_base + len(journaled):
+            tool_call['assistant_msg_idx'] = idx + shift
+    session.messages = new_messages
+
+
+def _try_retry_journal_recovery_in_place(session) -> bool:
+    sid = str(getattr(session, 'session_id', '') or '')
+    lock = _journal_retry_lock_for_sid(sid)
+    if not lock.acquire(blocking=False):
+        logger.debug("lazy journal-retry already running for session %s", sid)
+        return False
+    try:
+        return _retry_journal_recovery_in_place(
+            session, preserve_arriving_budget=True,
+        )
+    finally:
+        lock.release()
+        with _JOURNAL_RETRY_LOCKS_GUARD:
+            if _JOURNAL_RETRY_LOCKS.get(sid) is lock:
+                _JOURNAL_RETRY_LOCKS.pop(sid, None)
+
+
+def _retry_journal_recovery_in_place(
+    session,
+    *,
+    preserve_arriving_budget: bool = False,
+) -> bool:
+    """Re-attempt run-journal recovery for the most recent pending marker.
+
+    Returns True if the marker was promoted to the recovered-output wording.
+    Never raises — caller is best-effort.
+    """
+    try:
+        messages = session.messages or []
+        for idx in range(len(messages) - 1, -1, -1):
+            msg = messages[idx]
+            if not isinstance(msg, dict):
+                continue
+            if msg.get('role') == 'assistant' and not msg.get('_error') \
+                    and not msg.get('_pending_journal_recovery'):
+                # Walked past the pending marker without finding it.
+                return False
+            if not (
+                msg.get('type') == 'interrupted'
+                and msg.get('_pending_journal_recovery')
+            ):
+                continue
+            stream_id = msg.get('_journal_retry_stream_id')
+            first_seen = msg.get('_journal_retry_first_seen_ts') or 0
+            attempts = int(msg.get('_journal_retry_attempts') or 0)
+            now = time.time()
+            give_up = (
+                attempts >= _JOURNAL_RETRY_MAX_ATTEMPTS
+                or (
+                    first_seen
+                    and now - float(first_seen) > _JOURNAL_RETRY_GIVEUP_SECONDS
+                )
+            )
+            if not stream_id:
+                # No stream id to retry against; demote immediately.
+                msg['content'] = _INTERRUPTED_NEUTRAL_WORDING
+                _strip_journal_retry_meta(msg)
+                try:
+                    session.save(touch_updated_at=False)
+                except Exception:
+                    logger.debug(
+                        "save() failed while demoting marker for session %s",
+                        getattr(session, 'session_id', '?'),
+                        exc_info=True,
+                    )
+                return False
+            if give_up:
+                msg['content'] = _INTERRUPTED_NEUTRAL_WORDING
+                _strip_journal_retry_meta(msg)
+                try:
+                    session.save(touch_updated_at=False)
+                except Exception:
+                    logger.debug(
+                        "save() failed while demoting marker for session %s",
+                        getattr(session, 'session_id', '?'),
+                        exc_info=True,
+                    )
+                return False
+            tail_len_before = len(session.messages)
+            ok = _append_journaled_partial_output(
+                session, stream_id, dedupe_existing=True,
+            )
+            if ok:
+                msg['content'] = _INTERRUPTED_RECOVERED_WORDING
+                _strip_journal_retry_meta(msg)
+                # The journaled rows were appended at the end of messages;
+                # only the rows past the previous tail count as "newly
+                # journaled" and need to move above the marker.
+                _ = tail_len_before  # informational; helper below scans
+                _reorder_journal_tail_above_marker(session, idx)
+                try:
+                    session.save(touch_updated_at=False)
+                except Exception:
+                    logger.debug(
+                        "save() failed while promoting marker for session %s",
+                        getattr(session, 'session_id', '?'),
+                        exc_info=True,
+                    )
+                logger.info(
+                    "Session %s: lazy journal-recovery promoted marker for "
+                    "stream %s after %d attempts",
+                    getattr(session, 'session_id', '?'),
+                    stream_id,
+                    attempts,
+                )
+                return True
+            if (
+                preserve_arriving_budget
+                and _journal_is_still_arriving(session, stream_id)
+            ):
+                logger.debug(
+                    "Session %s: journal for stream %s still arriving; "
+                    "preserving retry budget",
+                    getattr(session, 'session_id', '?'),
+                    stream_id,
+                )
+                return False
+            next_attempts = attempts + 1
+            if next_attempts >= _JOURNAL_RETRY_MAX_ATTEMPTS:
+                msg['content'] = _INTERRUPTED_NEUTRAL_WORDING
+                _strip_journal_retry_meta(msg)
+            else:
+                msg['_journal_retry_attempts'] = next_attempts
+            try:
+                session.save(touch_updated_at=False)
+            except Exception:
+                logger.debug(
+                    "save() failed while updating retry counter for session %s",
+                    getattr(session, 'session_id', '?'),
+                    exc_info=True,
+                )
+            return False
+        return False
+    except Exception:
+        logger.exception(
+            "_retry_journal_recovery_in_place failed for session %s",
+            getattr(session, 'session_id', '?'),
+        )
+        return False
+
+
 def _apply_core_sync_or_error_marker(
    session,
    core_path,
@@ -1112,11 +1489,16 @@ def _apply_core_sync_or_error_marker(
            session,
            stream_id_for_recheck or session.active_stream_id,
        )
+        _stream_id = stream_id_for_recheck or session.active_stream_id
        session.active_stream_id = None
        session.pending_user_message = None
        session.pending_attachments = []
        session.pending_started_at = None
-        session.messages.append(_interrupted_recovery_marker(recovered_output=recovered_output))
+        session.messages.append(
+            _build_recovery_marker_with_retry_hook(
+                recovered_output=recovered_output, stream_id=_stream_id,
+            )
+        )
        session.save(touch_updated_at=touch_updated_at)
        logger.info(
            "Session %s: recovered pending user turn (messages non-empty), added error marker",
@@ -1167,6 +1549,17 @@ def _apply_core_sync_or_error_marker(
                session.messages.append(
                    _interrupted_recovery_marker(recovered_output=True)
                )
+            # NOTE: when the core transcript was synced in but the run journal
+            # is not yet visible, intentionally do NOT append a lazy-retry
+            # marker here. In this branch the canonical history is the core
+            # transcript itself (which has already been written to s.messages
+            # above) and the marker is purely advisory — the existing contract
+            # is "marker only when there is a recovered partial turn to
+            # annotate". Adding a pending-retry marker on every empty-journal
+            # core-sync would surface a spurious "reload to retry" banner on
+            # sessions whose journal is legitimately absent (e.g. archived
+            # streams). The first and third branches handle the lost-response
+            # case where the marker is the only signal the user gets.
            session.save(touch_updated_at=touch_updated_at)
            logger.info(
                "Session %s: synced %d messages from core transcript%s",
@@ -1189,11 +1582,16 @@ def _apply_core_sync_or_error_marker(
        session,
        stream_id_for_recheck or session.active_stream_id,
    )
+    _stream_id = stream_id_for_recheck or session.active_stream_id
    session.active_stream_id = None
    session.pending_user_message = None
    session.pending_attachments = []
    session.pending_started_at = None
-    session.messages.append(_interrupted_recovery_marker(recovered_output=recovered_output))
+    session.messages.append(
+        _build_recovery_marker_with_retry_hook(
+            recovered_output=recovered_output, stream_id=_stream_id,
+        )
+    )
    session.save(touch_updated_at=touch_updated_at)
    logger.info("Session %s: no core transcript found, added error marker", sid)
    return True
@@ -1313,9 +1711,19 @@ def get_session(sid, metadata_only=False):
    actual message history (e.g., for fast sidebar switching).
    """
    with LOCK:
-        if sid in SESSIONS:
+        cached = SESSIONS.get(sid)
+        if cached is not None:
            SESSIONS.move_to_end(sid)  # LRU: mark as recently used
-            return SESSIONS[sid]
+    if cached is not None:
+        if not metadata_only and _session_has_pending_journal_retry(cached):
+            try:
+                _try_retry_journal_recovery_in_place(cached)
+            except Exception:
+                logger.debug(
+                    "lazy journal-retry failed on cache hit for session %s",
+                    sid, exc_info=True,
+                )
+        return cached
    if metadata_only:
        s = Session.load_metadata_only(sid)
        if s:
@@ -1331,6 +1739,18 @@ def get_session(sid, metadata_only=False):
        if not metadata_only:
            try:
                repaired = _repair_stale_pending(s)
+                # If the stale-pending repair did not fire but the session
+                # already carries a pending-journal-retry marker (e.g. set on
+                # a previous repair pass), give the lazy-retry path one
+                # chance to self-heal on this read.
+                if not repaired and _session_has_pending_journal_retry(s):
+                    try:
+                        _try_retry_journal_recovery_in_place(s)
+                    except Exception:
+                        logger.debug(
+                            "lazy journal-retry failed on cold load for session %s",
+                            sid, exc_info=True,
+                        )
                # If repair had to bail because the per-session lock was held,
                # do not pin the still-stale sidecar in the LRU cache forever.
                # Leaving it cached would prevent future get_session() calls from
@@ -85,6 +85,34 @@ If after running steps 1-4 the import still fails *and* `pip install -e .` succe

 ---

+## "Response interrupted." marker keeps saying "no agent output was recovered"
+
+**Symptom.** After the WebUI process restarts mid-turn (manual restart, OOM, crash, …), the affected chat shows an `**Response interrupted.**` marker with the wording *"The user message above was preserved, but no agent output was recovered."*, even though the run-journal for that turn is present on disk and contains the partial tokens the agent had already streamed.
+
+**Why.** Sidecar repair re-checks the run-journal at restart and uses the result as a one-shot signal. On WSL2 (9p / DrvFs) and on some network-backed setups, the run-journal `.jsonl` is written by the dead worker but the WebUI process reads it through a page-cache state that has not yet seen those writes — recovery returns "empty" and the marker is baked permanently. The fix introduces a *lazy* retry path: when sidecar repair cannot read visible output but knows the stream id, it stores a `_pending_journal_recovery` flag on the marker and re-attempts recovery from `get_session()` until the journal becomes readable (or the retry budget is exhausted).
+
+**Diagnostic.**
+
+The on-disk locations below assume the default `~/.hermes/webui` state directory. If you override it via `HERMES_WEBUI_STATE_DIR`, substitute that path for `~/.hermes/webui` in every step.
+
+1. Identify the affected session id and stream id from the marker. The marker JSON lives at `~/.hermes/webui/sessions/<sid>.json`; after the fix it shows them on the `_journal_retry_stream_id` key. Pre-fix sessions only carry the legacy wording, with no retry meta.
+2. Check whether the run-journal contains real events:
+   ```bash
+   ls -la ~/.hermes/webui/sessions/_run_journal/<sid>/<stream_id>.jsonl
+   head -2 ~/.hermes/webui/sessions/_run_journal/<sid>/<stream_id>.jsonl
+   ```
+   If the file exists and contains `token` / `tool` events, the lazy-retry path will pick them up the next time the session is opened.
+
+**Fix.** Reload the session in the browser. On the next `get_session()` call the marker is re-evaluated; if the journaled events are visible on disk the marker promotes to *"The partial output above was recovered from the run journal …"* wording and the journaled assistant text + tool cards land above the marker in chronological order. No manual sidecar editing is required.
+
+**Trigger.** Sidebar metadata polling is intentionally not enough to run this self-heal. Requests such as `/api/session?messages=0&resolve_model=0` load the session with `metadata_only=True`, skip the full messages array, and therefore skip the lazy journal retry helper. Click/open the affected conversation so the message panel performs a full `messages=1` load; that full render is what re-checks the journal and can promote the marker.
+
+**Caps.** The lazy retry path gives up after 12 failed attempts or 24h of wall-clock age, at which point the marker is demoted to a neutral *"Partial output may have been lost."* wording so the "reload to retry" prompt doesn't linger forever for genuinely lost journals.
+
+**When to file a bug.** If, after the fix, you see the lazy-retry wording (*"Recovering the partial output from the run journal — reload this session to retry."*) but reloading the session never promotes it to the recovered wording even though the `.jsonl` clearly contains `token` events, capture the marker JSON and the run-journal file and file a bug.
+
+---
+
 ## Other troubleshooting

 This document grows over time. If a recurring failure mode isn't covered here yet, add it via PR. The format for each entry: **Symptom → Why → Diagnostic commands → Fix → When to file a bug**.
@@ -0,0 +1,339 @@
+"""Regression: lazy-retry run-journal recovery across multiple session reads.
+
+The scenario this test pins down:
+
+1. A WebUI process restarts mid-stream. On the first sidecar repair attempt
+   the run-journal for the dead stream is NOT visible yet (page-cache loss,
+   un-fsynced writes, slow network FS, etc.) so
+   `_append_journaled_partial_output` returns False.
+2. Pre-fix the repair path baked a permanent "no agent output was recovered"
+   marker into the session and never looked at the journal again — even
+   after the journaled tokens appeared on disk on a later read.
+3. With the fix, the repair instead leaves a `_pending_journal_recovery`
+   flag on the marker; the next `get_session()` call lazily re-runs the
+   recovery, promotes the marker wording, and threads the journaled
+   assistant text/tools into the transcript in the correct chronological
+   position.
+"""
+from concurrent.futures import ThreadPoolExecutor
+import threading
+
+import pytest
+
+import api.models as models
+import api.config as config
+import api.profiles as profiles
+import api.streaming as streaming  # noqa: F401  imported for fixture parity
+from api.models import (
+    Session,
+    _apply_core_sync_or_error_marker,
+)
+from api.run_journal import append_run_event
+
+
+# ── Fixtures (shape mirrors test_session_sidecar_repair.py) ────────────────
+
+
+@pytest.fixture(autouse=True)
+def _isolate_session_dir(tmp_path, monkeypatch):
+    session_dir = tmp_path / "sessions"
+    session_dir.mkdir()
+    index_file = session_dir / "_index.json"
+    monkeypatch.setattr(models, "SESSION_DIR", session_dir)
+    monkeypatch.setattr(models, "SESSION_INDEX_FILE", index_file)
+    models.SESSIONS.clear()
+    yield session_dir, index_file
+    models.SESSIONS.clear()
+
+
+@pytest.fixture(autouse=True)
+def _isolate_stream_state():
+    config.STREAMS.clear()
+    config.CANCEL_FLAGS.clear()
+    config.AGENT_INSTANCES.clear()
+    config.STREAM_PARTIAL_TEXT.clear()
+    yield
+    config.STREAMS.clear()
+    config.CANCEL_FLAGS.clear()
+    config.AGENT_INSTANCES.clear()
+    config.STREAM_PARTIAL_TEXT.clear()
+
+
+@pytest.fixture(autouse=True)
+def _isolate_agent_locks():
+    config.SESSION_AGENT_LOCKS.clear()
+    models._JOURNAL_RETRY_LOCKS.clear()
+    yield
+    config.SESSION_AGENT_LOCKS.clear()
+    models._JOURNAL_RETRY_LOCKS.clear()
+
+
+@pytest.fixture()
+def hermes_home(tmp_path, monkeypatch):
+    home = tmp_path / "hermes_home"
+    home.mkdir()
+    (home / "sessions").mkdir()
+    monkeypatch.setenv("HERMES_HOME", str(home))
+    monkeypatch.setattr(profiles, "_DEFAULT_HERMES_HOME", home)
+    return home
+
+
+def _make_dead_stream_session(
+    session_id: str,
+    *,
+    stream_id: str,
+    existing_msgs_count: int = 96,
+    pending_text: str = (
+        "[IMPORTANT: Background process polling. "
+        "Continue the user's prior request.]"
+    ),
+):
+    """Build a session that mirrors the production bug: lots of prior history,
+    pending_user_message set, an active_stream_id pointing at a dead stream,
+    and pending_started_at populated."""
+    messages = []
+    for i in range(existing_msgs_count // 2):
+        messages.append({"role": "user", "content": f"q{i}"})
+        messages.append({"role": "assistant", "content": f"a{i}"})
+    s = Session(session_id=session_id, title="Lost-response repro", messages=messages)
+    s.pending_user_message = pending_text
+    s.pending_started_at = 1779237637  # production-shaped value
+    s.active_stream_id = stream_id
+    return s
+
+
+def _make_pending_retry_session(session_id: str, *, stream_id: str):
+    s = Session(session_id=session_id, title="Pending retry", messages=[
+        {"role": "user", "content": "q", "timestamp": 1},
+        {
+            "role": "assistant",
+            "content": models._INTERRUPTED_PENDING_RETRY_WORDING,
+            "timestamp": 2,
+            "_error": True,
+            "type": "interrupted",
+            "_pending_journal_recovery": True,
+            "_journal_retry_stream_id": stream_id,
+            "_journal_retry_attempts": 0,
+            "_journal_retry_first_seen_ts": int(models.time.time()),
+        },
+    ])
+    s.save()
+    return s
+
+
+def _assert_retry_meta_removed(marker):
+    assert "_pending_journal_recovery" not in marker
+    assert "_journal_retry_stream_id" not in marker
+    assert "_journal_retry_attempts" not in marker
+    assert "_journal_retry_first_seen_ts" not in marker
+
+
+# ── The regression test ────────────────────────────────────────────────────
+
+
+def test_lost_response_recovered_on_second_read(hermes_home):
+    sid = "9f14583f0e4e4444aaaa111122223333"
+    stream_id = "7c8b4108d52b4aba9af362d3a54f47ac"
+
+    # ── Stage 1: simulate page-cache loss — sidecar repair runs while the
+    # run-journal for this stream is empty/absent on disk.
+    s = _make_dead_stream_session(sid, stream_id=stream_id)
+    s.save()
+    core_path = hermes_home / "sessions" / f"session_{sid}.json"
+
+    result = _apply_core_sync_or_error_marker(
+        s, core_path, stream_id_for_recheck=stream_id,
+    )
+    assert result is True
+
+    # Marker should carry the lazy-retry flag and *not* the permanent
+    # "no agent output was recovered" wording yet.
+    last = s.messages[-1]
+    assert last.get("_error") is True
+    assert last.get("type") == "interrupted"
+    assert last.get("_pending_journal_recovery") is True, (
+        "First repair pass should defer the recovery decision via a "
+        "_pending_journal_recovery flag so a later read can self-heal."
+    )
+    assert last.get("_journal_retry_stream_id") == stream_id
+    assert last.get("_journal_retry_attempts") == 0
+    assert isinstance(last.get("_journal_retry_first_seen_ts"), int)
+    assert "no agent output was recovered" not in last["content"]
+    # pending fields cleared regardless of journal visibility
+    assert s.pending_user_message is None
+    assert s.active_stream_id is None
+    assert s.pending_started_at is None
+
+    # ── Stage 2: the journaled events become visible on disk.
+    append_run_event(sid, stream_id, "token", {"text": "Checking GitHub first."})
+    append_run_event(
+        sid,
+        stream_id,
+        "tool",
+        {
+            "name": "terminal",
+            "preview": "gh pr list --repo nesquena/hermes-webui",
+            "args": {"command": "gh pr list --repo nesquena/hermes-webui"},
+        },
+    )
+    append_run_event(
+        sid,
+        stream_id,
+        "tool_complete",
+        {"name": "terminal", "duration": 1.2, "is_error": False},
+    )
+    append_run_event(
+        sid, stream_id, "token", {"text": " The first PR scan completed."},
+    )
+
+    # Pin session into the LRU cache and call get_session — this is the
+    # production path that triggers lazy retry.
+    models.SESSIONS[sid] = s
+    reloaded = models.get_session(sid)
+    assert reloaded is s
+
+    contents = [m.get("content", "") for m in s.messages]
+    # The marker self-healed:
+    assert any("recovered from the run journal" in c for c in contents), (
+        "After journaled tokens become readable, the marker must promote to "
+        "the recovered-output wording."
+    )
+    assert not any("no agent output was recovered" in c for c in contents)
+
+    # The journaled assistant text and tool card landed BEFORE the marker
+    # so chronological order in the transcript is preserved.
+    marker_idx = next(
+        i for i, m in enumerate(s.messages)
+        if m.get("type") == "interrupted" and m.get("_error")
+    )
+    recovered_msgs = [
+        m for m in s.messages[:marker_idx]
+        if m.get("_recovered_from_run_journal") is True
+    ]
+    assert recovered_msgs, "recovered assistant content must sit above the marker"
+    recovered_text = " ".join(m.get("content", "") for m in recovered_msgs)
+    assert "Checking GitHub first." in recovered_text
+    assert "first PR scan completed" in recovered_text
+
+    # Tool card lives in session.tool_calls and points at one of the
+    # recovered assistant indices.
+    assert s.tool_calls, "journaled tool should be materialized"
+    assert s.tool_calls[-1]["name"] == "terminal"
+    assert s.tool_calls[-1]["done"] is True
+
+    # Flag and meta cleaned up after promotion.
+    promoted = s.messages[marker_idx]
+    assert "_pending_journal_recovery" not in promoted
+    assert "_journal_retry_stream_id" not in promoted
+    assert "_journal_retry_attempts" not in promoted
+    assert "_journal_retry_first_seen_ts" not in promoted
+
+
+def test_concurrent_get_session_serializes_lazy_journal_retry(hermes_home, monkeypatch):
+    sid = "retry_lock_sid"
+    stream_id = "retry_lock_stream"
+    s = _make_pending_retry_session(sid, stream_id=stream_id)
+    models.SESSIONS[sid] = s
+
+    entered = threading.Event()
+    release = threading.Event()
+    counter_lock = threading.Lock()
+    calls = 0
+
+    def slow_retry(session, *, preserve_arriving_budget=False):
+        nonlocal calls
+        with counter_lock:
+            calls += 1
+        entered.set()
+        assert release.wait(timeout=2), "test timed out waiting to release retry body"
+        return False
+
+    monkeypatch.setattr(models, "_retry_journal_recovery_in_place", slow_retry)
+
+    with ThreadPoolExecutor(max_workers=2) as executor:
+        first = executor.submit(models.get_session, sid)
+        assert entered.wait(timeout=2), "first caller did not enter retry helper"
+        second = executor.submit(models.get_session, sid)
+        assert second.result(timeout=2) is s
+        release.set()
+        assert first.result(timeout=2) is s
+
+    assert calls == 1
+
+
+def test_still_arriving_journal_does_not_consume_retry_budget(hermes_home, monkeypatch):
+    sid = "retry_arriving_sid"
+    stream_id = "retry_arriving_stream"
+    s = _make_pending_retry_session(sid, stream_id=stream_id)
+    models.SESSIONS[sid] = s
+
+    monkeypatch.setattr(models, "_append_journaled_partial_output", lambda *a, **kw: False)
+    monkeypatch.setattr(models, "_journal_is_still_arriving", lambda *a, **kw: True)
+
+    for _ in range(20):
+        assert models.get_session(sid) is s
+
+    marker = s.messages[-1]
+    assert marker["_journal_retry_attempts"] == 0
+    assert marker["_pending_journal_recovery"] is True
+    assert marker["content"] == models._INTERRUPTED_PENDING_RETRY_WORDING
+
+
+def test_sealed_empty_journal_consumes_retry_budget_and_demotes_at_max(hermes_home, monkeypatch):
+    sid = "retry_sealed_sid"
+    stream_id = "retry_sealed_stream"
+    s = _make_pending_retry_session(sid, stream_id=stream_id)
+    s.messages[-1]["_journal_retry_attempts"] = models._JOURNAL_RETRY_MAX_ATTEMPTS - 1
+    append_run_event(sid, stream_id, "stream_end", {})
+    models.SESSIONS[sid] = s
+
+    assert models.get_session(sid) is s
+
+    marker = s.messages[-1]
+    assert marker["content"] == models._INTERRUPTED_NEUTRAL_WORDING
+    _assert_retry_meta_removed(marker)
+    assert not any(m.get("_recovered_from_run_journal") for m in s.messages)
+
+
+def test_marker_demotes_after_max_attempts_with_sealed_empty_journal(hermes_home, monkeypatch):
+    sid = "retry_max_sid"
+    stream_id = "retry_max_stream"
+    s = _make_pending_retry_session(sid, stream_id=stream_id)
+    s.messages[-1]["_journal_retry_attempts"] = models._JOURNAL_RETRY_MAX_ATTEMPTS - 1
+    append_run_event(sid, stream_id, "stream_end", {})
+    models.SESSIONS[sid] = s
+
+    assert models.get_session(sid) is s
+
+    marker = s.messages[-1]
+    assert marker["content"] == models._INTERRUPTED_NEUTRAL_WORDING
+    _assert_retry_meta_removed(marker)
+    assert not any(m.get("_recovered_from_run_journal") for m in s.messages)
+
+
+def test_marker_demotes_after_giveup_seconds(hermes_home, monkeypatch):
+    base = 1_779_000_000
+    monkeypatch.setattr(models.time, "time", lambda: base)
+    sid = "retry_age_sid"
+    stream_id = "retry_age_stream"
+    s = _make_pending_retry_session(sid, stream_id=stream_id)
+    s.messages[-1]["_journal_retry_first_seen_ts"] = (
+        base - models._JOURNAL_RETRY_GIVEUP_SECONDS - 1
+    )
+    models.SESSIONS[sid] = s
+
+    append_calls = 0
+
+    def append_should_not_run(*args, **kwargs):
+        nonlocal append_calls
+        append_calls += 1
+        return False
+
+    monkeypatch.setattr(models, "_append_journaled_partial_output", append_should_not_run)
+
+    assert models.get_session(sid) is s
+
+    marker = s.messages[-1]
+    assert marker["content"] == models._INTERRUPTED_NEUTRAL_WORDING
+    _assert_retry_meta_removed(marker)
+    assert append_calls == 0
@@ -313,7 +313,15 @@ class TestDraftRecovery:
        )
        assert "Response interrupted" in content
        assert "WebUI process restarted" in content
-        assert "user message above was preserved" in content
+        # The marker now arms the lazy-retry hook when a stream id is known
+        # ("Recovering the partial output… reload to retry."). The legacy
+        # "user message above was preserved" wording is reserved for the
+        # no-stream-id repair case; the post-retry-give-up case demotes to
+        # the neutral "Partial output may have been lost." wording instead.
+        assert (
+            "user message above was preserved" in content
+            or "Recovering the partial output" in content
+        )
        assert error_msgs[0].get("type") == "interrupted"

    def test_pending_attachments_recovered(self, hermes_home, monkeypatch):
@@ -705,7 +713,14 @@ class TestNonEmptyMessagesPendingCleared:
        assert not any("private scratchpad text" in c for c in contents)
        error_msgs = [m for m in s.messages if m.get("_error")]
        assert len(error_msgs) == 1
-        assert "no agent output was recovered" in error_msgs[0]["content"]
+        # Reasoning-only events do not count as visible output, so the marker
+        # arms the lazy-retry hook (stream id is known, journal may have more
+        # events appear on a later read). The legacy "no agent output was
+        # recovered" wording is now reserved for the no-stream-id case.
+        assert error_msgs[0].get("_pending_journal_recovery") is True
+        assert error_msgs[0].get("_journal_retry_stream_id") == "reasoning_only_stream"
+        assert "no agent output was recovered" not in error_msgs[0]["content"]
+        assert "Recovering the partial output" in error_msgs[0]["content"]

    def test_journal_recovery_keeps_consecutive_tools_on_one_anchor(self, hermes_home, monkeypatch):
        """Consecutive journaled tools without an intervening visible update
@@ -1153,3 +1168,536 @@ class TestCoreSyncMetadata:
        assert len(user_msgs) == 1
        assert user_msgs[0]["content"] == "My question"
        assert user_msgs[0].get("_recovered") is True
+
+
+# ── Lazy run-journal recovery (read-side self-heal) ─────────────────────────
+
+class TestInterruptedRecoveryMarker:
+    """Pure-function tests for _interrupted_recovery_marker(pending_retry=…)."""
+
+    def test_marker_recovered_output_excludes_pending_retry_flag(self):
+        marker = models._interrupted_recovery_marker(recovered_output=True)
+        assert marker["_error"] is True
+        assert marker["type"] == "interrupted"
+        assert "_pending_journal_recovery" not in marker
+        assert "recovered from the run journal" in marker["content"]
+
+    def test_marker_pending_retry_sets_flag_and_wording(self):
+        marker = models._interrupted_recovery_marker(pending_retry=True)
+        assert marker.get("_pending_journal_recovery") is True
+        assert "Recovering the partial output" in marker["content"]
+        assert "no agent output was recovered" not in marker["content"]
+
+    def test_marker_recovered_output_beats_pending_retry(self):
+        marker = models._interrupted_recovery_marker(
+            recovered_output=True, pending_retry=True,
+        )
+        assert "_pending_journal_recovery" not in marker
+        assert "recovered from the run journal" in marker["content"]
+
+    def test_marker_default_wording_unchanged_for_no_output_no_retry(self):
+        marker = models._interrupted_recovery_marker()
+        assert "_pending_journal_recovery" not in marker
+        assert "no agent output was recovered" in marker["content"]
+
+
+class TestRetryJournalRecoveryInPlace:
+    """In-place retry helper: promote / increment / give up."""
+
+    def _make_pending_session(self, hermes_home, stream_id="lazy_stream",
+                              attempts=0, first_seen_ts=None):
+        s = _make_session(messages=[
+            {"role": "user", "content": "earlier turn"},
+            {"role": "assistant", "content": "earlier reply"},
+            {"role": "user", "content": "Stuck draft", "_recovered": True},
+        ])
+        marker = models._interrupted_recovery_marker(pending_retry=True)
+        marker["_journal_retry_stream_id"] = stream_id
+        marker["_journal_retry_attempts"] = attempts
+        marker["_journal_retry_first_seen_ts"] = (
+            first_seen_ts if first_seen_ts is not None else int(time.time())
+        )
+        s.messages.append(marker)
+        s.save()
+        return s
+
+    def test_promotes_marker_when_journal_now_available(self, hermes_home, monkeypatch):
+        stream_id = "lazy_stream_promote"
+        s = self._make_pending_session(hermes_home, stream_id=stream_id)
+        marker_before = s.messages[-1]
+
+        append_run_event(s.session_id, stream_id, "token", {"text": "Late tokens."})
+        append_run_event(
+            s.session_id, stream_id, "tool",
+            {"name": "terminal", "preview": "echo", "args": {"cmd": "echo"}},
+        )
+
+        ok = models._retry_journal_recovery_in_place(s)
+        assert ok is True
+
+        # Marker promoted in place and meta stripped
+        marker_idx = next(
+            i for i, m in enumerate(s.messages)
+            if m.get("type") == "interrupted" and m.get("_error")
+        )
+        promoted = s.messages[marker_idx]
+        assert promoted is marker_before
+        assert "recovered from the run journal" in promoted["content"]
+        assert "_pending_journal_recovery" not in promoted
+        assert "_journal_retry_stream_id" not in promoted
+        assert "_journal_retry_attempts" not in promoted
+        assert "_journal_retry_first_seen_ts" not in promoted
+
+        # Journaled rows reordered ABOVE the marker, preserving order
+        before_marker = s.messages[:marker_idx]
+        recovered = [m for m in before_marker if m.get("_recovered_from_run_journal")]
+        assert recovered, "journaled assistant rows must sit above the marker"
+        assert any("Late tokens." in m.get("content", "") for m in recovered)
+        # tool_call.assistant_msg_idx still points into the valid range
+        for tc in s.tool_calls or []:
+            idx = tc.get("assistant_msg_idx")
+            if isinstance(idx, int):
+                assert 0 <= idx < len(s.messages)
+
+    def test_increments_attempts_when_journal_still_empty(self, hermes_home, monkeypatch):
+        stream_id = "lazy_stream_increment"
+        s = self._make_pending_session(hermes_home, stream_id=stream_id)
+        # No append_run_event — journal stays empty.
+        ok = models._retry_journal_recovery_in_place(s)
+        assert ok is False
+        marker = s.messages[-1]
+        assert marker.get("_pending_journal_recovery") is True
+        assert marker.get("_journal_retry_attempts") == 1
+        assert "Recovering the partial output" in marker["content"]
+
+    def test_demotes_to_neutral_after_max_attempts(self, hermes_home, monkeypatch):
+        stream_id = "lazy_stream_giveup_attempts"
+        s = self._make_pending_session(
+            hermes_home, stream_id=stream_id,
+            attempts=models._JOURNAL_RETRY_MAX_ATTEMPTS,
+        )
+        ok = models._retry_journal_recovery_in_place(s)
+        assert ok is False
+        marker = s.messages[-1]
+        assert "_pending_journal_recovery" not in marker
+        assert "_journal_retry_stream_id" not in marker
+        assert "_journal_retry_attempts" not in marker
+        assert "_journal_retry_first_seen_ts" not in marker
+        assert "Partial output may have been lost" in marker["content"]
+        assert "Recovering the partial output" not in marker["content"]
+
+    def test_demotes_to_neutral_after_giveup_seconds(self, hermes_home, monkeypatch):
+        stream_id = "lazy_stream_giveup_age"
+        first_seen = int(time.time()) - (models._JOURNAL_RETRY_GIVEUP_SECONDS + 60)
+        s = self._make_pending_session(
+            hermes_home, stream_id=stream_id, first_seen_ts=first_seen,
+        )
+        ok = models._retry_journal_recovery_in_place(s)
+        assert ok is False
+        marker = s.messages[-1]
+        assert "_pending_journal_recovery" not in marker
+        assert "Partial output may have been lost" in marker["content"]
+
+    def test_noop_when_no_pending_marker(self, hermes_home, monkeypatch):
+        s = _make_session(messages=[
+            {"role": "user", "content": "x"},
+            {"role": "assistant", "content": "y"},
+        ])
+        s.save()
+        spy = []
+        original = models._append_journaled_partial_output
+
+        def _spy(*a, **kw):
+            spy.append(1)
+            return original(*a, **kw)
+
+        monkeypatch.setattr(models, "_append_journaled_partial_output", _spy)
+        ok = models._retry_journal_recovery_in_place(s)
+        assert ok is False
+        assert spy == [], "no pending marker → must not call recovery"
+
+    def test_short_circuit_helper_detects_pending_marker(self, hermes_home, monkeypatch):
+        s = _make_session(messages=[
+            {"role": "user", "content": "x"},
+            {"role": "assistant", "content": "y"},
+        ])
+        assert models._session_has_pending_journal_retry(s) is False
+        marker = models._interrupted_recovery_marker(pending_retry=True)
+        marker["_journal_retry_stream_id"] = "abc"
+        marker["_journal_retry_attempts"] = 0
+        marker["_journal_retry_first_seen_ts"] = int(time.time())
+        s.messages.append(marker)
+        assert models._session_has_pending_journal_retry(s) is True
+
+    def test_short_circuit_helper_stops_at_normal_assistant(self, hermes_home, monkeypatch):
+        s = _make_session(messages=[
+            {"role": "user", "content": "x"},
+        ])
+        # An old, already-promoted marker followed by a normal assistant turn —
+        # the pending flag belongs to a prior turn that was healed; helper must
+        # not loop back into it.
+        marker = models._interrupted_recovery_marker(pending_retry=True)
+        marker["_journal_retry_stream_id"] = "abc"
+        marker["_journal_retry_attempts"] = 0
+        marker["_journal_retry_first_seen_ts"] = int(time.time())
+        s.messages.append(marker)
+        s.messages.append({"role": "user", "content": "later"})
+        s.messages.append({"role": "assistant", "content": "later reply"})
+        assert models._session_has_pending_journal_retry(s) is False
+
+
+class TestGetSessionLazyRetryHook:
+    """get_session() must trigger _retry_journal_recovery_in_place on both
+    cache-hit and cold-load paths when a pending marker exists, and skip
+    quickly when nothing is pending."""
+
+    def _make_session_with_pending_marker(self, sid="lazy_get", stream_id="st"):
+        s = _make_session(session_id=sid, messages=[
+            {"role": "user", "content": "u"},
+            {"role": "assistant", "content": "a"},
+            {"role": "user", "content": "u2", "_recovered": True},
+        ])
+        marker = models._interrupted_recovery_marker(pending_retry=True)
+        marker["_journal_retry_stream_id"] = stream_id
+        marker["_journal_retry_attempts"] = 0
+        marker["_journal_retry_first_seen_ts"] = int(time.time())
+        s.messages.append(marker)
+        return s
+
+    def test_triggers_retry_on_cache_hit(self, hermes_home, monkeypatch):
+        sid = "lazy_get_cache"
+        stream_id = "stream_cache"
+        s = self._make_session_with_pending_marker(sid=sid, stream_id=stream_id)
+        s.save()
+        models.SESSIONS[sid] = s
+        append_run_event(sid, stream_id, "token", {"text": "Late."})
+
+        reloaded = models.get_session(sid)
+        assert reloaded is s
+        marker = s.messages[-1] if "Recovering" in s.messages[-1]["content"] else next(
+            m for m in s.messages
+            if m.get("type") == "interrupted" and m.get("_error")
+        )
+        assert "recovered from the run journal" in marker["content"]
+        assert "_pending_journal_recovery" not in marker
+
+    def test_triggers_retry_on_cold_load(self, hermes_home, monkeypatch):
+        sid = "lazy_get_cold"
+        stream_id = "stream_cold"
+        s = self._make_session_with_pending_marker(sid=sid, stream_id=stream_id)
+        s.save()
+        models.SESSIONS.pop(sid, None)
+        append_run_event(sid, stream_id, "token", {"text": "Late."})
+
+        reloaded = models.get_session(sid)
+        marker = next(
+            m for m in reloaded.messages
+            if m.get("type") == "interrupted" and m.get("_error")
+        )
+        assert "recovered from the run journal" in marker["content"]
+        assert "_pending_journal_recovery" not in marker
+
+    def test_short_circuit_when_no_pending_marker(self, hermes_home, monkeypatch):
+        sid = "lazy_get_no_pending"
+        s = _make_session(session_id=sid, messages=[
+            {"role": "user", "content": "x"},
+            {"role": "assistant", "content": "y"},
+        ])
+        s.save()
+        models.SESSIONS[sid] = s
+        spy = []
+        monkeypatch.setattr(
+            models, "_retry_journal_recovery_in_place",
+            lambda session: spy.append(1) or False,
+        )
+        models.get_session(sid)
+        assert spy == []
+
+    def test_metadata_only_skips_retry(self, hermes_home, monkeypatch):
+        sid = "lazy_get_meta"
+        stream_id = "stream_meta"
+        s = self._make_session_with_pending_marker(sid=sid, stream_id=stream_id)
+        s.save()
+        models.SESSIONS[sid] = s
+        spy = []
+        monkeypatch.setattr(
+            models, "_retry_journal_recovery_in_place",
+            lambda session: spy.append(1) or False,
+        )
+        models.get_session(sid, metadata_only=True)
+        assert spy == [], "metadata_only must skip the lazy-retry helper"
+
+
+class TestLazyRetryBackwardsCompat:
+    """Pre-fix session shapes must continue to work."""
+
+    def test_legacy_marker_without_flag_unchanged(self, hermes_home, monkeypatch):
+        """An old session whose marker carries the legacy 'no agent output'
+        wording (no flag) must not be touched by get_session()."""
+        sid = "legacy_marker_sid"
+        s = _make_session(session_id=sid, messages=[
+            {"role": "user", "content": "x"},
+            {"role": "assistant", "content": "y"},
+            {"role": "user", "content": "later", "_recovered": True},
+        ])
+        legacy = models._interrupted_recovery_marker(recovered_output=False)
+        s.messages.append(legacy)
+        s.save()
+        models.SESSIONS.pop(sid, None)
+        spy = []
+        original = models._append_journaled_partial_output
+        monkeypatch.setattr(
+            models, "_append_journaled_partial_output",
+            lambda *a, **kw: spy.append(1) or original(*a, **kw),
+        )
+        models.get_session(sid)
+        assert spy == [], "legacy marker (no flag) must not re-trigger recovery"
+
+    def test_pending_retry_marker_round_trips_through_session_save_and_load(
+            self, hermes_home, monkeypatch):
+        """All four retry meta keys must survive Session.save() / Session.load()."""
+        sid = "round_trip_sid"
+        s = _make_session(session_id=sid, messages=[
+            {"role": "user", "content": "x"},
+        ])
+        marker = models._interrupted_recovery_marker(pending_retry=True)
+        marker["_journal_retry_stream_id"] = "abc"
+        marker["_journal_retry_attempts"] = 3
+        marker["_journal_retry_first_seen_ts"] = 1779200000
+        s.messages.append(marker)
+        s.save()
+        models.SESSIONS.pop(sid, None)
+
+        reloaded = Session.load(sid)
+        last = reloaded.messages[-1]
+        assert last.get("_pending_journal_recovery") is True
+        assert last.get("_journal_retry_stream_id") == "abc"
+        assert last.get("_journal_retry_attempts") == 3
+        assert last.get("_journal_retry_first_seen_ts") == 1779200000
+
+
+class TestJournalToolDedupeScoping:
+    """`_journal_tool_already_present` must only collapse against tool cards
+    recovered from the same stream — a repeated tool (e.g. ``terminal: ls``)
+    in a previous turn must NOT pre-empt this turn's recovery."""
+
+    def test_repeated_tool_in_earlier_turn_does_not_block_recovery(self, hermes_home, monkeypatch):
+        sid = "dedupe_scope_sid"
+        stream_id = "dedupe_scope_stream"
+        s = _make_session(session_id=sid, messages=[
+            {"role": "user", "content": "earlier turn"},
+            {"role": "assistant", "content": "earlier reply"},
+            {"role": "user", "content": "later turn", "_recovered": True},
+        ])
+        # Pre-existing tool card from an earlier turn with same (name, preview)
+        # but a different recovered-stream-id. The retry must NOT see this as a
+        # hit when dedupe_existing is asked to scope to ``stream_id``.
+        s.tool_calls = [
+            {
+                "name": "terminal",
+                "preview": "ls",
+                "snippet": "ls",
+                "tid": "old-1",
+                "_recovered_from_run_journal": True,
+                "_recovered_stream_id": "earlier_stream",
+                "done": True,
+            }
+        ]
+        marker = models._interrupted_recovery_marker(pending_retry=True)
+        marker["_journal_retry_stream_id"] = stream_id
+        marker["_journal_retry_attempts"] = 0
+        marker["_journal_retry_first_seen_ts"] = int(time.time())
+        s.messages.append(marker)
+        s.save()
+        append_run_event(sid, stream_id, "token", {"text": "Listing."})
+        append_run_event(
+            sid, stream_id, "tool",
+            {"name": "terminal", "preview": "ls", "args": {"cmd": "ls"}},
+        )
+
+        ok = models._retry_journal_recovery_in_place(s)
+        assert ok is True
+
+        # Two tool cards now: the old one untouched, plus a new one for this
+        # stream. If dedupe were session-wide the new one would be dropped.
+        scoped = [
+            tc for tc in s.tool_calls
+            if tc.get("_recovered_stream_id") == stream_id
+        ]
+        assert len(scoped) == 1
+        assert scoped[0]["name"] == "terminal"
+        assert scoped[0]["preview"] == "ls"
+        # Old tool card is preserved.
+        assert any(
+            tc.get("_recovered_stream_id") == "earlier_stream"
+            for tc in s.tool_calls
+        )
+
+    def test_untagged_tool_still_matches_for_core_transcript_invariant(self, hermes_home, monkeypatch):
+        """Tool cards without ``_recovered_stream_id`` (live tools, or tools
+        carried over from the core transcript) match regardless of the
+        ``stream_id`` argument. This preserves the "core transcript already
+        contains this tool, don't duplicate it" invariant the original repair
+        path relies on."""
+        s = _make_session(messages=[
+            {"role": "user", "content": "x"},
+            {"role": "assistant", "content": "y"},
+        ])
+        s.tool_calls = [
+            {"name": "terminal", "preview": "ls", "snippet": "ls"},
+        ]
+        # No stream_id → legacy session-wide check returns True.
+        assert models._journal_tool_already_present(s, "terminal", "ls") is True
+        # With a stream_id, untagged tool cards still match (different stream
+        # ids only override the match decision when the existing card itself
+        # is tagged with a stream id that disagrees).
+        assert models._journal_tool_already_present(
+            s, "terminal", "ls", stream_id="some_stream",
+        ) is True
+
+    def test_tagged_tool_with_different_stream_does_not_match(self, hermes_home, monkeypatch):
+        """A tool card tagged with a different recovered_stream_id must NOT
+        be considered a duplicate when the retry is scoped to a different
+        stream."""
+        s = _make_session(messages=[
+            {"role": "user", "content": "x"},
+        ])
+        s.tool_calls = [
+            {
+                "name": "terminal",
+                "preview": "ls",
+                "snippet": "ls",
+                "_recovered_stream_id": "other_stream",
+            },
+        ]
+        assert models._journal_tool_already_present(
+            s, "terminal", "ls", stream_id="this_stream",
+        ) is False
+        # But scoping to the same stream id matches.
+        assert models._journal_tool_already_present(
+            s, "terminal", "ls", stream_id="other_stream",
+        ) is True
+
+
+class TestWslPageCacheRace:
+    """Cover the WSL2 / network-FS shape: read_run_events returns empty / errors
+    first, recovers on a later call."""
+
+    def test_first_read_raises_oserror_second_read_succeeds(self, hermes_home, monkeypatch):
+        sid = "wsl_race_sid"
+        stream_id = "wsl_race_stream"
+        s = _make_session(session_id=sid, messages=[
+            {"role": "user", "content": "x"},
+        ])
+        s.pending_user_message = "Keep going"
+        s.pending_started_at = time.time() - 120
+        s.active_stream_id = stream_id
+
+        # Simulate first read raising IOError, then succeeding.
+        import api.run_journal as run_journal
+        real = run_journal.read_run_events
+        attempts = {"n": 0}
+
+        def flaky_read(sid_, run_id, **kw):
+            attempts["n"] += 1
+            if attempts["n"] == 1:
+                raise OSError("EIO simulated by test")
+            return real(sid_, run_id, **kw)
+
+        monkeypatch.setattr(run_journal, "read_run_events", flaky_read)
+
+        core_path = hermes_home / "sessions" / f"session_{sid}.json"
+        result = _apply_core_sync_or_error_marker(
+            s, core_path, stream_id_for_recheck=stream_id,
+        )
+        assert result is True
+        marker = next(m for m in s.messages if m.get("type") == "interrupted")
+        assert marker.get("_pending_journal_recovery") is True
+
+        # Now write journal events; the next retry call will read them.
+        append_run_event(sid, stream_id, "token", {"text": "Came back."})
+        ok = models._retry_journal_recovery_in_place(s)
+        assert ok is True
+        marker_after = next(m for m in s.messages if m.get("type") == "interrupted")
+        assert "recovered from the run journal" in marker_after["content"]
+        assert "_pending_journal_recovery" not in marker_after
+
+    def test_journal_grows_between_reads(self, hermes_home, monkeypatch):
+        sid = "wsl_grow_sid"
+        stream_id = "wsl_grow_stream"
+        s = _make_session(session_id=sid, messages=[
+            {"role": "user", "content": "x"},
+        ])
+        s.pending_user_message = "Keep going"
+        s.pending_started_at = time.time() - 120
+        s.active_stream_id = stream_id
+
+        # First repair pass: nothing visible yet.
+        core_path = hermes_home / "sessions" / f"session_{sid}.json"
+        result = _apply_core_sync_or_error_marker(
+            s, core_path, stream_id_for_recheck=stream_id,
+        )
+        assert result is True
+        marker = next(m for m in s.messages if m.get("type") == "interrupted")
+        assert marker.get("_pending_journal_recovery") is True
+
+        # Journal grows.
+        append_run_event(sid, stream_id, "token", {"text": "Partial 1."})
+        append_run_event(sid, stream_id, "token", {"text": " Partial 2."})
+        append_run_event(
+            sid, stream_id, "tool",
+            {"name": "terminal", "preview": "ls", "args": {"cmd": "ls"}},
+        )
+
+        ok = models._retry_journal_recovery_in_place(s)
+        assert ok is True
+        marker_after = next(m for m in s.messages if m.get("type") == "interrupted")
+        assert "recovered from the run journal" in marker_after["content"]
+        # Both tokens recovered, in order, before the marker.
+        marker_idx = s.messages.index(marker_after)
+        recovered_text = " ".join(
+            m.get("content", "") for m in s.messages[:marker_idx]
+            if m.get("_recovered_from_run_journal")
+        )
+        assert "Partial 1." in recovered_text and "Partial 2." in recovered_text
+
+    def test_concurrent_get_session_calls_idempotent(self, hermes_home, monkeypatch):
+        sid = "wsl_concurrent_sid"
+        stream_id = "wsl_concurrent_stream"
+        s = _make_session(session_id=sid, messages=[
+            {"role": "user", "content": "x"},
+            {"role": "assistant", "content": "y"},
+            {"role": "user", "content": "later", "_recovered": True},
+        ])
+        marker = models._interrupted_recovery_marker(pending_retry=True)
+        marker["_journal_retry_stream_id"] = stream_id
+        marker["_journal_retry_attempts"] = 0
+        marker["_journal_retry_first_seen_ts"] = int(time.time())
+        s.messages.append(marker)
+        s.save()
+        append_run_event(sid, stream_id, "token", {"text": "ConcurrentTokens"})
+        models.SESSIONS[sid] = s
+
+        results = []
+
+        def _worker():
+            try:
+                models.get_session(sid)
+                results.append("ok")
+            except Exception as exc:  # noqa: BLE001
+                results.append(exc)
+
+        threads = [threading.Thread(target=_worker, daemon=True) for _ in range(2)]
+        for t in threads:
+            t.start()
+        for t in threads:
+            t.join(timeout=5)
+
+        assert results.count("ok") == 2, f"both workers must succeed: {results}"
+
+        # Exactly one interrupted marker remains, exactly one journal-recovered
+        # body (deduped by dedupe_existing=True).
+        interrupted_markers = [m for m in s.messages if m.get("type") == "interrupted"]
+        assert len(interrupted_markers) == 1
+        recovered = [m for m in s.messages if m.get("_recovered_from_run_journal")]
+        assert sum(1 for m in recovered if "ConcurrentTokens" in m.get("content", "")) == 1