Merge remote-tracking branch 'origin/bb/gui' into austin/bb/gui

2026-05-21 03:39:54 +00:00 · 2026-05-13 10:18:22 -04:00
parent 6070941eb0 ca2c3d4ab4
commit bf196bb47b
48 changed files with 976 additions and 763 deletions
@@ -94,9 +94,13 @@ RUN cd web && npm run build && \
 # hermes_cli/main.py succeeds (see #18800). /opt/hermes/web is build-time
 # only (HERMES_WEB_DIST points at hermes_cli/web_dist) and is intentionally
 # not chowned here.
+# The .venv MUST be hermes-writable so lazy_deps.py can install platform
+# packages (discord.py, telegram, slack, etc.) at first gateway boot.
+# Without this, `uv pip install` fails with EACCES and all messaging
+# adapters silently fail to load.  See tools/lazy_deps.py.
 USER root
 RUN chmod -R a+rX /opt/hermes && \
-    chown -R hermes:hermes /opt/hermes/ui-tui /opt/hermes/node_modules
+    chown -R hermes:hermes /opt/hermes/.venv /opt/hermes/ui-tui /opt/hermes/node_modules
 # Start as root so the entrypoint can usermod/groupmod + gosu.
 # If HERMES_UID is unset, the entrypoint drops to the default hermes user (10000).

@@ -1305,9 +1305,8 @@ def convert_tools_to_anthropic(tools: List[Dict]) -> List[Dict]:
            ),
        }
        # Forward cache_control marker when present on the OpenAI-format
-        # tool dict (set by ``mark_tools_for_long_lived_cache``). Anthropic's
-        # tools array supports cache_control on the last tool to cache the
-        # entire schema cross-session.
+        # tool dict. Anthropic's tools array supports cache_control on the
+        # last tool to cache the entire schema cross-session.
        cache_control = t.get("cache_control")
        if isinstance(cache_control, dict):
            anthropic_tool["cache_control"] = dict(cache_control)
@@ -382,7 +382,28 @@ _AI_GATEWAY_HEADERS = {
 # Nous Portal extra_body for product attribution.
 # Callers should pass this as extra_body in chat.completions.create()
 # when the auxiliary client is backed by Nous Portal.
-NOUS_EXTRA_BODY = {"tags": ["product=hermes-agent", "client=aux"]}
+#
+# The tags are computed from agent.portal_tags so the client= marker stays
+# in lockstep with hermes_cli.__version__ across every Portal call site
+# (main loop, aux, compression, web_extract). Do not inline a literal here;
+# see agent/portal_tags.py for the rationale.
+from agent.portal_tags import nous_portal_tags as _nous_portal_tags
+
+
+def _nous_extra_body() -> dict:
+    """Return a fresh Nous Portal ``extra_body`` dict.
+
+    Computed at call time so a hot-reloaded ``hermes_cli.__version__`` is
+    reflected without restarting long-running processes.
+    """
+    return {"tags": _nous_portal_tags()}
+
+
+# Backwards-compatible module attribute. Some callers (tests, third-party
+# plugins) read ``NOUS_EXTRA_BODY`` directly; keep it as a snapshot of the
+# current tags. Callers that need the freshest value should call
+# ``_nous_extra_body()`` or import ``nous_portal_tags`` directly.
+NOUS_EXTRA_BODY = _nous_extra_body()

 # Set at resolve time — True if the auxiliary client points to Nous Portal
 auxiliary_is_nous: bool = False
@@ -3437,7 +3458,7 @@ def get_auxiliary_extra_body() -> dict:
    Includes Nous Portal product tags when the auxiliary client is backed
    by Nous Portal. Returns empty dict otherwise.
    """
-    return dict(NOUS_EXTRA_BODY) if auxiliary_is_nous else {}
+    return _nous_extra_body() if auxiliary_is_nous else {}


 def auxiliary_max_tokens_param(value: int) -> dict:
@@ -4026,7 +4047,7 @@ def _build_call_kwargs(
    # Provider-specific extra_body
    merged_extra = dict(extra_body or {})
    if provider == "nous" or auxiliary_is_nous:
-        merged_extra.setdefault("tags", []).extend(NOUS_EXTRA_BODY["tags"])
+        merged_extra.setdefault("tags", []).extend(_nous_portal_tags())
    if merged_extra:
        kwargs["extra_body"] = merged_extra

@@ -0,0 +1,64 @@
+"""Centralized Nous Portal request tags.
+
+Every Hermes request that hits the Nous Portal — main agent loop, auxiliary
+client (compression / titles / vision / web_extract / session_search / etc.),
+and any future code path — must carry the same product-attribution tags so
+Nous can attribute usage to Hermes Agent and bucket it by client release.
+
+Tag shape (sent in OpenAI-compatible ``extra_body['tags']``):
+
+    [
+        "product=hermes-agent",
+        "client=hermes-client-v<__version__>",
+    ]
+
+The version is sourced live from ``hermes_cli.__version__`` so it auto-aligns
+to whatever release is installed; the release script
+(``scripts/release.py``) regex-bumps that single string, and every Portal
+request picks up the new tag on the next process start.
+
+Why one helper instead of inlining the literal at each site:
+* Four call sites (main loop profile, aux client, run_agent compression
+  fallback, web_tools fallback) used to drift apart — see PR #24194 which
+  only got the aux site, leaving the main loop sending a different tag set.
+* Tests should assert the same tag list everywhere; centralizing makes that
+  assertion a one-liner against this module.
+
+Do NOT pre-compute these as module-level constants in the consumers. The
+version can change at runtime (editable installs, hot-reload tooling), and
+``hermes_cli.__version__`` is the canonical source of truth.
+"""
+
+from __future__ import annotations
+
+from typing import List
+
+
+def _hermes_version() -> str:
+    """Return the current Hermes release version, e.g. ``"0.13.0"``.
+
+    Falls back to ``"unknown"`` if ``hermes_cli`` cannot be imported (should
+    never happen in a real install — guarded for defensive testing).
+    """
+    try:
+        from hermes_cli import __version__
+        return __version__
+    except Exception:
+        return "unknown"
+
+
+def hermes_client_tag() -> str:
+    """Return the ``client=...`` tag for Nous Portal requests.
+
+    Format: ``client=hermes-client-v<MAJOR>.<MINOR>.<PATCH>``.
+    """
+    return f"client=hermes-client-v{_hermes_version()}"
+
+
+def nous_portal_tags() -> List[str]:
+    """Return the canonical list of Nous Portal product tags.
+
+    Always returns a fresh list so callers can mutate it freely
+    (e.g. ``merged_extra.setdefault("tags", []).extend(nous_portal_tags())``).
+    """
+    return ["product=hermes-agent", hermes_client_tag()]
@@ -268,7 +268,7 @@ TOOL_USE_ENFORCEMENT_GUIDANCE = (

 # Model name substrings that trigger tool-use enforcement guidance.
 # Add new patterns here when a model family needs explicit steering.
-TOOL_USE_ENFORCEMENT_MODELS = ("gpt", "codex", "gemini", "gemma", "grok")
+TOOL_USE_ENFORCEMENT_MODELS = ("gpt", "codex", "gemini", "gemma", "grok", "glm")

 # OpenAI GPT/Codex-specific execution guidance.  Addresses known failure modes
 # where GPT models abandon work on partial results, skip prerequisite lookups,
@@ -1,25 +1,15 @@
-"""Anthropic prompt caching strategies.
+"""Anthropic prompt caching strategy.

-Two layouts:
-
-* ``system_and_3`` (default, used everywhere except the long-lived path):
-  4 cache_control breakpoints — system prompt + last 3 non-system messages.
-  All at the same TTL (5m or 1h). Reduces input token costs by ~75% on
-  multi-turn conversations within a single session.
-
-* ``prefix_and_2`` (Claude on Anthropic / OpenRouter / Nous Portal):
-  4 breakpoints split across two TTL tiers — tools[-1] (1h) +
-  stable system prefix (1h) + last 2 non-system messages (5m). The
-  long-lived prefix is byte-stable across sessions for a given user
-  config, so every fresh session reads the cached system+tools instead
-  of re-paying for them. Within-session rolling window shrinks from 3
-  messages to 2 to free the breakpoint budget.
+Single layout: ``system_and_3``. 4 cache_control breakpoints — system
+prompt + last 3 non-system messages, all at the same TTL (5m or 1h).
+Reduces input token costs by ~75% on multi-turn conversations within a
+single session.

 Pure functions -- no class state, no AIAgent dependency.
 """

 import copy
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List


 def _apply_cache_marker(msg: dict, cache_marker: dict, native_anthropic: bool = False) -> None:
@@ -87,115 +77,3 @@ def apply_anthropic_cache_control(
        _apply_cache_marker(messages[idx], marker, native_anthropic=native_anthropic)

    return messages
-
-
-def _mark_system_stable_block(
-    messages: List[Dict[str, Any]],
-    long_lived_marker: Dict[str, str],
-) -> bool:
-    """Mark the *first* content block of the system message with the 1h marker.
-
-    The system message is expected to have been split into multiple content
-    blocks beforehand by the caller — block[0] is the cross-session-stable
-    prefix, subsequent blocks carry context files + volatile suffix.
-    Falls back to marking the whole system message as a single block when
-    the message hasn't been split (preserves correctness on the fallback path).
-
-    Returns True when a marker was placed.
-    """
-    if not messages or messages[0].get("role") != "system":
-        return False
-
-    sys_msg = messages[0]
-    content = sys_msg.get("content")
-
-    # Already a list of blocks → mark the first block.
-    if isinstance(content, list) and content:
-        first = content[0]
-        if isinstance(first, dict):
-            first["cache_control"] = long_lived_marker
-            return True
-        return False
-
-    # String content (no split) → cannot place a stable-prefix breakpoint
-    # without changing the byte content.  Caller is responsible for
-    # splitting; if they didn't, fall through to envelope marker so we still
-    # cache *something* for this turn.
-    if isinstance(content, str) and content:
-        sys_msg["content"] = [
-            {"type": "text", "text": content, "cache_control": long_lived_marker}
-        ]
-        return True
-
-    return False
-
-
-def apply_anthropic_cache_control_long_lived(
-    api_messages: List[Dict[str, Any]],
-    long_lived_ttl: str = "1h",
-    rolling_ttl: str = "5m",
-    native_anthropic: bool = False,
-) -> List[Dict[str, Any]]:
-    """Apply prefix_and_2 caching: long-lived stable prefix + rolling window.
-
-    Layout (4 breakpoints total):
-      * Stable system prefix (block[0]) → ``long_lived_ttl`` TTL
-      * Last 2 non-system messages → ``rolling_ttl`` TTL each
-
-    NOTE: this function does NOT mark the tools array. Tools cache_control
-    is attached separately (see ``mark_tools_for_long_lived_cache``) because
-    tools live outside the messages list in the API payload.
-
-    The caller MUST have split the system message into ordered content
-    blocks where block[0] is the cross-session-stable portion. If the system
-    message is still a single string, it is wrapped into a single block and
-    marked — this is correct, just less effective (the volatile suffix is
-    not isolated, so the prefix invalidates per-session).
-
-    Returns:
-        Deep copy of messages with cache_control breakpoints injected.
-    """
-    messages = copy.deepcopy(api_messages)
-    if not messages:
-        return messages
-
-    long_marker = _build_marker(long_lived_ttl)
-    rolling_marker = _build_marker(rolling_ttl)
-
-    placed_prefix = _mark_system_stable_block(messages, long_marker)
-
-    # Reserve 1 breakpoint for the system prefix (when placed); spend the
-    # remaining 3 on the rolling tail.  Anthropic max is 4 total —
-    # tools[-1] (when marked) consumes the 4th, so we cap rolling at 2 here.
-    rolling_budget = 2 if placed_prefix else 3
-    non_sys = [i for i in range(len(messages)) if messages[i].get("role") != "system"]
-    for idx in non_sys[-rolling_budget:]:
-        _apply_cache_marker(messages[idx], rolling_marker, native_anthropic=native_anthropic)
-
-    return messages
-
-
-def mark_tools_for_long_lived_cache(
-    tools: Optional[List[Dict[str, Any]]],
-    long_lived_ttl: str = "1h",
-) -> Optional[List[Dict[str, Any]]]:
-    """Attach cache_control to the last tool in the OpenAI-format tools list.
-
-    Anthropic prefix-cache order is ``tools → system → messages``.  Marking
-    the last tool dict caches the entire tools array (Anthropic's docs:
-    "the marker is placed on the last block you want included in the cached
-    prefix").  Marker is preserved across the OpenAI-wire boundary on
-    OpenRouter and Nous Portal (which proxies to OpenRouter); on native
-    Anthropic the marker is forwarded by ``convert_tools_to_anthropic``.
-
-    Returns a deep copy of the tools list with the marker attached, or the
-    input unchanged when tools is empty/None.  Pure function — does not
-    mutate the input.
-    """
-    if not tools:
-        return tools
-    out = copy.deepcopy(tools)
-    last = out[-1]
-    if isinstance(last, dict):
-        last["cache_control"] = _build_marker(long_lived_ttl)
-    return out
@@ -1,6 +1,6 @@
 import { Button } from '@/components/ui/button'
 import { triggerHaptic } from '@/lib/haptics'
-import { ArrowUp, AudioLines, Loader2, Mic, MicOff, Square } from '@/lib/icons'
+import { ArrowUp, AudioLines, Layers3, Loader2, Mic, MicOff, Square } from '@/lib/icons'
 import { cn } from '@/lib/utils'

 import type { ConversationStatus } from './hooks/use-voice-conversation'
@@ -31,6 +31,7 @@ interface ConversationProps {

 export function ComposerControls({
  busy,
+  busyAction,
  canSubmit,
  conversation,
  disabled,
@@ -40,6 +41,7 @@ export function ComposerControls({
  onDictate
 }: {
  busy: boolean
+  busyAction: 'queue' | 'stop'
  canSubmit: boolean
  conversation: ConversationProps
  disabled: boolean
@@ -74,12 +76,21 @@ export function ComposerControls({
        </Button>
      ) : (
        <Button
-          aria-label={busy ? 'Stop' : 'Send'}
+          aria-label={busy ? (busyAction === 'queue' ? 'Queue message' : 'Stop') : 'Send'}
          className={PRIMARY_ICON_BTN}
          disabled={disabled || !canSubmit}
+          title={busy ? (busyAction === 'queue' ? 'Queue message' : 'Stop') : 'Send'}
          type="submit"
        >
-          {busy ? <span className="block size-3 rounded-[0.1875rem] bg-current" /> : <ArrowUp size={18} />}
+          {busy ? (
+            busyAction === 'queue' ? (
+              <Layers3 size={16} />
+            ) : (
+              <span className="block size-3 rounded-[0.1875rem] bg-current" />
+            )
+          ) : (
+            <ArrowUp size={18} />
+          )}
        </Button>
      )}
    </div>
@@ -13,6 +13,7 @@ import {
 } from 'react'

 import { formatRefValue, hermesDirectiveFormatter } from '@/components/assistant-ui/directive-text'
+import { Button } from '@/components/ui/button'
 import { useMediaQuery } from '@/hooks/use-media-query'
 import { useResizeObserver } from '@/hooks/use-resize-observer'
 import { chatMessageText } from '@/lib/chat-messages'
@@ -20,7 +21,19 @@ import { contextPath } from '@/lib/chat-runtime'
 import { DATA_IMAGE_URL_RE } from '@/lib/embedded-images'
 import { triggerHaptic } from '@/lib/haptics'
 import { cn } from '@/lib/utils'
-import { $composerAttachments, $composerDraft } from '@/store/composer'
+import {
+  $composerAttachments,
+  $composerDraft,
+  clearComposerAttachments,
+  type ComposerAttachment
+} from '@/store/composer'
+import {
+  $queuedPromptsBySession,
+  enqueueQueuedPrompt,
+  removeQueuedPrompt,
+  type QueuedPromptEntry,
+  updateQueuedPrompt
+} from '@/store/composer-queue'
 import { $messages } from '@/store/session'
 import { $threadScrolledUp } from '@/store/thread-scroll'

@@ -41,6 +54,7 @@ import {
  renderComposerContents,
  RICH_INPUT_SLOT
 } from './rich-editor'
+import { QueuePanel } from './queue-panel'
 import { SkinSlashPopover } from './skin-slash-popover'
 import { detectTrigger, extractClipboardImageBlobs, textBeforeCaret, type TriggerState } from './text-utils'
 import { ComposerTriggerPopover } from './trigger-popover'
@@ -53,6 +67,15 @@ const COMPOSER_STACK_BREAKPOINT_PX = 320
 const COMPOSER_FADE_BACKGROUND =
  'linear-gradient(to bottom, transparent, color-mix(in srgb, var(--dt-background) 10%, transparent))'

+interface QueueEditState {
+  attachments: ComposerAttachment[]
+  draft: string
+  entryId: string
+  sessionKey: string
+}
+
+const cloneAttachments = (attachments: ComposerAttachment[]) => attachments.map(a => ({ ...a }))
+
 export function ChatBar({
  busy,
  cwd,
@@ -60,6 +83,7 @@ export function ChatBar({
  focusKey,
  gateway,
  maxRecordingSeconds = 120,
+  queueSessionKey,
  sessionId,
  state,
  onCancel,
@@ -77,12 +101,17 @@ export function ChatBar({
  const aui = useAui()
  const draft = useAuiState(s => s.composer.text)
  const attachments = useStore($composerAttachments)
+  const queuedPromptsBySession = useStore($queuedPromptsBySession)
  const scrolledUp = useStore($threadScrolledUp)
+  const activeQueueSessionKey = queueSessionKey || sessionId || null
+  const queuedPrompts = activeQueueSessionKey ? (queuedPromptsBySession[activeQueueSessionKey] ?? []) : []

  const composerRef = useRef<HTMLFormElement | null>(null)
  const composerSurfaceRef = useRef<HTMLDivElement | null>(null)
  const editorRef = useRef<HTMLDivElement | null>(null)
  const draftRef = useRef(draft)
+  const previousBusyRef = useRef(busy)
+  const drainingQueueRef = useRef(false)
  const urlInputRef = useRef<HTMLInputElement | null>(null)

  const [urlOpen, setUrlOpen] = useState(false)
@@ -91,6 +120,7 @@ export function ChatBar({
  const [voiceConversationActive, setVoiceConversationActive] = useState(false)
  const [tight, setTight] = useState(false)
  const [dragActive, setDragActive] = useState(false)
+  const [queueEdit, setQueueEdit] = useState<QueueEditState | null>(null)
  const dragDepthRef = useRef(0)
  const lastSpokenIdRef = useRef<string | null>(null)

@@ -102,6 +132,8 @@ export function ChatBar({
  const stacked = expanded || narrow || tight
  const hasComposerPayload = draft.trim().length > 0 || attachments.length > 0
  const canSubmit = busy || hasComposerPayload
+  const editingQueuedPrompt = queueEdit ? queuedPrompts.find(entry => entry.id === queueEdit.entryId) ?? null : null
+  const busyAction = busy && hasComposerPayload ? 'queue' : 'stop'
  const showHelpHint = draft === '?'

  const placeholder = disabled ? 'Starting Hermes…' : 'Ask anything'
@@ -463,6 +495,14 @@ export function ChatBar({
  }

  const handleEditorKeyDown = (event: KeyboardEvent<HTMLDivElement>) => {
+    if ((event.metaKey || event.ctrlKey) && !event.altKey && !event.shiftKey && event.key.toLowerCase() === 'k') {
+      event.preventDefault()
+
+      if (!busy) void drainNextQueued()
+
+      return
+    }
+
    if (trigger && triggerItems.length > 0) {
      if (event.key === 'ArrowDown') {
        event.preventDefault()
@@ -499,6 +539,13 @@ export function ChatBar({

    if (event.key === 'Enter' && !event.shiftKey) {
      event.preventDefault()
+
+      if (!busy && !hasComposerPayload && queuedPrompts.length > 0) {
+        void drainNextQueued()
+
+        return
+      }
+
      submitDraft()
    }
  }
@@ -635,10 +682,147 @@ export function ChatBar({
    }
  }

-  const submitDraft = () => {
-    if (busy) {
+  const loadIntoComposer = (text: string, attachments: ComposerAttachment[]) => {
+    draftRef.current = text
+    aui.composer().setText(text)
+    $composerAttachments.set(cloneAttachments(attachments))
+
+    const editor = editorRef.current
+
+    if (editor) {
+      renderComposerContents(editor, text)
+      placeCaretEnd(editor)
+    }
+  }
+
+  const beginQueuedEdit = (entry: QueuedPromptEntry) => {
+    if (!activeQueueSessionKey || queueEdit) return
+
+    setQueueEdit({
+      attachments: cloneAttachments($composerAttachments.get()),
+      draft: draftRef.current,
+      entryId: entry.id,
+      sessionKey: activeQueueSessionKey
+    })
+    loadIntoComposer(entry.text, entry.attachments)
+    triggerHaptic('selection')
+    focusInput()
+  }
+
+  const exitQueuedEdit = (action: 'cancel' | 'save'): boolean => {
+    if (!queueEdit) return false
+
+    if (action === 'save') {
+      const text = draftRef.current
+      const next = cloneAttachments($composerAttachments.get())
+
+      if (!text.trim() && next.length === 0) return false
+
+      const saved = updateQueuedPrompt(queueEdit.sessionKey, queueEdit.entryId, { attachments: next, text })
+      triggerHaptic(saved ? 'success' : 'selection')
+    } else {
      triggerHaptic('cancel')
-      onCancel()
+    }
+
+    loadIntoComposer(queueEdit.draft, queueEdit.attachments)
+    setQueueEdit(null)
+    focusInput()
+
+    return true
+  }
+
+  const queueCurrentDraft = useCallback(() => {
+    if (!activeQueueSessionKey || (!draft.trim() && attachments.length === 0)) return false
+    if (!enqueueQueuedPrompt(activeQueueSessionKey, { text: draft, attachments })) return false
+
+    clearDraft()
+    clearComposerAttachments()
+    triggerHaptic('selection')
+
+    return true
+  }, [activeQueueSessionKey, attachments, draft])
+
+  // All queue drain paths share one lock + send-then-remove sequence.
+  // `pickEntry` lets each caller choose head, by-id, or skip-edited.
+  const runDrain = useCallback(
+    async (pickEntry: (entries: QueuedPromptEntry[]) => QueuedPromptEntry | undefined): Promise<boolean> => {
+      if (drainingQueueRef.current || !activeQueueSessionKey) return false
+
+      const entry = pickEntry(queuedPrompts)
+
+      if (!entry) return false
+
+      drainingQueueRef.current = true
+
+      try {
+        const accepted = await Promise.resolve(onSubmit(entry.text, { attachments: entry.attachments, fromQueue: true }))
+
+        if (accepted === false) return false
+
+        removeQueuedPrompt(activeQueueSessionKey, entry.id)
+
+        return true
+      } finally {
+        drainingQueueRef.current = false
+      }
+    },
+    [activeQueueSessionKey, onSubmit, queuedPrompts]
+  )
+
+  const drainNextQueued = useCallback(
+    () =>
+      runDrain(entries => {
+        const skip = queueEdit?.entryId
+
+        return skip ? entries.find(e => e.id !== skip) : entries[0]
+      }),
+    [queueEdit, runDrain]
+  )
+
+  const sendQueuedNow = useCallback(
+    (id: string) => runDrain(entries => entries.find(e => e.id === id && id !== queueEdit?.entryId)),
+    [queueEdit, runDrain]
+  )
+
+  const interruptAndSendNextQueued = useCallback(async () => {
+    if (queuedPrompts.length === 0) return false
+
+    await Promise.resolve(onCancel())
+
+    return drainNextQueued()
+  }, [drainNextQueued, onCancel, queuedPrompts.length])
+
+  // Auto-drain on busy → false (turn settled).
+  useEffect(() => {
+    const wasBusy = previousBusyRef.current
+    previousBusyRef.current = busy
+
+    if (busy || !wasBusy || queuedPrompts.length === 0) return
+
+    void drainNextQueued()
+  }, [busy, drainNextQueued, queuedPrompts.length])
+
+  // Clean up queue edit when its target disappears (session swap or external delete).
+  useEffect(() => {
+    if (!queueEdit) return
+    if (queueEdit.sessionKey === activeQueueSessionKey && editingQueuedPrompt) return
+
+    loadIntoComposer(queueEdit.draft, queueEdit.attachments)
+    setQueueEdit(null)
+  }, [activeQueueSessionKey, editingQueuedPrompt, queueEdit]) // eslint-disable-line react-hooks/exhaustive-deps
+
+  const submitDraft = () => {
+    if (queueEdit) {
+      exitQueuedEdit('save')
+    } else if (busy) {
+      if (hasComposerPayload) queueCurrentDraft()
+      else if (queuedPrompts.length > 0) void interruptAndSendNextQueued()
+      else {
+        triggerHaptic('cancel')
+        void Promise.resolve(onCancel())
+      }
+    } else if (!hasComposerPayload && queuedPrompts.length > 0) {
+      void drainNextQueued()
    } else if (draft.trim() || attachments.length > 0) {
      const submitted = draft
      triggerHaptic('submit')
@@ -742,6 +926,7 @@ export function ChatBar({
  const controls = (
    <ComposerControls
      busy={busy}
+      busyAction={busyAction}
      canSubmit={canSubmit}
      conversation={{
        active: voiceConversationActive,
@@ -824,6 +1009,22 @@ export function ChatBar({
            />
          )}
          <SkinSlashPopover draft={draft} onSelect={selectSkinSlashCommand} />
+          {activeQueueSessionKey && queuedPrompts.length > 0 && (
+            <div className="relative z-6 mb-1 px-0.5">
+              <QueuePanel
+                busy={busy}
+                editingId={queueEdit?.entryId ?? null}
+                entries={queuedPrompts}
+                onDelete={id => {
+                  if (removeQueuedPrompt(activeQueueSessionKey, id) && queueEdit?.entryId === id) {
+                    exitQueuedEdit('cancel')
+                  }
+                }}
+                onEdit={beginQueuedEdit}
+                onSendNow={id => void sendQueuedNow(id)}
+              />
+            </div>
+          )}
          <div
            className="pointer-events-none absolute inset-0 rounded-[inherit]"
            style={{ background: COMPOSER_FADE_BACKGROUND }}
@@ -871,6 +1072,28 @@ export function ChatBar({
              >
                <VoiceActivity state={voiceActivityState} />
                <VoicePlaybackActivity />
+                {queueEdit && editingQueuedPrompt && (
+                  <div className="flex items-center justify-between gap-2 rounded-lg border border-[color-mix(in_srgb,var(--dt-composer-ring)_32%,transparent)] bg-accent/18 px-2 py-1">
+                    <div className="min-w-0 text-[0.7rem] text-muted-foreground/88">Editing queued turn in composer</div>
+                    <div className="flex shrink-0 items-center gap-1">
+                      <Button
+                        className="h-6 rounded-md px-2 text-[0.68rem]"
+                        onClick={() => exitQueuedEdit('cancel')}
+                        type="button"
+                        variant="ghost"
+                      >
+                        Cancel
+                      </Button>
+                      <Button
+                        className="h-6 rounded-md px-2 text-[0.68rem]"
+                        onClick={() => exitQueuedEdit('save')}
+                        type="button"
+                      >
+                        Save
+                      </Button>
+                    </div>
+                  </div>
+                )}
                {attachments.length > 0 && <AttachmentList attachments={attachments} onRemove={onRemoveAttachment} />}
                <div
                  className={cn(
@@ -0,0 +1,123 @@
+import { useState } from 'react'
+
+import { Button } from '@/components/ui/button'
+import { ArrowUp, ChevronDown, Pencil, Trash2 } from '@/lib/icons'
+import { cn } from '@/lib/utils'
+import type { QueuedPromptEntry } from '@/store/composer-queue'
+
+interface QueuePanelProps {
+  busy: boolean
+  editingId: null | string
+  entries: QueuedPromptEntry[]
+  onDelete: (id: string) => void
+  onEdit: (entry: QueuedPromptEntry) => void
+  onSendNow: (id: string) => void
+}
+
+const entryPreview = (entry: QueuedPromptEntry) =>
+  entry.text.trim() || (entry.attachments.length > 0 ? 'Attachment-only turn' : 'Empty turn')
+
+export function QueuePanel({ busy, editingId, entries, onDelete, onEdit, onSendNow }: QueuePanelProps) {
+  const [collapsed, setCollapsed] = useState(false)
+
+  if (entries.length === 0) return null
+
+  return (
+    <div className="rounded-2xl border border-border/65 bg-[color-mix(in_srgb,var(--dt-card)_70%,transparent)] py-0.5 shadow-[0_0_0_1px_color-mix(in_srgb,var(--dt-card)_30%,transparent)_inset]">
+      <button
+        className="flex w-full items-center gap-1.5 px-2.5 py-1 text-left text-[0.72rem] font-medium text-muted-foreground/92 transition-colors hover:text-foreground/90"
+        onClick={() => setCollapsed(open => !open)}
+        type="button"
+      >
+        <ChevronDown className={cn('shrink-0 transition-transform', collapsed && '-rotate-90')} size={14} />
+        <span className="truncate">{entries.length} Queued</span>
+      </button>
+
+      {!collapsed && (
+        <div className="space-y-0.5 px-1.5 pb-0.5">
+          {entries.map(entry => {
+            const isEditing = editingId === entry.id
+            const attachmentsCount = entry.attachments.length
+
+            return (
+              <div
+                className={cn(
+                  'group/queue-row flex items-center gap-1.5 rounded-lg border border-transparent px-1.5 py-1',
+                  'transition-colors duration-300 ease-out hover:bg-(--chrome-action-hover) hover:transition-none',
+                  isEditing && 'border-[color-mix(in_srgb,var(--dt-composer-ring)_40%,transparent)] bg-accent/25'
+                )}
+                key={entry.id}
+              >
+                <span
+                  aria-hidden
+                  className="h-3.5 w-3.5 shrink-0 rounded-full border border-foreground/35 bg-transparent"
+                />
+                <div className="min-w-0 flex-1">
+                  <p className="truncate text-[0.73rem] leading-4 text-foreground/92">{entryPreview(entry)}</p>
+                  {(attachmentsCount > 0 || isEditing) && (
+                    <div className="mt-0.5 flex items-center gap-1.5 text-[0.64rem] text-muted-foreground/75">
+                      {attachmentsCount > 0 && (
+                        <span>
+                          {attachmentsCount} attachment{attachmentsCount === 1 ? '' : 's'}
+                        </span>
+                      )}
+                      {isEditing && (
+                        <span className="text-[color-mix(in_srgb,var(--dt-composer-ring)_78%,var(--muted-foreground))]">
+                          Editing in composer
+                        </span>
+                      )}
+                    </div>
+                  )}
+                </div>
+                <div
+                  className={cn(
+                    'flex shrink-0 items-center gap-0 transition-opacity',
+                    isEditing
+                      ? 'opacity-100'
+                      : 'opacity-0 group-hover/queue-row:opacity-100 group-focus-within/queue-row:opacity-100'
+                  )}
+                >
+                  <Button
+                    aria-label="Edit queued turn"
+                    className="h-5 w-5 rounded-md"
+                    disabled={Boolean(editingId) && !isEditing}
+                    onClick={() => onEdit(entry)}
+                    size="icon-xs"
+                    title="Edit queued turn"
+                    type="button"
+                    variant="ghost"
+                  >
+                    <Pencil size={11} />
+                  </Button>
+                  <Button
+                    aria-label="Send queued turn now"
+                    className="h-5 w-5 rounded-md"
+                    disabled={busy || isEditing}
+                    onClick={() => onSendNow(entry.id)}
+                    size="icon-xs"
+                    title="Send queued turn now"
+                    type="button"
+                    variant="ghost"
+                  >
+                    <ArrowUp size={11} />
+                  </Button>
+                  <Button
+                    aria-label="Delete queued turn"
+                    className="h-5 w-5 rounded-md"
+                    onClick={() => onDelete(entry.id)}
+                    size="icon-xs"
+                    title="Delete queued turn"
+                    type="button"
+                    variant="ghost"
+                  >
+                    <Trash2 size={11} />
+                  </Button>
+                </div>
+              </div>
+            )
+          })}
+        </div>
+      )}
+    </div>
+  )
+}
@@ -1,4 +1,5 @@
 import type { HermesGateway } from '@/hermes'
+import type { ComposerAttachment } from '@/store/composer'

 import type { DroppedFile } from '../hooks/use-composer-actions'

@@ -33,9 +34,10 @@ export interface ChatBarProps {
  maxRecordingSeconds?: number
  state: ChatBarState
  gateway?: HermesGateway | null
+  queueSessionKey?: string | null
  sessionId?: string | null
  cwd?: string | null
-  onCancel: () => void
+  onCancel: () => Promise<void> | void
  onAddContextRef?: (refText: string, label?: string, detail?: string) => void
  onAddUrl?: (url: string) => void
  onAttachImageBlob?: (blob: Blob) => Promise<boolean | void> | boolean | void
@@ -45,7 +47,10 @@ export interface ChatBarProps {
  onPickFolders?: () => void
  onPickImages?: () => void
  onRemoveAttachment?: (id: string) => void
-  onSubmit: (value: string) => Promise<void> | void
+  onSubmit: (
+    value: string,
+    options?: { attachments?: ComposerAttachment[]; fromQueue?: boolean }
+  ) => Promise<boolean> | boolean
  onTranscribeAudio?: (audio: Blob) => Promise<string>
 }

@@ -20,6 +20,7 @@ import { ChevronDown } from '@/lib/icons'
 import { useIncrementalExternalStoreRuntime } from '@/lib/incremental-external-store-runtime'
 import { cn } from '@/lib/utils'
 import { $pinnedSessionIds } from '@/store/layout'
+import type { ComposerAttachment } from '@/store/composer'
 import {
  $activeSessionId,
  $awaitingResponse,
@@ -51,7 +52,7 @@ interface ChatViewProps extends Omit<React.ComponentProps<'div'>, 'onSubmit'> {
  gateway: HermesGateway | null
  onToggleSelectedPin: () => void
  onDeleteSelectedSession: () => void
-  onCancel: () => void
+  onCancel: () => Promise<void> | void
  onAddContextRef: (refText: string, label?: string, detail?: string) => void
  onAddUrl: (url: string) => void
  onBranchInNewChat: (messageId: string) => void
@@ -63,7 +64,10 @@ interface ChatViewProps extends Omit<React.ComponentProps<'div'>, 'onSubmit'> {
  onPickFolders: () => void
  onPickImages: () => void
  onRemoveAttachment: (id: string) => void
-  onSubmit: (text: string) => Promise<void> | void
+  onSubmit: (
+    text: string,
+    options?: { attachments?: ComposerAttachment[]; fromQueue?: boolean }
+  ) => Promise<boolean> | boolean
  onThreadMessagesChange: (messages: readonly ThreadMessage[]) => void
  onEdit: (message: AppendMessage) => Promise<void>
  onReload: (parentId: string | null) => Promise<void>
@@ -311,6 +315,7 @@ export function ChatView({
                onRemoveAttachment={onRemoveAttachment}
                onSubmit={onSubmit}
                onTranscribeAudio={onTranscribeAudio}
+                queueSessionKey={selectedSessionId || activeSessionId}
                sessionId={activeSessionId}
                state={chatBarState}
              />
@@ -472,7 +472,7 @@ export function DesktopController() {
      onAttachDroppedItems={composer.attachDroppedItems}
      onAttachImageBlob={composer.attachImageBlob}
      onBranchInNewChat={messageId => void branchInNewChat(messageId)}
-      onCancel={() => void cancelRun()}
+      onCancel={cancelRun}
      onDeleteSelectedSession={() => {
        if (selectedStoredSessionId) {
          void removeSession(selectedStoredSessionId)
@@ -71,6 +71,11 @@ interface PromptActionsOptions {
  ) => ClientSessionState
 }

+interface SubmitTextOptions {
+  attachments?: ComposerAttachment[]
+  fromQueue?: boolean
+}
+
 function renderCommandsCatalog(catalog: CommandsCatalogLike): string {
  const desktopCatalog = filterDesktopCommandsCatalog(catalog)

@@ -153,7 +158,12 @@ export function usePromptActions({
  )

  const syncImageAttachmentsForSubmit = useCallback(
-    async (sessionId: string, attachments: ComposerAttachment[]) => {
+    async (
+      sessionId: string,
+      attachments: ComposerAttachment[],
+      options: { updateComposerAttachments?: boolean } = {}
+    ) => {
+      const updateComposerAttachments = options.updateComposerAttachments ?? true
      const images = attachments.filter(attachment => attachment.kind === 'image' && attachment.path)

      for (const attachment of images) {
@@ -173,22 +183,25 @@ export function usePromptActions({

        const attachedPath = result.path || attachment.path

-        addComposerAttachment({
-          ...attachment,
-          id: attachment.id,
-          label: attachedPath ? pathLabel(attachedPath) : attachment.label,
-          path: attachedPath,
-          attachedSessionId: sessionId
-        })
+        if (updateComposerAttachments) {
+          addComposerAttachment({
+            ...attachment,
+            id: attachment.id,
+            label: attachedPath ? pathLabel(attachedPath) : attachment.label,
+            path: attachedPath,
+            attachedSessionId: sessionId
+          })
+        }
      }
    },
    [requestGateway]
  )

  const submitPromptText = useCallback(
-    async (rawText: string) => {
+    async (rawText: string, options?: SubmitTextOptions) => {
      const visibleText = rawText.trim()
-      const attachments = $composerAttachments.get()
+      const usingComposerAttachments = !options?.attachments
+      const attachments = options?.attachments ?? $composerAttachments.get()
      const contextRefs = attachments
        .map(a => a.refText)
        .filter(Boolean)
@@ -200,7 +213,7 @@ export function usePromptActions({
        [contextRefs, visibleText].filter(Boolean).join('\n\n') || (hasImage ? 'What do you see in this image?' : '')

      if (!text || busyRef.current) {
-        return
+        return false
      }

      const optimisticId = `user-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`
@@ -232,7 +245,7 @@ export function usePromptActions({
            awaitingResponse: true,
            pendingBranchGroup: null,
            sawAssistantPayload: false,
-            interrupted: false
+            interrupted: state.interrupted
          }),
          selectedStoredSessionIdRef.current
        )
@@ -278,7 +291,7 @@ export function usePromptActions({
          releaseBusy()
          notifyError(err, 'Session unavailable')

-          return
+          return false
        }

        if (!sessionId) {
@@ -286,16 +299,21 @@ export function usePromptActions({
          releaseBusy()
          notify({ kind: 'error', title: 'Session unavailable', message: 'Could not create a new session' })

-          return
+          return false
        }

        seedOptimistic(sessionId)
      }

      try {
-        await syncImageAttachmentsForSubmit(sessionId, attachments)
+        await syncImageAttachmentsForSubmit(sessionId, attachments, {
+          updateComposerAttachments: usingComposerAttachments
+        })
        await requestGateway('prompt.submit', { session_id: sessionId, text })
-        clearComposerAttachments()
+
+        if (usingComposerAttachments) clearComposerAttachments()
+
+        return true
      } catch (err) {
        releaseBusy()
        updateSessionState(sessionId, state => ({ ...state, busy: false, awaitingResponse: false }))
@@ -303,10 +321,11 @@ export function usePromptActions({
        if (isProviderSetupError(err)) {
          requestDesktopOnboarding('Add a provider credential before sending your first message.')

-          return
+          return false
        }

        notifyError(err, 'Prompt failed')
+        return false
      }
    },
    [
@@ -477,18 +496,18 @@ export function usePromptActions({
  )

  const submitText = useCallback(
-    async (rawText: string) => {
+    async (rawText: string, options?: SubmitTextOptions) => {
      const visibleText = rawText.trim()
-      const attachments = $composerAttachments.get()
+      const attachments = options?.attachments ?? $composerAttachments.get()

      if (!attachments.length && SLASH_COMMAND_RE.test(visibleText)) {
        triggerHaptic('selection')
        await executeSlashCommand(visibleText)

-        return
+        return true
      }

-      await submitPromptText(rawText)
+      return await submitPromptText(rawText, options)
    },
    [executeSlashCommand, submitPromptText]
  )
@@ -7,6 +7,7 @@ import { type ChatMessage, chatMessageText, toChatMessages } from '@/lib/chat-me
 import { normalizePersonalityValue } from '@/lib/chat-runtime'
 import { embeddedImageUrls, textWithoutEmbeddedImages } from '@/lib/embedded-images'
 import { clearComposerAttachments, clearComposerDraft } from '@/store/composer'
+import { clearQueuedPrompts } from '@/store/composer-queue'
 import { $pinnedSessionIds } from '@/store/layout'
 import { clearNotifications, notify, notifyError } from '@/store/notifications'
 import { requestDesktopOnboarding } from '@/store/onboarding'
@@ -649,6 +650,11 @@ export function useSessionActions({
        }

        await deleteSession(storedSessionId)
+        clearQueuedPrompts(storedSessionId)
+
+        if (closingRuntimeId) {
+          clearQueuedPrompts(closingRuntimeId)
+        }
      } catch (err) {
        if (removed) {
          setSessions(prev => [removed, ...prev])
@@ -95,6 +95,10 @@ function messageContentText(content: unknown): string {
  return Array.isArray(content) ? content.map(partText).join('').trim() : ''
 }

+const INTERRUPTED_ONLY_RE = /^_?\[interrupted\]_?$/i
+
+const isInterruptedOnlyMessage = (text: string) => INTERRUPTED_ONLY_RE.test(text.trim())
+
 function resetStickyState(state: StickyStateFlags) {
  state.escapedFromLock = false
  state.isAtBottom = true
@@ -368,6 +372,7 @@ const AssistantMessage: FC<{ onBranchInNewChat?: (messageId: string) => void }>

  const messageStatus = useAuiState(s => s.message.status?.type)
  const isPlaceholder = messageStatus === 'running' && content.length === 0
+  const interruptedOnly = useMemo(() => isInterruptedOnlyMessage(messageText), [messageText])

  if (isPlaceholder) {
    return null
@@ -380,7 +385,10 @@ const AssistantMessage: FC<{ onBranchInNewChat?: (messageId: string) => void }>
      data-slot="aui_assistant-message-root"
    >
      <div
-        className="wrap-anywhere min-w-0 max-w-full overflow-hidden text-pretty text-base leading-(--dt-line-height) text-foreground"
+        className={cn(
+          'wrap-anywhere min-w-0 max-w-full overflow-hidden text-pretty text-base leading-(--dt-line-height) text-foreground',
+          interruptedOnly && 'text-[0.8rem] leading-5 text-muted-foreground/82'
+        )}
        data-slot="aui_assistant-message-content"
      >
        {hoistedTodos.length > 0 && <HoistedTodoPanel todos={hoistedTodos} />}
@@ -401,7 +409,7 @@ const AssistantMessage: FC<{ onBranchInNewChat?: (messageId: string) => void }>
          </ErrorPrimitive.Root>
        </MessagePrimitive.Error>
      </div>
-      {messageText.trim().length > 0 && (
+      {messageText.trim().length > 0 && !interruptedOnly && (
        <AssistantFooter messageId={messageId} messageText={messageText} onBranchInNewChat={onBranchInNewChat} />
      )}
    </MessagePrimitive.Root>
@@ -0,0 +1,102 @@
+import { beforeEach, describe, expect, it } from 'vitest'
+
+import type { ComposerAttachment } from './composer'
+import {
+  $queuedPromptsBySession,
+  clearQueuedPrompts,
+  dequeueQueuedPrompt,
+  enqueueQueuedPrompt,
+  getQueuedPrompts,
+  removeQueuedPrompt,
+  updateQueuedPrompt,
+  updateQueuedPromptText
+} from './composer-queue'
+
+const SESSION_KEY = 'session-abc'
+const QUEUE_STORAGE_KEY = 'hermes.desktop.composerQueue.v1'
+
+function attachment(id: string, kind: ComposerAttachment['kind'] = 'file'): ComposerAttachment {
+  return {
+    id,
+    kind,
+    label: id,
+    refText: `@file:${id}`
+  }
+}
+
+describe('composer queue store', () => {
+  beforeEach(() => {
+    window.localStorage.removeItem(QUEUE_STORAGE_KEY)
+    $queuedPromptsBySession.set({})
+  })
+
+  it('queues prompts in FIFO order', () => {
+    enqueueQueuedPrompt(SESSION_KEY, { attachments: [], text: 'first' })
+    enqueueQueuedPrompt(SESSION_KEY, { attachments: [], text: 'second' })
+
+    expect(dequeueQueuedPrompt(SESSION_KEY)?.text).toBe('first')
+    expect(dequeueQueuedPrompt(SESSION_KEY)?.text).toBe('second')
+    expect(dequeueQueuedPrompt(SESSION_KEY)).toBeNull()
+  })
+
+  it('clones attachments when queueing', () => {
+    const source = [attachment('a-1')]
+    const queued = enqueueQueuedPrompt(SESSION_KEY, { attachments: source, text: 'check clones' })
+
+    expect(queued).not.toBeNull()
+    expect(getQueuedPrompts(SESSION_KEY)[0]?.attachments[0]).toEqual(source[0])
+    expect(getQueuedPrompts(SESSION_KEY)[0]?.attachments[0]).not.toBe(source[0])
+  })
+
+  it('updates and removes queued entries by id', () => {
+    const first = enqueueQueuedPrompt(SESSION_KEY, { attachments: [], text: 'draft one' })
+    const second = enqueueQueuedPrompt(SESSION_KEY, { attachments: [], text: 'draft two' })
+
+    expect(first).not.toBeNull()
+    expect(second).not.toBeNull()
+
+    expect(updateQueuedPromptText(SESSION_KEY, first!.id, 'draft one edited')).toBe(true)
+    expect(getQueuedPrompts(SESSION_KEY).map(entry => entry.text)).toEqual(['draft one edited', 'draft two'])
+
+    expect(removeQueuedPrompt(SESSION_KEY, first!.id)).toBe(true)
+    expect(getQueuedPrompts(SESSION_KEY).map(entry => entry.text)).toEqual(['draft two'])
+  })
+
+  it('updates queued text and attachment snapshot', () => {
+    const first = enqueueQueuedPrompt(SESSION_KEY, { attachments: [attachment('f-1')], text: 'draft one' })
+    const editedAttachments = [attachment('f-2'), attachment('f-3', 'image')]
+
+    expect(first).not.toBeNull()
+    expect(
+      updateQueuedPrompt(SESSION_KEY, first!.id, {
+        attachments: editedAttachments,
+        text: 'edited text'
+      })
+    ).toBe(true)
+
+    const queue = getQueuedPrompts(SESSION_KEY)
+    expect(queue[0]?.text).toBe('edited text')
+    expect(queue[0]?.attachments).toEqual(editedAttachments)
+    expect(queue[0]?.attachments[0]).not.toBe(editedAttachments[0])
+  })
+
+  it('clears queue state for a session', () => {
+    enqueueQueuedPrompt(SESSION_KEY, { attachments: [attachment('img-1', 'image')], text: 'queued' })
+
+    clearQueuedPrompts(SESSION_KEY)
+
+    expect(getQueuedPrompts(SESSION_KEY)).toEqual([])
+    expect($queuedPromptsBySession.get()[SESSION_KEY]).toBeUndefined()
+    expect(window.localStorage.getItem(QUEUE_STORAGE_KEY)).toBeNull()
+  })
+
+  it('persists queue entries into local storage', () => {
+    enqueueQueuedPrompt(SESSION_KEY, { attachments: [], text: 'persist me' })
+
+    const raw = window.localStorage.getItem(QUEUE_STORAGE_KEY)
+    expect(raw).toBeTruthy()
+
+    const parsed = JSON.parse(String(raw)) as Record<string, { text: string }[]>
+    expect(parsed[SESSION_KEY]?.[0]?.text).toBe('persist me')
+  })
+})
@@ -0,0 +1,158 @@
+import { atom } from 'nanostores'
+
+import type { ComposerAttachment } from './composer'
+
+export interface QueuedPromptEntry {
+  id: string
+  text: string
+  attachments: ComposerAttachment[]
+  queuedAt: number
+}
+
+type QueueState = Record<string, QueuedPromptEntry[]>
+
+const STORAGE_KEY = 'hermes.desktop.composerQueue.v1'
+
+const load = (): QueueState => {
+  if (typeof window === 'undefined') return {}
+  try {
+    const raw = window.localStorage.getItem(STORAGE_KEY)
+    const parsed = raw ? JSON.parse(raw) : null
+
+    return parsed && typeof parsed === 'object' && !Array.isArray(parsed) ? (parsed as QueueState) : {}
+  } catch {
+    return {}
+  }
+}
+
+const save = (state: QueueState) => {
+  if (typeof window === 'undefined') return
+  try {
+    if (Object.keys(state).length === 0) window.localStorage.removeItem(STORAGE_KEY)
+    else window.localStorage.setItem(STORAGE_KEY, JSON.stringify(state))
+  } catch {
+    // best-effort: storage may be unavailable, queue still works in-memory
+  }
+}
+
+export const $queuedPromptsBySession = atom<QueueState>(load())
+
+const writeSession = (sid: string, queue: QueuedPromptEntry[]) => {
+  const current = $queuedPromptsBySession.get()
+  const next = { ...current }
+
+  if (queue.length === 0) delete next[sid]
+  else next[sid] = queue
+
+  $queuedPromptsBySession.set(next)
+  save(next)
+}
+
+const sidOf = (key: string | null | undefined): null | string => {
+  const trimmed = key?.trim()
+
+  return trimmed ? trimmed : null
+}
+
+const queueFor = (sid: string) => $queuedPromptsBySession.get()[sid] ?? []
+
+const nextId = () => `queued-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`
+
+const cloneAttachments = (attachments: ComposerAttachment[]) => attachments.map(a => ({ ...a }))
+
+export const getQueuedPrompts = (key: string | null | undefined): QueuedPromptEntry[] => {
+  const sid = sidOf(key)
+
+  return sid ? queueFor(sid) : []
+}
+
+export const enqueueQueuedPrompt = (
+  key: string | null | undefined,
+  payload: { text: string; attachments: ComposerAttachment[] }
+): null | QueuedPromptEntry => {
+  const sid = sidOf(key)
+
+  if (!sid) return null
+
+  const entry: QueuedPromptEntry = {
+    id: nextId(),
+    text: payload.text,
+    attachments: cloneAttachments(payload.attachments),
+    queuedAt: Date.now()
+  }
+
+  writeSession(sid, [...queueFor(sid), entry])
+
+  return entry
+}
+
+export const dequeueQueuedPrompt = (key: string | null | undefined): null | QueuedPromptEntry => {
+  const sid = sidOf(key)
+
+  if (!sid) return null
+
+  const [head, ...rest] = queueFor(sid)
+
+  if (!head) return null
+
+  writeSession(sid, rest)
+
+  return head
+}
+
+export const removeQueuedPrompt = (key: string | null | undefined, id: string): boolean => {
+  const sid = sidOf(key)
+
+  if (!sid) return false
+
+  const queue = queueFor(sid)
+  const next = queue.filter(e => e.id !== id)
+
+  if (next.length === queue.length) return false
+
+  writeSession(sid, next)
+
+  return true
+}
+
+export const updateQueuedPrompt = (
+  key: string | null | undefined,
+  id: string,
+  update: { text: string; attachments?: ComposerAttachment[] }
+): boolean => {
+  const sid = sidOf(key)
+
+  if (!sid) return false
+
+  const queue = queueFor(sid)
+  let changed = false
+
+  const next = queue.map(entry => {
+    if (entry.id !== id) return entry
+
+    const attachments = update.attachments ? cloneAttachments(update.attachments) : entry.attachments
+
+    if (entry.text === update.text && !update.attachments) return entry
+
+    changed = true
+
+    return { ...entry, text: update.text, attachments }
+  })
+
+  if (!changed) return false
+
+  writeSession(sid, next)
+
+  return true
+}
+
+export const updateQueuedPromptText = (key: string | null | undefined, id: string, text: string): boolean =>
+  updateQueuedPrompt(key, id, { text })
+
+export const clearQueuedPrompts = (key: string | null | undefined) => {
+  const sid = sidOf(key)
+
+  if (!sid || !(sid in $queuedPromptsBySession.get())) return
+
+  writeSession(sid, [])
+}
@@ -39,6 +39,10 @@ if [ "$(id -u)" = "0" ]; then
        # by the mapped user on the host side.
        chown -R hermes:hermes "$HERMES_HOME" 2>/dev/null || \
            echo "Warning: chown failed (rootless container?) — continuing anyway"
+        # The .venv must also be re-chowned when UID is remapped, otherwise
+        # lazy_deps.py cannot install platform packages (discord.py, etc.).
+        chown -R hermes:hermes "$INSTALL_DIR/.venv" 2>/dev/null || \
+            echo "Warning: chown .venv failed (rootless container?) — continuing anyway"
    fi

    # Ensure config.yaml is readable by the hermes runtime user even if it was
@@ -446,7 +446,9 @@ class SignalAdapter(BasePlatformAdapter):
                if sent_msg and isinstance(sent_msg, dict):
                    dest = sent_msg.get("destinationNumber") or sent_msg.get("destination")
                    sent_ts = sent_msg.get("timestamp")
-                    if dest == self._account_normalized:
+                    sent_msg_group_info = sent_msg.get("groupInfo") or {}
+                    sent_msg_group_id = sent_msg_group_info.get("groupId") if sent_msg_group_info else None
+                    if dest == self._account_normalized or sent_msg_group_id:
                        # Check if this is an echo of our own outbound reply
                        if sent_ts and sent_ts in self._recent_sent_timestamps:
                            self._recent_sent_timestamps.discard(sent_ts)
@@ -2772,7 +2772,7 @@ class TelegramAdapter(BasePlatformAdapter):
                                    {"thread_id": str(thread_id)},
                                )
                            )
-                        await self._bot.send_message(**send_kwargs)
+                        await self._send_message_with_thread_fallback(**send_kwargs)
                except Exception as exc:
                    logger.error("[%s] slash-confirm callback failed: %s", self.name, exc, exc_info=True)
            return
@@ -345,6 +345,7 @@ class WeComAdapter(BasePlatformAdapter):
                try:
                    await self._open_connection()
                    backoff_idx = 0
+                    self._mark_connected()
                    logger.info("[%s] Reconnected", self.name)
                except Exception as reconnect_exc:
                    logger.warning("[%s] Reconnect failed: %s", self.name, reconnect_exc)
@@ -494,12 +494,15 @@ class WhatsAppAdapter(BasePlatformAdapter):
                # plain executable path.
                _npm_bin = shutil.which("npm") or "npm"
                try:
+                    # Read timeout from environment variable, default to 300 seconds (5 minutes)
+                    # to accommodate slower systems like Unraid NAS
+                    npm_install_timeout = int(os.environ.get("WHATSAPP_NPM_INSTALL_TIMEOUT", "300"))
                    install_result = subprocess.run(
                        [_npm_bin, "install", "--silent"],
                        cwd=str(bridge_dir),
                        capture_output=True,
                        text=True,
-                        timeout=60,
+                        timeout=npm_install_timeout,
                    )
                    if install_result.returncode != 0:
                        print(f"[{self.name}] npm install failed: {install_result.stderr}")
@@ -7543,6 +7543,7 @@ class GatewayRunner:
            hook_ctx = {
                "platform": source.platform.value if source.platform else "",
                "user_id": source.user_id,
+                "chat_id": source.chat_id or "",
                "session_id": session_entry.session_id,
                "message": message_text[:500],
            }
@@ -284,7 +284,7 @@ PROVIDER_REGISTRY: Dict[str, ProviderConfig] = {
    ),
    "alibaba": ProviderConfig(
        id="alibaba",
-        name="Alibaba Cloud (DashScope)",
+        name="Qwen Cloud",
        auth_type="api_key",
        inference_base_url="https://dashscope-intl.aliyuncs.com/compatible-mode/v1",
        api_key_env_vars=("DASHSCOPE_API_KEY",),
@@ -735,15 +735,8 @@ DEFAULT_CONFIG = {

    # Anthropic prompt caching (Claude via OpenRouter or native Anthropic API).
    # cache_ttl must be "5m" or "1h" (Anthropic-supported tiers); other values are ignored.
-    # long_lived_prefix: when true (default), Claude on Anthropic / OpenRouter / Nous
-    #   Portal uses a split layout: tools[-1] + stable system prefix at long_lived_ttl
-    #   (cross-session cache), last 2 messages at cache_ttl (within-session rolling).
-    #   Set false to keep the legacy "system + last 3 messages" single-tier layout.
-    # long_lived_ttl: TTL for the cross-session prefix tier ("5m" or "1h"; default "1h").
    "prompt_caching": {
        "cache_ttl": "5m",
-        "long_lived_prefix": True,
-        "long_lived_ttl": "1h",
    },

    # OpenRouter-specific settings.
@@ -307,7 +307,7 @@ def judge_goal(
        return "continue", "empty response (nothing to evaluate)", False

    try:
-        from agent.auxiliary_client import get_text_auxiliary_client
+        from agent.auxiliary_client import get_auxiliary_extra_body, get_text_auxiliary_client
    except Exception as exc:
        logger.debug("goal judge: auxiliary client import failed: %s", exc)
        return "continue", "auxiliary client unavailable", False
@@ -336,6 +336,7 @@ def judge_goal(
            temperature=0,
            max_tokens=200,
            timeout=timeout,
+            extra_body=get_auxiliary_extra_body() or None,
        )
    except Exception as exc:
        logger.info("goal judge: API call failed (%s) — falling through to continue", exc)
@@ -155,7 +155,7 @@ def specify_task(
        )

    try:
-        from agent.auxiliary_client import get_text_auxiliary_client
+        from agent.auxiliary_client import get_auxiliary_extra_body, get_text_auxiliary_client
    except Exception as exc:  # pragma: no cover — import smoke test
        logger.debug("specify: auxiliary client import failed: %s", exc)
        return SpecifyOutcome(task_id, False, "auxiliary client unavailable")
@@ -187,6 +187,7 @@ def specify_task(
            temperature=0.3,
            max_tokens=1500,
            timeout=timeout or 120,
+            extra_body=get_auxiliary_extra_body() or None,
        )
    except Exception as exc:
        logger.info(
@@ -908,10 +908,10 @@ CANONICAL_PROVIDERS: list[ProviderEntry] = [
    ProviderEntry("lmstudio",       "LM Studio",                "LM Studio (local desktop app with built-in model server)"),
    ProviderEntry("anthropic",      "Anthropic",                "Anthropic (Claude models — API key or Claude Code)"),
    ProviderEntry("openai-codex",   "OpenAI Codex",             "OpenAI Codex"),
+    ProviderEntry("alibaba",        "Qwen Cloud",               "Qwen Cloud / DashScope Coding (Qwen + multi-provider)"),
    ProviderEntry("xiaomi",         "Xiaomi MiMo",              "Xiaomi MiMo (MiMo-V2.5 and V2 models — pro, omni, flash)"),
    ProviderEntry("tencent-tokenhub", "Tencent TokenHub",       "Tencent TokenHub (Hy3 Preview — direct API via tokenhub.tencentmaas.com)"),
    ProviderEntry("nvidia",         "NVIDIA NIM",               "NVIDIA NIM (Nemotron models — build.nvidia.com or local NIM)"),
-    ProviderEntry("qwen-oauth",     "Qwen OAuth (Portal)",      "Qwen OAuth (reuses local Qwen CLI login)"),
    ProviderEntry("copilot",        "GitHub Copilot",           "GitHub Copilot (uses GITHUB_TOKEN or gh auth token)"),
    ProviderEntry("copilot-acp",    "GitHub Copilot ACP",       "GitHub Copilot ACP (spawns `copilot --acp --stdio`)"),
    ProviderEntry("huggingface",    "Hugging Face",             "Hugging Face Inference Providers (20+ open models)"),
@@ -926,7 +926,6 @@ CANONICAL_PROVIDERS: list[ProviderEntry] = [
    ProviderEntry("minimax",        "MiniMax",                  "MiniMax (global direct API)"),
    ProviderEntry("minimax-oauth",  "MiniMax (OAuth)",          "MiniMax via OAuth browser login (Coding Plan, minimax.io)"),
    ProviderEntry("minimax-cn",     "MiniMax (China)",          "MiniMax China (domestic direct API)"),
-    ProviderEntry("alibaba",        "Alibaba Cloud (DashScope)","Alibaba Cloud / DashScope Coding (Qwen + multi-provider)"),
    ProviderEntry("ollama-cloud",   "Ollama Cloud",             "Ollama Cloud (cloud-hosted open models — ollama.com)"),
    ProviderEntry("arcee",          "Arcee AI",                 "Arcee AI (Trinity models — direct API)"),
    ProviderEntry("gmi",            "GMI Cloud",                "GMI Cloud (multi-model direct API)"),
@@ -936,6 +935,7 @@ CANONICAL_PROVIDERS: list[ProviderEntry] = [
    ProviderEntry("bedrock",        "AWS Bedrock",              "AWS Bedrock (Claude, Nova, Llama, DeepSeek — IAM or API key)"),
    ProviderEntry("azure-foundry",  "Azure Foundry",            "Azure Foundry (OpenAI-style or Anthropic-style endpoint — your Azure AI deployment)"),
    ProviderEntry("ai-gateway",     "Vercel AI Gateway",        "Vercel AI Gateway"),
+    ProviderEntry("qwen-oauth",     "Qwen OAuth (Portal)",      "Qwen OAuth (reuses local Qwen CLI login)"),
 ]

 # Auto-extend CANONICAL_PROVIDERS with any provider registered in providers/
@@ -2,6 +2,7 @@

 from typing import Any

+from agent.portal_tags import nous_portal_tags
 from providers import register_provider
 from providers.base import ProviderProfile

@@ -12,7 +13,7 @@ class NousProfile(ProviderProfile):
    def build_extra_body(
        self, *, session_id: str | None = None, **context
    ) -> dict[str, Any]:
-        return {"tags": ["product=hermes-agent"]}
+        return {"tags": nous_portal_tags()}

    def build_api_kwargs_extras(
        self,
@@ -959,7 +959,7 @@ class LineAdapter(BasePlatformAdapter):
        if chat_type == "dm" and self._client:
            asyncio.create_task(self._client.loading(chat_id))

-        source_obj = self.create_source(
+        source_obj = self.build_source(
            chat_id=chat_id,
            chat_type=chat_type,
            user_id=user_id,
@@ -1454,15 +1454,6 @@ class AIAgent:
        # 1h tier costs 2x on write vs 1.25x for 5m, but amortizes across long
        # sessions with >5-minute pauses between turns (#14971).
        self._cache_ttl = "5m"
-        # Long-lived prefix caching: when enabled and supported by the
-        # current provider, splits the system prompt into a stable prefix
-        # (cached cross-session at 1h TTL) and a volatile suffix
-        # (memory/timestamp — never cached), and attaches a 1h cache_control
-        # marker to the last tool in the schema array.  Restricted to
-        # Claude on Anthropic / OpenRouter / Nous Portal; see
-        # ``_supports_long_lived_anthropic_cache``.
-        self._use_long_lived_prefix_cache = False
-        self._long_lived_cache_ttl = "1h"
        try:
            from hermes_cli.config import load_config as _load_pc_cfg

@@ -1470,12 +1461,6 @@ class AIAgent:
            _ttl = _pc_cfg.get("cache_ttl", "5m")
            if _ttl in {"5m", "1h"}:
                self._cache_ttl = _ttl
-            _ll_enabled = _pc_cfg.get("long_lived_prefix", True)
-            _ll_ttl = _pc_cfg.get("long_lived_ttl", "1h")
-            if _ll_ttl in ("5m", "1h"):
-                self._long_lived_cache_ttl = _ll_ttl
-            if _ll_enabled and self._use_prompt_caching and self._supports_long_lived_anthropic_cache():
-                self._use_long_lived_prefix_cache = True
        except Exception:
            pass

@@ -2480,7 +2465,6 @@ class AIAgent:
            "client_kwargs": dict(self._client_kwargs),
            "use_prompt_caching": self._use_prompt_caching,
            "use_native_cache_layout": self._use_native_cache_layout,
-            "use_long_lived_prefix_cache": self._use_long_lived_prefix_cache,
            # Context engine state that _try_activate_fallback() overwrites.
            # Use getattr for model/base_url/api_key/provider since plugin
            # engines may not have these (they're ContextCompressor-specific).
@@ -2647,6 +2631,11 @@ class AIAgent:
        old_model = self.model
        old_provider = self.provider

+        # Clear the per-config context_length override so the new model's
+        # actual context window is resolved via get_model_context_length()
+        # instead of inheriting the stale value from the previous model.
+        self._config_context_length = None
+
        # ── Swap core runtime fields ──
        self.model = new_model
        self.provider = new_provider
@@ -2711,15 +2700,6 @@ class AIAgent:
                model=new_model,
            )
        )
-        self._use_long_lived_prefix_cache = bool(
-            self._use_prompt_caching
-            and self._supports_long_lived_anthropic_cache(
-                provider=new_provider,
-                base_url=self.base_url,
-                api_mode=api_mode,
-                model=new_model,
-            )
-        )

        # ── LM Studio: preload before probing context length ──
        self._ensure_lmstudio_runtime_loaded()
@@ -2768,7 +2748,6 @@ class AIAgent:
            "client_kwargs": dict(self._client_kwargs),
            "use_prompt_caching": self._use_prompt_caching,
            "use_native_cache_layout": self._use_native_cache_layout,
-            "use_long_lived_prefix_cache": self._use_long_lived_prefix_cache,
            "compressor_model": getattr(_cc, "model", self.model) if _cc else self.model,
            "compressor_base_url": getattr(_cc, "base_url", self.base_url) if _cc else self.base_url,
            "compressor_api_key": getattr(_cc, "api_key", "") if _cc else "",
@@ -3579,73 +3558,6 @@ class AIAgent:

        return False, False

-    def _supports_long_lived_anthropic_cache(
-        self,
-        *,
-        provider: Optional[str] = None,
-        base_url: Optional[str] = None,
-        api_mode: Optional[str] = None,
-        model: Optional[str] = None,
-    ) -> bool:
-        """Decide whether the long-lived (1h cross-session) cache layout applies.
-
-        Narrower than ``_anthropic_prompt_cache_policy`` — only enabled
-        for Claude models on the four endpoints whose cross-session
-        cache_control behavior we have explicitly validated:
-
-          * Native Anthropic API (``api_mode == 'anthropic_messages'`` +
-            host ``api.anthropic.com``)
-          * Anthropic OAuth subscription (same transport as native API)
-          * OpenRouter (``base_url`` contains ``openrouter.ai``)
-          * Nous Portal (``base_url`` contains ``nousresearch`` — proxies
-            to OpenRouter, so identical wire-format)
-
-        All four honour ``cache_control`` on both the tools array and the
-        first system content block, and bill cross-session cache reads at
-        the documented 0.1× rate.
-
-        Other endpoints covered by the standard ``system_and_3`` policy
-        (third-party Anthropic gateways, MiniMax, opencode-go Qwen, etc.)
-        keep that layout — they support cache_control but their behavior
-        with mixed-TTL multi-block system content has not been validated
-        against this codebase.
-        """
-        eff_provider = (provider if provider is not None else self.provider) or ""
-        eff_base_url = base_url if base_url is not None else (self.base_url or "")
-        eff_api_mode = api_mode if api_mode is not None else (self.api_mode or "")
-        eff_model = (model if model is not None else self.model) or ""
-
-        model_lower = eff_model.lower()
-        is_claude = "claude" in model_lower
-        is_nous_portal = "nousresearch" in eff_base_url.lower()
-
-        # Nous Portal: Claude AND Qwen both get long-lived caching.
-        # Portal proxies to OpenRouter with identical cache_control
-        # semantics; any model on Portal that accepts envelope-layout
-        # markers via _anthropic_prompt_cache_policy also benefits from
-        # the documented 1h cross-session TTL.
-        if is_nous_portal and (is_claude or "qwen" in model_lower):
-            return True
-
-        if not is_claude:
-            return False
-
-        # Native Anthropic + Anthropic OAuth subscription
-        if eff_api_mode == "anthropic_messages":
-            if eff_provider == "anthropic" or base_url_hostname(eff_base_url) == "api.anthropic.com":
-                return True
-
-        # OpenRouter
-        if base_url_host_matches(eff_base_url, "openrouter.ai"):
-            return True
-
-        # Nous Portal — front-ends OpenRouter behind the scenes; identical
-        # wire format and cache_control semantics.
-        if is_nous_portal:
-            return True
-
-        return False
-
    @staticmethod
    def _model_requires_responses_api(model: str) -> bool:
        """Return True for models that require the Responses API path.
@@ -5894,26 +5806,19 @@ class AIAgent:
        """Assemble the system prompt as three ordered parts.

        Returns a dict with three keys:
-          * ``stable``  — content that is byte-stable across sessions for a
-            given user config: identity, tool guidance, skills prompt,
+          * ``stable``   — identity, tool guidance, skills prompt,
            environment hints, platform hints, model-family operational
-            guidance.  Eligible for cross-session 1h prompt caching when
-            placed as a separate Anthropic content block (see
-            ``apply_anthropic_cache_control_long_lived``).
-          * ``context`` — context files (AGENTS.md, .cursorrules, etc.) and
-            caller-supplied system_message.  Stable within a session but may
-            change between sessions when files are edited or the cwd
-            differs.  Cached within-session via the rolling messages
-            breakpoint (5m TTL); not promoted to the long-lived tier so
-            edits don't poison the cross-session cache.
-          * ``volatile`` — content that changes on most turns/sessions:
-            memory snapshot, user profile, external memory provider block,
-            timestamp line.  Never marked for caching.
+            guidance.
+          * ``context``  — context files (AGENTS.md, .cursorrules, etc.)
+            and caller-supplied system_message.
+          * ``volatile`` — memory snapshot, user profile, external
+            memory provider block, timestamp line.

-        Joined ``stable\\n\\ncontext\\n\\nvolatile`` produces the same
-        logical content the old single-string builder produced, with the
-        guarantee that volatile content is at the end (cache-friendly
-        ordering for any provider that does prefix caching).
+        Joined into a single string by ``_build_system_prompt`` and
+        cached on ``_cached_system_prompt`` for the lifetime of the
+        AIAgent.  Hermes never re-renders parts of this string mid-
+        session — that's the only way to keep upstream prompt caches
+        warm across turns.
        """
        # ── Stable tier ────────────────────────────────────────────────
        stable_parts: List[str] = []
@@ -6115,9 +6020,10 @@ class AIAgent:

        Layers are ordered cache-friendly: stable identity/guidance first,
        then session-stable context files, then per-call volatile content
-        (memory, USER profile, timestamp). The split is exposed via
-        ``_build_system_prompt_parts`` for the long-lived prompt-caching
-        path (Claude on Anthropic / OpenRouter / Nous Portal).
+        (memory, USER profile, timestamp).  The whole string is treated as
+        one cached block — Hermes never rebuilds or reinjects parts of it
+        mid-session, which is the only way to keep upstream prompt caches
+        warm across turns.
        """
        parts = self._build_system_prompt_parts(system_message=system_message)
        joined = "\n\n".join(p for p in (parts["stable"], parts["context"], parts["volatile"]) if p)
@@ -8817,6 +8723,11 @@ class AIAgent:
                fb_api_mode = "bedrock_converse"

            old_model = self.model
+
+            # Clear the per-config context_length override so the fallback
+            # model's actual context window is resolved instead of inheriting
+            # the stale value from the previous model.  See #22387.
+            self._config_context_length = None
            self.model = fb_model
            self.provider = fb_provider
            self.base_url = fb_base_url
@@ -8879,15 +8790,6 @@ class AIAgent:
                    model=fb_model,
                )
            )
-            self._use_long_lived_prefix_cache = bool(
-                self._use_prompt_caching
-                and self._supports_long_lived_anthropic_cache(
-                    provider=fb_provider,
-                    base_url=fb_base_url,
-                    api_mode=fb_api_mode,
-                    model=fb_model,
-                )
-            )

            # LM Studio: preload before probing the fallback's context length.
            self._ensure_lmstudio_runtime_loaded()
@@ -8964,16 +8866,6 @@ class AIAgent:
                "use_native_cache_layout",
                self.api_mode == "anthropic_messages" and self.provider == "anthropic",
            )
-            # Long-lived prefix flag was added later — restore False on
-            # snapshots predating the new field, then re-evaluate against
-            # the restored provider/model in case the user had it enabled.
-            self._use_long_lived_prefix_cache = rt.get(
-                "use_long_lived_prefix_cache",
-                bool(
-                    self._use_prompt_caching
-                    and self._supports_long_lived_anthropic_cache()
-                ),
-            )

            # ── Rebuild client for the primary provider ──
            if self.api_mode == "anthropic_messages":
@@ -9551,19 +9443,7 @@ class AIAgent:

    def _build_api_kwargs(self, api_messages: list) -> dict:
        """Build the keyword arguments dict for the active API mode."""
-        # Resolve the tools array exactly once. When the long-lived
-        # prefix-cache layout is active (Claude on Anthropic / OpenRouter
-        # / Nous Portal), attach a 1h cache_control marker to the last
-        # tool — this caches the entire tools array cross-session via
-        # Anthropic's tools→system→messages prefix order. The function
-        # returns a deep copy, so self.tools is never mutated.
-        if self._use_long_lived_prefix_cache and self.tools:
-            from agent.prompt_caching import mark_tools_for_long_lived_cache
-            tools_for_api = mark_tools_for_long_lived_cache(
-                self.tools, long_lived_ttl=self._long_lived_cache_ttl,
-            )
-        else:
-            tools_for_api = self.tools
+        tools_for_api = self.tools

        if self.api_mode == "anthropic_messages":
            _transport = self._get_transport()
@@ -11662,7 +11542,8 @@ class AIAgent:
                        "effort": "medium"
                    }
            if _is_nous:
-                summary_extra_body["tags"] = ["product=hermes-agent"]
+                from agent.portal_tags import nous_portal_tags as _portal_tags
+                summary_extra_body["tags"] = _portal_tags()

            if self.api_mode == "codex_responses":
                codex_kwargs = self._build_api_kwargs(api_messages)
@@ -12423,36 +12304,21 @@ class AIAgent:
            # External recall context is injected into the user message, not the system
            # prompt, so the stable cache prefix remains unchanged.
            #
-            # When the long-lived prefix-cache layout is active (Claude on
-            # Anthropic / OpenRouter / Nous Portal), we build the system
-            # message as a *list of content blocks*: [stable, context,
-            # volatile, ephemeral?].  Block 0 (stable) gets the 1h
-            # cache_control marker further down via
-            # apply_anthropic_cache_control_long_lived; blocks 1-3 are
-            # cached only via the rolling messages window at 5m.
            # NOTE: Plugin context from pre_llm_call hooks is injected into the
            # user message (see injection block above), NOT the system prompt.
            # This is intentional — system prompt modifications break the prompt
            # cache prefix.  The system prompt is reserved for Hermes internals.
-            if self._use_long_lived_prefix_cache:
-                _sys_parts = self._build_system_prompt_parts(system_message=system_message)
-                _sys_blocks: list = []
-                if _sys_parts.get("stable"):
-                    _sys_blocks.append({"type": "text", "text": _sys_parts["stable"]})
-                if _sys_parts.get("context"):
-                    _sys_blocks.append({"type": "text", "text": _sys_parts["context"]})
-                if _sys_parts.get("volatile"):
-                    _sys_blocks.append({"type": "text", "text": _sys_parts["volatile"]})
-                if self.ephemeral_system_prompt:
-                    _sys_blocks.append({"type": "text", "text": self.ephemeral_system_prompt})
-                if _sys_blocks:
-                    api_messages = [{"role": "system", "content": _sys_blocks}] + api_messages
-            else:
-                effective_system = active_system_prompt or ""
-                if self.ephemeral_system_prompt:
-                    effective_system = (effective_system + "\n\n" + self.ephemeral_system_prompt).strip()
-                if effective_system:
-                    api_messages = [{"role": "system", "content": effective_system}] + api_messages
+            #
+            # Hermes invariant: the system prompt is built ONCE per session
+            # (cached on ``_cached_system_prompt``) and replayed verbatim on
+            # every turn.  We send it as a single content string so the
+            # bytes are byte-stable across turns and upstream prompt caches
+            # stay warm.
+            effective_system = active_system_prompt or ""
+            if self.ephemeral_system_prompt:
+                effective_system = (effective_system + "\n\n" + self.ephemeral_system_prompt).strip()
+            if effective_system:
+                api_messages = [{"role": "system", "content": effective_system}] + api_messages

            # Inject ephemeral prefill messages right after the system prompt
            # but before conversation history. Same API-call-time-only pattern.
@@ -12466,29 +12332,13 @@ class AIAgent:
            # gateways. Auto-detected: if ``_use_prompt_caching`` is set,
            # inject cache_control breakpoints (system + last 3 messages)
            # to reduce input token costs by ~75% on multi-turn
-            # conversations. Layout is chosen per endpoint by
-            # ``_anthropic_prompt_cache_policy``.
-            #
-            # Long-lived prefix layout (prefix_and_2): stable system block
-            # gets 1h marker + last 2 messages get 5m markers. Tools
-            # array's last entry is marked separately at API-call kwargs
-            # build time (see ``_build_api_kwargs`` and
-            # ``mark_tools_for_long_lived_cache``).
+            # conversations.
            if self._use_prompt_caching:
-                if self._use_long_lived_prefix_cache:
-                    from agent.prompt_caching import apply_anthropic_cache_control_long_lived
-                    api_messages = apply_anthropic_cache_control_long_lived(
-                        api_messages,
-                        long_lived_ttl=self._long_lived_cache_ttl,
-                        rolling_ttl=self._cache_ttl,
-                        native_anthropic=self._use_native_cache_layout,
-                    )
-                else:
-                    api_messages = apply_anthropic_cache_control(
-                        api_messages,
-                        cache_ttl=self._cache_ttl,
-                        native_anthropic=self._use_native_cache_layout,
-                    )
+                api_messages = apply_anthropic_cache_control(
+                    api_messages,
+                    cache_ttl=self._cache_ttl,
+                    native_anthropic=self._use_native_cache_layout,
+                )

            # Safety net: strip orphaned tool results / add stubs for missing
            # results before sending to the API.  Runs unconditionally — not
@@ -14442,7 +14292,7 @@ class AIAgent:
                            _ra_raw = _resp_headers.get("retry-after") or _resp_headers.get("Retry-After")
                            if _ra_raw:
                                try:
-                                    _retry_after = min(int(_ra_raw), 120)  # Cap at 2 minutes
+                                    _retry_after = min(float(_ra_raw), 120)  # Cap at 2 minutes
                                except (TypeError, ValueError):
                                    pass
                    wait_time = _retry_after if _retry_after else jittered_backoff(retry_count, base_delay=2.0, max_delay=60.0)
@@ -890,7 +890,7 @@ clone_repo() {
                stash_name="hermes-install-autostash-$(date -u +%Y%m%d-%H%M%S)"
                log_info "Local changes detected, stashing before update..."
                git stash push --include-untracked -m "$stash_name"
-                autostash_ref="$(git rev-parse --verify refs/stash)"
+                autostash_ref="stash@{0}"
            fi

            git fetch origin
@@ -0,0 +1,61 @@
+"""Tests for agent.portal_tags — Nous Portal request tag contract."""
+
+from __future__ import annotations
+
+
+def test_hermes_client_tag_includes_current_version():
+    """The client tag must reflect hermes_cli.__version__ verbatim."""
+    from hermes_cli import __version__
+    from agent.portal_tags import hermes_client_tag
+
+    assert hermes_client_tag() == f"client=hermes-client-v{__version__}"
+
+
+def test_hermes_client_tag_format():
+    """The client tag has the exact shape Nous Portal expects."""
+    from agent.portal_tags import hermes_client_tag
+
+    tag = hermes_client_tag()
+    assert tag.startswith("client=hermes-client-v")
+    # No spaces, no commas — single tag value
+    assert " " not in tag
+    assert "," not in tag
+
+
+def test_nous_portal_tags_contains_product_and_client():
+    """Every Nous Portal request gets BOTH the product tag and the version tag."""
+    from agent.portal_tags import hermes_client_tag, nous_portal_tags
+
+    tags = nous_portal_tags()
+    assert "product=hermes-agent" in tags
+    assert hermes_client_tag() in tags
+    assert len(tags) == 2
+
+
+def test_nous_portal_tags_returns_fresh_list():
+    """Callers mutate the returned list; we must not share state across calls."""
+    from agent.portal_tags import nous_portal_tags
+
+    a = nous_portal_tags()
+    a.append("client=test-mutation")
+    b = nous_portal_tags()
+    assert "client=test-mutation" not in b
+
+
+def test_auxiliary_client_nous_extra_body_uses_helper():
+    """auxiliary_client.NOUS_EXTRA_BODY must match the canonical helper output."""
+    from agent.auxiliary_client import NOUS_EXTRA_BODY
+    from agent.portal_tags import nous_portal_tags
+
+    assert NOUS_EXTRA_BODY == {"tags": nous_portal_tags()}
+
+
+def test_nous_provider_profile_uses_helper():
+    """The Nous provider profile (main agent loop) must use the canonical tags."""
+    from agent.portal_tags import nous_portal_tags
+    from providers import get_provider_profile
+
+    profile = get_provider_profile("nous")
+    assert profile is not None
+    body = profile.build_extra_body()
+    assert body["tags"] == nous_portal_tags()
@@ -6,8 +6,6 @@ import pytest
 from agent.prompt_caching import (
    _apply_cache_marker,
    apply_anthropic_cache_control,
-    apply_anthropic_cache_control_long_lived,
-    mark_tools_for_long_lived_cache,
 )


@@ -143,132 +141,3 @@ class TestApplyAnthropicCacheControl:
            elif "cache_control" in msg:
                count += 1
        assert count <= 4
-
-
-class TestMarkToolsForLongLivedCache:
-    def test_returns_unchanged_for_empty_tools(self):
-        assert mark_tools_for_long_lived_cache(None) is None
-        assert mark_tools_for_long_lived_cache([]) == []
-
-    def test_marks_only_last_tool(self):
-        tools = [
-            {"type": "function", "function": {"name": "a"}},
-            {"type": "function", "function": {"name": "b"}},
-            {"type": "function", "function": {"name": "c"}},
-        ]
-        out = mark_tools_for_long_lived_cache(tools)
-        assert "cache_control" not in out[0]
-        assert "cache_control" not in out[1]
-        assert out[2]["cache_control"] == {"type": "ephemeral", "ttl": "1h"}
-
-    def test_does_not_mutate_input(self):
-        tools = [{"type": "function", "function": {"name": "a"}}]
-        mark_tools_for_long_lived_cache(tools)
-        assert "cache_control" not in tools[0]
-
-    def test_5m_ttl_drops_ttl_field(self):
-        tools = [{"type": "function", "function": {"name": "a"}}]
-        out = mark_tools_for_long_lived_cache(tools, long_lived_ttl="5m")
-        assert out[0]["cache_control"] == {"type": "ephemeral"}
-
-
-class TestApplyAnthropicCacheControlLongLived:
-    def test_empty_messages(self):
-        assert apply_anthropic_cache_control_long_lived([]) == []
-
-    def test_marks_first_block_of_split_system(self):
-        msgs = [
-            {"role": "system", "content": [
-                {"type": "text", "text": "STABLE"},
-                {"type": "text", "text": "CONTEXT"},
-                {"type": "text", "text": "VOLATILE"},
-            ]},
-            {"role": "user", "content": "msg1"},
-            {"role": "assistant", "content": "msg2"},
-        ]
-        out = apply_anthropic_cache_control_long_lived(msgs)
-        sys_blocks = out[0]["content"]
-        assert sys_blocks[0]["cache_control"] == {"type": "ephemeral", "ttl": "1h"}
-        assert "cache_control" not in sys_blocks[1]
-        assert "cache_control" not in sys_blocks[2]
-
-    def test_rolling_marker_on_last_2_messages(self):
-        msgs = [
-            {"role": "system", "content": [{"type": "text", "text": "S"}]},
-            {"role": "user", "content": "u1"},
-            {"role": "assistant", "content": "a1"},
-            {"role": "user", "content": "u2"},
-            {"role": "assistant", "content": "a2"},
-        ]
-        out = apply_anthropic_cache_control_long_lived(msgs)
-
-        def has_marker(m):
-            c = m.get("content")
-            if isinstance(c, list) and c and isinstance(c[-1], dict):
-                return "cache_control" in c[-1]
-            return "cache_control" in m
-
-        # u1 and a1 (older messages) should NOT be marked
-        assert not has_marker(out[1])
-        assert not has_marker(out[2])
-        # u2 and a2 (last 2) SHOULD be marked
-        assert has_marker(out[3])
-        assert has_marker(out[4])
-
-    def test_rolling_marker_uses_5m_ttl(self):
-        msgs = [
-            {"role": "system", "content": [{"type": "text", "text": "S"}]},
-            {"role": "user", "content": "u1"},
-            {"role": "assistant", "content": "a1"},
-        ]
-        out = apply_anthropic_cache_control_long_lived(
-            msgs, long_lived_ttl="1h", rolling_ttl="5m",
-        )
-        # Last user message: cache_control on the wrapped text part should be 5m
-        last = out[-1]
-        c = last["content"]
-        assert isinstance(c, list)
-        assert c[-1]["cache_control"] == {"type": "ephemeral"}  # 5m has no ttl key
-
-    def test_string_system_falls_back_to_envelope_marker(self):
-        """When the caller didn't split the system message, we still place a marker."""
-        msgs = [
-            {"role": "system", "content": "Single string system"},
-            {"role": "user", "content": "u1"},
-        ]
-        out = apply_anthropic_cache_control_long_lived(msgs)
-        sys_content = out[0]["content"]
-        # Wrapped into a list and the (now sole) block gets the 1h marker
-        assert isinstance(sys_content, list)
-        assert sys_content[0]["cache_control"] == {"type": "ephemeral", "ttl": "1h"}
-
-    def test_does_not_mutate_input(self):
-        msgs = [
-            {"role": "system", "content": [{"type": "text", "text": "S"}]},
-            {"role": "user", "content": "u1"},
-        ]
-        before = copy.deepcopy(msgs)
-        apply_anthropic_cache_control_long_lived(msgs)
-        assert msgs == before
-
-    def test_max_4_breakpoints_with_split_system(self):
-        msgs = [
-            {"role": "system", "content": [{"type": "text", "text": "S"}, {"type": "text", "text": "V"}]},
-        ] + [
-            {"role": "user" if i % 2 == 0 else "assistant", "content": f"msg{i}"}
-            for i in range(10)
-        ]
-        out = apply_anthropic_cache_control_long_lived(msgs)
-        count = 0
-        for m in out:
-            c = m.get("content")
-            if isinstance(c, list):
-                for item in c:
-                    if isinstance(item, dict) and "cache_control" in item:
-                        count += 1
-            elif "cache_control" in m:
-                count += 1
-        # 1 system block + last 2 messages = 3 breakpoints from this function.
-        # tools[-1] is marked separately (not via this function), so a 4th
-        # breakpoint can be added at API-call time.
-        assert count == 3
@@ -1,112 +0,0 @@
-"""Live E2E: long-lived prefix caching on Claude via OpenRouter.
-
-Run only when LIVE_OR_KEY env var is set. Skipped under the normal hermetic
-test suite (which unsets credentials).
-"""
-import os, sys, tempfile, time, shutil, pytest
-
-
-# Probe for the key BEFORE conftest unsets it
-_LIVE_KEY = os.environ.get("OPENROUTER_API_KEY") or os.environ.get("LIVE_OR_KEY")
-if not _LIVE_KEY:
-    # Try to read directly from .env
-    env_path = os.path.expanduser("~/.hermes/.env")
-    if os.path.exists(env_path):
-        with open(env_path) as f:
-            for line in f:
-                if line.startswith("OPENROUTER_API_KEY="):
-                    _LIVE_KEY = line.strip().split("=", 1)[1].strip().strip('"').strip("'")
-                    break
-
-
-pytestmark = pytest.mark.skipif(
-    not _LIVE_KEY,
-    reason="set OPENROUTER_API_KEY (or LIVE_OR_KEY) to run live cache test",
-)
-
-
-def test_long_lived_prefix_cache_e2e_openrouter(tmp_path, monkeypatch):
-    """Two AIAgent runs in fresh sessions: call 1 writes cache, call 2 reads it."""
-    monkeypatch.setenv("HERMES_HOME", str(tmp_path))
-    # The hermetic conftest unsets OPENROUTER_API_KEY — restore for this test
-    monkeypatch.setenv("OPENROUTER_API_KEY", _LIVE_KEY)
-
-    # Minimal config — but with enough toolset/guidance to exceed Anthropic's
-    # ~1024-token minimum-cacheable-prefix threshold. Anthropic silently
-    # ignores cache_control markers on small blocks.
-    import yaml
-    cfg_path = tmp_path / "config.yaml"
-    cfg_path.write_text(yaml.safe_dump({
-        "model": {"provider": "openrouter", "default": "anthropic/claude-haiku-4.5"},
-        "prompt_caching": {"long_lived_prefix": True, "long_lived_ttl": "1h", "cache_ttl": "5m"},
-        "agent": {"tool_use_enforcement": True},   # adds substantial guidance text
-        "memory": {"provider": ""},
-        "compression": {"enabled": False},
-    }))
-
-    from run_agent import AIAgent
-
-    def make_agent():
-        return AIAgent(
-            api_key=_LIVE_KEY,
-            base_url="https://openrouter.ai/api/v1",
-            provider="openrouter",
-            model="anthropic/claude-haiku-4.5",
-            api_mode="chat_completions",
-            # Use the default toolset roster — the tools array (~13k tokens
-            # for ~35 tools) is what carries the bulk of the cross-session
-            # cache value. With a tiny toolset the cached prefix can fall
-            # below Anthropic Haiku's 2048-token minimum cacheable size and
-            # the marker is silently ignored.
-            enabled_toolsets=None,
-            quiet_mode=True,
-            skip_context_files=True,
-            skip_memory=True,
-            save_trajectories=False,
-        )
-
-    a1 = make_agent()
-    assert a1._use_prompt_caching is True, "policy should enable caching for Claude on OR"
-    assert a1._use_long_lived_prefix_cache is True, "long-lived path should activate"
-    parts = a1._build_system_prompt_parts()
-    print(f"\nstable={len(parts['stable']):,} ctx={len(parts['context']):,} volatile={len(parts['volatile']):,} chars")
-    print(f"tool count: {len(a1.tools or [])}")
-
-    # Use distinct user messages each call so OpenRouter's response cache
-    # doesn't short-circuit the upstream Anthropic call (we need real
-    # Anthropic billing visibility to verify cache_creation/cache_read).
-    USER_1 = "Reply with the single word ALPHA."
-    USER_2 = "Reply with the single word BRAVO."
-
-    print("\n--- Call 1 (cold) ---")
-    r1 = a1.run_conversation(USER_1, conversation_history=[])
-    print(f"final_response[:80]: {(r1.get('final_response') or '')[:80]!r}")
-    cr1 = a1.session_cache_read_tokens
-    cw1 = a1.session_cache_write_tokens
-    print(f"call1: cache_read={cr1} cache_write={cw1}")
-
-    # Wait so cache settles, then fresh agent (NEW SESSION) for cross-session read
-    time.sleep(2)
-    a2 = make_agent()
-    assert a2.session_id != a1.session_id, "second agent must have a new session"
-
-    print("\n--- Call 2 (warm, NEW session, different user msg) ---")
-    r2 = a2.run_conversation(USER_2, conversation_history=[])
-    print(f"final_response[:80]: {(r2.get('final_response') or '')[:80]!r}")
-    cr2 = a2.session_cache_read_tokens
-    cw2 = a2.session_cache_write_tokens
-    print(f"call2: cache_read={cr2} cache_write={cw2}")
-
-    print(f"\n=== VERDICT ===")
-    print(f"  call1 wrote {cw1:,} cache tokens, read {cr1:,}")
-    print(f"  call2 wrote {cw2:,} cache tokens, read {cr2:,}")
-    if cw1:
-        print(f"  cross-session read fraction: cr2/cw1 = {cr2/cw1:.2%}")
-
-    # Assertions
-    assert cw1 > 0, f"call 1 must write cache (got {cw1}); long-lived layout not reaching wire"
-    assert cr2 > 0, (
-        f"call 2 must read cache cross-session (got {cr2}); "
-        f"stable prefix is not byte-stable across sessions"
-    )
-    assert cr2 >= 1000, f"cache_read on call 2 ({cr2}) too small to indicate real reuse"
@@ -147,11 +147,12 @@ class TestChatCompletionsBuildKwargs:
        ]

    def test_nous_tags(self, transport):
+        from agent.portal_tags import nous_portal_tags
        from providers import get_provider_profile
        profile = get_provider_profile("nous")
        msgs = [{"role": "user", "content": "Hi"}]
        kw = transport.build_kwargs(model="gpt-4o", messages=msgs, provider_profile=profile)
-        assert kw["extra_body"]["tags"] == ["product=hermes-agent"]
+        assert kw["extra_body"]["tags"] == nous_portal_tags()

    def test_reasoning_default(self, transport):
        msgs = [{"role": "user", "content": "Hi"}]
@@ -7,6 +7,7 @@ from unittest.mock import AsyncMock, MagicMock
 import pytest

 import gateway.run as gateway_run
+from agent.i18n import t
 from gateway.platforms.base import MessageEvent, MessageType
 from gateway.restart import DEFAULT_GATEWAY_RESTART_DRAIN_TIMEOUT
 from gateway.session import SessionEntry, build_session_key
@@ -32,7 +33,7 @@ async def test_restart_command_while_busy_requests_drain_without_interrupt(monke

    result = await runner._handle_message(event)

-    assert result == "⏳ Draining 1 active agent(s) before restart..."
+    assert result == t("gateway.draining", count=1)
    running_agent.interrupt.assert_not_called()
    runner.request_restart.assert_called_once_with(detached=True, via_service=False)

@@ -273,12 +273,13 @@ class TestRequestOverridesParity:

    def test_extra_body_override_merges_with_provider_body(self, transport):
        """Override extra_body merges WITH provider extra_body, not replaces."""
+        from agent.portal_tags import nous_portal_tags
        kw = transport.build_kwargs(
            model="hermes-3", messages=_msgs(), tools=None,
            provider_profile=get_provider_profile("nous"),
            request_overrides={"extra_body": {"custom": True}},
        )
-        assert kw["extra_body"]["tags"] == ["product=hermes-agent"]  # from profile
+        assert kw["extra_body"]["tags"] == nous_portal_tags()  # from profile
        assert kw["extra_body"]["custom"] is True  # from override

    def test_top_level_override(self, transport):
@@ -210,9 +210,10 @@ class TestOpenRouterProfile:

 class TestNousProfile:
    def test_tags(self):
+        from agent.portal_tags import nous_portal_tags
        p = get_provider_profile("nous")
        body = p.build_extra_body()
-        assert body["tags"] == ["product=hermes-agent"]
+        assert body["tags"] == nous_portal_tags()

    def test_auth_type(self):
        p = get_provider_profile("nous")
@@ -165,13 +165,14 @@ class TestNousParity:
    """Nous: product tags, reasoning, omit when disabled."""

    def test_tags(self, transport):
+        from agent.portal_tags import nous_portal_tags
        kw = transport.build_kwargs(
            model="hermes-3-llama-3.1-405b",
            messages=_simple_messages(),
            tools=None,
            provider_profile=get_provider_profile("nous"),
        )
-        assert kw["extra_body"]["tags"] == ["product=hermes-agent"]
+        assert kw["extra_body"]["tags"] == nous_portal_tags()

    def test_reasoning_omitted_when_disabled(self, transport):
        """Nous special case: reasoning omitted entirely when disabled."""
@@ -330,127 +330,3 @@ class TestExplicitOverrides:
 # Long-lived prefix cache policy (cross-session 1h tier)
 # ─────────────────────────────────────────────────────────────────────

-class TestSupportsLongLivedAnthropicCache:
-    """Narrower than _anthropic_prompt_cache_policy — only Claude on the 4
-    explicitly-validated endpoints get the long-lived layout."""
-
-    def test_native_anthropic_claude_supported(self):
-        agent = _make_agent(
-            provider="anthropic",
-            base_url="https://api.anthropic.com",
-            api_mode="anthropic_messages",
-            model="claude-sonnet-4.6",
-        )
-        assert agent._supports_long_lived_anthropic_cache() is True
-
-    def test_anthropic_oauth_supported(self):
-        # OAuth uses the same transport as native Anthropic
-        agent = _make_agent(
-            provider="anthropic",
-            base_url="https://api.anthropic.com",
-            api_mode="anthropic_messages",
-            model="claude-opus-4.6",
-        )
-        assert agent._supports_long_lived_anthropic_cache() is True
-
-    def test_openrouter_claude_supported(self):
-        agent = _make_agent(
-            provider="openrouter",
-            base_url="https://openrouter.ai/api/v1",
-            api_mode="chat_completions",
-            model="anthropic/claude-sonnet-4.6",
-        )
-        assert agent._supports_long_lived_anthropic_cache() is True
-
-    def test_nous_portal_claude_supported(self):
-        # Nous Portal proxies to OpenRouter — same wire format
-        agent = _make_agent(
-            provider="nous",
-            base_url="https://inference-api.nousresearch.com/v1",
-            api_mode="chat_completions",
-            model="anthropic/claude-opus-4.7",
-        )
-        assert agent._supports_long_lived_anthropic_cache() is True
-
-    def test_nous_portal_qwen_supported(self):
-        # Portal Qwen rides the same OpenRouter-equivalent transport as
-        # Portal Claude; long-lived (1h cross-session) cache_control
-        # markers apply identically.
-        agent = _make_agent(
-            provider="nous",
-            base_url="https://inference-api.nousresearch.com/v1",
-            api_mode="chat_completions",
-            model="qwen3.6-plus",
-        )
-        assert agent._supports_long_lived_anthropic_cache() is True
-
-    def test_nous_portal_qwen_vendored_slug_supported(self):
-        agent = _make_agent(
-            provider="nous",
-            base_url="https://inference-api.nousresearch.com/v1",
-            api_mode="chat_completions",
-            model="qwen/qwen3.6-plus",
-        )
-        assert agent._supports_long_lived_anthropic_cache() is True
-
-    def test_nous_portal_non_claude_non_qwen_rejected(self):
-        # Portal long-lived cache scope mirrors policy: Claude or Qwen only.
-        agent = _make_agent(
-            provider="nous",
-            base_url="https://inference-api.nousresearch.com/v1",
-            api_mode="chat_completions",
-            model="openai/gpt-5.4",
-        )
-        assert agent._supports_long_lived_anthropic_cache() is False
-
-    def test_openrouter_non_claude_rejected(self):
-        agent = _make_agent(
-            provider="openrouter",
-            base_url="https://openrouter.ai/api/v1",
-            api_mode="chat_completions",
-            model="openai/gpt-5.4",
-        )
-        assert agent._supports_long_lived_anthropic_cache() is False
-
-    def test_third_party_anthropic_gateway_rejected(self):
-        # MiniMax / Kimi / etc. — anthropic-wire but not in our validated list
-        agent = _make_agent(
-            provider="minimax",
-            base_url="https://api.minimax.io/anthropic",
-            api_mode="anthropic_messages",
-            model="minimax-m2.7",
-        )
-        assert agent._supports_long_lived_anthropic_cache() is False
-
-    def test_alibaba_dashscope_rejected(self):
-        agent = _make_agent(
-            provider="alibaba",
-            base_url="https://dashscope.aliyuncs.com/api/v1/anthropic",
-            api_mode="anthropic_messages",
-            model="qwen3.5-plus",
-        )
-        assert agent._supports_long_lived_anthropic_cache() is False
-
-    def test_opencode_qwen_rejected(self):
-        agent = _make_agent(
-            provider="opencode-go",
-            base_url="https://api.opencode-go.example/v1",
-            api_mode="chat_completions",
-            model="qwen3.6-plus",
-        )
-        assert agent._supports_long_lived_anthropic_cache() is False
-
-    def test_fallback_target_evaluated_independently(self):
-        # Starting on a non-supported provider, falling back to OpenRouter Claude
-        agent = _make_agent(
-            provider="minimax",
-            base_url="https://api.minimax.io/anthropic",
-            api_mode="anthropic_messages",
-            model="minimax-m2.7",
-        )
-        assert agent._supports_long_lived_anthropic_cache(
-            provider="openrouter",
-            base_url="https://openrouter.ai/api/v1",
-            api_mode="chat_completions",
-            model="anthropic/claude-sonnet-4.6",
-        ) is True
@@ -343,11 +343,12 @@ class TestBuildApiKwargsAIGateway:

 class TestBuildApiKwargsNousPortal:
    def test_includes_nous_product_tags(self, monkeypatch):
+        from agent.portal_tags import nous_portal_tags
        agent = _make_agent(monkeypatch, "nous", base_url="https://inference-api.nousresearch.com/v1")
        messages = [{"role": "user", "content": "hi"}]
        kwargs = agent._build_api_kwargs(messages)
        extra = kwargs.get("extra_body", {})
-        assert extra.get("tags") == ["product=hermes-agent"]
+        assert extra.get("tags") == nous_portal_tags()

    def test_uses_chat_completions_format(self, monkeypatch):
        agent = _make_agent(monkeypatch, "nous", base_url="https://inference-api.nousresearch.com/v1")
@@ -169,7 +169,6 @@ class TestEphemeralMaxOutputTokens:
        agent.reasoning_config = None
        agent._is_anthropic_oauth = False
        agent._ephemeral_max_output_tokens = None
-        agent._use_long_lived_prefix_cache = False

        compressor = MagicMock()
        compressor.context_length = 200_000
@@ -314,7 +314,9 @@ DANGEROUS_PATTERNS = [
    (r'\bdd\s+.*if=', "disk copy"),
    (r'>\s*/dev/sd', "write to block device"),
    (r'\bDROP\s+(TABLE|DATABASE)\b', "SQL DROP"),
-    (r'\bDELETE\s+FROM\b(?!.*\bWHERE\b)', "SQL DELETE without WHERE"),
+    # Use [^\n]* instead of .* so DOTALL mode does not cause a WHERE clause on the
+    # *next* line to satisfy the negative lookahead, silently allowing DELETE without WHERE.
+    (r'\bDELETE\s+FROM\b(?![^\n]*\bWHERE\b)', "SQL DELETE without WHERE"),
    (r'\bTRUNCATE\s+(TABLE)?\s*\w', "SQL TRUNCATE"),
    (r'>\s*/etc/', "overwrite system config"),
    (r'\bsystemctl\s+(-[^\s]+\s+)*(stop|restart|disable|mask)\b', "stop/restart system service"),
@@ -461,7 +461,8 @@ async def _send_via_adapter(
            adapter = None
        if adapter is not None:
            try:
-                result = await adapter.send(chat_id=chat_id, content=chunk)
+                metadata = {"thread_id": thread_id} if thread_id else None
+                result = await adapter.send(chat_id=chat_id, content=chunk, metadata=metadata)
            except asyncio.CancelledError:
                raise
            except Exception as e:
@@ -130,7 +130,9 @@ def detect_audio_environment() -> dict:
        try:
            devices = sd.query_devices()
            if not devices:
-                if termux_capture:
+                if os.environ.get('PULSE_SERVER'):
+                    notices.append("No PortAudio devices detected but PULSE_SERVER is set -- continuing")
+                elif termux_capture:
                    notices.append("No PortAudio devices detected, but Termux:API microphone capture is available")
                else:
                    warnings.append("No audio input/output devices detected")
@@ -593,7 +593,8 @@ def _resolve_web_extract_auxiliary(model: Optional[str] = None) -> tuple[Optiona
    extra_body: Dict[str, Any] = {}
    if client is not None and _is_nous_auxiliary_client(client):
        from agent.auxiliary_client import get_auxiliary_extra_body
-        extra_body = get_auxiliary_extra_body() or {"tags": ["product=hermes-agent"]}
+        from agent.portal_tags import nous_portal_tags
+        extra_body = get_auxiliary_extra_body() or {"tags": nous_portal_tags()}

    return client, effective_model, extra_body

@@ -92,6 +92,13 @@ manager makes sense for that language (rustup, ghcup, opam, brew,
 …). Hermes auto-detects the binary on PATH or in
 `<HERMES_HOME>/lsp/bin/`.

+A few servers are installed alongside a peer dependency that npm
+won't auto-pull. The current case is `typescript-language-server`,
+which requires the `typescript` SDK importable from the same
+`node_modules` tree — Hermes installs both packages together when you
+run `hermes lsp install typescript` or auto-install fires on first
+use.
+
 ## CLI

 ```
@@ -207,6 +214,24 @@ The binary isn't on PATH and isn't in `<HERMES_HOME>/lsp/bin/`. Run
 `hermes lsp install <server_id>` to attempt an auto-install, or
 install the binary manually through the language's normal toolchain.

+**`Backend warnings` section in `hermes lsp status`**
+
+Some servers ship as thin wrappers around an external CLI for actual
+diagnostics — they spawn cleanly and accept requests but never emit
+errors when the sidecar binary is missing. The most common case is
+`bash-language-server`, which delegates diagnostics to `shellcheck`.
+When `hermes lsp status` shows a `Backend warnings` section, install
+the named tool through your OS package manager:
+
+```
+apt install shellcheck      # Debian / Ubuntu
+brew install shellcheck     # macOS
+scoop install shellcheck    # Windows
+```
+
+The same warning is logged once at server spawn time in
+`~/.hermes/logs/agent.log`.
+
 **Server starts but never returns diagnostics**

 Check `~/.hermes/logs/agent.log` for `[agent.lsp.client]` entries —