feat(session_search): add fast/summary dual-mode with zero-LLM fast path

Add mode parameter to session_search tool supporting two modes: - fast (default): returns FTS5 snippets + context immediately (~0.02s), no LLM call — ideal for quick recall lookups - summary: preserves original behavior with LLM-generated session summaries (~10-30s) — use when fast mode is insufficient Changes: - tools/session_search_tool.py: implement fast mode path that returns FTS hits with snippets/context without calling auxiliary model; add mode parameter to schema (enum: fast|summary); apply parent session source/metadata resolution in fast mode (same pattern as upstream fix 6b4ccb9b1 in summary mode) - run_agent.py: pass mode argument from function_args in two call sites (direct tool call + subagent path) - tests/tools/test_session_search.py: add test coverage for fast mode output format, summary mode preservation, backwards compatibility, and run_agent.py mode forwarding verification The tool schema description is updated to recommend fast-first usage.
2026-05-21 03:39:54 +00:00 · 2026-05-05 20:42:38 +08:00
parent 88a2ce4ae5
commit 7d628eaa3d
3 changed files with 228 additions and 20 deletions
@@ -10295,6 +10295,7 @@ class AIAgent:
                limit=function_args.get("limit", 3),
                db=session_db,
                current_session_id=self.session_id,
+                mode=function_args.get("mode", "fast"),
            )
        elif function_name == "memory":
            target = function_args.get("target", "memory")
@@ -10921,6 +10922,7 @@ class AIAgent:
                        limit=function_args.get("limit", 3),
                        db=session_db,
                        current_session_id=self.session_id,
+                        mode=function_args.get("mode", "fast"),
                    )
                tool_duration = time.time() - tool_start_time
                if self._should_emit_quiet_tool_messages():
@@ -234,9 +234,10 @@ class TestSessionSearchConcurrency:
            {"role": "assistant", "content": "response"},
        ]

-        result = json.loads(session_search(query="message", db=mock_db, limit=3))
+        result = json.loads(session_search(query="message", db=mock_db, limit=3, mode="summary"))

        assert result["success"] is True
+        assert result["mode"] == "summary"
        assert result["count"] == 3
        assert max_seen["value"] == 1

@@ -400,6 +401,165 @@ class TestSessionSearch:
        assert result["sessions_searched"] == 1
        assert current_sid not in [r.get("session_id") for r in result.get("results", [])]

+    def test_default_search_returns_fast_hits_without_llm_or_full_session_load(self, monkeypatch):
+        """Default keyword search should stay on the DB/snippet path and avoid LLM latency."""
+        from unittest.mock import MagicMock
+        from tools.session_search_tool import session_search
+
+        async def fail_summarize(*_args, **_kwargs):
+            raise AssertionError("default session_search must not call the summarizer")
+
+        monkeypatch.setattr("tools.session_search_tool._summarize_session", fail_summarize)
+
+        mock_db = MagicMock()
+        mock_db.search_messages.return_value = [
+            {
+                "id": 123,
+                "session_id": "other_sid",
+                "role": "user",
+                "snippet": "we discussed >>>session_search<<< latency",
+                "context": [
+                    {"role": "user", "content": "session_search is slow"},
+                    {"role": "assistant", "content": "the LLM summary is the bottleneck"},
+                ],
+                "source": "cli",
+                "session_started": 1709400000,
+                "model": "test-model",
+            },
+        ]
+        mock_db.get_session.return_value = {"parent_session_id": None, "title": "Latency debug"}
+
+        result = json.loads(session_search(query="session_search", db=mock_db))
+
+        assert result["success"] is True
+        assert result["mode"] == "fast"
+        assert result["count"] == 1
+        entry = result["results"][0]
+        assert entry["summary"] == "[Search hit — summary not generated in fast mode] Use snippet/context fields, or set mode='summary' for LLM-generated recall."
+        assert "we discussed" not in entry["summary"]
+        assert entry["model"] == "test-model"
+        assert entry["snippet"] == "we discussed >>>session_search<<< latency"
+        assert entry["context"][1]["content"] == "the LLM summary is the bottleneck"
+        mock_db.get_messages_as_conversation.assert_not_called()
+
+    @pytest.mark.parametrize("mode", ["summarized", "summarise", "summarize", "deep"])
+    def test_summary_mode_aliases_use_llm_summarization_path(self, monkeypatch, mode):
+        """Common natural-language mode aliases should map to summary mode."""
+        from unittest.mock import MagicMock
+        from tools.session_search_tool import session_search
+
+        async def fake_summarize(_text, _query, _meta):
+            return "alias summary"
+
+        monkeypatch.setattr("tools.session_search_tool._summarize_session", fake_summarize)
+        monkeypatch.setattr("model_tools._run_async", lambda coro: asyncio.run(coro))
+
+        mock_db = MagicMock()
+        mock_db.search_messages.return_value = [{"session_id": "sid", "source": "cli"}]
+        mock_db.get_session.return_value = {"parent_session_id": None, "source": "cli"}
+        mock_db.get_messages_as_conversation.return_value = [
+            {"role": "user", "content": "full transcript"},
+        ]
+
+        result = json.loads(session_search(query="session_search", db=mock_db, mode=mode))
+
+        assert result["success"] is True
+        assert result["mode"] == "summary"
+        assert result["results"][0]["summary"] == "alias summary"
+
+    @pytest.mark.parametrize("mode", ["", "unknown", 42, True, None])
+    def test_invalid_or_empty_mode_falls_back_to_fast_without_llm(self, monkeypatch, mode):
+        """Loose tool-call args should degrade to fast mode rather than crashing."""
+        from unittest.mock import MagicMock
+        from tools.session_search_tool import session_search
+
+        async def fail_summarize(*_args, **_kwargs):
+            raise AssertionError("invalid modes should fall back to fast mode")
+
+        monkeypatch.setattr("tools.session_search_tool._summarize_session", fail_summarize)
+
+        mock_db = MagicMock()
+        mock_db.search_messages.return_value = [
+            {"session_id": "sid", "snippet": "hit", "context": "not-a-list", "source": "cli"},
+        ]
+        mock_db.get_session.return_value = {"parent_session_id": None}
+
+        result = json.loads(session_search(query="session_search", db=mock_db, mode=mode))
+
+        assert result["success"] is True
+        assert result["mode"] == "fast"
+        assert result["results"][0]["context"] == []
+        assert result["results"][0]["model"] == "unknown"
+        mock_db.get_messages_as_conversation.assert_not_called()
+
+    def test_fast_mode_tolerates_session_metadata_lookup_failure(self):
+        """Fast mode should still return the FTS hit when parent metadata is unavailable."""
+        from unittest.mock import MagicMock
+        from tools.session_search_tool import session_search
+
+        mock_db = MagicMock()
+        mock_db.search_messages.return_value = [
+            {"session_id": "sid", "snippet": "hit", "source": "cli", "model": None},
+        ]
+        mock_db.get_session.side_effect = RuntimeError("metadata unavailable")
+
+        result = json.loads(session_search(query="session_search", db=mock_db))
+
+        assert result["success"] is True
+        assert result["results"][0]["source"] == "cli"
+        assert result["results"][0]["model"] == "unknown"
+
+    def test_summary_mode_preserves_llm_summarization_path(self, monkeypatch):
+        """Explicit summary mode keeps the previous behavior for deeper recall."""
+        from unittest.mock import MagicMock
+        from tools.session_search_tool import session_search
+
+        async def fake_summarize(text, query, meta):
+            assert "full transcript" in text
+            assert query == "session_search"
+            assert meta["source"] == "cli"
+            return "focused session summary"
+
+        monkeypatch.setattr("tools.session_search_tool._summarize_session", fake_summarize)
+        monkeypatch.setattr("model_tools._run_async", lambda coro: asyncio.run(coro))
+
+        mock_db = MagicMock()
+        mock_db.search_messages.return_value = [
+            {"session_id": "other_sid", "source": "cli", "session_started": 1709400000, "model": "test-model"},
+        ]
+        mock_db.get_session.return_value = {"parent_session_id": None, "source": "cli", "started_at": 1709400000}
+        mock_db.get_messages_as_conversation.return_value = [
+            {"role": "user", "content": "full transcript about session_search"},
+        ]
+
+        result = json.loads(session_search(query="session_search", db=mock_db, mode="summary"))
+
+        assert result["success"] is True
+        assert result["mode"] == "summary"
+        assert result["results"][0]["summary"] == "focused session summary"
+        mock_db.get_messages_as_conversation.assert_called_once_with("other_sid")
+
+    def test_positional_db_argument_remains_backwards_compatible(self):
+        """Keep the historical positional order: query, role_filter, limit, db, current_session_id."""
+        from unittest.mock import MagicMock
+        from tools.session_search_tool import session_search
+
+        mock_db = MagicMock()
+        mock_db.search_messages.return_value = []
+
+        result = json.loads(session_search("session_search", None, 3, mock_db, None))
+
+        assert result["success"] is True
+        assert result["mode"] == "fast"
+        mock_db.search_messages.assert_called_once()
+
+    def test_run_agent_special_session_search_paths_forward_mode(self):
+        """run_agent has two direct session_search call sites outside registry dispatch."""
+        from pathlib import Path
+
+        source = (Path(__file__).parent.parent.parent / "run_agent.py").read_text()
+        assert source.count('mode=function_args.get("mode", "fast")') == 2
+
    def test_current_child_session_excludes_parent_lineage(self):
        """Compression/delegation parents should be excluded for the active child session."""
        from unittest.mock import MagicMock
@@ -2,19 +2,16 @@
 """
 Session Search Tool - Long-Term Conversation Recall

-Searches past session transcripts in SQLite via FTS5, then summarizes the top
-matching sessions using the configured auxiliary session_search model (same
-pattern as web_extract). By default, auxiliary "auto" routing uses the main
-chat provider/model unless the user overrides auxiliary.session_search.
-Returns focused summaries of past conversations rather than raw transcripts,
-keeping the main model's context window clean.
+Searches past session transcripts in SQLite via FTS5. Keyword search defaults
+to fast snippet/context hits without any LLM call; callers can opt into focused
+LLM summaries with mode="summary" when deeper recall is worth the latency.

 Flow:
  1. FTS5 search finds matching messages ranked by relevance
  2. Groups by session, takes the top N unique sessions (default 3)
-  3. Loads each session's conversation, truncates to ~100k chars centered on matches
-  4. Sends to the configured auxiliary model with a focused summarization prompt
-  5. Returns per-session summaries with metadata
+  3. Fast mode returns snippets and nearby context immediately
+  4. Summary mode loads each session, truncates around matches, and calls an LLM
+  5. Returns per-session hits/summaries with metadata
 """

 import asyncio
@@ -328,13 +325,11 @@ def session_search(
    limit: int = 3,
    db=None,
    current_session_id: str = None,
+    mode: str = "fast",
 ) -> str:
    """
-    Search past sessions and return focused summaries of matching conversations.
-
-    Uses FTS5 to find matches, then summarizes the top sessions with the
-    configured auxiliary session_search model.
-    The current session is excluded from results since the agent already has that context.
+    Search past sessions. Fast mode returns FTS snippets without LLM calls;
+    summary mode preserves the previous focused summarization behavior.
    """
    if db is None:
        try:
@@ -346,6 +341,12 @@ def session_search(
            from hermes_state import format_session_db_unavailable
            return tool_error(format_session_db_unavailable(), success=False)

+    mode = (mode or "fast").strip().lower() if isinstance(mode, str) else "fast"
+    if mode in ("summarized", "summarise", "summarize", "deep"):
+        mode = "summary"
+    if mode not in ("fast", "summary"):
+        mode = "fast"
+
    # Defensive: models (especially open-source) may send non-int limit values
    # (None when JSON null, string "int", or even a type object).  Coerce to a
    # safe integer before any arithmetic/comparison to prevent TypeError.
@@ -381,6 +382,7 @@ def session_search(
        if not raw_results:
            return json.dumps({
                "success": True,
+                "mode": mode,
                "query": query,
                "results": [],
                "count": 0,
@@ -438,6 +440,41 @@ def session_search(
            if len(seen_sessions) >= limit:
                break

+        if mode == "fast":
+            results = []
+            for session_id, match_info in seen_sessions.items():
+                try:
+                    session_meta = db.get_session(session_id) or {}
+                except Exception:
+                    session_meta = {}
+                snippet = match_info.get("snippet") or ""
+                context = match_info.get("context") or []
+                if not isinstance(context, list):
+                    context = []
+                results.append({
+                    "session_id": session_id,
+                    "when": _format_timestamp(
+                        session_meta.get("started_at") or match_info.get("session_started")
+                    ),
+                    "source": session_meta.get("source") or match_info.get("source", "unknown"),
+                    "model": session_meta.get("model") or match_info.get("model") or "unknown",
+                    "matched_role": match_info.get("role"),
+                    "title": session_meta.get("title") or None,
+                    "snippet": snippet,
+                    "context": context,
+                    "summary": "[Search hit — summary not generated in fast mode] Use snippet/context fields, or set mode='summary' for LLM-generated recall.",
+                })
+
+            return json.dumps({
+                "success": True,
+                "mode": "fast",
+                "query": query,
+                "results": results,
+                "count": len(results),
+                "sessions_searched": len(seen_sessions),
+                "message": "Fast search returned FTS snippets without LLM summarization. Use mode='summary' for focused summaries when needed.",
+            }, ensure_ascii=False)
+
        # Prepare all sessions for parallel summarization
        tasks = []
        for session_id, match_info in seen_sessions.items():
@@ -527,6 +564,7 @@ def session_search(

        return json.dumps({
            "success": True,
+            "mode": "summary",
            "query": query,
            "results": summaries,
            "count": len(summaries),
@@ -539,7 +577,7 @@ def session_search(


 def check_session_search_requirements() -> bool:
-    """Requires SQLite state database and an auxiliary text model."""
+    """Requires SQLite state database; summary mode also needs an auxiliary model."""
    try:
        from hermes_state import DEFAULT_DB_PATH
        return DEFAULT_DB_PATH.parent.exists()
@@ -551,13 +589,14 @@ SESSION_SEARCH_SCHEMA = {
    "name": "session_search",
    "description": (
        "Search your long-term memory of past conversations, or browse recent sessions. This is your recall -- "
-        "every past session is searchable, and this tool summarizes what happened.\n\n"
+        "every past session is searchable. Keyword search defaults to fast FTS snippets with no LLM call.\n\n"
        "TWO MODES:\n"
        "1. Recent sessions (no query): Call with no arguments to see what was worked on recently. "
        "Returns titles, previews, and timestamps. Zero LLM cost, instant. "
        "Start here when the user asks what were we working on or what did we do recently.\n"
        "2. Keyword search (with query): Search for specific topics across all past sessions. "
-        "Returns LLM-generated summaries of matching sessions.\n\n"
+        "Defaults to mode='fast', returning snippets and nearby context instantly without LLM summarization. "
+        "Use mode='summary' only when a focused LLM-generated recap is worth the latency.\n\n"
        "USE THIS PROACTIVELY when:\n"
        "- The user says 'we did this before', 'remember when', 'last time', 'as I mentioned'\n"
        "- The user asks about a topic you worked on before but don't have in current context\n"
@@ -570,7 +609,7 @@ SESSION_SEARCH_SCHEMA = {
        "phrases for exact match (\"docker networking\"), boolean (python NOT java), prefix (deploy*). "
        "IMPORTANT: Use OR between keywords for best results — FTS5 defaults to AND which misses "
        "sessions that only mention some terms. If a broad OR query returns nothing, try individual "
-        "keyword searches in parallel. Returns summaries of the top matching sessions."
+        "keyword searches in parallel. Returns fast search hits by default."
    ),
    "parameters": {
        "type": "object",
@@ -585,9 +624,15 @@ SESSION_SEARCH_SCHEMA = {
            },
            "limit": {
                "type": "integer",
-                "description": "Max sessions to summarize (default: 3, max: 5).",
+                "description": "Max sessions to return (default: 3, max: 5).",
                "default": 3,
            },
+            "mode": {
+                "type": "string",
+                "enum": ["fast", "summary"],
+                "description": "fast (default) returns FTS snippets + surrounding context without LLM calls (~0.02s). Start here for most recall needs. summary loads the full session transcript and runs the LLM summarizer (~10-30s). Use summary only when the fast results do not give enough context to answer the user's question, or when the user explicitly asks for a 'summary' or 'recap' of past conversations. You can call twice: first fast, then summary if more detail is needed.",
+                "default": "fast",
+            },
        },
        "required": [],
    },
@@ -605,6 +650,7 @@ registry.register(
        query=args.get("query") or "",
        role_filter=args.get("role_filter"),
        limit=args.get("limit", 3),
+        mode=args.get("mode", "fast"),
        db=kw.get("db"),
        current_session_id=kw.get("current_session_id")),
    check_fn=check_session_search_requirements,