feat(session_search): add fast/summary dual-mode with zero-LLM fast path

Add mode parameter to session_search tool supporting two modes:
- fast (default): returns FTS5 snippets + context immediately (~0.02s),
  no LLM call — ideal for quick recall lookups
- summary: preserves original behavior with LLM-generated session
  summaries (~10-30s) — use when fast mode is insufficient

Changes:
- tools/session_search_tool.py: implement fast mode path that returns
  FTS hits with snippets/context without calling auxiliary model;
  add mode parameter to schema (enum: fast|summary); apply parent
  session source/metadata resolution in fast mode (same pattern
  as upstream fix 6b4ccb9b1 in summary mode)
- run_agent.py: pass mode argument from function_args in two call sites
  (direct tool call + subagent path)
- tests/tools/test_session_search.py: add test coverage for fast mode
  output format, summary mode preservation, backwards compatibility,
  and run_agent.py mode forwarding verification

The tool schema description is updated to recommend fast-first usage.
This commit is contained in:
zihao.zhao
2026-05-05 20:42:38 +08:00
committed by yoniebans
parent 88a2ce4ae5
commit 7d628eaa3d
3 changed files with 228 additions and 20 deletions
+2
View File
@@ -10295,6 +10295,7 @@ class AIAgent:
limit=function_args.get("limit", 3),
db=session_db,
current_session_id=self.session_id,
mode=function_args.get("mode", "fast"),
)
elif function_name == "memory":
target = function_args.get("target", "memory")
@@ -10921,6 +10922,7 @@ class AIAgent:
limit=function_args.get("limit", 3),
db=session_db,
current_session_id=self.session_id,
mode=function_args.get("mode", "fast"),
)
tool_duration = time.time() - tool_start_time
if self._should_emit_quiet_tool_messages():
+161 -1
View File
@@ -234,9 +234,10 @@ class TestSessionSearchConcurrency:
{"role": "assistant", "content": "response"},
]
result = json.loads(session_search(query="message", db=mock_db, limit=3))
result = json.loads(session_search(query="message", db=mock_db, limit=3, mode="summary"))
assert result["success"] is True
assert result["mode"] == "summary"
assert result["count"] == 3
assert max_seen["value"] == 1
@@ -400,6 +401,165 @@ class TestSessionSearch:
assert result["sessions_searched"] == 1
assert current_sid not in [r.get("session_id") for r in result.get("results", [])]
def test_default_search_returns_fast_hits_without_llm_or_full_session_load(self, monkeypatch):
"""Default keyword search should stay on the DB/snippet path and avoid LLM latency."""
from unittest.mock import MagicMock
from tools.session_search_tool import session_search
async def fail_summarize(*_args, **_kwargs):
raise AssertionError("default session_search must not call the summarizer")
monkeypatch.setattr("tools.session_search_tool._summarize_session", fail_summarize)
mock_db = MagicMock()
mock_db.search_messages.return_value = [
{
"id": 123,
"session_id": "other_sid",
"role": "user",
"snippet": "we discussed >>>session_search<<< latency",
"context": [
{"role": "user", "content": "session_search is slow"},
{"role": "assistant", "content": "the LLM summary is the bottleneck"},
],
"source": "cli",
"session_started": 1709400000,
"model": "test-model",
},
]
mock_db.get_session.return_value = {"parent_session_id": None, "title": "Latency debug"}
result = json.loads(session_search(query="session_search", db=mock_db))
assert result["success"] is True
assert result["mode"] == "fast"
assert result["count"] == 1
entry = result["results"][0]
assert entry["summary"] == "[Search hit — summary not generated in fast mode] Use snippet/context fields, or set mode='summary' for LLM-generated recall."
assert "we discussed" not in entry["summary"]
assert entry["model"] == "test-model"
assert entry["snippet"] == "we discussed >>>session_search<<< latency"
assert entry["context"][1]["content"] == "the LLM summary is the bottleneck"
mock_db.get_messages_as_conversation.assert_not_called()
@pytest.mark.parametrize("mode", ["summarized", "summarise", "summarize", "deep"])
def test_summary_mode_aliases_use_llm_summarization_path(self, monkeypatch, mode):
"""Common natural-language mode aliases should map to summary mode."""
from unittest.mock import MagicMock
from tools.session_search_tool import session_search
async def fake_summarize(_text, _query, _meta):
return "alias summary"
monkeypatch.setattr("tools.session_search_tool._summarize_session", fake_summarize)
monkeypatch.setattr("model_tools._run_async", lambda coro: asyncio.run(coro))
mock_db = MagicMock()
mock_db.search_messages.return_value = [{"session_id": "sid", "source": "cli"}]
mock_db.get_session.return_value = {"parent_session_id": None, "source": "cli"}
mock_db.get_messages_as_conversation.return_value = [
{"role": "user", "content": "full transcript"},
]
result = json.loads(session_search(query="session_search", db=mock_db, mode=mode))
assert result["success"] is True
assert result["mode"] == "summary"
assert result["results"][0]["summary"] == "alias summary"
@pytest.mark.parametrize("mode", ["", "unknown", 42, True, None])
def test_invalid_or_empty_mode_falls_back_to_fast_without_llm(self, monkeypatch, mode):
"""Loose tool-call args should degrade to fast mode rather than crashing."""
from unittest.mock import MagicMock
from tools.session_search_tool import session_search
async def fail_summarize(*_args, **_kwargs):
raise AssertionError("invalid modes should fall back to fast mode")
monkeypatch.setattr("tools.session_search_tool._summarize_session", fail_summarize)
mock_db = MagicMock()
mock_db.search_messages.return_value = [
{"session_id": "sid", "snippet": "hit", "context": "not-a-list", "source": "cli"},
]
mock_db.get_session.return_value = {"parent_session_id": None}
result = json.loads(session_search(query="session_search", db=mock_db, mode=mode))
assert result["success"] is True
assert result["mode"] == "fast"
assert result["results"][0]["context"] == []
assert result["results"][0]["model"] == "unknown"
mock_db.get_messages_as_conversation.assert_not_called()
def test_fast_mode_tolerates_session_metadata_lookup_failure(self):
"""Fast mode should still return the FTS hit when parent metadata is unavailable."""
from unittest.mock import MagicMock
from tools.session_search_tool import session_search
mock_db = MagicMock()
mock_db.search_messages.return_value = [
{"session_id": "sid", "snippet": "hit", "source": "cli", "model": None},
]
mock_db.get_session.side_effect = RuntimeError("metadata unavailable")
result = json.loads(session_search(query="session_search", db=mock_db))
assert result["success"] is True
assert result["results"][0]["source"] == "cli"
assert result["results"][0]["model"] == "unknown"
def test_summary_mode_preserves_llm_summarization_path(self, monkeypatch):
"""Explicit summary mode keeps the previous behavior for deeper recall."""
from unittest.mock import MagicMock
from tools.session_search_tool import session_search
async def fake_summarize(text, query, meta):
assert "full transcript" in text
assert query == "session_search"
assert meta["source"] == "cli"
return "focused session summary"
monkeypatch.setattr("tools.session_search_tool._summarize_session", fake_summarize)
monkeypatch.setattr("model_tools._run_async", lambda coro: asyncio.run(coro))
mock_db = MagicMock()
mock_db.search_messages.return_value = [
{"session_id": "other_sid", "source": "cli", "session_started": 1709400000, "model": "test-model"},
]
mock_db.get_session.return_value = {"parent_session_id": None, "source": "cli", "started_at": 1709400000}
mock_db.get_messages_as_conversation.return_value = [
{"role": "user", "content": "full transcript about session_search"},
]
result = json.loads(session_search(query="session_search", db=mock_db, mode="summary"))
assert result["success"] is True
assert result["mode"] == "summary"
assert result["results"][0]["summary"] == "focused session summary"
mock_db.get_messages_as_conversation.assert_called_once_with("other_sid")
def test_positional_db_argument_remains_backwards_compatible(self):
"""Keep the historical positional order: query, role_filter, limit, db, current_session_id."""
from unittest.mock import MagicMock
from tools.session_search_tool import session_search
mock_db = MagicMock()
mock_db.search_messages.return_value = []
result = json.loads(session_search("session_search", None, 3, mock_db, None))
assert result["success"] is True
assert result["mode"] == "fast"
mock_db.search_messages.assert_called_once()
def test_run_agent_special_session_search_paths_forward_mode(self):
"""run_agent has two direct session_search call sites outside registry dispatch."""
from pathlib import Path
source = (Path(__file__).parent.parent.parent / "run_agent.py").read_text()
assert source.count('mode=function_args.get("mode", "fast")') == 2
def test_current_child_session_excludes_parent_lineage(self):
"""Compression/delegation parents should be excluded for the active child session."""
from unittest.mock import MagicMock
+65 -19
View File
@@ -2,19 +2,16 @@
"""
Session Search Tool - Long-Term Conversation Recall
Searches past session transcripts in SQLite via FTS5, then summarizes the top
matching sessions using the configured auxiliary session_search model (same
pattern as web_extract). By default, auxiliary "auto" routing uses the main
chat provider/model unless the user overrides auxiliary.session_search.
Returns focused summaries of past conversations rather than raw transcripts,
keeping the main model's context window clean.
Searches past session transcripts in SQLite via FTS5. Keyword search defaults
to fast snippet/context hits without any LLM call; callers can opt into focused
LLM summaries with mode="summary" when deeper recall is worth the latency.
Flow:
1. FTS5 search finds matching messages ranked by relevance
2. Groups by session, takes the top N unique sessions (default 3)
3. Loads each session's conversation, truncates to ~100k chars centered on matches
4. Sends to the configured auxiliary model with a focused summarization prompt
5. Returns per-session summaries with metadata
3. Fast mode returns snippets and nearby context immediately
4. Summary mode loads each session, truncates around matches, and calls an LLM
5. Returns per-session hits/summaries with metadata
"""
import asyncio
@@ -328,13 +325,11 @@ def session_search(
limit: int = 3,
db=None,
current_session_id: str = None,
mode: str = "fast",
) -> str:
"""
Search past sessions and return focused summaries of matching conversations.
Uses FTS5 to find matches, then summarizes the top sessions with the
configured auxiliary session_search model.
The current session is excluded from results since the agent already has that context.
Search past sessions. Fast mode returns FTS snippets without LLM calls;
summary mode preserves the previous focused summarization behavior.
"""
if db is None:
try:
@@ -346,6 +341,12 @@ def session_search(
from hermes_state import format_session_db_unavailable
return tool_error(format_session_db_unavailable(), success=False)
mode = (mode or "fast").strip().lower() if isinstance(mode, str) else "fast"
if mode in ("summarized", "summarise", "summarize", "deep"):
mode = "summary"
if mode not in ("fast", "summary"):
mode = "fast"
# Defensive: models (especially open-source) may send non-int limit values
# (None when JSON null, string "int", or even a type object). Coerce to a
# safe integer before any arithmetic/comparison to prevent TypeError.
@@ -381,6 +382,7 @@ def session_search(
if not raw_results:
return json.dumps({
"success": True,
"mode": mode,
"query": query,
"results": [],
"count": 0,
@@ -438,6 +440,41 @@ def session_search(
if len(seen_sessions) >= limit:
break
if mode == "fast":
results = []
for session_id, match_info in seen_sessions.items():
try:
session_meta = db.get_session(session_id) or {}
except Exception:
session_meta = {}
snippet = match_info.get("snippet") or ""
context = match_info.get("context") or []
if not isinstance(context, list):
context = []
results.append({
"session_id": session_id,
"when": _format_timestamp(
session_meta.get("started_at") or match_info.get("session_started")
),
"source": session_meta.get("source") or match_info.get("source", "unknown"),
"model": session_meta.get("model") or match_info.get("model") or "unknown",
"matched_role": match_info.get("role"),
"title": session_meta.get("title") or None,
"snippet": snippet,
"context": context,
"summary": "[Search hit — summary not generated in fast mode] Use snippet/context fields, or set mode='summary' for LLM-generated recall.",
})
return json.dumps({
"success": True,
"mode": "fast",
"query": query,
"results": results,
"count": len(results),
"sessions_searched": len(seen_sessions),
"message": "Fast search returned FTS snippets without LLM summarization. Use mode='summary' for focused summaries when needed.",
}, ensure_ascii=False)
# Prepare all sessions for parallel summarization
tasks = []
for session_id, match_info in seen_sessions.items():
@@ -527,6 +564,7 @@ def session_search(
return json.dumps({
"success": True,
"mode": "summary",
"query": query,
"results": summaries,
"count": len(summaries),
@@ -539,7 +577,7 @@ def session_search(
def check_session_search_requirements() -> bool:
"""Requires SQLite state database and an auxiliary text model."""
"""Requires SQLite state database; summary mode also needs an auxiliary model."""
try:
from hermes_state import DEFAULT_DB_PATH
return DEFAULT_DB_PATH.parent.exists()
@@ -551,13 +589,14 @@ SESSION_SEARCH_SCHEMA = {
"name": "session_search",
"description": (
"Search your long-term memory of past conversations, or browse recent sessions. This is your recall -- "
"every past session is searchable, and this tool summarizes what happened.\n\n"
"every past session is searchable. Keyword search defaults to fast FTS snippets with no LLM call.\n\n"
"TWO MODES:\n"
"1. Recent sessions (no query): Call with no arguments to see what was worked on recently. "
"Returns titles, previews, and timestamps. Zero LLM cost, instant. "
"Start here when the user asks what were we working on or what did we do recently.\n"
"2. Keyword search (with query): Search for specific topics across all past sessions. "
"Returns LLM-generated summaries of matching sessions.\n\n"
"Defaults to mode='fast', returning snippets and nearby context instantly without LLM summarization. "
"Use mode='summary' only when a focused LLM-generated recap is worth the latency.\n\n"
"USE THIS PROACTIVELY when:\n"
"- The user says 'we did this before', 'remember when', 'last time', 'as I mentioned'\n"
"- The user asks about a topic you worked on before but don't have in current context\n"
@@ -570,7 +609,7 @@ SESSION_SEARCH_SCHEMA = {
"phrases for exact match (\"docker networking\"), boolean (python NOT java), prefix (deploy*). "
"IMPORTANT: Use OR between keywords for best results — FTS5 defaults to AND which misses "
"sessions that only mention some terms. If a broad OR query returns nothing, try individual "
"keyword searches in parallel. Returns summaries of the top matching sessions."
"keyword searches in parallel. Returns fast search hits by default."
),
"parameters": {
"type": "object",
@@ -585,9 +624,15 @@ SESSION_SEARCH_SCHEMA = {
},
"limit": {
"type": "integer",
"description": "Max sessions to summarize (default: 3, max: 5).",
"description": "Max sessions to return (default: 3, max: 5).",
"default": 3,
},
"mode": {
"type": "string",
"enum": ["fast", "summary"],
"description": "fast (default) returns FTS snippets + surrounding context without LLM calls (~0.02s). Start here for most recall needs. summary loads the full session transcript and runs the LLM summarizer (~10-30s). Use summary only when the fast results do not give enough context to answer the user's question, or when the user explicitly asks for a 'summary' or 'recap' of past conversations. You can call twice: first fast, then summary if more detail is needed.",
"default": "fast",
},
},
"required": [],
},
@@ -605,6 +650,7 @@ registry.register(
query=args.get("query") or "",
role_filter=args.get("role_filter"),
limit=args.get("limit", 3),
mode=args.get("mode", "fast"),
db=kw.get("db"),
current_session_id=kw.get("current_session_id")),
check_fn=check_session_search_requirements,