mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-21 03:39:54 +00:00
feat(session_search): add fast/summary dual-mode with zero-LLM fast path
Add mode parameter to session_search tool supporting two modes:
- fast (default): returns FTS5 snippets + context immediately (~0.02s),
no LLM call — ideal for quick recall lookups
- summary: preserves original behavior with LLM-generated session
summaries (~10-30s) — use when fast mode is insufficient
Changes:
- tools/session_search_tool.py: implement fast mode path that returns
FTS hits with snippets/context without calling auxiliary model;
add mode parameter to schema (enum: fast|summary); apply parent
session source/metadata resolution in fast mode (same pattern
as upstream fix 6b4ccb9b1 in summary mode)
- run_agent.py: pass mode argument from function_args in two call sites
(direct tool call + subagent path)
- tests/tools/test_session_search.py: add test coverage for fast mode
output format, summary mode preservation, backwards compatibility,
and run_agent.py mode forwarding verification
The tool schema description is updated to recommend fast-first usage.
This commit is contained in:
@@ -10295,6 +10295,7 @@ class AIAgent:
|
||||
limit=function_args.get("limit", 3),
|
||||
db=session_db,
|
||||
current_session_id=self.session_id,
|
||||
mode=function_args.get("mode", "fast"),
|
||||
)
|
||||
elif function_name == "memory":
|
||||
target = function_args.get("target", "memory")
|
||||
@@ -10921,6 +10922,7 @@ class AIAgent:
|
||||
limit=function_args.get("limit", 3),
|
||||
db=session_db,
|
||||
current_session_id=self.session_id,
|
||||
mode=function_args.get("mode", "fast"),
|
||||
)
|
||||
tool_duration = time.time() - tool_start_time
|
||||
if self._should_emit_quiet_tool_messages():
|
||||
|
||||
@@ -234,9 +234,10 @@ class TestSessionSearchConcurrency:
|
||||
{"role": "assistant", "content": "response"},
|
||||
]
|
||||
|
||||
result = json.loads(session_search(query="message", db=mock_db, limit=3))
|
||||
result = json.loads(session_search(query="message", db=mock_db, limit=3, mode="summary"))
|
||||
|
||||
assert result["success"] is True
|
||||
assert result["mode"] == "summary"
|
||||
assert result["count"] == 3
|
||||
assert max_seen["value"] == 1
|
||||
|
||||
@@ -400,6 +401,165 @@ class TestSessionSearch:
|
||||
assert result["sessions_searched"] == 1
|
||||
assert current_sid not in [r.get("session_id") for r in result.get("results", [])]
|
||||
|
||||
def test_default_search_returns_fast_hits_without_llm_or_full_session_load(self, monkeypatch):
|
||||
"""Default keyword search should stay on the DB/snippet path and avoid LLM latency."""
|
||||
from unittest.mock import MagicMock
|
||||
from tools.session_search_tool import session_search
|
||||
|
||||
async def fail_summarize(*_args, **_kwargs):
|
||||
raise AssertionError("default session_search must not call the summarizer")
|
||||
|
||||
monkeypatch.setattr("tools.session_search_tool._summarize_session", fail_summarize)
|
||||
|
||||
mock_db = MagicMock()
|
||||
mock_db.search_messages.return_value = [
|
||||
{
|
||||
"id": 123,
|
||||
"session_id": "other_sid",
|
||||
"role": "user",
|
||||
"snippet": "we discussed >>>session_search<<< latency",
|
||||
"context": [
|
||||
{"role": "user", "content": "session_search is slow"},
|
||||
{"role": "assistant", "content": "the LLM summary is the bottleneck"},
|
||||
],
|
||||
"source": "cli",
|
||||
"session_started": 1709400000,
|
||||
"model": "test-model",
|
||||
},
|
||||
]
|
||||
mock_db.get_session.return_value = {"parent_session_id": None, "title": "Latency debug"}
|
||||
|
||||
result = json.loads(session_search(query="session_search", db=mock_db))
|
||||
|
||||
assert result["success"] is True
|
||||
assert result["mode"] == "fast"
|
||||
assert result["count"] == 1
|
||||
entry = result["results"][0]
|
||||
assert entry["summary"] == "[Search hit — summary not generated in fast mode] Use snippet/context fields, or set mode='summary' for LLM-generated recall."
|
||||
assert "we discussed" not in entry["summary"]
|
||||
assert entry["model"] == "test-model"
|
||||
assert entry["snippet"] == "we discussed >>>session_search<<< latency"
|
||||
assert entry["context"][1]["content"] == "the LLM summary is the bottleneck"
|
||||
mock_db.get_messages_as_conversation.assert_not_called()
|
||||
|
||||
@pytest.mark.parametrize("mode", ["summarized", "summarise", "summarize", "deep"])
|
||||
def test_summary_mode_aliases_use_llm_summarization_path(self, monkeypatch, mode):
|
||||
"""Common natural-language mode aliases should map to summary mode."""
|
||||
from unittest.mock import MagicMock
|
||||
from tools.session_search_tool import session_search
|
||||
|
||||
async def fake_summarize(_text, _query, _meta):
|
||||
return "alias summary"
|
||||
|
||||
monkeypatch.setattr("tools.session_search_tool._summarize_session", fake_summarize)
|
||||
monkeypatch.setattr("model_tools._run_async", lambda coro: asyncio.run(coro))
|
||||
|
||||
mock_db = MagicMock()
|
||||
mock_db.search_messages.return_value = [{"session_id": "sid", "source": "cli"}]
|
||||
mock_db.get_session.return_value = {"parent_session_id": None, "source": "cli"}
|
||||
mock_db.get_messages_as_conversation.return_value = [
|
||||
{"role": "user", "content": "full transcript"},
|
||||
]
|
||||
|
||||
result = json.loads(session_search(query="session_search", db=mock_db, mode=mode))
|
||||
|
||||
assert result["success"] is True
|
||||
assert result["mode"] == "summary"
|
||||
assert result["results"][0]["summary"] == "alias summary"
|
||||
|
||||
@pytest.mark.parametrize("mode", ["", "unknown", 42, True, None])
|
||||
def test_invalid_or_empty_mode_falls_back_to_fast_without_llm(self, monkeypatch, mode):
|
||||
"""Loose tool-call args should degrade to fast mode rather than crashing."""
|
||||
from unittest.mock import MagicMock
|
||||
from tools.session_search_tool import session_search
|
||||
|
||||
async def fail_summarize(*_args, **_kwargs):
|
||||
raise AssertionError("invalid modes should fall back to fast mode")
|
||||
|
||||
monkeypatch.setattr("tools.session_search_tool._summarize_session", fail_summarize)
|
||||
|
||||
mock_db = MagicMock()
|
||||
mock_db.search_messages.return_value = [
|
||||
{"session_id": "sid", "snippet": "hit", "context": "not-a-list", "source": "cli"},
|
||||
]
|
||||
mock_db.get_session.return_value = {"parent_session_id": None}
|
||||
|
||||
result = json.loads(session_search(query="session_search", db=mock_db, mode=mode))
|
||||
|
||||
assert result["success"] is True
|
||||
assert result["mode"] == "fast"
|
||||
assert result["results"][0]["context"] == []
|
||||
assert result["results"][0]["model"] == "unknown"
|
||||
mock_db.get_messages_as_conversation.assert_not_called()
|
||||
|
||||
def test_fast_mode_tolerates_session_metadata_lookup_failure(self):
|
||||
"""Fast mode should still return the FTS hit when parent metadata is unavailable."""
|
||||
from unittest.mock import MagicMock
|
||||
from tools.session_search_tool import session_search
|
||||
|
||||
mock_db = MagicMock()
|
||||
mock_db.search_messages.return_value = [
|
||||
{"session_id": "sid", "snippet": "hit", "source": "cli", "model": None},
|
||||
]
|
||||
mock_db.get_session.side_effect = RuntimeError("metadata unavailable")
|
||||
|
||||
result = json.loads(session_search(query="session_search", db=mock_db))
|
||||
|
||||
assert result["success"] is True
|
||||
assert result["results"][0]["source"] == "cli"
|
||||
assert result["results"][0]["model"] == "unknown"
|
||||
|
||||
def test_summary_mode_preserves_llm_summarization_path(self, monkeypatch):
|
||||
"""Explicit summary mode keeps the previous behavior for deeper recall."""
|
||||
from unittest.mock import MagicMock
|
||||
from tools.session_search_tool import session_search
|
||||
|
||||
async def fake_summarize(text, query, meta):
|
||||
assert "full transcript" in text
|
||||
assert query == "session_search"
|
||||
assert meta["source"] == "cli"
|
||||
return "focused session summary"
|
||||
|
||||
monkeypatch.setattr("tools.session_search_tool._summarize_session", fake_summarize)
|
||||
monkeypatch.setattr("model_tools._run_async", lambda coro: asyncio.run(coro))
|
||||
|
||||
mock_db = MagicMock()
|
||||
mock_db.search_messages.return_value = [
|
||||
{"session_id": "other_sid", "source": "cli", "session_started": 1709400000, "model": "test-model"},
|
||||
]
|
||||
mock_db.get_session.return_value = {"parent_session_id": None, "source": "cli", "started_at": 1709400000}
|
||||
mock_db.get_messages_as_conversation.return_value = [
|
||||
{"role": "user", "content": "full transcript about session_search"},
|
||||
]
|
||||
|
||||
result = json.loads(session_search(query="session_search", db=mock_db, mode="summary"))
|
||||
|
||||
assert result["success"] is True
|
||||
assert result["mode"] == "summary"
|
||||
assert result["results"][0]["summary"] == "focused session summary"
|
||||
mock_db.get_messages_as_conversation.assert_called_once_with("other_sid")
|
||||
|
||||
def test_positional_db_argument_remains_backwards_compatible(self):
|
||||
"""Keep the historical positional order: query, role_filter, limit, db, current_session_id."""
|
||||
from unittest.mock import MagicMock
|
||||
from tools.session_search_tool import session_search
|
||||
|
||||
mock_db = MagicMock()
|
||||
mock_db.search_messages.return_value = []
|
||||
|
||||
result = json.loads(session_search("session_search", None, 3, mock_db, None))
|
||||
|
||||
assert result["success"] is True
|
||||
assert result["mode"] == "fast"
|
||||
mock_db.search_messages.assert_called_once()
|
||||
|
||||
def test_run_agent_special_session_search_paths_forward_mode(self):
|
||||
"""run_agent has two direct session_search call sites outside registry dispatch."""
|
||||
from pathlib import Path
|
||||
|
||||
source = (Path(__file__).parent.parent.parent / "run_agent.py").read_text()
|
||||
assert source.count('mode=function_args.get("mode", "fast")') == 2
|
||||
|
||||
def test_current_child_session_excludes_parent_lineage(self):
|
||||
"""Compression/delegation parents should be excluded for the active child session."""
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
@@ -2,19 +2,16 @@
|
||||
"""
|
||||
Session Search Tool - Long-Term Conversation Recall
|
||||
|
||||
Searches past session transcripts in SQLite via FTS5, then summarizes the top
|
||||
matching sessions using the configured auxiliary session_search model (same
|
||||
pattern as web_extract). By default, auxiliary "auto" routing uses the main
|
||||
chat provider/model unless the user overrides auxiliary.session_search.
|
||||
Returns focused summaries of past conversations rather than raw transcripts,
|
||||
keeping the main model's context window clean.
|
||||
Searches past session transcripts in SQLite via FTS5. Keyword search defaults
|
||||
to fast snippet/context hits without any LLM call; callers can opt into focused
|
||||
LLM summaries with mode="summary" when deeper recall is worth the latency.
|
||||
|
||||
Flow:
|
||||
1. FTS5 search finds matching messages ranked by relevance
|
||||
2. Groups by session, takes the top N unique sessions (default 3)
|
||||
3. Loads each session's conversation, truncates to ~100k chars centered on matches
|
||||
4. Sends to the configured auxiliary model with a focused summarization prompt
|
||||
5. Returns per-session summaries with metadata
|
||||
3. Fast mode returns snippets and nearby context immediately
|
||||
4. Summary mode loads each session, truncates around matches, and calls an LLM
|
||||
5. Returns per-session hits/summaries with metadata
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
@@ -328,13 +325,11 @@ def session_search(
|
||||
limit: int = 3,
|
||||
db=None,
|
||||
current_session_id: str = None,
|
||||
mode: str = "fast",
|
||||
) -> str:
|
||||
"""
|
||||
Search past sessions and return focused summaries of matching conversations.
|
||||
|
||||
Uses FTS5 to find matches, then summarizes the top sessions with the
|
||||
configured auxiliary session_search model.
|
||||
The current session is excluded from results since the agent already has that context.
|
||||
Search past sessions. Fast mode returns FTS snippets without LLM calls;
|
||||
summary mode preserves the previous focused summarization behavior.
|
||||
"""
|
||||
if db is None:
|
||||
try:
|
||||
@@ -346,6 +341,12 @@ def session_search(
|
||||
from hermes_state import format_session_db_unavailable
|
||||
return tool_error(format_session_db_unavailable(), success=False)
|
||||
|
||||
mode = (mode or "fast").strip().lower() if isinstance(mode, str) else "fast"
|
||||
if mode in ("summarized", "summarise", "summarize", "deep"):
|
||||
mode = "summary"
|
||||
if mode not in ("fast", "summary"):
|
||||
mode = "fast"
|
||||
|
||||
# Defensive: models (especially open-source) may send non-int limit values
|
||||
# (None when JSON null, string "int", or even a type object). Coerce to a
|
||||
# safe integer before any arithmetic/comparison to prevent TypeError.
|
||||
@@ -381,6 +382,7 @@ def session_search(
|
||||
if not raw_results:
|
||||
return json.dumps({
|
||||
"success": True,
|
||||
"mode": mode,
|
||||
"query": query,
|
||||
"results": [],
|
||||
"count": 0,
|
||||
@@ -438,6 +440,41 @@ def session_search(
|
||||
if len(seen_sessions) >= limit:
|
||||
break
|
||||
|
||||
if mode == "fast":
|
||||
results = []
|
||||
for session_id, match_info in seen_sessions.items():
|
||||
try:
|
||||
session_meta = db.get_session(session_id) or {}
|
||||
except Exception:
|
||||
session_meta = {}
|
||||
snippet = match_info.get("snippet") or ""
|
||||
context = match_info.get("context") or []
|
||||
if not isinstance(context, list):
|
||||
context = []
|
||||
results.append({
|
||||
"session_id": session_id,
|
||||
"when": _format_timestamp(
|
||||
session_meta.get("started_at") or match_info.get("session_started")
|
||||
),
|
||||
"source": session_meta.get("source") or match_info.get("source", "unknown"),
|
||||
"model": session_meta.get("model") or match_info.get("model") or "unknown",
|
||||
"matched_role": match_info.get("role"),
|
||||
"title": session_meta.get("title") or None,
|
||||
"snippet": snippet,
|
||||
"context": context,
|
||||
"summary": "[Search hit — summary not generated in fast mode] Use snippet/context fields, or set mode='summary' for LLM-generated recall.",
|
||||
})
|
||||
|
||||
return json.dumps({
|
||||
"success": True,
|
||||
"mode": "fast",
|
||||
"query": query,
|
||||
"results": results,
|
||||
"count": len(results),
|
||||
"sessions_searched": len(seen_sessions),
|
||||
"message": "Fast search returned FTS snippets without LLM summarization. Use mode='summary' for focused summaries when needed.",
|
||||
}, ensure_ascii=False)
|
||||
|
||||
# Prepare all sessions for parallel summarization
|
||||
tasks = []
|
||||
for session_id, match_info in seen_sessions.items():
|
||||
@@ -527,6 +564,7 @@ def session_search(
|
||||
|
||||
return json.dumps({
|
||||
"success": True,
|
||||
"mode": "summary",
|
||||
"query": query,
|
||||
"results": summaries,
|
||||
"count": len(summaries),
|
||||
@@ -539,7 +577,7 @@ def session_search(
|
||||
|
||||
|
||||
def check_session_search_requirements() -> bool:
|
||||
"""Requires SQLite state database and an auxiliary text model."""
|
||||
"""Requires SQLite state database; summary mode also needs an auxiliary model."""
|
||||
try:
|
||||
from hermes_state import DEFAULT_DB_PATH
|
||||
return DEFAULT_DB_PATH.parent.exists()
|
||||
@@ -551,13 +589,14 @@ SESSION_SEARCH_SCHEMA = {
|
||||
"name": "session_search",
|
||||
"description": (
|
||||
"Search your long-term memory of past conversations, or browse recent sessions. This is your recall -- "
|
||||
"every past session is searchable, and this tool summarizes what happened.\n\n"
|
||||
"every past session is searchable. Keyword search defaults to fast FTS snippets with no LLM call.\n\n"
|
||||
"TWO MODES:\n"
|
||||
"1. Recent sessions (no query): Call with no arguments to see what was worked on recently. "
|
||||
"Returns titles, previews, and timestamps. Zero LLM cost, instant. "
|
||||
"Start here when the user asks what were we working on or what did we do recently.\n"
|
||||
"2. Keyword search (with query): Search for specific topics across all past sessions. "
|
||||
"Returns LLM-generated summaries of matching sessions.\n\n"
|
||||
"Defaults to mode='fast', returning snippets and nearby context instantly without LLM summarization. "
|
||||
"Use mode='summary' only when a focused LLM-generated recap is worth the latency.\n\n"
|
||||
"USE THIS PROACTIVELY when:\n"
|
||||
"- The user says 'we did this before', 'remember when', 'last time', 'as I mentioned'\n"
|
||||
"- The user asks about a topic you worked on before but don't have in current context\n"
|
||||
@@ -570,7 +609,7 @@ SESSION_SEARCH_SCHEMA = {
|
||||
"phrases for exact match (\"docker networking\"), boolean (python NOT java), prefix (deploy*). "
|
||||
"IMPORTANT: Use OR between keywords for best results — FTS5 defaults to AND which misses "
|
||||
"sessions that only mention some terms. If a broad OR query returns nothing, try individual "
|
||||
"keyword searches in parallel. Returns summaries of the top matching sessions."
|
||||
"keyword searches in parallel. Returns fast search hits by default."
|
||||
),
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
@@ -585,9 +624,15 @@ SESSION_SEARCH_SCHEMA = {
|
||||
},
|
||||
"limit": {
|
||||
"type": "integer",
|
||||
"description": "Max sessions to summarize (default: 3, max: 5).",
|
||||
"description": "Max sessions to return (default: 3, max: 5).",
|
||||
"default": 3,
|
||||
},
|
||||
"mode": {
|
||||
"type": "string",
|
||||
"enum": ["fast", "summary"],
|
||||
"description": "fast (default) returns FTS snippets + surrounding context without LLM calls (~0.02s). Start here for most recall needs. summary loads the full session transcript and runs the LLM summarizer (~10-30s). Use summary only when the fast results do not give enough context to answer the user's question, or when the user explicitly asks for a 'summary' or 'recap' of past conversations. You can call twice: first fast, then summary if more detail is needed.",
|
||||
"default": "fast",
|
||||
},
|
||||
},
|
||||
"required": [],
|
||||
},
|
||||
@@ -605,6 +650,7 @@ registry.register(
|
||||
query=args.get("query") or "",
|
||||
role_filter=args.get("role_filter"),
|
||||
limit=args.get("limit", 3),
|
||||
mode=args.get("mode", "fast"),
|
||||
db=kw.get("db"),
|
||||
current_session_id=kw.get("current_session_id")),
|
||||
check_fn=check_session_search_requirements,
|
||||
|
||||
Reference in New Issue
Block a user