diff --git a/run_agent.py b/run_agent.py
index 8908562b38..ccd851f766 100644
--- a/run_agent.py
+++ b/run_agent.py
@@ -10295,6 +10295,7 @@ class AIAgent:
                 limit=function_args.get("limit", 3),
                 db=session_db,
                 current_session_id=self.session_id,
+                mode=function_args.get("mode", "fast"),
             )
         elif function_name == "memory":
             target = function_args.get("target", "memory")
@@ -10921,6 +10922,7 @@ class AIAgent:
                         limit=function_args.get("limit", 3),
                         db=session_db,
                         current_session_id=self.session_id,
+                        mode=function_args.get("mode", "fast"),
                     )
                 tool_duration = time.time() - tool_start_time
                 if self._should_emit_quiet_tool_messages():
diff --git a/tests/tools/test_session_search.py b/tests/tools/test_session_search.py
index 8e67f23034..dec1457933 100644
--- a/tests/tools/test_session_search.py
+++ b/tests/tools/test_session_search.py
@@ -234,9 +234,10 @@ class TestSessionSearchConcurrency:
             {"role": "assistant", "content": "response"},
         ]
 
-        result = json.loads(session_search(query="message", db=mock_db, limit=3))
+        result = json.loads(session_search(query="message", db=mock_db, limit=3, mode="summary"))
 
         assert result["success"] is True
+        assert result["mode"] == "summary"
         assert result["count"] == 3
         assert max_seen["value"] == 1
 
@@ -400,6 +401,165 @@ class TestSessionSearch:
         assert result["sessions_searched"] == 1
         assert current_sid not in [r.get("session_id") for r in result.get("results", [])]
 
+    def test_default_search_returns_fast_hits_without_llm_or_full_session_load(self, monkeypatch):
+        """Default keyword search should stay on the DB/snippet path and avoid LLM latency."""
+        from unittest.mock import MagicMock
+        from tools.session_search_tool import session_search
+
+        async def fail_summarize(*_args, **_kwargs):
+            raise AssertionError("default session_search must not call the summarizer")
+
+        monkeypatch.setattr("tools.session_search_tool._summarize_session", fail_summarize)
+
+        mock_db = MagicMock()
+        mock_db.search_messages.return_value = [
+            {
+                "id": 123,
+                "session_id": "other_sid",
+                "role": "user",
+                "snippet": "we discussed >>>session_search<<< latency",
+                "context": [
+                    {"role": "user", "content": "session_search is slow"},
+                    {"role": "assistant", "content": "the LLM summary is the bottleneck"},
+                ],
+                "source": "cli",
+                "session_started": 1709400000,
+                "model": "test-model",
+            },
+        ]
+        mock_db.get_session.return_value = {"parent_session_id": None, "title": "Latency debug"}
+
+        result = json.loads(session_search(query="session_search", db=mock_db))
+
+        assert result["success"] is True
+        assert result["mode"] == "fast"
+        assert result["count"] == 1
+        entry = result["results"][0]
+        assert entry["summary"] == "[Search hit — summary not generated in fast mode] Use snippet/context fields, or set mode='summary' for LLM-generated recall."
+        assert "we discussed" not in entry["summary"]
+        assert entry["model"] == "test-model"
+        assert entry["snippet"] == "we discussed >>>session_search<<< latency"
+        assert entry["context"][1]["content"] == "the LLM summary is the bottleneck"
+        mock_db.get_messages_as_conversation.assert_not_called()
+
+    @pytest.mark.parametrize("mode", ["summarized", "summarise", "summarize", "deep"])
+    def test_summary_mode_aliases_use_llm_summarization_path(self, monkeypatch, mode):
+        """Common natural-language mode aliases should map to summary mode."""
+        from unittest.mock import MagicMock
+        from tools.session_search_tool import session_search
+
+        async def fake_summarize(_text, _query, _meta):
+            return "alias summary"
+
+        monkeypatch.setattr("tools.session_search_tool._summarize_session", fake_summarize)
+        monkeypatch.setattr("model_tools._run_async", lambda coro: asyncio.run(coro))
+
+        mock_db = MagicMock()
+        mock_db.search_messages.return_value = [{"session_id": "sid", "source": "cli"}]
+        mock_db.get_session.return_value = {"parent_session_id": None, "source": "cli"}
+        mock_db.get_messages_as_conversation.return_value = [
+            {"role": "user", "content": "full transcript"},
+        ]
+
+        result = json.loads(session_search(query="session_search", db=mock_db, mode=mode))
+
+        assert result["success"] is True
+        assert result["mode"] == "summary"
+        assert result["results"][0]["summary"] == "alias summary"
+
+    @pytest.mark.parametrize("mode", ["", "unknown", 42, True, None])
+    def test_invalid_or_empty_mode_falls_back_to_fast_without_llm(self, monkeypatch, mode):
+        """Loose tool-call args should degrade to fast mode rather than crashing."""
+        from unittest.mock import MagicMock
+        from tools.session_search_tool import session_search
+
+        async def fail_summarize(*_args, **_kwargs):
+            raise AssertionError("invalid modes should fall back to fast mode")
+
+        monkeypatch.setattr("tools.session_search_tool._summarize_session", fail_summarize)
+
+        mock_db = MagicMock()
+        mock_db.search_messages.return_value = [
+            {"session_id": "sid", "snippet": "hit", "context": "not-a-list", "source": "cli"},
+        ]
+        mock_db.get_session.return_value = {"parent_session_id": None}
+
+        result = json.loads(session_search(query="session_search", db=mock_db, mode=mode))
+
+        assert result["success"] is True
+        assert result["mode"] == "fast"
+        assert result["results"][0]["context"] == []
+        assert result["results"][0]["model"] == "unknown"
+        mock_db.get_messages_as_conversation.assert_not_called()
+
+    def test_fast_mode_tolerates_session_metadata_lookup_failure(self):
+        """Fast mode should still return the FTS hit when parent metadata is unavailable."""
+        from unittest.mock import MagicMock
+        from tools.session_search_tool import session_search
+
+        mock_db = MagicMock()
+        mock_db.search_messages.return_value = [
+            {"session_id": "sid", "snippet": "hit", "source": "cli", "model": None},
+        ]
+        mock_db.get_session.side_effect = RuntimeError("metadata unavailable")
+
+        result = json.loads(session_search(query="session_search", db=mock_db))
+
+        assert result["success"] is True
+        assert result["results"][0]["source"] == "cli"
+        assert result["results"][0]["model"] == "unknown"
+
+    def test_summary_mode_preserves_llm_summarization_path(self, monkeypatch):
+        """Explicit summary mode keeps the previous behavior for deeper recall."""
+        from unittest.mock import MagicMock
+        from tools.session_search_tool import session_search
+
+        async def fake_summarize(text, query, meta):
+            assert "full transcript" in text
+            assert query == "session_search"
+            assert meta["source"] == "cli"
+            return "focused session summary"
+
+        monkeypatch.setattr("tools.session_search_tool._summarize_session", fake_summarize)
+        monkeypatch.setattr("model_tools._run_async", lambda coro: asyncio.run(coro))
+
+        mock_db = MagicMock()
+        mock_db.search_messages.return_value = [
+            {"session_id": "other_sid", "source": "cli", "session_started": 1709400000, "model": "test-model"},
+        ]
+        mock_db.get_session.return_value = {"parent_session_id": None, "source": "cli", "started_at": 1709400000}
+        mock_db.get_messages_as_conversation.return_value = [
+            {"role": "user", "content": "full transcript about session_search"},
+        ]
+
+        result = json.loads(session_search(query="session_search", db=mock_db, mode="summary"))
+
+        assert result["success"] is True
+        assert result["mode"] == "summary"
+        assert result["results"][0]["summary"] == "focused session summary"
+        mock_db.get_messages_as_conversation.assert_called_once_with("other_sid")
+
+    def test_positional_db_argument_remains_backwards_compatible(self):
+        """Keep the historical positional order: query, role_filter, limit, db, current_session_id."""
+        from unittest.mock import MagicMock
+        from tools.session_search_tool import session_search
+
+        mock_db = MagicMock()
+        mock_db.search_messages.return_value = []
+
+        result = json.loads(session_search("session_search", None, 3, mock_db, None))
+
+        assert result["success"] is True
+        assert result["mode"] == "fast"
+        mock_db.search_messages.assert_called_once()
+
+    def test_run_agent_special_session_search_paths_forward_mode(self):
+        """run_agent has two direct session_search call sites outside registry dispatch."""
+        from pathlib import Path
+
+        source = (Path(__file__).parent.parent.parent / "run_agent.py").read_text()
+        assert source.count('mode=function_args.get("mode", "fast")') == 2
+
     def test_current_child_session_excludes_parent_lineage(self):
         """Compression/delegation parents should be excluded for the active child session."""
         from unittest.mock import MagicMock
diff --git a/tools/session_search_tool.py b/tools/session_search_tool.py
index e73cce6bbd..35b7191fb4 100644
--- a/tools/session_search_tool.py
+++ b/tools/session_search_tool.py
@@ -2,19 +2,16 @@
 """
 Session Search Tool - Long-Term Conversation Recall
 
-Searches past session transcripts in SQLite via FTS5, then summarizes the top
-matching sessions using the configured auxiliary session_search model (same
-pattern as web_extract). By default, auxiliary "auto" routing uses the main
-chat provider/model unless the user overrides auxiliary.session_search.
-Returns focused summaries of past conversations rather than raw transcripts,
-keeping the main model's context window clean.
+Searches past session transcripts in SQLite via FTS5. Keyword search defaults
+to fast snippet/context hits without any LLM call; callers can opt into focused
+LLM summaries with mode="summary" when deeper recall is worth the latency.
 
 Flow:
   1. FTS5 search finds matching messages ranked by relevance
   2. Groups by session, takes the top N unique sessions (default 3)
-  3. Loads each session's conversation, truncates to ~100k chars centered on matches
-  4. Sends to the configured auxiliary model with a focused summarization prompt
-  5. Returns per-session summaries with metadata
+  3. Fast mode returns snippets and nearby context immediately
+  4. Summary mode loads each session, truncates around matches, and calls an LLM
+  5. Returns per-session hits/summaries with metadata
 """
 
 import asyncio
@@ -328,13 +325,11 @@ def session_search(
     limit: int = 3,
     db=None,
     current_session_id: str = None,
+    mode: str = "fast",
 ) -> str:
     """
-    Search past sessions and return focused summaries of matching conversations.
-
-    Uses FTS5 to find matches, then summarizes the top sessions with the
-    configured auxiliary session_search model.
-    The current session is excluded from results since the agent already has that context.
+    Search past sessions. Fast mode returns FTS snippets without LLM calls;
+    summary mode preserves the previous focused summarization behavior.
     """
     if db is None:
         try:
@@ -346,6 +341,12 @@ def session_search(
             from hermes_state import format_session_db_unavailable
             return tool_error(format_session_db_unavailable(), success=False)
 
+    mode = (mode or "fast").strip().lower() if isinstance(mode, str) else "fast"
+    if mode in ("summarized", "summarise", "summarize", "deep"):
+        mode = "summary"
+    if mode not in ("fast", "summary"):
+        mode = "fast"
+
     # Defensive: models (especially open-source) may send non-int limit values
     # (None when JSON null, string "int", or even a type object).  Coerce to a
     # safe integer before any arithmetic/comparison to prevent TypeError.
@@ -381,6 +382,7 @@ def session_search(
         if not raw_results:
             return json.dumps({
                 "success": True,
+                "mode": mode,
                 "query": query,
                 "results": [],
                 "count": 0,
@@ -438,6 +440,41 @@ def session_search(
             if len(seen_sessions) >= limit:
                 break
 
+        if mode == "fast":
+            results = []
+            for session_id, match_info in seen_sessions.items():
+                try:
+                    session_meta = db.get_session(session_id) or {}
+                except Exception:
+                    session_meta = {}
+                snippet = match_info.get("snippet") or ""
+                context = match_info.get("context") or []
+                if not isinstance(context, list):
+                    context = []
+                results.append({
+                    "session_id": session_id,
+                    "when": _format_timestamp(
+                        session_meta.get("started_at") or match_info.get("session_started")
+                    ),
+                    "source": session_meta.get("source") or match_info.get("source", "unknown"),
+                    "model": session_meta.get("model") or match_info.get("model") or "unknown",
+                    "matched_role": match_info.get("role"),
+                    "title": session_meta.get("title") or None,
+                    "snippet": snippet,
+                    "context": context,
+                    "summary": "[Search hit — summary not generated in fast mode] Use snippet/context fields, or set mode='summary' for LLM-generated recall.",
+                })
+
+            return json.dumps({
+                "success": True,
+                "mode": "fast",
+                "query": query,
+                "results": results,
+                "count": len(results),
+                "sessions_searched": len(seen_sessions),
+                "message": "Fast search returned FTS snippets without LLM summarization. Use mode='summary' for focused summaries when needed.",
+            }, ensure_ascii=False)
+
         # Prepare all sessions for parallel summarization
         tasks = []
         for session_id, match_info in seen_sessions.items():
@@ -527,6 +564,7 @@ def session_search(
 
         return json.dumps({
             "success": True,
+            "mode": "summary",
             "query": query,
             "results": summaries,
             "count": len(summaries),
@@ -539,7 +577,7 @@ def session_search(
 
 
 def check_session_search_requirements() -> bool:
-    """Requires SQLite state database and an auxiliary text model."""
+    """Requires SQLite state database; summary mode also needs an auxiliary model."""
     try:
         from hermes_state import DEFAULT_DB_PATH
         return DEFAULT_DB_PATH.parent.exists()
@@ -551,13 +589,14 @@ SESSION_SEARCH_SCHEMA = {
     "name": "session_search",
     "description": (
         "Search your long-term memory of past conversations, or browse recent sessions. This is your recall -- "
-        "every past session is searchable, and this tool summarizes what happened.\n\n"
+        "every past session is searchable. Keyword search defaults to fast FTS snippets with no LLM call.\n\n"
         "TWO MODES:\n"
         "1. Recent sessions (no query): Call with no arguments to see what was worked on recently. "
         "Returns titles, previews, and timestamps. Zero LLM cost, instant. "
         "Start here when the user asks what were we working on or what did we do recently.\n"
         "2. Keyword search (with query): Search for specific topics across all past sessions. "
-        "Returns LLM-generated summaries of matching sessions.\n\n"
+        "Defaults to mode='fast', returning snippets and nearby context instantly without LLM summarization. "
+        "Use mode='summary' only when a focused LLM-generated recap is worth the latency.\n\n"
         "USE THIS PROACTIVELY when:\n"
         "- The user says 'we did this before', 'remember when', 'last time', 'as I mentioned'\n"
         "- The user asks about a topic you worked on before but don't have in current context\n"
@@ -570,7 +609,7 @@ SESSION_SEARCH_SCHEMA = {
         "phrases for exact match (\"docker networking\"), boolean (python NOT java), prefix (deploy*). "
         "IMPORTANT: Use OR between keywords for best results — FTS5 defaults to AND which misses "
         "sessions that only mention some terms. If a broad OR query returns nothing, try individual "
-        "keyword searches in parallel. Returns summaries of the top matching sessions."
+        "keyword searches in parallel. Returns fast search hits by default."
     ),
     "parameters": {
         "type": "object",
@@ -585,9 +624,15 @@ SESSION_SEARCH_SCHEMA = {
             },
             "limit": {
                 "type": "integer",
-                "description": "Max sessions to summarize (default: 3, max: 5).",
+                "description": "Max sessions to return (default: 3, max: 5).",
                 "default": 3,
             },
+            "mode": {
+                "type": "string",
+                "enum": ["fast", "summary"],
+                "description": "fast (default) returns FTS snippets + surrounding context without LLM calls (~0.02s). Start here for most recall needs. summary loads the full session transcript and runs the LLM summarizer (~10-30s). Use summary only when the fast results do not give enough context to answer the user's question, or when the user explicitly asks for a 'summary' or 'recap' of past conversations. You can call twice: first fast, then summary if more detail is needed.",
+                "default": "fast",
+            },
         },
         "required": [],
     },
@@ -605,6 +650,7 @@ registry.register(
         query=args.get("query") or "",
         role_filter=args.get("role_filter"),
         limit=args.get("limit", 3),
+        mode=args.get("mode", "fast"),
         db=kw.get("db"),
         current_session_id=kw.get("current_session_id")),
     check_fn=check_session_search_requirements,