From fd1c4eaeafebc1efaab19ab58da14530af640c50 Mon Sep 17 00:00:00 2001 From: hermes-agent Date: Sun, 24 May 2026 18:57:40 +0000 Subject: [PATCH] =?UTF-8?q?Stage=20406:=20PR=20#2827=20=E2=80=94=20fix(sta?= =?UTF-8?q?te-sync):=20pass=20profile=20explicitly=20so=20background-threa?= =?UTF-8?q?d=20DB=20writes=20hit=20the=20right=20state.db=20(#2762)=20by?= =?UTF-8?q?=20@Koraji95-coder?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- api/state_sync.py | 97 +++++- api/streaming.py | 6 + ...test_issue2762_state_sync_profile_kwarg.py | 284 ++++++++++++++++++ 3 files changed, 373 insertions(+), 14 deletions(-) create mode 100644 tests/test_issue2762_state_sync_profile_kwarg.py diff --git a/api/state_sync.py b/api/state_sync.py index fa9c9b33..dbb2873f 100644 --- a/api/state_sync.py +++ b/api/state_sync.py @@ -20,22 +20,78 @@ from pathlib import Path logger = logging.getLogger(__name__) -def _get_state_db(): - """Get a SessionDB instance for the active profile's state.db. - Returns None if hermes_state is not importable or DB is unavailable. - Each caller is responsible for calling db.close() when done. +def _get_state_db(profile: str=None): + """Get a SessionDB instance for a profile's state.db. + + When ``profile`` is provided the function resolves *that* profile's + home directory directly (via ``_resolve_profile_home_for_name``). + If resolution fails (unknown profile name, IO error, etc.) the + function returns ``None`` rather than silently falling back to + ``HERMES_HOME`` — silently routing the write to the wrong DB + would defeat the point of the explicit-profile path (#2762). + + When ``profile`` is None it falls back to the TLS-based + ``get_active_hermes_home()`` lookup for backward compatibility, + with a final ``HERMES_HOME`` fallback only on that path. TLS may be + unset in background/worker threads, in which case the lookup falls + through to the process-global active profile and can write to the + wrong DB. Callers that know the session's profile (e.g. + ``sync_session_usage`` after a stream completes on a background + thread) should pass it explicitly to avoid that race. + + Returns None if hermes_state is not importable, the explicit + profile cannot be resolved, or the DB is unavailable. Each caller + is responsible for calling db.close() when done. """ try: from hermes_state import SessionDB except ImportError: return None - try: - from api.profiles import get_active_hermes_home - hermes_home = Path(get_active_hermes_home()).expanduser().resolve() - except Exception: - logger.debug("Failed to resolve hermes home, using default") - hermes_home = Path(os.getenv('HERMES_HOME', str(Path.home() / '.hermes'))) + if profile is not None: + # Explicit-profile path — a resolution failure here MUST NOT + # silently fall back to HERMES_HOME or the caller's "write to + # the named profile" contract is broken (the original #2762 + # symptom: writes leaking into the wrong profile's state.db). + # + # Defense-in-depth (per #2827 maintainer review): validate the + # name shape BEFORE handing it to ``_resolve_profile_home_for_name``. + # The resolver itself rarely raises — for an invalid-but-non- + # malicious name (e.g. one that fails ``_PROFILE_ID_RE``) it + # quietly returns ``_DEFAULT_HERMES_HOME``, which is the exact + # leak we're trying to prevent on the explicit-profile path. + # Validating up-front turns that quiet leak into an explicit + # "refuse + log + return None" so the contract is "write to + # the EXACT named profile, or write nowhere." + try: + from api.profiles import ( + _resolve_profile_home_for_name, + _PROFILE_ID_RE, + _is_root_profile, + ) + if not (_is_root_profile(profile) or _PROFILE_ID_RE.fullmatch(profile)): + logger.warning( + "state_sync: refusing invalid profile name %r — skipping " + "write rather than leaking to the default state.db (#2762).", + profile, + ) + return None + hermes_home = Path(_resolve_profile_home_for_name(profile)).expanduser().resolve() + except Exception: + logger.warning( + "state_sync: could not resolve profile %r — skipping write rather " + "than leaking to the active profile (#2762).", profile, + ) + return None + else: + # Implicit / TLS-fallback path — preserves pre-#2762 behavior + # for any caller that doesn't pass profile= explicitly. + try: + from api.profiles import get_active_hermes_home + hermes_home = Path(get_active_hermes_home()).expanduser().resolve() + except Exception: + logger.debug("Failed to resolve hermes home, using default") + hermes_home = Path(os.getenv('HERMES_HOME', str(Path.home() / '.hermes'))) db_path = hermes_home / 'state.db' if not db_path.exists(): @@ -48,11 +104,16 @@ def _get_state_db(): return None -def sync_session_start(session_id: str, model=None) -> None: +def sync_session_start(session_id: str, model=None, profile: str=None) -> None: """Register a WebUI session in state.db (idempotent). Called when a session's first message is sent. + + ``profile`` lets the caller name the target state.db explicitly, + avoiding the TLS-vs-background-thread mismatch in #2762. When + omitted, the active profile is resolved from TLS (then process + globals) as before. """ - db = _get_state_db() + db = _get_state_db(profile=profile) if not db: return try: @@ -72,12 +133,20 @@ def sync_session_start(session_id: str, model=None) -> None: def sync_session_usage(session_id: str, input_tokens: int=0, output_tokens: int=0, estimated_cost=None, model=None, title: str=None, - message_count: int=None) -> None: + message_count: int=None, profile: str=None) -> None: """Update token usage and title for a WebUI session in state.db. Called after each turn completes. Uses absolute=True to set totals (the WebUI Session already accumulates across turns). + + ``profile`` lets the caller name the target state.db explicitly, + which is what fixes #2762: this function is invoked from the + agent streaming worker thread, where the request-thread's TLS + profile context has not been propagated. Without an explicit + profile, the TLS lookup falls back to the process-global active + profile and writes the session's usage to the wrong state.db + (e.g. ``hiyuki``'s instead of the cookie-switched ``maiko``'s). """ - db = _get_state_db() + db = _get_state_db(profile=profile) if not db: return try: diff --git a/api/streaming.py b/api/streaming.py index 5718ee1c..cd6cb960 100644 --- a/api/streaming.py +++ b/api/streaming.py @@ -5070,6 +5070,12 @@ def _run_agent_streaming( model=model, title=s.title, message_count=len(s.messages), + # #2762: pass the session's profile explicitly so the + # background-thread state.db lookup doesn't fall + # through to the process-global active profile and + # write to the wrong DB (TLS profile is set on the + # HTTP thread but not propagated to this worker). + profile=getattr(s, 'profile', None), ) except Exception: logger.debug("Failed to sync session to insights") diff --git a/tests/test_issue2762_state_sync_profile_kwarg.py b/tests/test_issue2762_state_sync_profile_kwarg.py new file mode 100644 index 00000000..c112fead --- /dev/null +++ b/tests/test_issue2762_state_sync_profile_kwarg.py @@ -0,0 +1,284 @@ +""" +Regression test for #2762 — state_sync writes to wrong profile's state.db +when profile is switched via WebUI cookie. + +Root cause: ``_get_state_db()`` relied on TLS-based +``get_active_hermes_home()`` to pick the DB path. TLS gets set on the HTTP +thread by the cookie middleware, but the agent streaming worker thread that +calls ``sync_session_usage`` does NOT inherit that TLS, so the lookup falls +through to the process-global active profile and writes to the wrong DB. + +Fix: ``_get_state_db(profile=...)`` accepts an explicit profile name and +resolves *that* profile's home directly via +``_resolve_profile_home_for_name``. Callers that know the session's profile +(e.g. ``sync_session_usage`` after streaming completes) pass it explicitly, +avoiding the TLS race. + +These tests pin: + 1. ``_get_state_db(profile='X')`` resolves X's home, not the active profile's. + 2. ``sync_session_usage(..., profile='X')`` writes to X's state.db only, + even when the global active profile is set to Y. + 3. ``sync_session_usage`` with no profile kwarg falls back to the old + TLS behavior (so existing callers don't regress). +""" +from __future__ import annotations + +import sys +import sqlite3 +from pathlib import Path + +import pytest + + +# Make sure we can import the api package the same way the server does. +_REPO_ROOT = Path(__file__).resolve().parent.parent +if str(_REPO_ROOT) not in sys.path: + sys.path.insert(0, str(_REPO_ROOT)) + + +@pytest.fixture() +def two_profile_homes(tmp_path, monkeypatch): + """Stand up two minimal profile homes with state.db initialized via + ``hermes_state.SessionDB`` itself (so the schema matches what the + production code expects — `sync_session_usage` does a raw-SQL + UPDATE of `message_count`, which hand-rolled schemas could miss). + Per Copilot review on PR #2827. + """ + # Skip the fixture cleanly if the production package isn't importable + # in this env — same gate the tests below use. + pytest.importorskip("hermes_state") + from hermes_state import SessionDB + + hiyuki_home = tmp_path / 'hiyuki' + maiko_home = tmp_path / 'maiko' + for home in (hiyuki_home, maiko_home): + home.mkdir(parents=True) + # Touch state.db then open via SessionDB so its own constructor + # runs whatever schema init / migration the production code + # would see at runtime. Then close immediately — each test + # opens its own handle through the production code path. + (home / 'state.db').touch() + SessionDB(home / 'state.db').close() + + # Stub api.profiles to return our temp paths + import api.profiles as profiles_mod + + def fake_resolve(name): + if name == 'hiyuki': + return hiyuki_home + if name == 'maiko': + return maiko_home + raise LookupError(name) + + monkeypatch.setattr(profiles_mod, '_resolve_profile_home_for_name', fake_resolve) + # Active profile is hiyuki — the WRONG one for tests that pass profile='maiko' + monkeypatch.setattr(profiles_mod, 'get_active_hermes_home', lambda: hiyuki_home) + + return {'hiyuki': hiyuki_home, 'maiko': maiko_home} + + +def _read_session(db_path: Path, session_id: str): + if not db_path.exists(): + return None + conn = sqlite3.connect(db_path) + try: + cur = conn.execute( + "SELECT session_id, title, input_tokens, output_tokens " + "FROM sessions WHERE session_id = ?", + (session_id,), + ) + row = cur.fetchone() + return row + finally: + conn.close() + + +def test_get_state_db_honors_explicit_profile_kwarg(two_profile_homes): + """_get_state_db(profile='maiko') resolves to maiko's home, NOT + the active profile (hiyuki).""" + from api.state_sync import _get_state_db + + # Some installs ship without the hermes_state package; the function + # returns None gracefully and there's nothing to assert. + try: + import hermes_state # noqa: F401 + except ImportError: + pytest.skip("hermes_state package not available in this test env") + + db = _get_state_db(profile='maiko') + if db is None: + pytest.skip("SessionDB could not open the test db (env issue)") + # We don't have a public accessor for the underlying path on SessionDB, + # but writing through it and reading the raw file should work. + db.ensure_session(session_id='probe-2762', source='webui', model='test') + try: + db.close() + except Exception: + pass + + # maiko's state.db should have the row; hiyuki's should not. + assert _read_session(two_profile_homes['maiko'] / 'state.db', 'probe-2762') is not None, \ + "session was not written to maiko's state.db" + assert _read_session(two_profile_homes['hiyuki'] / 'state.db', 'probe-2762') is None, \ + "session leaked into hiyuki's state.db — TLS-fallback regressed" + + +def test_sync_session_usage_writes_only_to_named_profile(two_profile_homes): + """sync_session_usage(..., profile='maiko') is the actual scenario from + the streaming worker thread post-#2762. The write must land in maiko's + state.db only, regardless of what the global active profile is.""" + try: + import hermes_state # noqa: F401 + except ImportError: + pytest.skip("hermes_state package not available in this test env") + + from api.state_sync import sync_session_usage + + sync_session_usage( + session_id='2762-regression', + input_tokens=42, + output_tokens=17, + estimated_cost=0.001, + model='test-model', + title='2762 regression test', + message_count=3, + profile='maiko', + ) + + maiko_row = _read_session(two_profile_homes['maiko'] / 'state.db', '2762-regression') + hiyuki_row = _read_session(two_profile_homes['hiyuki'] / 'state.db', '2762-regression') + + assert maiko_row is not None, \ + "sync_session_usage(profile='maiko') did not write to maiko's state.db" + assert hiyuki_row is None, \ + "sync_session_usage(profile='maiko') leaked into hiyuki's state.db — #2762 regression" + + +def test_sync_session_usage_without_profile_kwarg_uses_active(two_profile_homes): + """Backward compatibility: when called without a profile kwarg (the + pre-#2762 call shape), the function falls back to the active profile + (here: hiyuki). Existing callers should not regress.""" + try: + import hermes_state # noqa: F401 + except ImportError: + pytest.skip("hermes_state package not available in this test env") + + from api.state_sync import sync_session_usage + + sync_session_usage( + session_id='legacy-call-shape', + input_tokens=1, + output_tokens=2, + model='legacy', + title='legacy', + message_count=1, + ) + + hiyuki_row = _read_session(two_profile_homes['hiyuki'] / 'state.db', 'legacy-call-shape') + assert hiyuki_row is not None, \ + "sync_session_usage() without profile= regressed: did not write to active profile's state.db" + + +def test_unknown_explicit_profile_returns_none_not_falls_back(two_profile_homes): + """Copilot review of PR #2827: when ``profile`` is explicit and + resolution fails (e.g. typoed profile name, IO error), the + function MUST return None rather than silently fall back to + HERMES_HOME and write to the wrong DB. That fallback would + re-introduce the exact #2762 symptom (writes leaking into the + active profile). + + The fixture's `fake_resolve` raises LookupError for any name + that isn't 'hiyuki' or 'maiko', so passing 'does-not-exist' + here exercises the failure path. + """ + try: + import hermes_state # noqa: F401 + except ImportError: + pytest.skip("hermes_state package not available in this test env") + + from api.state_sync import sync_session_usage + + # Passing an unknown profile name MUST NOT cause a write to land in + # hiyuki (the active profile's home). If we leaked there, that's + # the exact bug we're guarding against. + sync_session_usage( + session_id='unknown-profile-probe', + input_tokens=99, + output_tokens=99, + model='probe', + title='probe', + message_count=1, + profile='does-not-exist', + ) + + hiyuki_row = _read_session(two_profile_homes['hiyuki'] / 'state.db', 'unknown-profile-probe') + maiko_row = _read_session(two_profile_homes['maiko'] / 'state.db', 'unknown-profile-probe') + + assert hiyuki_row is None, ( + "unknown explicit profile leaked write into hiyuki's state.db — #2762 regression" + ) + assert maiko_row is None, ( + "unknown explicit profile somehow ended up in maiko's state.db" + ) + + +@pytest.mark.parametrize("bad_name", [ + "../etc", # path traversal attempt + "Foo Bar", # space — invalid chars + "FOO", # uppercase — invalid per _PROFILE_ID_RE + "-leading-dash", # leading dash — invalid per regex (must start [a-z0-9]) + "_underscore", # leading underscore — invalid per regex + "a" * 100, # too long (> 64 chars) + "", # empty string is handled by _is_root_profile, separate case +]) +def test_invalid_profile_name_refused_not_falls_back(two_profile_homes, bad_name): + """Per PR #2827 maintainer review: an invalid-but-non-malicious + profile name on the explicit-profile path must be REFUSED, not + quietly routed to the default state.db. + + Before this defense, ``_resolve_profile_home_for_name`` would return + ``_DEFAULT_HERMES_HOME`` for any name failing ``_PROFILE_ID_RE`` + without raising — which is the exact #2762 leak symptom with a + different trigger. The new regex check up-front turns that quiet + leak into an explicit "refuse + log + return None" so the + explicit-path contract is "write to the EXACT named profile, or + write nowhere." + + The empty string is intentionally in the parametrize set because + we want to confirm it's refused — ``_is_root_profile('')`` returns + False (per ``api/profiles.py:216-217`` it short-circuits on falsy + input), so an empty explicit profile fails both the + ``_is_root_profile`` check and the regex, and the contract refuses + the write. That's the expected behavior — an empty explicit name + is itself a bug at the caller, not "I want the default." + """ + try: + import hermes_state # noqa: F401 + except ImportError: + pytest.skip("hermes_state package not available in this test env") + + from api.state_sync import sync_session_usage + + sync_session_usage( + session_id=f'invalid-name-probe-{abs(hash(bad_name))}', + input_tokens=99, + output_tokens=99, + model='probe', + title='probe', + message_count=1, + profile=bad_name, + ) + + sid = f'invalid-name-probe-{abs(hash(bad_name))}' + hiyuki_row = _read_session(two_profile_homes['hiyuki'] / 'state.db', sid) + maiko_row = _read_session(two_profile_homes['maiko'] / 'state.db', sid) + + # All invalid names (including empty string) MUST be refused — + # no row should appear in either profile's state.db. + assert hiyuki_row is None, ( + f"invalid profile name {bad_name!r} leaked write into hiyuki's " + "state.db — defense missed; #2762 regression" + ) + assert maiko_row is None, ( + f"invalid profile name {bad_name!r} somehow ended up in maiko's state.db" + )