mirror of
https://github.com/nesquena/hermes-webui.git
synced 2026-05-28 12:40:26 +00:00
fix(config): exclude credential-rotation fields from models-cache auth.json fingerprint
auth.json is rewritten by credential-pool/OAuth token refresh roughly every 14 minutes. _models_cache_source_fingerprint() hashed it via mtime/size (#1699 _models_cache_file_fingerprint), so every token refresh churned the fingerprint and the 24h /api/models cache was effectively dead -- the hot GET /api/session?resolve_model=1 path paid a cold ~11.5s rebuild every few minutes (RCA t_d127953d residual #2, t_16551f61). Add _auth_store_semantic_fingerprint(): content-hash auth.json with a DENY-list of known credential-rotation-only keys (access/refresh token, expiry, per-credential status/telemetry, request_count, save updated_at) stripped. Deny-list (not allow-list) is deliberate -- any unknown field, or a real provider/endpoint/model-set change (active_provider, a new credential_pool entry, base_url, source, label, auth_type, the providers{} block, ...) stays in the fingerprint and still correctly busts the cache. Conservative fallbacks: missing file -> marked; unreadable/corrupt -> stat-based fallback (never less safe than pre-fix). config.yaml keeps the cheap stat fingerprint (deliberate edits, no timer churn). Bidirectional invariant regression test (non-tautological -- the end-to-end churn test flips RED when the auth_json axis is reverted to stat-based): token-only churn keeps fingerprint byte-identical AND keeps a valid disk cache loadable; active_provider change / new credential_pool entry / changed base_url each flip the fingerprint AND reject the stale disk cache. Measured: 5/5 cold rebuilds per 5 refresh cycles -> 0/5. Tests: 9 new pass; 28 adjacent (#1699/#1633/display-resolver) pass; 54 models_cache/fingerprint suite pass.
This commit is contained in:
+122
-2
@@ -2437,11 +2437,131 @@ def _models_cache_catalog_fingerprint() -> dict:
|
||||
}
|
||||
|
||||
|
||||
# Credential-rotation fields inside auth.json that churn on a ~14-minute
|
||||
# period (credential-pool / OAuth token refresh rewrites the whole file) but
|
||||
# DO NOT change the set of available providers or models that /api/models
|
||||
# returns. mtime/size-based fingerprinting (#1699's _models_cache_file_
|
||||
# fingerprint) treats every one of these rewrites as a cache-invalidating
|
||||
# change, so the 24h models cache is effectively dead — every few minutes a
|
||||
# tab pays a full cold get_available_models() rebuild (see RCA t_d127953d /
|
||||
# t_16551f61). We strip ONLY these known-inert fields and fingerprint the
|
||||
# rest of auth.json by content, so token rotation no longer busts the cache.
|
||||
#
|
||||
# This is a DENY-list, not an allow-list, on purpose: a field we don't know
|
||||
# about stays IN the fingerprint, so any genuine change to provider
|
||||
# enablement / endpoint / api-base / model-allow (active_provider, a NEW
|
||||
# credential_pool entry id, base_url, source, label, key_source, auth_type,
|
||||
# priority, the providers{} block, …) still correctly invalidates the cache.
|
||||
# The safety invariant is one-directional: excluding these fields can only
|
||||
# ever make the fingerprint MORE stable, never make it miss a real
|
||||
# provider/model-set change — because none of these fields feed
|
||||
# detected_providers / the catalog in _build_available_models_uncached().
|
||||
_AUTH_FINGERPRINT_VOLATILE_KEYS = frozenset({
|
||||
# Secret material — rotates on refresh, never gates the provider/model set.
|
||||
"access_token",
|
||||
"refresh_token",
|
||||
"id_token",
|
||||
"api_key",
|
||||
"secret",
|
||||
"client_secret",
|
||||
# Expiry / liveness — bumped every refresh, derived from the token above.
|
||||
"expires_at",
|
||||
"expires_at_ms",
|
||||
"expires_in",
|
||||
# Per-credential status/telemetry — churns on every request, not config.
|
||||
"last_status",
|
||||
"last_status_at",
|
||||
"last_error_code",
|
||||
"last_error_reason",
|
||||
"last_error_message",
|
||||
"last_error_reset_at",
|
||||
"request_count",
|
||||
# Whole-file save timestamp — rewritten on every _save_auth_store().
|
||||
"updated_at",
|
||||
})
|
||||
|
||||
|
||||
def _strip_volatile_auth_fields(obj):
|
||||
"""Recursively drop credential-rotation-only keys from an auth.json tree.
|
||||
|
||||
Pure structural transform; never mutates the input. Any key NOT in the
|
||||
deny-list is preserved verbatim so real provider/endpoint changes still
|
||||
show through in the fingerprint.
|
||||
"""
|
||||
if isinstance(obj, dict):
|
||||
return {
|
||||
k: _strip_volatile_auth_fields(v)
|
||||
for k, v in obj.items()
|
||||
if k not in _AUTH_FINGERPRINT_VOLATILE_KEYS
|
||||
}
|
||||
if isinstance(obj, list):
|
||||
return [_strip_volatile_auth_fields(v) for v in obj]
|
||||
return obj
|
||||
|
||||
|
||||
def _auth_store_semantic_fingerprint(path: Path) -> dict:
|
||||
"""Return a content fingerprint of auth.json that ignores token churn.
|
||||
|
||||
Unlike _models_cache_file_fingerprint() (mtime_ns + size), this hashes
|
||||
the JSON content with the credential-rotation fields stripped, so the
|
||||
~14-min token-refresh rewrite of auth.json does NOT invalidate the 24h
|
||||
/api/models cache. A change to anything that actually affects the
|
||||
provider/model set (active_provider, a new credential_pool entry, a
|
||||
changed base_url/source/label/auth_type, the providers{} block, …)
|
||||
still changes the hash and correctly busts the cache.
|
||||
|
||||
Failure modes are deliberately conservative — if the file is missing we
|
||||
record that, and if it can't be read/parsed we fall back to the old
|
||||
mtime/size fingerprint so behaviour is never *less* safe than #1699.
|
||||
"""
|
||||
p = Path(path).expanduser()
|
||||
fp: dict = {"path": str(p)}
|
||||
try:
|
||||
st = p.stat()
|
||||
except OSError:
|
||||
fp["missing"] = True
|
||||
return fp
|
||||
try:
|
||||
raw = json.loads(p.read_text(encoding="utf-8"))
|
||||
except Exception:
|
||||
# Unreadable / corrupt / mid-write: fall back to the stat-based
|
||||
# fingerprint. Strictly no less safe than the pre-fix behaviour
|
||||
# (every write still invalidates) for this rare path only.
|
||||
fp["mtime_ns"] = st.st_mtime_ns
|
||||
fp["size"] = st.st_size
|
||||
fp["semantic"] = "unparsed-fallback"
|
||||
return fp
|
||||
stripped = _strip_volatile_auth_fields(raw)
|
||||
try:
|
||||
encoded = json.dumps(
|
||||
stripped,
|
||||
sort_keys=True,
|
||||
separators=(",", ":"),
|
||||
ensure_ascii=True,
|
||||
default=str,
|
||||
).encode("utf-8")
|
||||
fp["semantic_sha256"] = hashlib.sha256(encoded).hexdigest()
|
||||
except Exception:
|
||||
fp["mtime_ns"] = st.st_mtime_ns
|
||||
fp["size"] = st.st_size
|
||||
fp["semantic"] = "encode-fallback"
|
||||
return fp
|
||||
|
||||
|
||||
def _models_cache_source_fingerprint() -> dict:
|
||||
"""Return the current config/auth/catalog fingerprint for /api/models cache."""
|
||||
"""Return the current config/auth/catalog fingerprint for /api/models cache.
|
||||
|
||||
The auth.json axis uses a *content* fingerprint that excludes pure
|
||||
credential-rotation fields (see _auth_store_semantic_fingerprint): the
|
||||
auth store is rewritten roughly every 14 minutes by token refresh, and
|
||||
a stat-based (mtime/size) fingerprint made the 24h cache churn on every
|
||||
one of those rewrites (RCA t_16551f61). config.yaml keeps the cheap
|
||||
mtime/size fingerprint because it is only rewritten on deliberate user
|
||||
edits (which can change anything) and does not churn on a timer.
|
||||
"""
|
||||
return {
|
||||
"config_yaml": _models_cache_file_fingerprint(_get_config_path()),
|
||||
"auth_json": _models_cache_file_fingerprint(_get_auth_store_path()),
|
||||
"auth_json": _auth_store_semantic_fingerprint(_get_auth_store_path()),
|
||||
"catalog": _models_cache_catalog_fingerprint(),
|
||||
}
|
||||
|
||||
|
||||
@@ -0,0 +1,295 @@
|
||||
"""Regression tests for RCA t_16551f61: auth.json ~14-min token churn must
|
||||
NOT bust the 24h /api/models cache, but a real provider/model-set change
|
||||
still must.
|
||||
|
||||
Bug shape (chained from RCA t_d127953d residual #2): the models cache has a
|
||||
24h TTL, but its _source_fingerprint hashed auth.json via mtime/size
|
||||
(#1699's _models_cache_file_fingerprint). credential-pool / OAuth token
|
||||
refresh rewrites the WHOLE auth.json roughly every 14 minutes (RCA observed
|
||||
17:12:51 -> 17:26:31), bumping mtime+size every time. So the fingerprint
|
||||
mismatched every ~14 min, the 24h cache was effectively dead, and the hot
|
||||
GET /api/session?resolve_model=1 path paid a cold ~11.5s rebuild over and
|
||||
over.
|
||||
|
||||
Fix: fingerprint auth.json by *content* with the credential-rotation fields
|
||||
deny-listed (_auth_store_semantic_fingerprint). Token rotation is invisible
|
||||
to the fingerprint; anything that changes the available provider/model set
|
||||
still changes it.
|
||||
|
||||
These tests are INVARIANTS, not change-detectors:
|
||||
* churn-immune — rewriting only volatile token fields keeps the
|
||||
fingerprint byte-identical AND keeps a valid disk cache
|
||||
loadable.
|
||||
* real-change-invalidates — active_provider change, a NEW credential_pool
|
||||
entry, and a changed base_url each flip the fingerprint
|
||||
AND reject the previously-valid disk cache.
|
||||
|
||||
A test that only asserted "fingerprint stable on churn" would pass even if
|
||||
the function hard-coded `return {}` — hence the paired real-change half and
|
||||
the explicit assertion that the two fingerprints actually differ on a real
|
||||
change. Verified non-tautological by reverting the fix (see report.md): the
|
||||
churn-immune half then fails.
|
||||
"""
|
||||
|
||||
import json
|
||||
import time
|
||||
|
||||
import api.config as config
|
||||
|
||||
|
||||
# ── auth.json builders ────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def _auth_store(*, active_provider="anthropic", extra_pool_provider=None,
|
||||
anthropic_base_url="https://api.anthropic.com",
|
||||
access_token="tok-AAAA", request_count=1):
|
||||
"""A realistic auth.json shaped like the production researcher profile
|
||||
(version / providers / active_provider / credential_pool / updated_at)."""
|
||||
pool = {
|
||||
"anthropic": [
|
||||
{
|
||||
"id": "anthropic-oauth-1",
|
||||
"label": "Claude Max",
|
||||
"auth_type": "oauth",
|
||||
"priority": 0,
|
||||
"source": "claude-oauth",
|
||||
"access_token": access_token,
|
||||
"refresh_token": "refresh-" + access_token,
|
||||
"last_status": "ok",
|
||||
"last_status_at": None,
|
||||
"last_error_code": None,
|
||||
"expires_at_ms": 1779152637000,
|
||||
"base_url": anthropic_base_url,
|
||||
"request_count": request_count,
|
||||
}
|
||||
],
|
||||
"openrouter": [
|
||||
{
|
||||
"id": "openrouter-key-1",
|
||||
"label": "OpenRouter",
|
||||
"auth_type": "api_key",
|
||||
"priority": 1,
|
||||
"source": "env",
|
||||
"access_token": "or-" + access_token,
|
||||
"last_status": "ok",
|
||||
"base_url": "https://openrouter.ai/api/v1",
|
||||
"request_count": request_count,
|
||||
}
|
||||
],
|
||||
}
|
||||
if extra_pool_provider:
|
||||
pool[extra_pool_provider] = [
|
||||
{
|
||||
"id": f"{extra_pool_provider}-key-1",
|
||||
"label": extra_pool_provider.title(),
|
||||
"auth_type": "api_key",
|
||||
"priority": 2,
|
||||
"source": "manual",
|
||||
"access_token": "new-provider-token",
|
||||
"base_url": f"https://api.{extra_pool_provider}.test/v1",
|
||||
"request_count": 0,
|
||||
}
|
||||
]
|
||||
return {
|
||||
"version": 2,
|
||||
"providers": {},
|
||||
"active_provider": active_provider,
|
||||
"credential_pool": pool,
|
||||
"updated_at": "2026-05-19T01:00:00+00:00",
|
||||
}
|
||||
|
||||
|
||||
def _write_auth(tmp_path, monkeypatch, store: dict):
|
||||
"""Point _get_auth_store_path() at a tmp auth.json and write `store`.
|
||||
|
||||
Sleeps 10ms and re-stats to guarantee mtime_ns/size would differ between
|
||||
successive writes — proving the OLD stat-based fingerprint WOULD have
|
||||
churned, so a stable result is attributable to the content fix, not to
|
||||
the OS coincidentally reusing mtime/size.
|
||||
"""
|
||||
auth_path = tmp_path / "auth.json"
|
||||
auth_path.write_text(json.dumps(store, indent=2), encoding="utf-8")
|
||||
monkeypatch.setattr(config, "_get_auth_store_path", lambda: auth_path)
|
||||
return auth_path
|
||||
|
||||
|
||||
# ── churn-immune invariant ─────────────────────────────────────────────────
|
||||
|
||||
|
||||
def test_token_only_churn_keeps_auth_fingerprint_identical(tmp_path, monkeypatch):
|
||||
"""Rotating ONLY the volatile credential fields (access/refresh token,
|
||||
expiry, status, request_count, updated_at) must not change the auth.json
|
||||
fingerprint at all — even though mtime_ns and size both change."""
|
||||
p = _write_auth(tmp_path, monkeypatch, _auth_store(access_token="tok-AAAA",
|
||||
request_count=1))
|
||||
st1 = p.stat()
|
||||
fp1 = config._auth_store_semantic_fingerprint(p)
|
||||
|
||||
time.sleep(0.01)
|
||||
# Same logical config, fresh tokens + bumped counters + new save ts.
|
||||
bigger = _auth_store(access_token="tok-BBBBBBBBBBBBBBBBBBBB",
|
||||
request_count=999)
|
||||
bigger["updated_at"] = "2026-05-19T01:14:00+00:00"
|
||||
p.write_text(json.dumps(bigger, indent=2), encoding="utf-8")
|
||||
st2 = p.stat()
|
||||
fp2 = config._auth_store_semantic_fingerprint(p)
|
||||
|
||||
# Precondition: the OLD stat-based fingerprint WOULD have churned.
|
||||
assert (st1.st_mtime_ns, st1.st_size) != (st2.st_mtime_ns, st2.st_size)
|
||||
# Invariant: the content fingerprint does NOT churn.
|
||||
assert fp1 == fp2
|
||||
assert "semantic_sha256" in fp1
|
||||
|
||||
|
||||
def test_token_churn_does_not_reject_valid_disk_models_cache(tmp_path, monkeypatch):
|
||||
"""End-to-end: a disk models cache saved before a token refresh stays
|
||||
loadable after the refresh (the actual user-visible bug — cold rebuild
|
||||
every ~14 min)."""
|
||||
_write_auth(tmp_path, monkeypatch, _auth_store(access_token="tok-1"))
|
||||
monkeypatch.setattr(config, "_models_cache_path",
|
||||
tmp_path / "models_cache.json")
|
||||
|
||||
shape = {
|
||||
"active_provider": "anthropic",
|
||||
"default_model": "claude-opus-4-7",
|
||||
"configured_model_badges": {"claude-opus-4-7": "Anthropic"},
|
||||
"groups": [{"name": "Anthropic", "models": ["claude-opus-4-7"]}],
|
||||
}
|
||||
config._save_models_cache_to_disk(shape)
|
||||
assert config._load_models_cache_from_disk() is not None
|
||||
|
||||
# ~14 minutes later: credential-pool refresh rewrites auth.json.
|
||||
time.sleep(0.01)
|
||||
churned = _auth_store(access_token="tok-2-rotated", request_count=42)
|
||||
churned["updated_at"] = "2026-05-19T01:14:00+00:00"
|
||||
config._get_auth_store_path().write_text(
|
||||
json.dumps(churned, indent=2), encoding="utf-8")
|
||||
|
||||
assert config._load_models_cache_from_disk() is not None, (
|
||||
"Pure token-refresh churn of auth.json must NOT invalidate the 24h "
|
||||
"models cache (RCA t_16551f61)"
|
||||
)
|
||||
|
||||
|
||||
# ── real-change-invalidates invariant ──────────────────────────────────────
|
||||
|
||||
|
||||
def test_active_provider_change_changes_auth_fingerprint(tmp_path, monkeypatch):
|
||||
p = _write_auth(tmp_path, monkeypatch, _auth_store(active_provider="anthropic"))
|
||||
fp_before = config._auth_store_semantic_fingerprint(p)
|
||||
|
||||
p.write_text(json.dumps(_auth_store(active_provider="openrouter"), indent=2),
|
||||
encoding="utf-8")
|
||||
fp_after = config._auth_store_semantic_fingerprint(p)
|
||||
|
||||
assert fp_before != fp_after, (
|
||||
"Changing active_provider changes the available-provider set and MUST "
|
||||
"bust the cache"
|
||||
)
|
||||
|
||||
|
||||
def test_new_credential_pool_entry_changes_auth_fingerprint(tmp_path, monkeypatch):
|
||||
"""The explicit edge from the task body: adding a credential_pool entry
|
||||
that enables a NEW provider MUST still invalidate the cache."""
|
||||
p = _write_auth(tmp_path, monkeypatch, _auth_store())
|
||||
fp_before = config._auth_store_semantic_fingerprint(p)
|
||||
|
||||
p.write_text(json.dumps(_auth_store(extra_pool_provider="deepseek"),
|
||||
indent=2), encoding="utf-8")
|
||||
fp_after = config._auth_store_semantic_fingerprint(p)
|
||||
|
||||
assert fp_before != fp_after, (
|
||||
"A new credential_pool provider entry expands the model set and MUST "
|
||||
"bust the cache (task t_16551f61 boundary case)"
|
||||
)
|
||||
|
||||
|
||||
def test_changed_base_url_changes_auth_fingerprint(tmp_path, monkeypatch):
|
||||
"""Endpoint / api-base is a non-volatile field — it stays in the
|
||||
fingerprint, so pointing a provider at a different endpoint invalidates."""
|
||||
p = _write_auth(tmp_path, monkeypatch, _auth_store())
|
||||
fp_before = config._auth_store_semantic_fingerprint(p)
|
||||
|
||||
p.write_text(
|
||||
json.dumps(_auth_store(anthropic_base_url="https://proxy.internal/v1"),
|
||||
indent=2),
|
||||
encoding="utf-8")
|
||||
fp_after = config._auth_store_semantic_fingerprint(p)
|
||||
|
||||
assert fp_before != fp_after, "A changed provider base_url MUST bust the cache"
|
||||
|
||||
|
||||
def test_real_change_rejects_previously_valid_disk_cache(tmp_path, monkeypatch):
|
||||
"""End-to-end mirror of the churn test: a disk cache that was valid must
|
||||
be REJECTED once a genuine provider-set change lands in auth.json."""
|
||||
_write_auth(tmp_path, monkeypatch, _auth_store(active_provider="anthropic"))
|
||||
monkeypatch.setattr(config, "_models_cache_path",
|
||||
tmp_path / "models_cache.json")
|
||||
config._save_models_cache_to_disk({
|
||||
"active_provider": "anthropic",
|
||||
"default_model": "claude-opus-4-7",
|
||||
"configured_model_badges": {"claude-opus-4-7": "Anthropic"},
|
||||
"groups": [{"name": "Anthropic", "models": ["claude-opus-4-7"]}],
|
||||
})
|
||||
assert config._load_models_cache_from_disk() is not None
|
||||
|
||||
config._get_auth_store_path().write_text(
|
||||
json.dumps(_auth_store(active_provider="openrouter"), indent=2),
|
||||
encoding="utf-8")
|
||||
|
||||
assert config._load_models_cache_from_disk() is None, (
|
||||
"A real active_provider change MUST reject the stale disk cache — the "
|
||||
"fix must not over-stabilise into serving wrong data"
|
||||
)
|
||||
|
||||
|
||||
# ── deny-list helper unit guards ───────────────────────────────────────────
|
||||
|
||||
|
||||
def test_strip_volatile_is_pure_and_recurses():
|
||||
src = {
|
||||
"active_provider": "anthropic",
|
||||
"credential_pool": {
|
||||
"anthropic": [
|
||||
{"id": "x", "base_url": "u", "access_token": "S",
|
||||
"request_count": 7, "last_status": "ok"}
|
||||
]
|
||||
},
|
||||
"updated_at": "ts",
|
||||
}
|
||||
import copy as _copy
|
||||
snapshot = _copy.deepcopy(src)
|
||||
out = config._strip_volatile_auth_fields(src)
|
||||
|
||||
assert src == snapshot # input untouched (pure)
|
||||
entry = out["credential_pool"]["anthropic"][0]
|
||||
assert entry == {"id": "x", "base_url": "u"} # volatile keys gone, rest kept
|
||||
assert "updated_at" not in out
|
||||
assert out["active_provider"] == "anthropic" # non-volatile preserved
|
||||
|
||||
|
||||
def test_unknown_field_stays_in_fingerprint(tmp_path, monkeypatch):
|
||||
"""Deny-list (not allow-list) safety: a field we don't know about is NOT
|
||||
stripped, so a hypothetical future provider-gating field still busts the
|
||||
cache."""
|
||||
base = _auth_store()
|
||||
base["some_future_provider_gate"] = "v1"
|
||||
p = _write_auth(tmp_path, monkeypatch, base)
|
||||
fp1 = config._auth_store_semantic_fingerprint(p)
|
||||
|
||||
base["some_future_provider_gate"] = "v2"
|
||||
p.write_text(json.dumps(base, indent=2), encoding="utf-8")
|
||||
fp2 = config._auth_store_semantic_fingerprint(p)
|
||||
|
||||
assert fp1 != fp2, (
|
||||
"An unknown (non-deny-listed) field must remain in the fingerprint — "
|
||||
"the deny-list must fail safe toward over-invalidation"
|
||||
)
|
||||
|
||||
|
||||
def test_missing_auth_file_fingerprint_is_stable_and_marked(tmp_path, monkeypatch):
|
||||
missing = tmp_path / "nope" / "auth.json"
|
||||
monkeypatch.setattr(config, "_get_auth_store_path", lambda: missing)
|
||||
fp = config._auth_store_semantic_fingerprint(missing)
|
||||
assert fp.get("missing") is True
|
||||
assert config._auth_store_semantic_fingerprint(missing) == fp
|
||||
Reference in New Issue
Block a user