From ea5b4ec2a0a0f3f8b8888d6f8bb1fec6fcc2b9fc Mon Sep 17 00:00:00 2001 From: aqilaziz Date: Sat, 16 May 2026 00:41:23 +0700 Subject: [PATCH] fix(gateway): quiet corrupt kanban dispatcher boards --- gateway/run.py | 48 ++++++++++++ .../test_kanban_core_functionality.py | 76 +++++++++++++++++++ tests/run_agent/test_provider_parity.py | 22 +++++- 3 files changed, 142 insertions(+), 4 deletions(-) diff --git a/gateway/run.py b/gateway/run.py index f9a282a413..5fd0c6ceef 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -37,6 +37,7 @@ import signal import tempfile import threading import time +import sqlite3 from collections import OrderedDict from contextvars import copy_context from pathlib import Path @@ -4678,6 +4679,28 @@ class GatewayRunner: HEALTH_WINDOW = 6 bad_ticks = 0 last_warn_at = 0 + disabled_corrupt_boards: dict[str, tuple[str, int | None, int | None]] = {} + + def _board_db_fingerprint(slug: str) -> tuple[str, int | None, int | None]: + path = _kb.kanban_db_path(slug) + try: + resolved = str(path.expanduser().resolve()) + except Exception: + resolved = str(path) + try: + stat = path.stat() + except OSError: + return (resolved, None, None) + return (resolved, stat.st_mtime_ns, stat.st_size) + + def _is_corrupt_board_db_error(exc: Exception) -> bool: + if not isinstance(exc, sqlite3.DatabaseError): + return False + msg = str(exc).lower() + return ( + "file is not a database" in msg + or "database disk image is malformed" in msg + ) def _tick_once_for_board(slug: str) -> "Optional[object]": """Run one dispatch_once for a specific board. @@ -4689,6 +4712,16 @@ class GatewayRunner: connection handle or accidentally claim across each other. """ conn = None + fingerprint = _board_db_fingerprint(slug) + disabled_fingerprint = disabled_corrupt_boards.get(slug) + if disabled_fingerprint == fingerprint: + return None + if disabled_fingerprint is not None: + logger.info( + "kanban dispatcher: board %s database changed; retrying dispatch", + slug, + ) + disabled_corrupt_boards.pop(slug, None) try: conn = _kb.connect(board=slug) # `connect()` runs the schema + idempotent migration on @@ -4703,6 +4736,21 @@ class GatewayRunner: max_spawn=max_spawn, failure_limit=failure_limit, ) + except sqlite3.DatabaseError as exc: + if _is_corrupt_board_db_error(exc): + disabled_corrupt_boards[slug] = fingerprint + logger.error( + "kanban dispatcher: board %s database %s is not a valid " + "SQLite database; disabling dispatch for this board " + "until the file changes or the gateway restarts. Move " + "or restore the file, then run `hermes kanban init` if " + "you need a fresh board.", + slug, + fingerprint[0], + ) + return None + logger.exception("kanban dispatcher: tick failed on board %s", slug) + return None except Exception: logger.exception("kanban dispatcher: tick failed on board %s", slug) return None diff --git a/tests/hermes_cli/test_kanban_core_functionality.py b/tests/hermes_cli/test_kanban_core_functionality.py index 17252af827..60aa186d00 100644 --- a/tests/hermes_cli/test_kanban_core_functionality.py +++ b/tests/hermes_cli/test_kanban_core_functionality.py @@ -3414,6 +3414,82 @@ def test_gateway_dispatcher_watcher_env_truthy_uses_config(monkeypatch): ) +def test_gateway_dispatcher_disables_corrupt_board_without_traceback( + monkeypatch, tmp_path, caplog +): + """Corrupt board DBs log one actionable error and stop retrying per tick.""" + import asyncio + import logging + import sqlite3 + + from gateway.run import GatewayRunner + import hermes_cli.config as _cfg_mod + import hermes_cli.kanban_db as _kb + + runner = object.__new__(GatewayRunner) + runner._running = True + corrupt_db = tmp_path / "kanban.db" + corrupt_db.write_text("not sqlite", encoding="utf-8") + + monkeypatch.setattr( + _cfg_mod, + "load_config", + lambda: { + "kanban": { + "dispatch_in_gateway": True, + "dispatch_interval_seconds": 1, + } + }, + ) + monkeypatch.setattr( + _kb, + "list_boards", + lambda include_archived=False: [{"slug": _kb.DEFAULT_BOARD}], + ) + monkeypatch.setattr( + _kb, + "read_board_metadata", + lambda slug: {"slug": slug}, + ) + monkeypatch.setattr(_kb, "kanban_db_path", lambda board=None: corrupt_db) + + calls = {"connect": 0, "to_thread": 0} + + def _connect(*args, **kwargs): + calls["connect"] += 1 + raise sqlite3.DatabaseError("file is not a database") + + async def _to_thread(fn, *args, **kwargs): + calls["to_thread"] += 1 + result = fn(*args, **kwargs) + if calls["to_thread"] >= 4: + runner._running = False + return result + + async def _sleep(_delay): + return None + + monkeypatch.setattr(_kb, "connect", _connect) + monkeypatch.setattr("gateway.run.asyncio.to_thread", _to_thread) + monkeypatch.setattr("gateway.run.asyncio.sleep", _sleep) + + with caplog.at_level(logging.ERROR, logger="gateway.run"): + asyncio.run( + asyncio.wait_for( + runner._kanban_dispatcher_watcher(), + timeout=3.0, + ) + ) + + messages = [record.getMessage() for record in caplog.records] + assert sum("not a valid SQLite database" in msg for msg in messages) == 1 + assert not any("tick failed on board" in msg for msg in messages) + assert not any(record.exc_info for record in caplog.records) + # First tick connect + two ready-queue probes. The second dispatch tick + # skips connect because the corrupt board fingerprint is disabled. + assert calls["connect"] == 3 + + # --------------------------------------------------------------------------- # Hallucination gate (created_cards verify + prose scan) # --------------------------------------------------------------------------- diff --git a/tests/run_agent/test_provider_parity.py b/tests/run_agent/test_provider_parity.py index c65c22004a..cf619ea974 100644 --- a/tests/run_agent/test_provider_parity.py +++ b/tests/run_agent/test_provider_parity.py @@ -254,8 +254,12 @@ class TestDeveloperRoleSwap: assert messages[0]["role"] == "system" def test_developer_role_via_nous_portal(self, monkeypatch): - agent = _make_agent(monkeypatch, "nous", base_url="https://inference-api.nousresearch.com/v1") - agent.model = "gpt-5" + agent = _make_agent( + monkeypatch, + "nous", + base_url="https://inference-api.nousresearch.com/v1", + model="gpt-5", + ) messages = [ {"role": "system", "content": "You are helpful."}, {"role": "user", "content": "hi"}, @@ -346,14 +350,24 @@ class TestBuildApiKwargsAIGateway: class TestBuildApiKwargsNousPortal: def test_includes_nous_product_tags(self, monkeypatch): from agent.portal_tags import nous_portal_tags - agent = _make_agent(monkeypatch, "nous", base_url="https://inference-api.nousresearch.com/v1") + agent = _make_agent( + monkeypatch, + "nous", + base_url="https://inference-api.nousresearch.com/v1", + model="gpt-5", + ) messages = [{"role": "user", "content": "hi"}] kwargs = agent._build_api_kwargs(messages) extra = kwargs.get("extra_body", {}) assert extra.get("tags") == nous_portal_tags() def test_uses_chat_completions_format(self, monkeypatch): - agent = _make_agent(monkeypatch, "nous", base_url="https://inference-api.nousresearch.com/v1") + agent = _make_agent( + monkeypatch, + "nous", + base_url="https://inference-api.nousresearch.com/v1", + model="gpt-5", + ) messages = [{"role": "user", "content": "hi"}] kwargs = agent._build_api_kwargs(messages) assert "messages" in kwargs