From d0c20708cea5e467eda220240bd1b69355a3bfbb Mon Sep 17 00:00:00 2001 From: Brooklyn Nicholson Date: Fri, 15 May 2026 15:38:50 -0500 Subject: [PATCH] Add dedicated GUI log stream for dashboard debugging. Capture dashboard and PTY websocket lifecycle failures in gui.log and expose it via hermes logs. --- hermes_cli/logs.py | 4 +- hermes_cli/main.py | 15 +++- hermes_cli/web_server.py | 144 ++++++++++++++++++++++++++++++++-- hermes_logging.py | 26 +++++- tests/hermes_cli/test_logs.py | 1 + tests/test_hermes_logging.py | 52 ++++++++++++ 6 files changed, 229 insertions(+), 13 deletions(-) diff --git a/hermes_cli/logs.py b/hermes_cli/logs.py index 9a829a4bdc..d580751b4d 100644 --- a/hermes_cli/logs.py +++ b/hermes_cli/logs.py @@ -10,6 +10,7 @@ Usage examples:: hermes logs -f # follow agent.log in real time hermes logs errors # last 50 lines of errors.log hermes logs gateway -n 100 # last 100 lines of gateway.log + hermes logs gui -f # follow gui.log (dashboard/pty/ws) hermes logs --level WARNING # only WARNING+ lines hermes logs --session abc123 # filter by session ID substring hermes logs --component tools # only tool-related lines @@ -31,6 +32,7 @@ LOG_FILES = { "agent": "agent.log", "errors": "errors.log", "gateway": "gateway.log", + "gui": "gui.log", } # Log line timestamp regex — matches "2026-04-05 22:35:00,123" or @@ -150,7 +152,7 @@ def tail_log( Parameters ---------- log_name - Which log to read: ``"agent"``, ``"errors"``, ``"gateway"``. + Which log to read: ``"agent"``, ``"errors"``, ``"gateway"``, ``"gui"``. num_lines Number of recent lines to show (before follow starts). follow diff --git a/hermes_cli/main.py b/hermes_cli/main.py index e4d846cd58..0cda09db57 100644 --- a/hermes_cli/main.py +++ b/hermes_cli/main.py @@ -9421,6 +9421,14 @@ def cmd_dashboard(args): remaining = _find_stale_dashboard_pids() sys.exit(1 if remaining else 0) + # Attach gui.log early so dashboard startup/build failures are captured in + # the same logs directory as every other Hermes surface. + try: + from hermes_logging import setup_logging as _setup_logging_gui + _setup_logging_gui(mode="gui") + except Exception: + pass + try: import fastapi # noqa: F401 import uvicorn # noqa: F401 @@ -12097,7 +12105,7 @@ Examples: logs_parser = subparsers.add_parser( "logs", help="View and filter Hermes log files", - description="View, tail, and filter agent.log / errors.log / gateway.log", + description="View, tail, and filter agent.log / errors.log / gateway.log / gui.log", formatter_class=argparse.RawDescriptionHelpFormatter, epilog="""\ Examples: @@ -12105,6 +12113,7 @@ Examples: hermes logs -f Follow agent.log in real time hermes logs errors Show last 50 lines of errors.log hermes logs gateway -n 100 Show last 100 lines of gateway.log + hermes logs gui -f Follow gui.log in real time hermes logs --level WARNING Only show WARNING and above hermes logs --session abc123 Filter by session ID hermes logs --component tools Only show tool-related lines @@ -12117,7 +12126,7 @@ Examples: "log_name", nargs="?", default="agent", - help="Log to view: agent (default), errors, gateway, or 'list' to show available files", + help="Log to view: agent (default), errors, gateway, gui, or 'list' to show available files", ) logs_parser.add_argument( "-n", @@ -12150,7 +12159,7 @@ Examples: logs_parser.add_argument( "--component", metavar="NAME", - help="Filter by component: gateway, agent, tools, cli, cron", + help="Filter by component: gateway, agent, tools, cli, cron, gui", ) logs_parser.set_defaults(func=cmd_logs) diff --git a/hermes_cli/web_server.py b/hermes_cli/web_server.py index ec86af6ed5..93dc42fbe6 100644 --- a/hermes_cli/web_server.py +++ b/hermes_cli/web_server.py @@ -253,6 +253,41 @@ async def auth_middleware(request: Request, call_next): return await call_next(request) +@app.middleware("http") +async def api_error_logging_middleware(request: Request, call_next): + """Emit compact diagnostics for API failures and crashes.""" + path = request.url.path + is_api = path.startswith("/api/") + start = time.monotonic() + try: + response = await call_next(request) + except Exception: + if is_api: + _log.exception("api-crash method=%s path=%s", request.method, path) + raise + + if is_api: + elapsed_ms = (time.monotonic() - start) * 1000 + status = int(response.status_code) + if status >= 500: + _log.error( + "api-failure method=%s path=%s status=%d elapsed_ms=%.1f", + request.method, + path, + status, + elapsed_ms, + ) + elif status >= 400 and status not in {401, 403}: + _log.warning( + "api-failure method=%s path=%s status=%d elapsed_ms=%.1f", + request.method, + path, + status, + elapsed_ms, + ) + return response + + # --------------------------------------------------------------------------- # Config schema — auto-generated from DEFAULT_CONFIG # --------------------------------------------------------------------------- @@ -3950,6 +3985,15 @@ def _ws_client_is_allowed(ws: "WebSocket") -> bool: return True return client_host in _LOOPBACK_HOSTS + +def _ws_client_label(ws: "WebSocket") -> str: + """Best-effort peer label for websocket diagnostics.""" + if ws.client is None: + return "unknown" + host = ws.client.host or "unknown" + port = ws.client.port + return f"{host}:{port}" if port is not None else host + # Per-channel subscriber registry used by /api/pub (PTY-side gateway → dashboard) # and /api/events (dashboard → browser sidebar). Keyed by an opaque channel id # the chat tab generates on mount; entries auto-evict when the last subscriber @@ -4045,10 +4089,10 @@ async def _broadcast_event(channel: str, payload: str) -> None: for sub in subs: try: await sub.send_text(payload) - except Exception: + except Exception as exc: # Subscriber went away mid-send; the /api/events finally clause # will remove it from the registry on its next iteration. - pass + _log.debug("event broadcast drop channel=%s error=%s", channel, exc) def _channel_or_close_code(ws: WebSocket) -> Optional[str]: @@ -4060,7 +4104,9 @@ def _channel_or_close_code(ws: WebSocket) -> Optional[str]: @app.websocket("/api/pty") async def pty_ws(ws: WebSocket) -> None: + peer = _ws_client_label(ws) if not _DASHBOARD_EMBEDDED_CHAT_ENABLED: + _log.debug("pty-ws reject peer=%s reason=embedded_chat_disabled", peer) await ws.close(code=4403) return @@ -4068,10 +4114,12 @@ async def pty_ws(ws: WebSocket) -> None: token = ws.query_params.get("token", "") expected = _SESSION_TOKEN if not hmac.compare_digest(token.encode(), expected.encode()): + _log.warning("pty-ws reject peer=%s reason=bad_token", peer) await ws.close(code=4401) return if not _ws_client_is_allowed(ws): + _log.warning("pty-ws reject peer=%s reason=non_loopback_client", peer) await ws.close(code=4403) return @@ -4080,6 +4128,7 @@ async def pty_ws(ws: WebSocket) -> None: # On native Windows, the POSIX PTY bridge can't be imported. Tell the # client and close cleanly rather than pretending the feature works. if not _PTY_BRIDGE_AVAILABLE: + _log.warning("pty-ws unavailable peer=%s reason=pty_bridge_missing", peer) await ws.send_text( "\r\n\x1b[31mChat unavailable: the embedded terminal requires a " "POSIX PTY, which native Windows Python doesn't provide.\x1b[0m\r\n" @@ -4093,43 +4142,68 @@ async def pty_ws(ws: WebSocket) -> None: resume = ws.query_params.get("resume") or None channel = _channel_or_close_code(ws) sidecar_url = _build_sidecar_url(channel) if channel else None + _log.info( + "pty-ws connect peer=%s resume=%s channel=%s", + peer, + bool(resume), + channel or "-", + ) try: argv, cwd, env = _resolve_chat_argv(resume=resume, sidecar_url=sidecar_url) except SystemExit as exc: # _make_tui_argv calls sys.exit(1) when node/npm is missing. + _log.warning("pty-ws argv resolution failed peer=%s error=%s", peer, exc) await ws.send_text(f"\r\n\x1b[31mChat unavailable: {exc}\x1b[0m\r\n") await ws.close(code=1011) return + _log.debug( + "pty-ws spawn peer=%s argv0=%s cwd=%s sidecar=%s", + peer, + argv[0] if argv else "", + cwd, + bool(sidecar_url), + ) try: bridge = PtyBridge.spawn(argv, cwd=cwd, env=env) except PtyUnavailableError as exc: + _log.warning("pty-ws spawn unavailable peer=%s error=%s", peer, exc) await ws.send_text(f"\r\n\x1b[31mChat unavailable: {exc}\x1b[0m\r\n") await ws.close(code=1011) return except (FileNotFoundError, OSError) as exc: + _log.exception("pty-ws spawn failed peer=%s", peer) await ws.send_text(f"\r\n\x1b[31mChat failed to start: {exc}\x1b[0m\r\n") await ws.close(code=1011) return + _log.info("pty-ws spawned peer=%s pid=%s", peer, bridge.pid) loop = asyncio.get_running_loop() + bytes_to_client = 0 + bytes_to_pty = 0 + resize_events = 0 + disconnect_reason = "closed" # --- reader task: PTY master → WebSocket ---------------------------- async def pump_pty_to_ws() -> None: + nonlocal bytes_to_client, disconnect_reason while True: chunk = await loop.run_in_executor( None, bridge.read, _PTY_READ_CHUNK_TIMEOUT ) if chunk is None: # EOF + disconnect_reason = "pty_eof" return if not chunk: # no data this tick; yield control and retry await asyncio.sleep(0) continue + bytes_to_client += len(chunk) try: await ws.send_bytes(chunk) except Exception: + disconnect_reason = "ws_send_failed" return reader_task = asyncio.create_task(pump_pty_to_ws()) @@ -4140,6 +4214,7 @@ async def pty_ws(ws: WebSocket) -> None: msg = await ws.receive() msg_type = msg.get("type") if msg_type == "websocket.disconnect": + disconnect_reason = "ws_disconnect" break raw = msg.get("bytes") if raw is None: @@ -4154,11 +4229,13 @@ async def pty_ws(ws: WebSocket) -> None: cols = int(match.group(1)) rows = int(match.group(2)) bridge.resize(cols=cols, rows=rows) + resize_events += 1 continue bridge.write(raw) + bytes_to_pty += len(raw) except WebSocketDisconnect: - pass + disconnect_reason = "ws_disconnect" finally: reader_task.cancel() try: @@ -4166,6 +4243,15 @@ async def pty_ws(ws: WebSocket) -> None: except (asyncio.CancelledError, Exception): pass bridge.close() + _log.info( + "pty-ws closed peer=%s pid=%s reason=%s bytes_out=%d bytes_in=%d resizes=%d", + peer, + bridge.pid, + disconnect_reason, + bytes_to_client, + bytes_to_pty, + resize_events, + ) # --------------------------------------------------------------------------- @@ -4181,22 +4267,34 @@ async def pty_ws(ws: WebSocket) -> None: @app.websocket("/api/ws") async def gateway_ws(ws: WebSocket) -> None: + peer = _ws_client_label(ws) if not _DASHBOARD_EMBEDDED_CHAT_ENABLED: + _log.debug("gateway-ws reject peer=%s reason=embedded_chat_disabled", peer) await ws.close(code=4403) return token = ws.query_params.get("token", "") if not hmac.compare_digest(token.encode(), _SESSION_TOKEN.encode()): + _log.warning("gateway-ws reject peer=%s reason=bad_token", peer) await ws.close(code=4401) return if not _ws_client_is_allowed(ws): + _log.warning("gateway-ws reject peer=%s reason=non_loopback_client", peer) await ws.close(code=4403) return from tui_gateway.ws import handle_ws - - await handle_ws(ws) + _log.info("gateway-ws connect peer=%s", peer) + try: + await handle_ws(ws) + except WebSocketDisconnect: + _log.info("gateway-ws disconnect peer=%s", peer) + except Exception: + _log.exception("gateway-ws error peer=%s", peer) + raise + else: + _log.info("gateway-ws closed peer=%s", peer) # --------------------------------------------------------------------------- @@ -4213,54 +4311,69 @@ async def gateway_ws(ws: WebSocket) -> None: @app.websocket("/api/pub") async def pub_ws(ws: WebSocket) -> None: + peer = _ws_client_label(ws) if not _DASHBOARD_EMBEDDED_CHAT_ENABLED: + _log.debug("pub-ws reject peer=%s reason=embedded_chat_disabled", peer) await ws.close(code=4403) return token = ws.query_params.get("token", "") if not hmac.compare_digest(token.encode(), _SESSION_TOKEN.encode()): + _log.warning("pub-ws reject peer=%s reason=bad_token", peer) await ws.close(code=4401) return if not _ws_client_is_allowed(ws): + _log.warning("pub-ws reject peer=%s reason=non_loopback_client", peer) await ws.close(code=4403) return channel = _channel_or_close_code(ws) if not channel: + _log.warning("pub-ws reject peer=%s reason=invalid_channel", peer) await ws.close(code=4400) return await ws.accept() + _log.info("pub-ws connect peer=%s channel=%s", peer, channel) + messages = 0 try: while True: - await _broadcast_event(channel, await ws.receive_text()) + payload = await ws.receive_text() + messages += 1 + await _broadcast_event(channel, payload) except WebSocketDisconnect: - pass + _log.info("pub-ws disconnect peer=%s channel=%s messages=%d", peer, channel, messages) @app.websocket("/api/events") async def events_ws(ws: WebSocket) -> None: + peer = _ws_client_label(ws) if not _DASHBOARD_EMBEDDED_CHAT_ENABLED: + _log.debug("events-ws reject peer=%s reason=embedded_chat_disabled", peer) await ws.close(code=4403) return token = ws.query_params.get("token", "") if not hmac.compare_digest(token.encode(), _SESSION_TOKEN.encode()): + _log.warning("events-ws reject peer=%s reason=bad_token", peer) await ws.close(code=4401) return if not _ws_client_is_allowed(ws): + _log.warning("events-ws reject peer=%s reason=non_loopback_client", peer) await ws.close(code=4403) return channel = _channel_or_close_code(ws) if not channel: + _log.warning("events-ws reject peer=%s reason=invalid_channel", peer) await ws.close(code=4400) return await ws.accept() + _log.info("events-ws connect peer=%s channel=%s", peer, channel) async with _event_lock: _event_channels.setdefault(channel, set()).add(ws) @@ -4272,7 +4385,7 @@ async def events_ws(ws: WebSocket) -> None: # browser holds it. await ws.receive_text() except WebSocketDisconnect: - pass + _log.info("events-ws disconnect peer=%s channel=%s", peer, channel) finally: async with _event_lock: subs = _event_channels.get(channel) @@ -5175,6 +5288,14 @@ def start_server( embedded_chat: bool = False, ): """Start the web UI server.""" + try: + from hermes_logging import setup_logging as _setup_logging + + log_dir = _setup_logging(mode="gui") + _log.info("GUI logging enabled: %s", log_dir / "gui.log") + except Exception: + pass + import uvicorn global _DASHBOARD_EMBEDDED_CHAT_ENABLED @@ -5231,5 +5352,12 @@ def start_server( "(headless Linux). Pass --no-open to suppress this detection." ) + _log.info( + "dashboard starting host=%s port=%s embedded_chat=%s open_browser=%s", + host, + port, + embedded_chat, + open_browser, + ) print(f" Hermes Web UI → http://{host}:{port}") uvicorn.run(app, host=host, port=port, log_level="warning") diff --git a/hermes_logging.py b/hermes_logging.py index 8d16e653c7..1dc6be1570 100644 --- a/hermes_logging.py +++ b/hermes_logging.py @@ -8,6 +8,8 @@ Log files produced: agent.log — INFO+, all agent/tool/session activity (the main log) errors.log — WARNING+, errors and warnings only (quick triage) gateway.log — INFO+, gateway-only events (created when mode="gateway") + gui.log — INFO+, dashboard/websocket/TUI-gateway events + (created when mode="gui") All files use ``RotatingFileHandler`` with ``RedactingFormatter`` so secrets are never written to disk. @@ -15,6 +17,8 @@ secrets are never written to disk. Component separation: gateway.log only receives records from ``gateway.*`` loggers — platform adapters, session management, slash commands, delivery. + gui.log receives dashboard-side records from ``hermes_cli.web_server``, + ``hermes_cli.pty_bridge``, ``tui_gateway.*``, and ``uvicorn.*``. agent.log remains the catch-all (everything goes there). Session context: @@ -146,6 +150,12 @@ COMPONENT_PREFIXES = { "tools": ("tools",), "cli": ("hermes_cli", "cli"), "cron": ("cron",), + "gui": ( + "hermes_cli.web_server", + "hermes_cli.pty_bridge", + "tui_gateway", + "uvicorn", + ), } @@ -183,9 +193,11 @@ def setup_logging( Number of rotated backup files to keep. Defaults to 3 or the value from config.yaml ``logging.backup_count``. mode - Caller context: ``"cli"``, ``"gateway"``, ``"cron"``. + Caller context: ``"cli"``, ``"gateway"``, ``"gui"``, ``"cron"``. When ``"gateway"``, an additional ``gateway.log`` file is created that receives only gateway-component records. + When ``"gui"``, an additional ``gui.log`` file is created that + receives dashboard and TUI-gateway component records. force Re-run setup even if it has already been called. @@ -244,6 +256,18 @@ def setup_logging( log_filter=_ComponentFilter(COMPONENT_PREFIXES["gateway"]), ) + # --- gui.log (INFO+, dashboard/tui-gateway components) ----------------- + if mode == "gui": + _add_rotating_handler( + root, + log_dir / "gui.log", + level=logging.INFO, + max_bytes=10 * 1024 * 1024, + backup_count=5, + formatter=RedactingFormatter(_LOG_FORMAT), + log_filter=_ComponentFilter(COMPONENT_PREFIXES["gui"]), + ) + if _logging_initialized and not force: return log_dir diff --git a/tests/hermes_cli/test_logs.py b/tests/hermes_cli/test_logs.py index 0827143fc6..398c7faf50 100644 --- a/tests/hermes_cli/test_logs.py +++ b/tests/hermes_cli/test_logs.py @@ -253,3 +253,4 @@ class TestLogFiles: assert "agent" in LOG_FILES assert "errors" in LOG_FILES assert "gateway" in LOG_FILES + assert "gui" in LOG_FILES diff --git a/tests/test_hermes_logging.py b/tests/test_hermes_logging.py index c4168f79b9..127d4027a7 100644 --- a/tests/test_hermes_logging.py +++ b/tests/test_hermes_logging.py @@ -25,6 +25,8 @@ def _reset_logging_state(): """ hermes_logging._logging_initialized = False root = logging.getLogger() + prev_root_level = root.level + root.setLevel(logging.NOTSET) # Strip ALL RotatingFileHandlers — not just the ones we added — so that # handlers leaked from other test modules in the same xdist worker don't # pollute our counts. @@ -43,6 +45,7 @@ def _reset_logging_state(): if h not in pre_existing: root.removeHandler(h) h.close() + root.setLevel(prev_root_level) hermes_logging._logging_initialized = False hermes_logging.clear_session_context() @@ -355,6 +358,50 @@ class TestGatewayMode: assert "file msg" in content +class TestGuiMode: + """setup_logging(mode='gui') creates a filtered gui.log.""" + + def test_gui_log_created(self, hermes_home): + hermes_logging.setup_logging(hermes_home=hermes_home, mode="gui") + root = logging.getLogger() + + gui_handlers = [ + h for h in root.handlers + if isinstance(h, RotatingFileHandler) + and "gui.log" in getattr(h, "baseFilename", "") + ] + assert len(gui_handlers) == 1 + + def test_gui_log_created_after_cli_init(self, hermes_home): + hermes_logging.setup_logging(hermes_home=hermes_home, mode="cli") + hermes_logging.setup_logging(hermes_home=hermes_home, mode="gui") + + root = logging.getLogger() + gui_handlers = [ + h for h in root.handlers + if isinstance(h, RotatingFileHandler) + and "gui.log" in getattr(h, "baseFilename", "") + ] + assert len(gui_handlers) == 1 + + def test_gui_log_receives_only_gui_components(self, hermes_home): + hermes_logging.setup_logging(hermes_home=hermes_home, mode="gui") + + logging.getLogger("hermes_cli.web_server").info("dashboard online") + logging.getLogger("tui_gateway.ws").info("ws connected") + logging.getLogger("gateway.run").info("gateway event") + + for h in logging.getLogger().handlers: + h.flush() + + gui_log = hermes_home / "logs" / "gui.log" + assert gui_log.exists() + content = gui_log.read_text() + assert "dashboard online" in content + assert "ws connected" in content + assert "gateway event" not in content + + class TestSessionContext: """set_session_context / clear_session_context + _SessionFilter.""" @@ -557,6 +604,11 @@ class TestComponentPrefixes: def test_cron_prefix(self): assert ("cron",) == hermes_logging.COMPONENT_PREFIXES["cron"] + def test_gui_prefix(self): + prefixes = hermes_logging.COMPONENT_PREFIXES["gui"] + assert "hermes_cli.web_server" in prefixes + assert "tui_gateway" in prefixes + class TestSetupVerboseLogging: """setup_verbose_logging() adds a DEBUG-level console handler."""