Add dedicated GUI log stream for dashboard debugging.

Capture dashboard and PTY websocket lifecycle failures in gui.log and expose it via hermes logs.
This commit is contained in:
Brooklyn Nicholson
2026-05-15 15:38:50 -05:00
parent 6640a9d3ab
commit d0c20708ce
6 changed files with 229 additions and 13 deletions
+3 -1
View File
@@ -10,6 +10,7 @@ Usage examples::
hermes logs -f # follow agent.log in real time
hermes logs errors # last 50 lines of errors.log
hermes logs gateway -n 100 # last 100 lines of gateway.log
hermes logs gui -f # follow gui.log (dashboard/pty/ws)
hermes logs --level WARNING # only WARNING+ lines
hermes logs --session abc123 # filter by session ID substring
hermes logs --component tools # only tool-related lines
@@ -31,6 +32,7 @@ LOG_FILES = {
"agent": "agent.log",
"errors": "errors.log",
"gateway": "gateway.log",
"gui": "gui.log",
}
# Log line timestamp regex — matches "2026-04-05 22:35:00,123" or
@@ -150,7 +152,7 @@ def tail_log(
Parameters
----------
log_name
Which log to read: ``"agent"``, ``"errors"``, ``"gateway"``.
Which log to read: ``"agent"``, ``"errors"``, ``"gateway"``, ``"gui"``.
num_lines
Number of recent lines to show (before follow starts).
follow
+12 -3
View File
@@ -9421,6 +9421,14 @@ def cmd_dashboard(args):
remaining = _find_stale_dashboard_pids()
sys.exit(1 if remaining else 0)
# Attach gui.log early so dashboard startup/build failures are captured in
# the same logs directory as every other Hermes surface.
try:
from hermes_logging import setup_logging as _setup_logging_gui
_setup_logging_gui(mode="gui")
except Exception:
pass
try:
import fastapi # noqa: F401
import uvicorn # noqa: F401
@@ -12097,7 +12105,7 @@ Examples:
logs_parser = subparsers.add_parser(
"logs",
help="View and filter Hermes log files",
description="View, tail, and filter agent.log / errors.log / gateway.log",
description="View, tail, and filter agent.log / errors.log / gateway.log / gui.log",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""\
Examples:
@@ -12105,6 +12113,7 @@ Examples:
hermes logs -f Follow agent.log in real time
hermes logs errors Show last 50 lines of errors.log
hermes logs gateway -n 100 Show last 100 lines of gateway.log
hermes logs gui -f Follow gui.log in real time
hermes logs --level WARNING Only show WARNING and above
hermes logs --session abc123 Filter by session ID
hermes logs --component tools Only show tool-related lines
@@ -12117,7 +12126,7 @@ Examples:
"log_name",
nargs="?",
default="agent",
help="Log to view: agent (default), errors, gateway, or 'list' to show available files",
help="Log to view: agent (default), errors, gateway, gui, or 'list' to show available files",
)
logs_parser.add_argument(
"-n",
@@ -12150,7 +12159,7 @@ Examples:
logs_parser.add_argument(
"--component",
metavar="NAME",
help="Filter by component: gateway, agent, tools, cli, cron",
help="Filter by component: gateway, agent, tools, cli, cron, gui",
)
logs_parser.set_defaults(func=cmd_logs)
+136 -8
View File
@@ -253,6 +253,41 @@ async def auth_middleware(request: Request, call_next):
return await call_next(request)
@app.middleware("http")
async def api_error_logging_middleware(request: Request, call_next):
"""Emit compact diagnostics for API failures and crashes."""
path = request.url.path
is_api = path.startswith("/api/")
start = time.monotonic()
try:
response = await call_next(request)
except Exception:
if is_api:
_log.exception("api-crash method=%s path=%s", request.method, path)
raise
if is_api:
elapsed_ms = (time.monotonic() - start) * 1000
status = int(response.status_code)
if status >= 500:
_log.error(
"api-failure method=%s path=%s status=%d elapsed_ms=%.1f",
request.method,
path,
status,
elapsed_ms,
)
elif status >= 400 and status not in {401, 403}:
_log.warning(
"api-failure method=%s path=%s status=%d elapsed_ms=%.1f",
request.method,
path,
status,
elapsed_ms,
)
return response
# ---------------------------------------------------------------------------
# Config schema — auto-generated from DEFAULT_CONFIG
# ---------------------------------------------------------------------------
@@ -3950,6 +3985,15 @@ def _ws_client_is_allowed(ws: "WebSocket") -> bool:
return True
return client_host in _LOOPBACK_HOSTS
def _ws_client_label(ws: "WebSocket") -> str:
"""Best-effort peer label for websocket diagnostics."""
if ws.client is None:
return "unknown"
host = ws.client.host or "unknown"
port = ws.client.port
return f"{host}:{port}" if port is not None else host
# Per-channel subscriber registry used by /api/pub (PTY-side gateway → dashboard)
# and /api/events (dashboard → browser sidebar). Keyed by an opaque channel id
# the chat tab generates on mount; entries auto-evict when the last subscriber
@@ -4045,10 +4089,10 @@ async def _broadcast_event(channel: str, payload: str) -> None:
for sub in subs:
try:
await sub.send_text(payload)
except Exception:
except Exception as exc:
# Subscriber went away mid-send; the /api/events finally clause
# will remove it from the registry on its next iteration.
pass
_log.debug("event broadcast drop channel=%s error=%s", channel, exc)
def _channel_or_close_code(ws: WebSocket) -> Optional[str]:
@@ -4060,7 +4104,9 @@ def _channel_or_close_code(ws: WebSocket) -> Optional[str]:
@app.websocket("/api/pty")
async def pty_ws(ws: WebSocket) -> None:
peer = _ws_client_label(ws)
if not _DASHBOARD_EMBEDDED_CHAT_ENABLED:
_log.debug("pty-ws reject peer=%s reason=embedded_chat_disabled", peer)
await ws.close(code=4403)
return
@@ -4068,10 +4114,12 @@ async def pty_ws(ws: WebSocket) -> None:
token = ws.query_params.get("token", "")
expected = _SESSION_TOKEN
if not hmac.compare_digest(token.encode(), expected.encode()):
_log.warning("pty-ws reject peer=%s reason=bad_token", peer)
await ws.close(code=4401)
return
if not _ws_client_is_allowed(ws):
_log.warning("pty-ws reject peer=%s reason=non_loopback_client", peer)
await ws.close(code=4403)
return
@@ -4080,6 +4128,7 @@ async def pty_ws(ws: WebSocket) -> None:
# On native Windows, the POSIX PTY bridge can't be imported. Tell the
# client and close cleanly rather than pretending the feature works.
if not _PTY_BRIDGE_AVAILABLE:
_log.warning("pty-ws unavailable peer=%s reason=pty_bridge_missing", peer)
await ws.send_text(
"\r\n\x1b[31mChat unavailable: the embedded terminal requires a "
"POSIX PTY, which native Windows Python doesn't provide.\x1b[0m\r\n"
@@ -4093,43 +4142,68 @@ async def pty_ws(ws: WebSocket) -> None:
resume = ws.query_params.get("resume") or None
channel = _channel_or_close_code(ws)
sidecar_url = _build_sidecar_url(channel) if channel else None
_log.info(
"pty-ws connect peer=%s resume=%s channel=%s",
peer,
bool(resume),
channel or "-",
)
try:
argv, cwd, env = _resolve_chat_argv(resume=resume, sidecar_url=sidecar_url)
except SystemExit as exc:
# _make_tui_argv calls sys.exit(1) when node/npm is missing.
_log.warning("pty-ws argv resolution failed peer=%s error=%s", peer, exc)
await ws.send_text(f"\r\n\x1b[31mChat unavailable: {exc}\x1b[0m\r\n")
await ws.close(code=1011)
return
_log.debug(
"pty-ws spawn peer=%s argv0=%s cwd=%s sidecar=%s",
peer,
argv[0] if argv else "",
cwd,
bool(sidecar_url),
)
try:
bridge = PtyBridge.spawn(argv, cwd=cwd, env=env)
except PtyUnavailableError as exc:
_log.warning("pty-ws spawn unavailable peer=%s error=%s", peer, exc)
await ws.send_text(f"\r\n\x1b[31mChat unavailable: {exc}\x1b[0m\r\n")
await ws.close(code=1011)
return
except (FileNotFoundError, OSError) as exc:
_log.exception("pty-ws spawn failed peer=%s", peer)
await ws.send_text(f"\r\n\x1b[31mChat failed to start: {exc}\x1b[0m\r\n")
await ws.close(code=1011)
return
_log.info("pty-ws spawned peer=%s pid=%s", peer, bridge.pid)
loop = asyncio.get_running_loop()
bytes_to_client = 0
bytes_to_pty = 0
resize_events = 0
disconnect_reason = "closed"
# --- reader task: PTY master → WebSocket ----------------------------
async def pump_pty_to_ws() -> None:
nonlocal bytes_to_client, disconnect_reason
while True:
chunk = await loop.run_in_executor(
None, bridge.read, _PTY_READ_CHUNK_TIMEOUT
)
if chunk is None: # EOF
disconnect_reason = "pty_eof"
return
if not chunk: # no data this tick; yield control and retry
await asyncio.sleep(0)
continue
bytes_to_client += len(chunk)
try:
await ws.send_bytes(chunk)
except Exception:
disconnect_reason = "ws_send_failed"
return
reader_task = asyncio.create_task(pump_pty_to_ws())
@@ -4140,6 +4214,7 @@ async def pty_ws(ws: WebSocket) -> None:
msg = await ws.receive()
msg_type = msg.get("type")
if msg_type == "websocket.disconnect":
disconnect_reason = "ws_disconnect"
break
raw = msg.get("bytes")
if raw is None:
@@ -4154,11 +4229,13 @@ async def pty_ws(ws: WebSocket) -> None:
cols = int(match.group(1))
rows = int(match.group(2))
bridge.resize(cols=cols, rows=rows)
resize_events += 1
continue
bridge.write(raw)
bytes_to_pty += len(raw)
except WebSocketDisconnect:
pass
disconnect_reason = "ws_disconnect"
finally:
reader_task.cancel()
try:
@@ -4166,6 +4243,15 @@ async def pty_ws(ws: WebSocket) -> None:
except (asyncio.CancelledError, Exception):
pass
bridge.close()
_log.info(
"pty-ws closed peer=%s pid=%s reason=%s bytes_out=%d bytes_in=%d resizes=%d",
peer,
bridge.pid,
disconnect_reason,
bytes_to_client,
bytes_to_pty,
resize_events,
)
# ---------------------------------------------------------------------------
@@ -4181,22 +4267,34 @@ async def pty_ws(ws: WebSocket) -> None:
@app.websocket("/api/ws")
async def gateway_ws(ws: WebSocket) -> None:
peer = _ws_client_label(ws)
if not _DASHBOARD_EMBEDDED_CHAT_ENABLED:
_log.debug("gateway-ws reject peer=%s reason=embedded_chat_disabled", peer)
await ws.close(code=4403)
return
token = ws.query_params.get("token", "")
if not hmac.compare_digest(token.encode(), _SESSION_TOKEN.encode()):
_log.warning("gateway-ws reject peer=%s reason=bad_token", peer)
await ws.close(code=4401)
return
if not _ws_client_is_allowed(ws):
_log.warning("gateway-ws reject peer=%s reason=non_loopback_client", peer)
await ws.close(code=4403)
return
from tui_gateway.ws import handle_ws
await handle_ws(ws)
_log.info("gateway-ws connect peer=%s", peer)
try:
await handle_ws(ws)
except WebSocketDisconnect:
_log.info("gateway-ws disconnect peer=%s", peer)
except Exception:
_log.exception("gateway-ws error peer=%s", peer)
raise
else:
_log.info("gateway-ws closed peer=%s", peer)
# ---------------------------------------------------------------------------
@@ -4213,54 +4311,69 @@ async def gateway_ws(ws: WebSocket) -> None:
@app.websocket("/api/pub")
async def pub_ws(ws: WebSocket) -> None:
peer = _ws_client_label(ws)
if not _DASHBOARD_EMBEDDED_CHAT_ENABLED:
_log.debug("pub-ws reject peer=%s reason=embedded_chat_disabled", peer)
await ws.close(code=4403)
return
token = ws.query_params.get("token", "")
if not hmac.compare_digest(token.encode(), _SESSION_TOKEN.encode()):
_log.warning("pub-ws reject peer=%s reason=bad_token", peer)
await ws.close(code=4401)
return
if not _ws_client_is_allowed(ws):
_log.warning("pub-ws reject peer=%s reason=non_loopback_client", peer)
await ws.close(code=4403)
return
channel = _channel_or_close_code(ws)
if not channel:
_log.warning("pub-ws reject peer=%s reason=invalid_channel", peer)
await ws.close(code=4400)
return
await ws.accept()
_log.info("pub-ws connect peer=%s channel=%s", peer, channel)
messages = 0
try:
while True:
await _broadcast_event(channel, await ws.receive_text())
payload = await ws.receive_text()
messages += 1
await _broadcast_event(channel, payload)
except WebSocketDisconnect:
pass
_log.info("pub-ws disconnect peer=%s channel=%s messages=%d", peer, channel, messages)
@app.websocket("/api/events")
async def events_ws(ws: WebSocket) -> None:
peer = _ws_client_label(ws)
if not _DASHBOARD_EMBEDDED_CHAT_ENABLED:
_log.debug("events-ws reject peer=%s reason=embedded_chat_disabled", peer)
await ws.close(code=4403)
return
token = ws.query_params.get("token", "")
if not hmac.compare_digest(token.encode(), _SESSION_TOKEN.encode()):
_log.warning("events-ws reject peer=%s reason=bad_token", peer)
await ws.close(code=4401)
return
if not _ws_client_is_allowed(ws):
_log.warning("events-ws reject peer=%s reason=non_loopback_client", peer)
await ws.close(code=4403)
return
channel = _channel_or_close_code(ws)
if not channel:
_log.warning("events-ws reject peer=%s reason=invalid_channel", peer)
await ws.close(code=4400)
return
await ws.accept()
_log.info("events-ws connect peer=%s channel=%s", peer, channel)
async with _event_lock:
_event_channels.setdefault(channel, set()).add(ws)
@@ -4272,7 +4385,7 @@ async def events_ws(ws: WebSocket) -> None:
# browser holds it.
await ws.receive_text()
except WebSocketDisconnect:
pass
_log.info("events-ws disconnect peer=%s channel=%s", peer, channel)
finally:
async with _event_lock:
subs = _event_channels.get(channel)
@@ -5175,6 +5288,14 @@ def start_server(
embedded_chat: bool = False,
):
"""Start the web UI server."""
try:
from hermes_logging import setup_logging as _setup_logging
log_dir = _setup_logging(mode="gui")
_log.info("GUI logging enabled: %s", log_dir / "gui.log")
except Exception:
pass
import uvicorn
global _DASHBOARD_EMBEDDED_CHAT_ENABLED
@@ -5231,5 +5352,12 @@ def start_server(
"(headless Linux). Pass --no-open to suppress this detection."
)
_log.info(
"dashboard starting host=%s port=%s embedded_chat=%s open_browser=%s",
host,
port,
embedded_chat,
open_browser,
)
print(f" Hermes Web UI → http://{host}:{port}")
uvicorn.run(app, host=host, port=port, log_level="warning")
+25 -1
View File
@@ -8,6 +8,8 @@ Log files produced:
agent.log INFO+, all agent/tool/session activity (the main log)
errors.log WARNING+, errors and warnings only (quick triage)
gateway.log INFO+, gateway-only events (created when mode="gateway")
gui.log INFO+, dashboard/websocket/TUI-gateway events
(created when mode="gui")
All files use ``RotatingFileHandler`` with ``RedactingFormatter`` so
secrets are never written to disk.
@@ -15,6 +17,8 @@ secrets are never written to disk.
Component separation:
gateway.log only receives records from ``gateway.*`` loggers
platform adapters, session management, slash commands, delivery.
gui.log receives dashboard-side records from ``hermes_cli.web_server``,
``hermes_cli.pty_bridge``, ``tui_gateway.*``, and ``uvicorn.*``.
agent.log remains the catch-all (everything goes there).
Session context:
@@ -146,6 +150,12 @@ COMPONENT_PREFIXES = {
"tools": ("tools",),
"cli": ("hermes_cli", "cli"),
"cron": ("cron",),
"gui": (
"hermes_cli.web_server",
"hermes_cli.pty_bridge",
"tui_gateway",
"uvicorn",
),
}
@@ -183,9 +193,11 @@ def setup_logging(
Number of rotated backup files to keep.
Defaults to 3 or the value from config.yaml ``logging.backup_count``.
mode
Caller context: ``"cli"``, ``"gateway"``, ``"cron"``.
Caller context: ``"cli"``, ``"gateway"``, ``"gui"``, ``"cron"``.
When ``"gateway"``, an additional ``gateway.log`` file is created
that receives only gateway-component records.
When ``"gui"``, an additional ``gui.log`` file is created that
receives dashboard and TUI-gateway component records.
force
Re-run setup even if it has already been called.
@@ -244,6 +256,18 @@ def setup_logging(
log_filter=_ComponentFilter(COMPONENT_PREFIXES["gateway"]),
)
# --- gui.log (INFO+, dashboard/tui-gateway components) -----------------
if mode == "gui":
_add_rotating_handler(
root,
log_dir / "gui.log",
level=logging.INFO,
max_bytes=10 * 1024 * 1024,
backup_count=5,
formatter=RedactingFormatter(_LOG_FORMAT),
log_filter=_ComponentFilter(COMPONENT_PREFIXES["gui"]),
)
if _logging_initialized and not force:
return log_dir
+1
View File
@@ -253,3 +253,4 @@ class TestLogFiles:
assert "agent" in LOG_FILES
assert "errors" in LOG_FILES
assert "gateway" in LOG_FILES
assert "gui" in LOG_FILES
+52
View File
@@ -25,6 +25,8 @@ def _reset_logging_state():
"""
hermes_logging._logging_initialized = False
root = logging.getLogger()
prev_root_level = root.level
root.setLevel(logging.NOTSET)
# Strip ALL RotatingFileHandlers — not just the ones we added — so that
# handlers leaked from other test modules in the same xdist worker don't
# pollute our counts.
@@ -43,6 +45,7 @@ def _reset_logging_state():
if h not in pre_existing:
root.removeHandler(h)
h.close()
root.setLevel(prev_root_level)
hermes_logging._logging_initialized = False
hermes_logging.clear_session_context()
@@ -355,6 +358,50 @@ class TestGatewayMode:
assert "file msg" in content
class TestGuiMode:
"""setup_logging(mode='gui') creates a filtered gui.log."""
def test_gui_log_created(self, hermes_home):
hermes_logging.setup_logging(hermes_home=hermes_home, mode="gui")
root = logging.getLogger()
gui_handlers = [
h for h in root.handlers
if isinstance(h, RotatingFileHandler)
and "gui.log" in getattr(h, "baseFilename", "")
]
assert len(gui_handlers) == 1
def test_gui_log_created_after_cli_init(self, hermes_home):
hermes_logging.setup_logging(hermes_home=hermes_home, mode="cli")
hermes_logging.setup_logging(hermes_home=hermes_home, mode="gui")
root = logging.getLogger()
gui_handlers = [
h for h in root.handlers
if isinstance(h, RotatingFileHandler)
and "gui.log" in getattr(h, "baseFilename", "")
]
assert len(gui_handlers) == 1
def test_gui_log_receives_only_gui_components(self, hermes_home):
hermes_logging.setup_logging(hermes_home=hermes_home, mode="gui")
logging.getLogger("hermes_cli.web_server").info("dashboard online")
logging.getLogger("tui_gateway.ws").info("ws connected")
logging.getLogger("gateway.run").info("gateway event")
for h in logging.getLogger().handlers:
h.flush()
gui_log = hermes_home / "logs" / "gui.log"
assert gui_log.exists()
content = gui_log.read_text()
assert "dashboard online" in content
assert "ws connected" in content
assert "gateway event" not in content
class TestSessionContext:
"""set_session_context / clear_session_context + _SessionFilter."""
@@ -557,6 +604,11 @@ class TestComponentPrefixes:
def test_cron_prefix(self):
assert ("cron",) == hermes_logging.COMPONENT_PREFIXES["cron"]
def test_gui_prefix(self):
prefixes = hermes_logging.COMPONENT_PREFIXES["gui"]
assert "hermes_cli.web_server" in prefixes
assert "tui_gateway" in prefixes
class TestSetupVerboseLogging:
"""setup_verbose_logging() adds a DEBUG-level console handler."""