feat: add agent heartbeat alert

This commit is contained in:
Michael Lam
2026-05-04 17:19:21 -07:00
committed by test
parent fcc83284e3
commit 960e45f77f
7 changed files with 382 additions and 0 deletions
+132
View File
@@ -0,0 +1,132 @@
"""Hermes agent/gateway heartbeat payload helpers (#716).
The WebUI process is not always paired with a long-running Hermes gateway. Some
setups use WebUI only, while self-hosted messaging deployments run a separate
Hermes gateway daemon that records runtime metadata in the Hermes Agent home.
This module turns those existing safe runtime signals into a small UI-facing
heartbeat without shelling out or adding psutil as a hard dependency.
"""
from __future__ import annotations
import importlib
from datetime import datetime, timezone
from typing import Any
def _checked_at() -> str:
return datetime.now(timezone.utc).isoformat()
def _gateway_status_module():
"""Load gateway.status lazily so tests and WebUI-only installs stay isolated."""
return importlib.import_module("gateway.status")
def _runtime_detail_subset(runtime_status: dict[str, Any] | None) -> dict[str, Any]:
"""Return only non-sensitive runtime fields for the browser.
gateway.status records argv/PID metadata so the CLI can validate process
identity. The WebUI alert only needs health semantics, never raw command
lines, paths, environment, or tokens.
"""
if not isinstance(runtime_status, dict):
return {}
details: dict[str, Any] = {}
gateway_state = runtime_status.get("gateway_state")
if isinstance(gateway_state, str) and gateway_state:
details["gateway_state"] = gateway_state
updated_at = runtime_status.get("updated_at")
if isinstance(updated_at, str) and updated_at:
details["updated_at"] = updated_at
try:
details["active_agents"] = max(0, int(runtime_status.get("active_agents") or 0))
except (TypeError, ValueError):
pass
platforms = runtime_status.get("platforms")
if isinstance(platforms, dict):
details["platform_count"] = len(platforms)
states: dict[str, int] = {}
for payload in platforms.values():
if not isinstance(payload, dict):
continue
state = payload.get("state")
if isinstance(state, str) and state:
states[state] = states.get(state, 0) + 1
if states:
details["platform_states"] = states
return details
def build_agent_health_payload() -> dict[str, Any]:
"""Return `{alive, checked_at, details}` for the Hermes gateway/agent.
`alive` is intentionally tri-state:
* True: a gateway runtime signal says the process is alive.
* False: gateway metadata exists, but no live gateway process owns it.
* None: no gateway metadata/status is available, so this WebUI setup is
probably not configured with a separate gateway process.
"""
checked_at = _checked_at()
try:
gateway_status = _gateway_status_module()
except Exception as exc:
return {
"alive": None,
"checked_at": checked_at,
"details": {
"state": "unknown",
"reason": "gateway_status_unavailable",
"error": type(exc).__name__,
},
}
runtime_status = None
try:
runtime_status = gateway_status.read_runtime_status()
except Exception:
runtime_status = None
try:
running_pid = gateway_status.get_running_pid(cleanup_stale=False)
except TypeError:
# Older agent versions may not expose cleanup_stale. Keep compatibility.
running_pid = gateway_status.get_running_pid()
except Exception:
running_pid = None
safe_details = _runtime_detail_subset(runtime_status)
if running_pid is not None:
return {
"alive": True,
"checked_at": checked_at,
"details": {
"state": "alive",
**safe_details,
},
}
if isinstance(runtime_status, dict):
return {
"alive": False,
"checked_at": checked_at,
"details": {
"state": "down",
"reason": "gateway_not_running",
**safe_details,
},
}
return {
"alive": None,
"checked_at": checked_at,
"details": {
"state": "unknown",
"reason": "gateway_not_configured",
},
}
+4
View File
@@ -479,6 +479,7 @@ from api.helpers import (
redact_session_data,
_redact_text,
)
from api.agent_health import build_agent_health_payload
def _clear_stale_stream_state(session) -> bool:
@@ -2487,6 +2488,9 @@ def handle_get(handler, parsed) -> bool:
if parsed.path == "/health":
return _handle_health(handler, parsed)
if parsed.path == "/api/health/agent":
return j(handler, build_agent_health_payload())
if parsed.path == "/api/models":
return j(handler, get_available_models())
Binary file not shown.

After

Width:  |  Height:  |  Size: 146 KiB

+7
View File
@@ -342,6 +342,13 @@
<button class="reconnect-btn" onclick="refreshSession()"><svg width="13" height="13" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" aria-hidden="true" style="vertical-align:-1px"><polyline points="23 4 23 10 17 10"/><polyline points="1 20 1 14 7 14"/><path d="M3.51 9a9 9 0 0 1 14.85-3.36L23 10M1 14l4.64 4.36A9 9 0 0 0 20.49 15"/></svg> Reload</button>
</div>
</div>
<div class="agent-health-banner" id="agentHealthBanner" role="alert" aria-live="assertive" hidden>
<div class="agent-health-copy">
<strong id="agentHealthTitle">Hermes agent is not responding</strong>
<span id="agentHealthDetails">The gateway heartbeat failed. Messages may not be delivered until it comes back.</span>
</div>
<button class="agent-health-dismiss" id="agentHealthDismiss" type="button" onclick="dismissAgentHealthAlert()" aria-label="Dismiss Hermes agent heartbeat alert">Dismiss</button>
</div>
<div class="composer-wrap" id="composerWrap">
<div class="composer-flyout">
<!-- Queue flyout: slides up from behind composer, same pattern as approval-card -->
+7
View File
@@ -519,6 +519,13 @@
.reconnect-banner.visible{display:flex;}
.reconnect-btn{padding:6px 12px;border-radius:8px;font-size:12px;font-weight:600;background:var(--accent-bg-strong);border:1px solid var(--accent-bg-strong);color:var(--accent-text);cursor:pointer;}
.reconnect-btn:hover{background:var(--accent-bg-strong);}
.agent-health-banner{position:sticky;bottom:0;z-index:4;display:none;align-items:center;justify-content:space-between;gap:12px;margin:10px auto 0;max-width:var(--msg-max);width:calc(100% - 40px);padding:12px 16px;border:1px solid color-mix(in srgb,var(--error) 55%,var(--surface));border-radius:12px;background:color-mix(in srgb,var(--error) 14%,var(--surface));color:var(--text);box-shadow:0 10px 32px rgba(0,0,0,.16);}
.agent-health-banner.visible{display:flex;}
.agent-health-copy{display:flex;flex-direction:column;gap:3px;min-width:0;font-size:13px;line-height:1.35;}
.agent-health-copy strong{color:var(--error);font-size:13px;}
.agent-health-copy span{color:var(--muted);}
.agent-health-dismiss{flex-shrink:0;padding:6px 12px;border-radius:8px;border:1px solid color-mix(in srgb,var(--error) 45%,var(--surface));background:color-mix(in srgb,var(--error) 10%,var(--surface));color:var(--error);font-size:12px;font-weight:600;cursor:pointer;}
.agent-health-dismiss:hover{background:color-mix(in srgb,var(--error) 18%,var(--surface));}
/* ── Update banner ── */
.update-banner{display:none;background:var(--surface);border:1px solid var(--accent);border-radius:10px;padding:10px 16px;margin:10px auto;max-width:780px;font-size:13px;color:var(--accent-text);align-items:center;justify-content:space-between;gap:12px;}
.update-banner.visible{display:flex;}
+76
View File
@@ -3021,6 +3021,82 @@ function dismissReconnect() {
$('reconnectBanner').classList.remove('visible');
clearInflight();
}
// ── Hermes agent/gateway heartbeat alert (#716) ──
const AGENT_HEALTH_INTERVAL_MS=30000;
const AGENT_HEALTH_DISMISSED_KEY='agent-health-dismissed';
let _agentHealthTimer=null;
let _agentHealthLastState='unknown';
function _agentHealthDismissed(){
try{return localStorage.getItem(AGENT_HEALTH_DISMISSED_KEY)==='1';}
catch(_){return false;}
}
function _setAgentHealthDismissed(value){
try{
if(value)localStorage.setItem(AGENT_HEALTH_DISMISSED_KEY,'1');
else localStorage.removeItem(AGENT_HEALTH_DISMISSED_KEY);
}catch(_){ }
}
function _hideAgentHealthAlert(){
const banner=$('agentHealthBanner');
if(banner){banner.classList.remove('visible');banner.hidden=true;}
}
function _showAgentHealthAlert(payload){
if(_agentHealthDismissed()) return;
const banner=$('agentHealthBanner');
const title=$('agentHealthTitle');
const details=$('agentHealthDetails');
if(!banner) return;
if(title) title.textContent='Hermes agent is not responding';
const state=payload&&payload.details&&payload.details.gateway_state?` State: ${payload.details.gateway_state}.`:'';
if(details) details.textContent=`Gateway heartbeat failed.${state} Messages may not be delivered until it comes back.`;
banner.hidden=false;
banner.classList.add('visible');
}
function dismissAgentHealthAlert(){
_setAgentHealthDismissed(true);
_hideAgentHealthAlert();
}
async function pollAgentHealth(){
if(document.visibilityState !== 'visible') return;
try{
const payload=await api('/api/health/agent');
if(payload.alive === true){
_agentHealthLastState='alive';
_setAgentHealthDismissed(false);
_hideAgentHealthAlert();
return;
}
if(payload.alive === false){
_agentHealthLastState='down';
_showAgentHealthAlert(payload);
return;
}
if(payload.alive == null){
_agentHealthLastState='unknown';
_hideAgentHealthAlert();
}
}catch(_){
_agentHealthLastState='unknown';
_hideAgentHealthAlert();
}
}
function startAgentHealthMonitor(){
if(document.visibilityState !== 'visible') return;
if(_agentHealthTimer) return;
void pollAgentHealth();
_agentHealthTimer=setInterval(pollAgentHealth, AGENT_HEALTH_INTERVAL_MS);
}
function stopAgentHealthMonitor(){
if(_agentHealthTimer){clearInterval(_agentHealthTimer);_agentHealthTimer=null;}
}
function _syncAgentHealthMonitorVisibility(){
if(document.visibilityState === 'visible') startAgentHealthMonitor();
else stopAgentHealthMonitor();
}
document.addEventListener('visibilitychange',_syncAgentHealthMonitorVisibility);
if(document.readyState==='loading') document.addEventListener('DOMContentLoaded',startAgentHealthMonitor);
else startAgentHealthMonitor();
async function refreshSession() {
// When the banner is in post-update restart mode, the "Reload" button
// should do a full page reload — a session refresh would just 502 while
+156
View File
@@ -0,0 +1,156 @@
"""Regression coverage for #716 Hermes agent/gateway heartbeat monitor."""
from __future__ import annotations
import pathlib
REPO_ROOT = pathlib.Path(__file__).parent.parent
UI_JS = (REPO_ROOT / "static" / "ui.js").read_text(encoding="utf-8")
INDEX_HTML = (REPO_ROOT / "static" / "index.html").read_text(encoding="utf-8")
STYLE_CSS = (REPO_ROOT / "static" / "style.css").read_text(encoding="utf-8")
ROUTES_PY = (REPO_ROOT / "api" / "routes.py").read_text(encoding="utf-8")
class _FakeGatewayStatus:
def __init__(self, runtime_status, running_pid):
self._runtime_status = runtime_status
self._running_pid = running_pid
def read_runtime_status(self):
return self._runtime_status
def get_running_pid(self, cleanup_stale=False):
assert cleanup_stale is False
return self._running_pid
def _runtime_status(**overrides):
payload = {
"gateway_state": "running",
"updated_at": "2026-05-04T12:00:00+00:00",
"active_agents": 2,
"platforms": {
"discord": {"state": "connected"},
"telegram": {"state": "starting"},
},
# Sensitive/raw process fields that must never reach the browser.
"pid": 12345,
"argv": ["hermes", "gateway", "--token", "secret-token"],
"command": "hermes gateway --token secret-token",
"executable": "/home/user/.hermes/hermes-agent/venv/bin/python",
"env": {"API_KEY": "secret"},
}
payload.update(overrides)
return payload
def test_agent_health_payload_alive_uses_safe_runtime_details(monkeypatch):
from api import agent_health
monkeypatch.setattr(
agent_health,
"_gateway_status_module",
lambda: _FakeGatewayStatus(_runtime_status(), running_pid=12345),
)
payload = agent_health.build_agent_health_payload()
assert payload["alive"] is True
assert payload["checked_at"]
assert payload["details"] == {
"state": "alive",
"gateway_state": "running",
"updated_at": "2026-05-04T12:00:00+00:00",
"active_agents": 2,
"platform_count": 2,
"platform_states": {"connected": 1, "starting": 1},
}
rendered = repr(payload)
assert "secret-token" not in rendered
assert "API_KEY" not in rendered
assert "argv" not in rendered
assert "command" not in rendered
assert "executable" not in rendered
assert "pid" not in payload["details"]
def test_agent_health_payload_down_when_gateway_metadata_exists_but_no_process(monkeypatch):
from api import agent_health
monkeypatch.setattr(
agent_health,
"_gateway_status_module",
lambda: _FakeGatewayStatus(_runtime_status(gateway_state="stale"), running_pid=None),
)
payload = agent_health.build_agent_health_payload()
assert payload["alive"] is False
assert payload["details"]["state"] == "down"
assert payload["details"]["reason"] == "gateway_not_running"
assert payload["details"]["gateway_state"] == "stale"
def test_agent_health_payload_unknown_when_gateway_is_not_configured(monkeypatch):
from api import agent_health
monkeypatch.setattr(
agent_health,
"_gateway_status_module",
lambda: _FakeGatewayStatus(runtime_status=None, running_pid=None),
)
payload = agent_health.build_agent_health_payload()
assert payload["alive"] is None
assert payload["details"] == {"state": "unknown", "reason": "gateway_not_configured"}
def test_agent_health_route_is_registered_with_tri_state_payload_shape():
assert 'parsed.path == "/api/health/agent"' in ROUTES_PY
assert "build_agent_health_payload()" in ROUTES_PY
src = (REPO_ROOT / "api" / "agent_health.py").read_text(encoding="utf-8")
assert '"alive"' in src
assert '"checked_at"' in src
assert '"details"' in src
def test_agent_health_banner_markup_and_styles_exist():
assert 'id="agentHealthBanner"' in INDEX_HTML
assert 'role="alert"' in INDEX_HTML
assert 'aria-live="assertive"' in INDEX_HTML
assert 'onclick="dismissAgentHealthAlert()"' in INDEX_HTML
assert ".agent-health-banner" in STYLE_CSS
assert ".agent-health-banner.visible" in STYLE_CSS
assert ".agent-health-dismiss" in STYLE_CSS
def test_agent_health_frontend_polls_only_visible_and_distinguishes_states():
assert "const AGENT_HEALTH_INTERVAL_MS=30000" in UI_JS
assert "api('/api/health/agent')" in UI_JS
assert "document.visibilityState !== 'visible'" in UI_JS
assert "document.addEventListener('visibilitychange',_syncAgentHealthMonitorVisibility)" in UI_JS
assert "if(payload.alive === true)" in UI_JS
assert "if(payload.alive === false)" in UI_JS
assert "if(payload.alive == null)" in UI_JS
assert "_showAgentHealthAlert(payload)" in UI_JS
assert "_hideAgentHealthAlert()" in UI_JS
def test_agent_health_dismiss_persists_until_recovery():
assert "const AGENT_HEALTH_DISMISSED_KEY='agent-health-dismissed'" in UI_JS
assert "localStorage.setItem(AGENT_HEALTH_DISMISSED_KEY,'1')" in UI_JS
assert "localStorage.removeItem(AGENT_HEALTH_DISMISSED_KEY)" in UI_JS
assert "function dismissAgentHealthAlert()" in UI_JS
assert "if(_agentHealthDismissed()) return;" in UI_JS
assert "_setAgentHealthDismissed(false)" in UI_JS
def test_agent_health_backend_does_not_use_shell_or_expose_raw_process_fields():
src = (REPO_ROOT / "api" / "agent_health.py").read_text(encoding="utf-8")
assert "import subprocess" not in src
assert "import psutil" not in src
for private_field in ("argv", "command", "executable", "env"):
assert f'details["{private_field}"]' not in src
assert f"details['{private_field}']" not in src