Files
hermes-webui/api/agent_health.py
T
carryzuo00 2881fcec89 fix(agent_health): detect profile-scoped gateway.pid for correct status
_gateway_root_pid_path() unconditionally returned <hermes_root>/gateway.pid.
Profile-scoped gateways (started with --profile <name> or via active_profile)
write their runtime files under <hermes_root>/profiles/<name>/ instead of the
root, so the root-level path never existed.

build_agent_health_payload() therefore always received a non-existent pid_path,
fell through to the stale root-level gateway_state.json, and returned alive=None.
This caused the cron/scheduled-jobs page to display "Gateway not configured" even
when a gateway was actively running.

Fix: after failing to find a root-level gateway.pid, fall back to the active
profile directory via get_active_hermes_home(). Root-level wins when it exists,
so deployments that do write there are unaffected. Errors from profile lookup are
swallowed and the root path is returned, preserving the previous safe default.

Adds five focused unit tests covering the new fallback, the priority rule, and
the error-handling path.
2026-05-25 10:35:37 +00:00

393 lines
14 KiB
Python

"""Hermes agent/gateway heartbeat payload helpers (#716, #1879).
The WebUI process is not always paired with a long-running Hermes gateway. Some
setups use WebUI only, while self-hosted messaging deployments run a separate
Hermes gateway daemon that records runtime metadata in the Hermes Agent home.
This module turns those existing safe runtime signals into a small UI-facing
heartbeat without shelling out or adding psutil as a hard dependency.
Cross-container note (#1879): ``gateway.status.get_running_pid()`` uses
``fcntl.flock`` and ``os.kill(pid, 0)``, both of which require the caller to
share a PID namespace with the gateway process. In multi-container deployments
where the WebUI runs separately from ``hermes-agent`` and only a Hermes data
volume is shared, those checks always return ``None`` and the dashboard
incorrectly shows "Gateway not running". To stay accurate without forcing a
``pid: "service:hermes-agent"`` compose workaround, we accept a recent
``updated_at`` timestamp on ``gateway_state.json`` (combined with
``gateway_state == "running"``) as an equivalent live-process signal. Older
gateway builds do not refresh that file periodically, so a stale
``gateway_state == "running"`` record is treated as inconclusive rather than a
confirmed outage.
"""
from __future__ import annotations
import importlib
import json
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
_GATEWAY_PID_FILE = "gateway.pid"
_GATEWAY_RUNTIME_STATUS_FILE = "gateway_state.json"
# Two cron ticks (~60s each). Chosen to avoid false negatives during brief
# gateway restarts while still surfacing a true outage within a couple of
# minutes. Override is intentionally not exposed: keep the check deterministic
# and identical across deployments so support diagnostics are reproducible.
GATEWAY_FRESHNESS_THRESHOLD_S: float = 120.0
def _checked_at() -> str:
return datetime.now(timezone.utc).isoformat()
def _runtime_status_is_fresh(
runtime_status: dict[str, Any] | None,
*,
now: datetime | None = None,
threshold_s: float = GATEWAY_FRESHNESS_THRESHOLD_S,
) -> bool:
"""Return ``True`` when ``gateway_state.json`` looks freshly written.
"Fresh" means the gateway self-reported ``running`` and the ``updated_at``
ISO-8601 timestamp is no older than ``threshold_s`` seconds. This is the
cross-container liveness signal used when ``get_running_pid()`` returns
``None`` purely because of PID-namespace isolation (#1879).
Any unparseable input is treated as "not fresh" — a stale or missing
timestamp must never report alive.
"""
if not isinstance(runtime_status, dict):
return False
if runtime_status.get("gateway_state") != "running":
return False
raw_updated_at = runtime_status.get("updated_at")
if not isinstance(raw_updated_at, str) or not raw_updated_at:
return False
# ``datetime.fromisoformat`` accepts the exact format gateway/status.py
# writes (``datetime.now(timezone.utc).isoformat()``). We deliberately
# don't pull in dateutil — keeping this stdlib-only matches the rest of
# this module.
try:
updated_at = datetime.fromisoformat(raw_updated_at)
except (TypeError, ValueError):
return False
if updated_at.tzinfo is None:
# A naive timestamp could mean anything across containers / hosts.
# Refuse to interpret it rather than assume UTC.
return False
reference = now if now is not None else datetime.now(timezone.utc)
age_s = (reference - updated_at).total_seconds()
if age_s < 0:
# Clock skew between containers can produce small negatives. A future
# timestamp is still a "fresh" signal — the gateway clearly wrote it
# very recently — so accept it. A wildly-future timestamp (> threshold
# in the future) is rejected to avoid trusting a broken clock.
return -age_s <= threshold_s
return age_s <= threshold_s
def _runtime_status_is_stale_stopped(
runtime_status: dict[str, Any] | None,
*,
now: datetime | None = None,
threshold_s: float = GATEWAY_FRESHNESS_THRESHOLD_S,
) -> bool:
"""Return ``True`` for an old clean-stop root gateway state.
A user may run only profile-scoped gateways while a root
``gateway_state.json`` from an older, intentionally stopped gateway remains
on disk (#1944). Treat that stale stopped file like "no root gateway
configured" so the heartbeat banner does not keep warning about a service
the user is not running. Fresh stopped state still reports down.
"""
if not isinstance(runtime_status, dict):
return False
if runtime_status.get("gateway_state") != "stopped":
return False
raw_updated_at = runtime_status.get("updated_at")
if not isinstance(raw_updated_at, str) or not raw_updated_at:
return False
try:
updated_at = datetime.fromisoformat(raw_updated_at)
except (TypeError, ValueError):
return False
if updated_at.tzinfo is None:
return False
reference = now if now is not None else datetime.now(timezone.utc)
age_s = (reference - updated_at).total_seconds()
return age_s > threshold_s
def _runtime_status_is_stale_running(
runtime_status: dict[str, Any] | None,
*,
now: datetime | None = None,
threshold_s: float = GATEWAY_FRESHNESS_THRESHOLD_S,
) -> bool:
"""Return ``True`` when the gateway last self-reported running, but stale.
WebUI often runs in a separate container from the gateway. In that shape PID
checks can be impossible, and older gateway versions only update
``gateway_state.json`` on lifecycle/platform changes. A stale ``running``
file therefore means "not enough information from WebUI" rather than
"gateway is down".
"""
if not isinstance(runtime_status, dict):
return False
if runtime_status.get("gateway_state") != "running":
return False
raw_updated_at = runtime_status.get("updated_at")
if not isinstance(raw_updated_at, str) or not raw_updated_at:
return False
try:
updated_at = datetime.fromisoformat(raw_updated_at)
except (TypeError, ValueError):
return False
if updated_at.tzinfo is None:
return False
reference = now if now is not None else datetime.now(timezone.utc)
age_s = (reference - updated_at).total_seconds()
return age_s > threshold_s
def _gateway_status_module():
"""Load gateway.status lazily so tests and WebUI-only installs stay isolated."""
return importlib.import_module("gateway.status")
def _gateway_root_pid_path() -> Path | None:
"""Return the root Hermes gateway PID path.
Gateway runtime files are root-level singletons. A profile-scoped WebUI
process may have HERMES_HOME=<root>/profiles/<name>, but gateway.pid,
gateway.lock, and gateway_state.json still live under <root>.
When the root-level gateway.pid is absent (profile-scoped gateway
deployments write it under <root>/profiles/<name>/), fall back to the
active profile's directory so the gateway is detected correctly.
"""
try:
from hermes_constants import get_default_hermes_root
root_pid = get_default_hermes_root() / _GATEWAY_PID_FILE
if root_pid.exists():
return root_pid
try:
from api.profiles import get_active_hermes_home
profile_pid = Path(get_active_hermes_home()) / _GATEWAY_PID_FILE
if profile_pid.exists():
return profile_pid
except Exception:
pass
return root_pid
except Exception:
return None
def _read_runtime_status_path(path: Path) -> dict[str, Any] | None:
try:
payload = json.loads(path.read_text(encoding="utf-8"))
except (OSError, UnicodeDecodeError, json.JSONDecodeError):
return None
if isinstance(payload, dict):
return payload
return None
def _read_gateway_runtime_status(gateway_status: Any, pid_path: Path | None) -> dict[str, Any] | None:
read_runtime_status = gateway_status.read_runtime_status
if pid_path is not None:
try:
return read_runtime_status(pid_path=pid_path)
except TypeError:
try:
return read_runtime_status(pid_path)
except TypeError:
if getattr(gateway_status, "__name__", "") == "gateway.status" or hasattr(
gateway_status,
"_read_json_file",
):
runtime_status_file = str(
getattr(gateway_status, "_RUNTIME_STATUS_FILE", _GATEWAY_RUNTIME_STATUS_FILE)
)
runtime_status = _read_runtime_status_path(pid_path.with_name(runtime_status_file))
if runtime_status is not None:
return runtime_status
return read_runtime_status()
def _gateway_running_pid(gateway_status: Any, pid_path: Path | None) -> int | None:
get_running_pid = gateway_status.get_running_pid
if pid_path is not None:
try:
return get_running_pid(pid_path=pid_path, cleanup_stale=False)
except TypeError:
try:
return get_running_pid(pid_path, cleanup_stale=False)
except TypeError:
pass
try:
return get_running_pid(cleanup_stale=False)
except TypeError:
# Older agent versions may not expose cleanup_stale. Keep compatibility.
return get_running_pid()
def _runtime_detail_subset(runtime_status: dict[str, Any] | None) -> dict[str, Any]:
"""Return only non-sensitive runtime fields for the browser.
gateway.status records argv/PID metadata so the CLI can validate process
identity. The WebUI alert only needs health semantics, never raw command
lines, paths, environment, or tokens.
"""
if not isinstance(runtime_status, dict):
return {}
details: dict[str, Any] = {}
gateway_state = runtime_status.get("gateway_state")
if isinstance(gateway_state, str) and gateway_state:
details["gateway_state"] = gateway_state
updated_at = runtime_status.get("updated_at")
if isinstance(updated_at, str) and updated_at:
details["updated_at"] = updated_at
try:
details["active_agents"] = max(0, int(runtime_status.get("active_agents") or 0))
except (TypeError, ValueError):
pass
platforms = runtime_status.get("platforms")
if isinstance(platforms, dict):
details["platform_count"] = len(platforms)
states: dict[str, int] = {}
for payload in platforms.values():
if not isinstance(payload, dict):
continue
state = payload.get("state")
if isinstance(state, str) and state:
states[state] = states.get(state, 0) + 1
if states:
details["platform_states"] = states
return details
def build_agent_health_payload() -> dict[str, Any]:
"""Return `{alive, checked_at, details}` for the Hermes gateway/agent.
`alive` is intentionally tri-state:
* True: a gateway runtime signal says the process is alive.
* False: gateway metadata exists, but no live gateway process owns it.
* None: no gateway metadata/status is available, so this WebUI setup is
probably not configured with a separate gateway process.
"""
checked_at = _checked_at()
try:
gateway_status = _gateway_status_module()
except Exception as exc:
return {
"alive": None,
"checked_at": checked_at,
"details": {
"state": "unknown",
"reason": "gateway_status_unavailable",
"error": type(exc).__name__,
},
}
gateway_pid_path = _gateway_root_pid_path()
runtime_status = None
try:
runtime_status = _read_gateway_runtime_status(gateway_status, gateway_pid_path)
except Exception:
runtime_status = None
try:
running_pid = _gateway_running_pid(gateway_status, gateway_pid_path)
except Exception:
running_pid = None
safe_details = _runtime_detail_subset(runtime_status)
if running_pid is not None:
return {
"alive": True,
"checked_at": checked_at,
"details": {
"state": "alive",
**safe_details,
},
}
# Cross-container fallback (#1879): when ``get_running_pid()`` cannot see
# the gateway because we're in a different PID namespace, a recent
# ``updated_at`` on ``gateway_state.json`` is a reliable equivalent signal
# since the gateway writes it on every tick. We only trust this fallback
# when the gateway also self-reports ``gateway_state == "running"`` so
# crash-without-cleanup scenarios still surface as "down".
if _runtime_status_is_fresh(runtime_status):
return {
"alive": True,
"checked_at": checked_at,
"details": {
"state": "alive",
"reason": "cross_container_freshness",
**safe_details,
},
}
if _runtime_status_is_stale_stopped(runtime_status):
return {
"alive": None,
"checked_at": checked_at,
"details": {
"state": "unknown",
"reason": "gateway_stale_stopped_state",
**safe_details,
},
}
if _runtime_status_is_stale_running(runtime_status):
return {
"alive": None,
"checked_at": checked_at,
"details": {
"state": "unknown",
"reason": "gateway_stale_running_state",
**safe_details,
},
}
if isinstance(runtime_status, dict):
return {
"alive": False,
"checked_at": checked_at,
"details": {
"state": "down",
"reason": "gateway_not_running",
**safe_details,
},
}
return {
"alive": None,
"checked_at": checked_at,
"details": {
"state": "unknown",
"reason": "gateway_not_configured",
},
}