mirror of
https://github.com/nesquena/hermes-webui.git
synced 2026-05-28 12:40:26 +00:00
2881fcec89
_gateway_root_pid_path() unconditionally returned <hermes_root>/gateway.pid. Profile-scoped gateways (started with --profile <name> or via active_profile) write their runtime files under <hermes_root>/profiles/<name>/ instead of the root, so the root-level path never existed. build_agent_health_payload() therefore always received a non-existent pid_path, fell through to the stale root-level gateway_state.json, and returned alive=None. This caused the cron/scheduled-jobs page to display "Gateway not configured" even when a gateway was actively running. Fix: after failing to find a root-level gateway.pid, fall back to the active profile directory via get_active_hermes_home(). Root-level wins when it exists, so deployments that do write there are unaffected. Errors from profile lookup are swallowed and the root path is returned, preserving the previous safe default. Adds five focused unit tests covering the new fallback, the priority rule, and the error-handling path.
393 lines
14 KiB
Python
393 lines
14 KiB
Python
"""Hermes agent/gateway heartbeat payload helpers (#716, #1879).
|
|
|
|
The WebUI process is not always paired with a long-running Hermes gateway. Some
|
|
setups use WebUI only, while self-hosted messaging deployments run a separate
|
|
Hermes gateway daemon that records runtime metadata in the Hermes Agent home.
|
|
This module turns those existing safe runtime signals into a small UI-facing
|
|
heartbeat without shelling out or adding psutil as a hard dependency.
|
|
|
|
Cross-container note (#1879): ``gateway.status.get_running_pid()`` uses
|
|
``fcntl.flock`` and ``os.kill(pid, 0)``, both of which require the caller to
|
|
share a PID namespace with the gateway process. In multi-container deployments
|
|
where the WebUI runs separately from ``hermes-agent`` and only a Hermes data
|
|
volume is shared, those checks always return ``None`` and the dashboard
|
|
incorrectly shows "Gateway not running". To stay accurate without forcing a
|
|
``pid: "service:hermes-agent"`` compose workaround, we accept a recent
|
|
``updated_at`` timestamp on ``gateway_state.json`` (combined with
|
|
``gateway_state == "running"``) as an equivalent live-process signal. Older
|
|
gateway builds do not refresh that file periodically, so a stale
|
|
``gateway_state == "running"`` record is treated as inconclusive rather than a
|
|
confirmed outage.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import importlib
|
|
import json
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
_GATEWAY_PID_FILE = "gateway.pid"
|
|
_GATEWAY_RUNTIME_STATUS_FILE = "gateway_state.json"
|
|
|
|
|
|
# Two cron ticks (~60s each). Chosen to avoid false negatives during brief
|
|
# gateway restarts while still surfacing a true outage within a couple of
|
|
# minutes. Override is intentionally not exposed: keep the check deterministic
|
|
# and identical across deployments so support diagnostics are reproducible.
|
|
GATEWAY_FRESHNESS_THRESHOLD_S: float = 120.0
|
|
|
|
|
|
def _checked_at() -> str:
|
|
return datetime.now(timezone.utc).isoformat()
|
|
|
|
|
|
def _runtime_status_is_fresh(
|
|
runtime_status: dict[str, Any] | None,
|
|
*,
|
|
now: datetime | None = None,
|
|
threshold_s: float = GATEWAY_FRESHNESS_THRESHOLD_S,
|
|
) -> bool:
|
|
"""Return ``True`` when ``gateway_state.json`` looks freshly written.
|
|
|
|
"Fresh" means the gateway self-reported ``running`` and the ``updated_at``
|
|
ISO-8601 timestamp is no older than ``threshold_s`` seconds. This is the
|
|
cross-container liveness signal used when ``get_running_pid()`` returns
|
|
``None`` purely because of PID-namespace isolation (#1879).
|
|
|
|
Any unparseable input is treated as "not fresh" — a stale or missing
|
|
timestamp must never report alive.
|
|
"""
|
|
if not isinstance(runtime_status, dict):
|
|
return False
|
|
if runtime_status.get("gateway_state") != "running":
|
|
return False
|
|
|
|
raw_updated_at = runtime_status.get("updated_at")
|
|
if not isinstance(raw_updated_at, str) or not raw_updated_at:
|
|
return False
|
|
|
|
# ``datetime.fromisoformat`` accepts the exact format gateway/status.py
|
|
# writes (``datetime.now(timezone.utc).isoformat()``). We deliberately
|
|
# don't pull in dateutil — keeping this stdlib-only matches the rest of
|
|
# this module.
|
|
try:
|
|
updated_at = datetime.fromisoformat(raw_updated_at)
|
|
except (TypeError, ValueError):
|
|
return False
|
|
|
|
if updated_at.tzinfo is None:
|
|
# A naive timestamp could mean anything across containers / hosts.
|
|
# Refuse to interpret it rather than assume UTC.
|
|
return False
|
|
|
|
reference = now if now is not None else datetime.now(timezone.utc)
|
|
age_s = (reference - updated_at).total_seconds()
|
|
if age_s < 0:
|
|
# Clock skew between containers can produce small negatives. A future
|
|
# timestamp is still a "fresh" signal — the gateway clearly wrote it
|
|
# very recently — so accept it. A wildly-future timestamp (> threshold
|
|
# in the future) is rejected to avoid trusting a broken clock.
|
|
return -age_s <= threshold_s
|
|
return age_s <= threshold_s
|
|
|
|
|
|
def _runtime_status_is_stale_stopped(
|
|
runtime_status: dict[str, Any] | None,
|
|
*,
|
|
now: datetime | None = None,
|
|
threshold_s: float = GATEWAY_FRESHNESS_THRESHOLD_S,
|
|
) -> bool:
|
|
"""Return ``True`` for an old clean-stop root gateway state.
|
|
|
|
A user may run only profile-scoped gateways while a root
|
|
``gateway_state.json`` from an older, intentionally stopped gateway remains
|
|
on disk (#1944). Treat that stale stopped file like "no root gateway
|
|
configured" so the heartbeat banner does not keep warning about a service
|
|
the user is not running. Fresh stopped state still reports down.
|
|
"""
|
|
if not isinstance(runtime_status, dict):
|
|
return False
|
|
if runtime_status.get("gateway_state") != "stopped":
|
|
return False
|
|
|
|
raw_updated_at = runtime_status.get("updated_at")
|
|
if not isinstance(raw_updated_at, str) or not raw_updated_at:
|
|
return False
|
|
|
|
try:
|
|
updated_at = datetime.fromisoformat(raw_updated_at)
|
|
except (TypeError, ValueError):
|
|
return False
|
|
if updated_at.tzinfo is None:
|
|
return False
|
|
|
|
reference = now if now is not None else datetime.now(timezone.utc)
|
|
age_s = (reference - updated_at).total_seconds()
|
|
return age_s > threshold_s
|
|
|
|
|
|
def _runtime_status_is_stale_running(
|
|
runtime_status: dict[str, Any] | None,
|
|
*,
|
|
now: datetime | None = None,
|
|
threshold_s: float = GATEWAY_FRESHNESS_THRESHOLD_S,
|
|
) -> bool:
|
|
"""Return ``True`` when the gateway last self-reported running, but stale.
|
|
|
|
WebUI often runs in a separate container from the gateway. In that shape PID
|
|
checks can be impossible, and older gateway versions only update
|
|
``gateway_state.json`` on lifecycle/platform changes. A stale ``running``
|
|
file therefore means "not enough information from WebUI" rather than
|
|
"gateway is down".
|
|
"""
|
|
if not isinstance(runtime_status, dict):
|
|
return False
|
|
if runtime_status.get("gateway_state") != "running":
|
|
return False
|
|
|
|
raw_updated_at = runtime_status.get("updated_at")
|
|
if not isinstance(raw_updated_at, str) or not raw_updated_at:
|
|
return False
|
|
|
|
try:
|
|
updated_at = datetime.fromisoformat(raw_updated_at)
|
|
except (TypeError, ValueError):
|
|
return False
|
|
if updated_at.tzinfo is None:
|
|
return False
|
|
|
|
reference = now if now is not None else datetime.now(timezone.utc)
|
|
age_s = (reference - updated_at).total_seconds()
|
|
return age_s > threshold_s
|
|
|
|
|
|
def _gateway_status_module():
|
|
"""Load gateway.status lazily so tests and WebUI-only installs stay isolated."""
|
|
return importlib.import_module("gateway.status")
|
|
|
|
|
|
def _gateway_root_pid_path() -> Path | None:
|
|
"""Return the root Hermes gateway PID path.
|
|
|
|
Gateway runtime files are root-level singletons. A profile-scoped WebUI
|
|
process may have HERMES_HOME=<root>/profiles/<name>, but gateway.pid,
|
|
gateway.lock, and gateway_state.json still live under <root>.
|
|
|
|
When the root-level gateway.pid is absent (profile-scoped gateway
|
|
deployments write it under <root>/profiles/<name>/), fall back to the
|
|
active profile's directory so the gateway is detected correctly.
|
|
"""
|
|
try:
|
|
from hermes_constants import get_default_hermes_root
|
|
root_pid = get_default_hermes_root() / _GATEWAY_PID_FILE
|
|
if root_pid.exists():
|
|
return root_pid
|
|
try:
|
|
from api.profiles import get_active_hermes_home
|
|
profile_pid = Path(get_active_hermes_home()) / _GATEWAY_PID_FILE
|
|
if profile_pid.exists():
|
|
return profile_pid
|
|
except Exception:
|
|
pass
|
|
return root_pid
|
|
except Exception:
|
|
return None
|
|
|
|
|
|
def _read_runtime_status_path(path: Path) -> dict[str, Any] | None:
|
|
try:
|
|
payload = json.loads(path.read_text(encoding="utf-8"))
|
|
except (OSError, UnicodeDecodeError, json.JSONDecodeError):
|
|
return None
|
|
if isinstance(payload, dict):
|
|
return payload
|
|
return None
|
|
|
|
|
|
def _read_gateway_runtime_status(gateway_status: Any, pid_path: Path | None) -> dict[str, Any] | None:
|
|
read_runtime_status = gateway_status.read_runtime_status
|
|
if pid_path is not None:
|
|
try:
|
|
return read_runtime_status(pid_path=pid_path)
|
|
except TypeError:
|
|
try:
|
|
return read_runtime_status(pid_path)
|
|
except TypeError:
|
|
if getattr(gateway_status, "__name__", "") == "gateway.status" or hasattr(
|
|
gateway_status,
|
|
"_read_json_file",
|
|
):
|
|
runtime_status_file = str(
|
|
getattr(gateway_status, "_RUNTIME_STATUS_FILE", _GATEWAY_RUNTIME_STATUS_FILE)
|
|
)
|
|
runtime_status = _read_runtime_status_path(pid_path.with_name(runtime_status_file))
|
|
if runtime_status is not None:
|
|
return runtime_status
|
|
return read_runtime_status()
|
|
|
|
|
|
def _gateway_running_pid(gateway_status: Any, pid_path: Path | None) -> int | None:
|
|
get_running_pid = gateway_status.get_running_pid
|
|
if pid_path is not None:
|
|
try:
|
|
return get_running_pid(pid_path=pid_path, cleanup_stale=False)
|
|
except TypeError:
|
|
try:
|
|
return get_running_pid(pid_path, cleanup_stale=False)
|
|
except TypeError:
|
|
pass
|
|
try:
|
|
return get_running_pid(cleanup_stale=False)
|
|
except TypeError:
|
|
# Older agent versions may not expose cleanup_stale. Keep compatibility.
|
|
return get_running_pid()
|
|
|
|
|
|
def _runtime_detail_subset(runtime_status: dict[str, Any] | None) -> dict[str, Any]:
|
|
"""Return only non-sensitive runtime fields for the browser.
|
|
|
|
gateway.status records argv/PID metadata so the CLI can validate process
|
|
identity. The WebUI alert only needs health semantics, never raw command
|
|
lines, paths, environment, or tokens.
|
|
"""
|
|
if not isinstance(runtime_status, dict):
|
|
return {}
|
|
|
|
details: dict[str, Any] = {}
|
|
gateway_state = runtime_status.get("gateway_state")
|
|
if isinstance(gateway_state, str) and gateway_state:
|
|
details["gateway_state"] = gateway_state
|
|
|
|
updated_at = runtime_status.get("updated_at")
|
|
if isinstance(updated_at, str) and updated_at:
|
|
details["updated_at"] = updated_at
|
|
|
|
try:
|
|
details["active_agents"] = max(0, int(runtime_status.get("active_agents") or 0))
|
|
except (TypeError, ValueError):
|
|
pass
|
|
|
|
platforms = runtime_status.get("platforms")
|
|
if isinstance(platforms, dict):
|
|
details["platform_count"] = len(platforms)
|
|
states: dict[str, int] = {}
|
|
for payload in platforms.values():
|
|
if not isinstance(payload, dict):
|
|
continue
|
|
state = payload.get("state")
|
|
if isinstance(state, str) and state:
|
|
states[state] = states.get(state, 0) + 1
|
|
if states:
|
|
details["platform_states"] = states
|
|
|
|
return details
|
|
|
|
|
|
def build_agent_health_payload() -> dict[str, Any]:
|
|
"""Return `{alive, checked_at, details}` for the Hermes gateway/agent.
|
|
|
|
`alive` is intentionally tri-state:
|
|
* True: a gateway runtime signal says the process is alive.
|
|
* False: gateway metadata exists, but no live gateway process owns it.
|
|
* None: no gateway metadata/status is available, so this WebUI setup is
|
|
probably not configured with a separate gateway process.
|
|
"""
|
|
checked_at = _checked_at()
|
|
try:
|
|
gateway_status = _gateway_status_module()
|
|
except Exception as exc:
|
|
return {
|
|
"alive": None,
|
|
"checked_at": checked_at,
|
|
"details": {
|
|
"state": "unknown",
|
|
"reason": "gateway_status_unavailable",
|
|
"error": type(exc).__name__,
|
|
},
|
|
}
|
|
|
|
gateway_pid_path = _gateway_root_pid_path()
|
|
|
|
runtime_status = None
|
|
try:
|
|
runtime_status = _read_gateway_runtime_status(gateway_status, gateway_pid_path)
|
|
except Exception:
|
|
runtime_status = None
|
|
|
|
try:
|
|
running_pid = _gateway_running_pid(gateway_status, gateway_pid_path)
|
|
except Exception:
|
|
running_pid = None
|
|
|
|
safe_details = _runtime_detail_subset(runtime_status)
|
|
if running_pid is not None:
|
|
return {
|
|
"alive": True,
|
|
"checked_at": checked_at,
|
|
"details": {
|
|
"state": "alive",
|
|
**safe_details,
|
|
},
|
|
}
|
|
|
|
# Cross-container fallback (#1879): when ``get_running_pid()`` cannot see
|
|
# the gateway because we're in a different PID namespace, a recent
|
|
# ``updated_at`` on ``gateway_state.json`` is a reliable equivalent signal
|
|
# since the gateway writes it on every tick. We only trust this fallback
|
|
# when the gateway also self-reports ``gateway_state == "running"`` so
|
|
# crash-without-cleanup scenarios still surface as "down".
|
|
if _runtime_status_is_fresh(runtime_status):
|
|
return {
|
|
"alive": True,
|
|
"checked_at": checked_at,
|
|
"details": {
|
|
"state": "alive",
|
|
"reason": "cross_container_freshness",
|
|
**safe_details,
|
|
},
|
|
}
|
|
|
|
if _runtime_status_is_stale_stopped(runtime_status):
|
|
return {
|
|
"alive": None,
|
|
"checked_at": checked_at,
|
|
"details": {
|
|
"state": "unknown",
|
|
"reason": "gateway_stale_stopped_state",
|
|
**safe_details,
|
|
},
|
|
}
|
|
|
|
if _runtime_status_is_stale_running(runtime_status):
|
|
return {
|
|
"alive": None,
|
|
"checked_at": checked_at,
|
|
"details": {
|
|
"state": "unknown",
|
|
"reason": "gateway_stale_running_state",
|
|
**safe_details,
|
|
},
|
|
}
|
|
|
|
if isinstance(runtime_status, dict):
|
|
return {
|
|
"alive": False,
|
|
"checked_at": checked_at,
|
|
"details": {
|
|
"state": "down",
|
|
"reason": "gateway_not_running",
|
|
**safe_details,
|
|
},
|
|
}
|
|
|
|
return {
|
|
"alive": None,
|
|
"checked_at": checked_at,
|
|
"details": {
|
|
"state": "unknown",
|
|
"reason": "gateway_not_configured",
|
|
},
|
|
}
|