From 3f23431bb77d92e9d24947cb9440a072684ca445 Mon Sep 17 00:00:00 2001 From: happy5318 Date: Mon, 4 May 2026 07:17:25 +0800 Subject: [PATCH 1/2] Fix: add TCP keepalive to prevent CLOSE-WAIT zombie connections (v2) - Add server_bind() to QuietHTTPServer with SO_REUSEADDR and TCP keepalive - Add setup() to Handler for per-connection aggressive keepalive - Server level: 60s idle, 10s interval, 3 probes = 90s detection - Connection level: 10s idle, 5s interval, 3 probes = 25s detection - Prevents zombie connections from blocking API on long-running servers - Cross-platform safe with try/except for platforms without TCP_KEEP* constants Fixes #1580 --- server.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/server.py b/server.py index 2aee95cb..7f3ad495 100644 --- a/server.py +++ b/server.py @@ -27,6 +27,19 @@ class QuietHTTPServer(ThreadingHTTPServer): daemon_threads = True request_queue_size = 64 + def server_bind(self): + """Set socket options to prevent TIME_WAIT and CLOSE-WAIT accumulation.""" + # Enable address reuse to avoid "Address already in use" errors + self.socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + # Enable TCP keepalive to detect dead connections (Linux) + try: + self.socket.setsockopt(socket.IPPROTO_TCP, socket.TCP_KEEPIDLE, 60) # Start probing after 60s idle + self.socket.setsockopt(socket.IPPROTO_TCP, socket.TCP_KEEPINTVL, 10) # Probe every 10s + self.socket.setsockopt(socket.IPPROTO_TCP, socket.TCP_KEEPCNT, 3) # Drop after 3 failed probes + except (OSError, AttributeError): + pass # TCP_KEEP* may not be available on all platforms + super().server_bind() + def handle_error(self, request, client_address): """Override to suppress logging for common client disconnect errors.""" exc_type, exc_value, _ = sys.exc_info() @@ -48,6 +61,21 @@ class QuietHTTPServer(ThreadingHTTPServer): class Handler(BaseHTTPRequestHandler): timeout = 30 # seconds — kills idle/incomplete connections to prevent thread exhaustion + + def setup(self): + """Set additional socket options for each connection.""" + super().setup() + # Enable TCP keepalive on the connection socket (not just server socket) + try: + import socket + self.connection.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1) # Disable Nagle's algorithm + # Aggressive keepalive: start after 10s idle, probe every 5s, drop after 3 failures + self.connection.setsockopt(socket.IPPROTO_TCP, socket.TCP_KEEPIDLE, 10) + self.connection.setsockopt(socket.IPPROTO_TCP, socket.TCP_KEEPINTVL, 5) + self.connection.setsockopt(socket.IPPROTO_TCP, socket.TCP_KEEPCNT, 3) + self.connection.setsockopt(socket.SOL_SOCKET, socket.SO_KEEPALIVE, 1) + except (OSError, AttributeError): + pass # May not be available on all platforms _ver_suffix = WEBUI_VERSION.removeprefix('v') server_version = ('HermesWebUI/' + _ver_suffix) if _ver_suffix != 'unknown' else 'HermesWebUI' def log_message(self, fmt, *args): pass # suppress default Apache-style log From 59a6c6bc15329e5af99c4ae55d0022d49e968902 Mon Sep 17 00:00:00 2001 From: Hermes Release Agent Date: Sun, 3 May 2026 23:50:09 +0000 Subject: [PATCH 2/2] =?UTF-8?q?release:=20stamp=20v0.50.289=20=E2=80=94=20?= =?UTF-8?q?TCP=20keepalive=20on=20accepted=20connections=20(#1581)=20?= =?UTF-8?q?=E2=80=94=204094=20tests?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 21 +++++++++++++++++++++ ROADMAP.md | 2 +- TESTING.md | 2 +- 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3f77419b..ac948853 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,26 @@ # Hermes Web UI -- Changelog +## [v0.50.289] — 2026-05-03 + +### Fixed (1 PR — TCP keepalive on accepted connections — closes #1580) + +- **TCP keepalive on accepted connections to clean up dead `CLOSE-WAIT` sockets** (#1581 by @happy5318; closes #1580) — reporter (also @happy5318) observed `CLOSE-WAIT` zombie connections accumulating on long-running Linux WebUI servers (`ss -tn | grep 8787 | grep CLOSE-WAIT` showing nonzero counts after extended uptime). Without TCP keepalive enabled, a thread blocked in `recv()` waiting for the next request on an HTTP/1.0-or-1.1 keep-alive socket has no way to detect a peer that crashed, lost its network, or otherwise disappeared without sending FIN — the socket sits in `ESTABLISHED` indefinitely until the kernel reclaims it on idle thresholds far higher than necessary. **Fix (load-bearing):** new `Handler.setup()` override in `server.py` that, on every accepted connection, sets `SO_KEEPALIVE=1` (the master switch that enables TCP keepalive on this socket), `TCP_NODELAY=1` (disables Nagle for HTTP small-burst latency), and the keepalive timing parameters `TCP_KEEPIDLE=10` / `TCP_KEEPINTVL=5` / `TCP_KEEPCNT=3` → kernel starts probing a connection idle for 10s, probes every 5s, drops after 3 failed probes (~25s detection). All setsockopts wrapped in a single `try/except (OSError, AttributeError)` for graceful no-op on platforms where `TCP_KEEP*` constants aren't available (macOS, Windows). Healthy SSE streams send their existing 30s app-level `: keepalive\\n\\n` heartbeat which resets the kernel idle timer well below the 10s threshold, so probes never fire on healthy long-lived connections; only genuinely idle keep-alive sockets get cleaned up. The PR additionally adds a `QuietHTTPServer.server_bind()` block that sets `SO_REUSEADDR` (already the default via `allow_reuse_address=True`, so redundant) and listening-socket `TCP_KEEP*` (no-op without `SO_KEEPALIVE` on the listening socket — child sockets don't inherit keepalive parameters from the listener on Linux). Reviewer flagged that block as harmless dead code; deferred cleanup to follow-up issue along with macOS-doesn't-get-SO_KEEPALIVE behavior (the entire `try` block aborts on the first `AttributeError` from `TCP_KEEPIDLE`, so macOS dev servers get TCP_NODELAY but not the keepalive master switch). Linux is the production target and gets the full benefit. + +### Tests + +4094 → **4094 passing** (no new tests; kernel-level networking change is impractical to test in unit suite without a multi-process integration fixture). 0 regressions. Full suite in 110s. + +### Pre-release verification + +- Independent reviewer (nesquena, APPROVED) traced end-to-end: per-connection `Handler.setup()` is the load-bearing change; `SO_KEEPALIVE=1` is the master switch; 10/5/3 timing produces ~25s detection; healthy SSE streams' 30s app keepalive resets the kernel idle timer so probes never escalate on healthy connections; security audit clean (no XSS, SSRF, auth, path traversal, eval, shell — pure socket-options change); race-free (`server_bind` once at startup, `setup` per-connection on the request thread). +- Pre-release Opus advisor: **SHIP AS-IS** — no MUST-FIX. All 5 verification questions check out (race-free per-thread `Handler` lifecycle, kernel-keepalive death raises `OSError(ETIMEDOUT)` which is in both `_CLIENT_DISCONNECT_ERRORS` AND `QuietHTTPServer.handle_error`'s errno-110 suppress list, HTTP/1.0 churn impact negligible at 5 setsockopts per accept, swallow of `OSError`/`AttributeError` defensible for hotfix scope, dead-code cleanup in `server_bind()` correctly deferred to follow-up). +- Full suite: **4094 passed, 2 skipped, 3 xpassed, 0 failed** in 110s. +- Syntax: `py_compile server.py` → OK. + +### Maintainer in-stage actions + +- **PR rebase** (REBASE-DEFAULT rule): PR base was 111 commits behind `origin/master` (forked at `6c3ff3ff`, pre-v0.50.275). Rebased onto current master. Clean, no conflicts. Re-tested on rebased branch → 4094 passed, no regressions. + ## [v0.50.288] — 2026-05-03 ### Fixed (3 PRs — picker symmetry + cron profile isolation — closes #1567, #1568, #1573) diff --git a/ROADMAP.md b/ROADMAP.md index 141344da..d078905b 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -2,7 +2,7 @@ > Web companion to the Hermes Agent CLI. Same workflows, browser-native. > -> Last updated: v0.50.288 (May 03, 2026) — 4094 tests collected +> Last updated: v0.50.289 (May 03, 2026) — 4094 tests collected > Test source: `pytest tests/ --collect-only -q` > Per-version detail: see [CHANGELOG.md](./CHANGELOG.md) diff --git a/TESTING.md b/TESTING.md index 43263518..77dc10d8 100644 --- a/TESTING.md +++ b/TESTING.md @@ -1835,7 +1835,7 @@ Bridged CLI sessions: --- -*Last updated: v0.50.288, May 03, 2026* +*Last updated: v0.50.289, May 03, 2026* *Total automated tests collected: 4094* *Regression gate: tests/test_regressions.py* *Run: pytest tests/ -v --timeout=60*