hermes-agent/scripts/benchmark_browser_eval.py

"""Quick benchmark: subprocess eval vs supervisor-WS eval.

Runs both paths against the same live Chrome and prints a comparison table.
Not a pytest — a script you run manually for the PR description.

Usage:
    .venv/bin/python scripts/benchmark_browser_eval.py [--iterations N]
"""
from __future__ import annotations

import argparse
import shutil
import statistics
import subprocess
import sys
import tempfile
import time
import urllib.request
import json


def _find_chrome() -> str:
    for c in ("google-chrome", "chromium", "chromium-browser"):
        p = shutil.which(c)
        if p:
            return p
    print("No Chrome binary found.", file=sys.stderr)
    sys.exit(1)


def _start_chrome(port: int):
    profile = tempfile.mkdtemp(prefix="hermes-bench-eval-")
    proc = subprocess.Popen(
        [
            _find_chrome(),
            f"--remote-debugging-port={port}",
            f"--user-data-dir={profile}",
            "--no-first-run",
            "--no-default-browser-check",
            "--headless=new",
            "--disable-gpu",
        ],
        stdout=subprocess.DEVNULL,
        stderr=subprocess.DEVNULL,
    )
    deadline = time.monotonic() + 15
    while time.monotonic() < deadline:
        try:
            with urllib.request.urlopen(f"http://127.0.0.1:{port}/json/version", timeout=1) as r:
                info = json.loads(r.read().decode())
                return proc, profile, info["webSocketDebuggerUrl"]
        except Exception:
            time.sleep(0.25)
    proc.terminate()
    raise RuntimeError("Chrome didn't expose CDP")


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--iterations", type=int, default=50)
    parser.add_argument("--port", type=int, default=9333)
    args = parser.parse_args()

    proc, profile, cdp_url = _start_chrome(args.port)
    try:
        from tools.browser_supervisor import SUPERVISOR_REGISTRY

        # Warm up: start the supervisor, navigate to a page.
        supervisor = SUPERVISOR_REGISTRY.get_or_start(
            task_id="bench-eval", cdp_url=cdp_url
        )
        # Give it a moment to attach.
        time.sleep(1.0)

        # Sanity check: one eval over WS should succeed.
        sanity = supervisor.evaluate_runtime("1 + 1")
        if not sanity.get("ok") or sanity.get("result") != 2:
            print(f"sanity check failed: {sanity}", file=sys.stderr)
            sys.exit(2)

        # ── Bench 1: supervisor WS path ──────────────────────────────────
        ws_times: list[float] = []
        for _ in range(args.iterations):
            t0 = time.monotonic()
            out = supervisor.evaluate_runtime("1 + 1")
            t1 = time.monotonic()
            assert out.get("ok"), out
            ws_times.append((t1 - t0) * 1000)

        # ── Bench 2: agent-browser subprocess path ────────────────────────
        # Skip if agent-browser isn't installed — the WS bench still tells
        # us what we need.
        if shutil.which("agent-browser") is None and shutil.which("npx") is None:
            print("agent-browser CLI not found — skipping subprocess bench.")
            sub_times = []
        else:
            from tools.browser_tool import _run_browser_command, _last_session_key
            task_id = _last_session_key("bench-eval")
            sub_times = []
            for _ in range(args.iterations):
                t0 = time.monotonic()
                _run_browser_command(task_id, "eval", ["1 + 1"])
                t1 = time.monotonic()
                sub_times.append((t1 - t0) * 1000)

        def fmt(name: str, ts: list[float]) -> str:
            if not ts:
                return f"  {name:<40} (skipped)"
            mean = statistics.mean(ts)
            median = statistics.median(ts)
            mn, mx = min(ts), max(ts)
            return (
                f"  {name:<40} mean={mean:>7.2f}ms  median={median:>7.2f}ms  "
                f"min={mn:>7.2f}ms  max={mx:>7.2f}ms"
            )

        print()
        print(f"browser_eval benchmark — {args.iterations} iterations of `1 + 1`")
        print("-" * 90)
        print(fmt("supervisor WS (Runtime.evaluate)", ws_times))
        print(fmt("agent-browser subprocess (eval)", sub_times))
        if ws_times and sub_times:
            speedup = statistics.mean(sub_times) / statistics.mean(ws_times)
            print()
            print(f"Speedup: {speedup:.1f}x (mean)")

    finally:
        SUPERVISOR_REGISTRY.stop_all()
        proc.terminate()
        try:
            proc.wait(timeout=3)
        except Exception:
            proc.kill()
        shutil.rmtree(profile, ignore_errors=True)


if __name__ == "__main__":
    main()