feat(openrouter): add response caching support (#19132)

Enable OpenRouter's response caching feature (beta) via X-OpenRouter-Cache
headers. When enabled, identical API requests return cached responses for
free (zero billing), reducing both latency and cost.

Configuration via config.yaml:
  openrouter:
    response_cache: true       # default: on
    response_cache_ttl: 300    # 1-86400 seconds

Changes:
- Add openrouter config section to DEFAULT_CONFIG (response_cache + TTL)
- Add build_or_headers() in auxiliary_client.py that builds attribution
  headers plus optional cache headers based on config
- Replace inline _OR_HEADERS dicts with build_or_headers() at all 5 sites:
  run_agent.py __init__, _apply_client_headers_for_base_url(), and
  auxiliary_client.py _try_openrouter() + _to_async_client()
- Add _check_openrouter_cache_status() method to AIAgent that reads
  X-OpenRouter-Cache-Status from streaming response headers and logs
  HIT/MISS status
- Document in cli-config.yaml.example
- Add 28 tests (22 unit + 6 integration)

Ref: https://openrouter.ai/docs/guides/features/response-caching
This commit is contained in:
kshitij
2026-05-03 01:54:24 -07:00
committed by GitHub
parent 9b5b88b5e0
commit 457c7b76cd
7 changed files with 451 additions and 12 deletions
+60 -5
View File
@@ -259,13 +259,68 @@ _PROVIDERS_WITHOUT_VISION: frozenset = frozenset({
"kimi-coding-cn",
})
# OpenRouter app attribution headers
_OR_HEADERS = {
# OpenRouter app attribution headers (base — always sent)
_OR_HEADERS_BASE = {
"HTTP-Referer": "https://hermes-agent.nousresearch.com",
"X-OpenRouter-Title": "Hermes Agent",
"X-OpenRouter-Categories": "productivity,cli-agent",
}
# Truthy values for boolean env-var parsing.
_TRUTHY_ENV_VALUES = frozenset({"1", "true", "yes", "on"})
def build_or_headers(or_config: dict | None = None) -> dict:
"""Build OpenRouter headers, optionally including response-cache headers.
Precedence for response cache: env var > config.yaml > default (enabled).
Environment variables:
``HERMES_OPENROUTER_CACHE`` — truthy (``1``/``true``/``yes``/``on``)
enables caching; ``0``/``false``/``no``/``off`` disables.
Overrides ``openrouter.response_cache`` in config.yaml.
``HERMES_OPENROUTER_CACHE_TTL`` — integer seconds (1-86400).
Overrides ``openrouter.response_cache_ttl`` in config.yaml.
*or_config* is the ``openrouter`` section from config.yaml. When *None*,
falls back to reading config from disk via ``load_config()``.
"""
headers = dict(_OR_HEADERS_BASE)
# Resolve config from disk if not provided.
if or_config is None:
try:
from hermes_cli.config import load_config
or_config = load_config().get("openrouter", {})
except Exception:
or_config = {}
# Determine cache enabled: env var overrides config.
env_cache = os.environ.get("HERMES_OPENROUTER_CACHE", "").strip().lower()
if env_cache:
cache_enabled = env_cache in _TRUTHY_ENV_VALUES
else:
cache_enabled = or_config.get("response_cache", False)
if not cache_enabled:
return headers
headers["X-OpenRouter-Cache"] = "true"
# Determine TTL: env var overrides config.
env_ttl = os.environ.get("HERMES_OPENROUTER_CACHE_TTL", "").strip()
if env_ttl:
if env_ttl.isdigit():
ttl = int(env_ttl)
if 1 <= ttl <= 86400:
headers["X-OpenRouter-Cache-TTL"] = str(ttl)
else:
ttl = or_config.get("response_cache_ttl", 300)
if isinstance(ttl, (int, float)) and 1 <= ttl <= 86400:
headers["X-OpenRouter-Cache-TTL"] = str(int(ttl))
return headers
# Vercel AI Gateway app attribution headers. HTTP-Referer maps to
# referrerUrl and X-Title maps to appName in the gateway's analytics.
from hermes_cli import __version__ as _HERMES_VERSION
@@ -1158,14 +1213,14 @@ def _try_openrouter(explicit_api_key: str = None) -> Tuple[Optional[OpenAI], Opt
base_url = _pool_runtime_base_url(entry, OPENROUTER_BASE_URL) or OPENROUTER_BASE_URL
logger.debug("Auxiliary client: OpenRouter via pool")
return OpenAI(api_key=or_key, base_url=base_url,
default_headers=_OR_HEADERS), _OPENROUTER_MODEL
default_headers=build_or_headers()), _OPENROUTER_MODEL
or_key = explicit_api_key or os.getenv("OPENROUTER_API_KEY")
if not or_key:
return None, None
logger.debug("Auxiliary client: OpenRouter")
return OpenAI(api_key=or_key, base_url=OPENROUTER_BASE_URL,
default_headers=_OR_HEADERS), _OPENROUTER_MODEL
default_headers=build_or_headers()), _OPENROUTER_MODEL
def _describe_openrouter_unavailable() -> str:
@@ -1911,7 +1966,7 @@ def _to_async_client(sync_client, model: str, is_vision: bool = False):
}
sync_base_url = str(sync_client.base_url)
if base_url_host_matches(sync_base_url, "openrouter.ai"):
async_kwargs["default_headers"] = dict(_OR_HEADERS)
async_kwargs["default_headers"] = build_or_headers()
elif base_url_host_matches(sync_base_url, "api.githubcopilot.com"):
from hermes_cli.copilot_auth import copilot_request_headers
+12
View File
@@ -121,6 +121,18 @@ model:
# # Data policy: "allow" (default) or "deny" to exclude providers that may store data
# # data_collection: "deny"
# =============================================================================
# OpenRouter Response Caching (only applies when using OpenRouter)
# =============================================================================
# Cache identical API responses at the OpenRouter edge for free instant replays.
# When enabled, identical requests (same model, messages, parameters) return
# cached responses with zero billing. Separate from Anthropic prompt caching.
# See: https://openrouter.ai/docs/guides/features/response-caching
#
# openrouter:
# response_cache: true # Enable response caching (default: true)
# response_cache_ttl: 300 # Cache TTL in seconds, 1-86400 (default: 300)
# =============================================================================
# Git Worktree Isolation
# =============================================================================
+12
View File
@@ -644,6 +644,18 @@ DEFAULT_CONFIG = {
"cache_ttl": "5m",
},
# OpenRouter-specific settings.
# response_cache: enable OpenRouter response caching (X-OpenRouter-Cache header).
# When enabled, identical requests return cached responses for free (zero billing).
# This is separate from Anthropic prompt caching and works alongside it.
# See: https://openrouter.ai/docs/guides/features/response-caching
# response_cache_ttl: how long cached responses remain valid, in seconds (1-86400).
# Default 300 (5 minutes). Only used when response_cache is enabled.
"openrouter": {
"response_cache": True,
"response_cache_ttl": 300,
},
# AWS Bedrock provider configuration.
# Only used when model.provider is "bedrock".
"bedrock": {
+33 -7
View File
@@ -1258,6 +1258,10 @@ class AIAgent:
# after each API call. Accessed by /usage slash command.
self._rate_limit_state: Optional["RateLimitState"] = None
# OpenRouter response cache hit counter — incremented when
# X-OpenRouter-Cache-Status: HIT is seen in streaming response headers.
self._or_cache_hits: int = 0
# Centralized logging — agent.log (INFO+) and errors.log (WARNING+)
# both live under ~/.hermes/logs/. Idempotent, so gateway mode
# (which creates a new AIAgent per message) won't duplicate handlers.
@@ -1421,11 +1425,8 @@ class AIAgent:
client_kwargs["args"] = self.acp_args
effective_base = base_url
if base_url_host_matches(effective_base, "openrouter.ai"):
client_kwargs["default_headers"] = {
"HTTP-Referer": "https://hermes-agent.nousresearch.com",
"X-OpenRouter-Title": "Hermes Agent",
"X-OpenRouter-Categories": "productivity,cli-agent",
}
from agent.auxiliary_client import build_or_headers
client_kwargs["default_headers"] = build_or_headers()
elif base_url_host_matches(effective_base, "api.routermint.com"):
client_kwargs["default_headers"] = _routermint_headers()
elif base_url_host_matches(effective_base, "api.githubcopilot.com"):
@@ -4580,6 +4581,28 @@ class AIAgent:
"""Return the last captured RateLimitState, or None."""
return self._rate_limit_state
def _check_openrouter_cache_status(self, http_response: Any) -> None:
"""Read X-OpenRouter-Cache-Status from response headers and log it.
Increments ``_or_cache_hits`` on HIT so callers can report savings.
"""
if http_response is None:
return
headers = getattr(http_response, "headers", None)
if not headers:
return
try:
status = headers.get("x-openrouter-cache-status")
if not status:
return
if status.upper() == "HIT":
self._or_cache_hits += 1
logger.info("OpenRouter response cache HIT (total: %d)", self._or_cache_hits)
else:
logger.debug("OpenRouter response cache %s", status.upper())
except Exception:
pass # Never let header parsing break the agent loop
def get_activity_summary(self) -> dict:
"""Return a snapshot of the agent's current activity for diagnostics.
@@ -6157,10 +6180,10 @@ class AIAgent:
return True
def _apply_client_headers_for_base_url(self, base_url: str) -> None:
from agent.auxiliary_client import _AI_GATEWAY_HEADERS, _OR_HEADERS
from agent.auxiliary_client import _AI_GATEWAY_HEADERS, build_or_headers
if base_url_host_matches(base_url, "openrouter.ai"):
self._client_kwargs["default_headers"] = dict(_OR_HEADERS)
self._client_kwargs["default_headers"] = build_or_headers()
elif base_url_host_matches(base_url, "ai-gateway.vercel.sh"):
self._client_kwargs["default_headers"] = dict(_AI_GATEWAY_HEADERS)
elif base_url_host_matches(base_url, "api.routermint.com"):
@@ -6780,6 +6803,9 @@ class AIAgent:
# response via .response before any chunks are consumed.
self._capture_rate_limits(getattr(stream, "response", None))
# Log OpenRouter response cache status when present.
self._check_openrouter_cache_status(getattr(stream, "response", None))
content_parts: list = []
tool_calls_acc: dict = {}
tool_gen_notified: set = set()
@@ -0,0 +1,284 @@
"""Tests for OpenRouter response caching header injection."""
from types import SimpleNamespace
from unittest.mock import patch
import pytest
# ---------------------------------------------------------------------------
# build_or_headers
# ---------------------------------------------------------------------------
class TestBuildOrHeaders:
"""Test the build_or_headers() helper in agent/auxiliary_client.py."""
def test_base_attribution_always_present(self):
"""Attribution headers must always be included regardless of cache setting."""
from agent.auxiliary_client import build_or_headers
headers = build_or_headers(or_config={"response_cache": False})
assert headers["HTTP-Referer"] == "https://hermes-agent.nousresearch.com"
assert headers["X-OpenRouter-Title"] == "Hermes Agent"
assert headers["X-OpenRouter-Categories"] == "productivity,cli-agent"
def test_cache_enabled(self):
"""When response_cache is True, X-OpenRouter-Cache header is set."""
from agent.auxiliary_client import build_or_headers
headers = build_or_headers(or_config={"response_cache": True})
assert headers["X-OpenRouter-Cache"] == "true"
def test_cache_disabled(self):
"""When response_cache is False, no cache header is sent."""
from agent.auxiliary_client import build_or_headers
headers = build_or_headers(or_config={"response_cache": False})
assert "X-OpenRouter-Cache" not in headers
assert "X-OpenRouter-Cache-TTL" not in headers
def test_cache_disabled_by_default_empty_config(self):
"""Empty config dict means no cache headers (response_cache defaults to False)."""
from agent.auxiliary_client import build_or_headers
headers = build_or_headers(or_config={})
assert "X-OpenRouter-Cache" not in headers
def test_ttl_default(self):
"""Default TTL (300) is included when cache is enabled."""
from agent.auxiliary_client import build_or_headers
headers = build_or_headers(or_config={"response_cache": True, "response_cache_ttl": 300})
assert headers["X-OpenRouter-Cache-TTL"] == "300"
def test_ttl_custom(self):
"""Custom TTL values within range are sent."""
from agent.auxiliary_client import build_or_headers
headers = build_or_headers(or_config={"response_cache": True, "response_cache_ttl": 3600})
assert headers["X-OpenRouter-Cache-TTL"] == "3600"
def test_ttl_max(self):
"""Maximum TTL (86400) is accepted."""
from agent.auxiliary_client import build_or_headers
headers = build_or_headers(or_config={"response_cache": True, "response_cache_ttl": 86400})
assert headers["X-OpenRouter-Cache-TTL"] == "86400"
def test_ttl_out_of_range_too_high(self):
"""TTL above 86400 is silently ignored (no TTL header sent)."""
from agent.auxiliary_client import build_or_headers
headers = build_or_headers(or_config={"response_cache": True, "response_cache_ttl": 100000})
assert "X-OpenRouter-Cache-TTL" not in headers
# But cache is still enabled
assert headers["X-OpenRouter-Cache"] == "true"
def test_ttl_out_of_range_zero(self):
"""TTL of 0 is below minimum — no TTL header sent."""
from agent.auxiliary_client import build_or_headers
headers = build_or_headers(or_config={"response_cache": True, "response_cache_ttl": 0})
assert "X-OpenRouter-Cache-TTL" not in headers
def test_ttl_negative(self):
"""Negative TTL is ignored."""
from agent.auxiliary_client import build_or_headers
headers = build_or_headers(or_config={"response_cache": True, "response_cache_ttl": -5})
assert "X-OpenRouter-Cache-TTL" not in headers
def test_ttl_not_a_number(self):
"""Non-numeric TTL is ignored."""
from agent.auxiliary_client import build_or_headers
headers = build_or_headers(or_config={"response_cache": True, "response_cache_ttl": "five"})
assert "X-OpenRouter-Cache-TTL" not in headers
def test_ttl_float_truncated(self):
"""Float TTL values are truncated to int."""
from agent.auxiliary_client import build_or_headers
headers = build_or_headers(or_config={"response_cache": True, "response_cache_ttl": 600.7})
assert headers["X-OpenRouter-Cache-TTL"] == "600"
def test_returns_fresh_dict(self):
"""Each call returns a new dict so mutations don't leak."""
from agent.auxiliary_client import build_or_headers
cfg = {"response_cache": True}
h1 = build_or_headers(or_config=cfg)
h2 = build_or_headers(or_config=cfg)
assert h1 is not h2
assert h1 == h2
def test_none_config_falls_back_to_load_config(self):
"""When or_config is None, build_or_headers reads from load_config()."""
from agent.auxiliary_client import build_or_headers
fake_cfg = {
"openrouter": {"response_cache": True, "response_cache_ttl": 900},
}
with patch("hermes_cli.config.load_config", return_value=fake_cfg):
headers = build_or_headers(or_config=None)
assert headers["X-OpenRouter-Cache"] == "true"
assert headers["X-OpenRouter-Cache-TTL"] == "900"
def test_none_config_load_config_fails_gracefully(self):
"""When load_config() fails, build_or_headers still returns base headers."""
from agent.auxiliary_client import build_or_headers
with patch("hermes_cli.config.load_config", side_effect=RuntimeError("boom")):
headers = build_or_headers(or_config=None)
# Should have base attribution but no cache headers
assert "HTTP-Referer" in headers
assert "X-OpenRouter-Cache" not in headers
# ---------------------------------------------------------------------------
# Environment variable overrides
# ---------------------------------------------------------------------------
class TestEnvVarOverrides:
"""Test env var precedence over config.yaml for response caching."""
def test_env_enables_cache(self, monkeypatch):
"""HERMES_OPENROUTER_CACHE=true enables cache even when config disables it."""
from agent.auxiliary_client import build_or_headers
monkeypatch.setenv("HERMES_OPENROUTER_CACHE", "true")
headers = build_or_headers(or_config={"response_cache": False})
assert headers["X-OpenRouter-Cache"] == "true"
def test_env_disables_cache(self, monkeypatch):
"""HERMES_OPENROUTER_CACHE=false disables cache even when config enables it."""
from agent.auxiliary_client import build_or_headers
monkeypatch.setenv("HERMES_OPENROUTER_CACHE", "false")
headers = build_or_headers(or_config={"response_cache": True})
assert "X-OpenRouter-Cache" not in headers
@pytest.mark.parametrize("value", ["1", "true", "TRUE", "yes", "Yes", "on"])
def test_truthy_values(self, monkeypatch, value):
"""Various truthy strings enable caching."""
from agent.auxiliary_client import build_or_headers
monkeypatch.setenv("HERMES_OPENROUTER_CACHE", value)
headers = build_or_headers(or_config={})
assert headers["X-OpenRouter-Cache"] == "true"
@pytest.mark.parametrize("value", ["0", "false", "no", "off", "maybe", ""])
def test_non_truthy_values(self, monkeypatch, value):
"""Non-truthy strings do not enable caching (empty falls through to config)."""
from agent.auxiliary_client import build_or_headers
monkeypatch.setenv("HERMES_OPENROUTER_CACHE", value)
# Empty string falls through to config; others are explicitly non-truthy
if value == "":
# Empty env var falls through to config default (False)
headers = build_or_headers(or_config={"response_cache": False})
else:
headers = build_or_headers(or_config={"response_cache": True})
assert "X-OpenRouter-Cache" not in headers
def test_env_ttl_overrides_config(self, monkeypatch):
"""HERMES_OPENROUTER_CACHE_TTL overrides config TTL."""
from agent.auxiliary_client import build_or_headers
monkeypatch.setenv("HERMES_OPENROUTER_CACHE", "true")
monkeypatch.setenv("HERMES_OPENROUTER_CACHE_TTL", "1800")
headers = build_or_headers(or_config={"response_cache_ttl": 300})
assert headers["X-OpenRouter-Cache-TTL"] == "1800"
@pytest.mark.parametrize("ttl", ["0", "86401", "abc", "-1", "12.5"])
def test_invalid_env_ttl_dropped(self, monkeypatch, ttl):
"""Invalid TTL env values are ignored; cache still enabled without TTL."""
from agent.auxiliary_client import build_or_headers
monkeypatch.setenv("HERMES_OPENROUTER_CACHE", "1")
monkeypatch.setenv("HERMES_OPENROUTER_CACHE_TTL", ttl)
headers = build_or_headers(or_config={})
assert headers["X-OpenRouter-Cache"] == "true"
assert "X-OpenRouter-Cache-TTL" not in headers
@pytest.mark.parametrize("ttl", ["1", "300", "86400"])
def test_valid_env_ttl_boundaries(self, monkeypatch, ttl):
"""Boundary TTL values (1, 300, 86400) are accepted."""
from agent.auxiliary_client import build_or_headers
monkeypatch.setenv("HERMES_OPENROUTER_CACHE", "yes")
monkeypatch.setenv("HERMES_OPENROUTER_CACHE_TTL", ttl)
assert build_or_headers(or_config={})["X-OpenRouter-Cache-TTL"] == ttl
def test_no_env_vars_falls_through_to_config(self, monkeypatch):
"""Without env vars, config.yaml controls behavior."""
from agent.auxiliary_client import build_or_headers
monkeypatch.delenv("HERMES_OPENROUTER_CACHE", raising=False)
monkeypatch.delenv("HERMES_OPENROUTER_CACHE_TTL", raising=False)
headers = build_or_headers(or_config={"response_cache": True, "response_cache_ttl": 600})
assert headers["X-OpenRouter-Cache"] == "true"
assert headers["X-OpenRouter-Cache-TTL"] == "600"
class TestDefaultConfig:
"""Verify the openrouter config section is in DEFAULT_CONFIG."""
def test_openrouter_section_exists(self):
from hermes_cli.config import DEFAULT_CONFIG
assert "openrouter" in DEFAULT_CONFIG
or_cfg = DEFAULT_CONFIG["openrouter"]
assert or_cfg["response_cache"] is True
assert or_cfg["response_cache_ttl"] == 300
# ---------------------------------------------------------------------------
# _check_openrouter_cache_status
# ---------------------------------------------------------------------------
class TestCheckOpenrouterCacheStatus:
"""Test the _check_openrouter_cache_status method on AIAgent."""
def _make_agent(self):
"""Create a minimal AIAgent-like object with just the method under test."""
from run_agent import AIAgent
# Use object.__new__ to skip __init__, then set the attributes we need
agent = object.__new__(AIAgent)
agent._or_cache_hits = 0
return agent
def test_hit_increments_counter(self):
agent = self._make_agent()
resp = SimpleNamespace(headers={"x-openrouter-cache-status": "HIT"})
agent._check_openrouter_cache_status(resp)
assert agent._or_cache_hits == 1
# Second hit increments
agent._check_openrouter_cache_status(resp)
assert agent._or_cache_hits == 2
def test_miss_does_not_increment(self):
agent = self._make_agent()
resp = SimpleNamespace(headers={"x-openrouter-cache-status": "MISS"})
agent._check_openrouter_cache_status(resp)
assert getattr(agent, "_or_cache_hits", 0) == 0
def test_no_header_is_noop(self):
agent = self._make_agent()
resp = SimpleNamespace(headers={})
agent._check_openrouter_cache_status(resp)
assert getattr(agent, "_or_cache_hits", 0) == 0
def test_none_response_is_safe(self):
agent = self._make_agent()
agent._check_openrouter_cache_status(None) # no crash
def test_no_headers_attr_is_safe(self):
agent = self._make_agent()
agent._check_openrouter_cache_status(object()) # no crash
def test_case_insensitive(self):
agent = self._make_agent()
resp = SimpleNamespace(headers={"x-openrouter-cache-status": "hit"})
agent._check_openrouter_cache_status(resp)
assert agent._or_cache_hits == 1
@@ -81,3 +81,51 @@ def test_unknown_base_url_clears_default_headers(mock_openai):
agent._apply_client_headers_for_base_url("https://api.example.com/v1")
assert "default_headers" not in agent._client_kwargs
@patch("run_agent.OpenAI")
def test_openrouter_headers_include_response_cache_when_enabled(mock_openai):
"""When openrouter.response_cache is True, the cache header is injected."""
mock_openai.return_value = MagicMock()
agent = AIAgent(
api_key="test-key",
base_url="https://openrouter.ai/api/v1",
model="test/model",
quiet_mode=True,
skip_context_files=True,
skip_memory=True,
)
with patch("hermes_cli.config.load_config", return_value={
"openrouter": {"response_cache": True, "response_cache_ttl": 600},
}):
agent._apply_client_headers_for_base_url("https://openrouter.ai/api/v1")
headers = agent._client_kwargs["default_headers"]
assert headers["HTTP-Referer"] == "https://hermes-agent.nousresearch.com"
assert headers["X-OpenRouter-Cache"] == "true"
assert headers["X-OpenRouter-Cache-TTL"] == "600"
@patch("run_agent.OpenAI")
def test_openrouter_headers_no_cache_when_disabled(mock_openai):
"""When openrouter.response_cache is False, no cache headers are sent."""
mock_openai.return_value = MagicMock()
agent = AIAgent(
api_key="test-key",
base_url="https://openrouter.ai/api/v1",
model="test/model",
quiet_mode=True,
skip_context_files=True,
skip_memory=True,
)
with patch("hermes_cli.config.load_config", return_value={
"openrouter": {"response_cache": False},
}):
agent._apply_client_headers_for_base_url("https://openrouter.ai/api/v1")
headers = agent._client_kwargs["default_headers"]
assert headers["HTTP-Referer"] == "https://hermes-agent.nousresearch.com"
assert "X-OpenRouter-Cache" not in headers
assert "X-OpenRouter-Cache-TTL" not in headers
@@ -14,6 +14,8 @@ All variables go in `~/.hermes/.env`. You can also set them with `hermes config
|----------|-------------|
| `OPENROUTER_API_KEY` | OpenRouter API key (recommended for flexibility) |
| `OPENROUTER_BASE_URL` | Override the OpenRouter-compatible base URL |
| `HERMES_OPENROUTER_CACHE` | Enable OpenRouter response caching (`1`/`true`/`yes`/`on`). Overrides `openrouter.response_cache` in config.yaml. See [Response Caching](https://openrouter.ai/docs/guides/features/response-caching). |
| `HERMES_OPENROUTER_CACHE_TTL` | Cache TTL in seconds (1-86400). Overrides `openrouter.response_cache_ttl` in config.yaml. |
| `NOUS_BASE_URL` | Override Nous Portal base URL (rarely needed; development/testing only) |
| `NOUS_INFERENCE_BASE_URL` | Override Nous inference endpoint directly |
| `AI_GATEWAY_API_KEY` | Vercel AI Gateway API key ([ai-gateway.vercel.sh](https://ai-gateway.vercel.sh)) |