diff --git a/CHANGELOG.md b/CHANGELOG.md index 7fad2de4..3e8ea2d4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,8 @@ - Visible but unfocused chat windows now still attempt the immediate SSE reconnect for the current session; only a real session switch skips the reconnect path. (Refs #3040) +- Gateway-backed WebUI chat now forwards current-turn image attachments as OpenAI-style multimodal `image_url` parts when native image input is enabled, matching the legacy WebUI runtime's image handoff. + ## [v0.51.152] — 2026-05-28 — Release DX (stage-batch34 — single-PR optional gateway-backed browser chat) ### Added diff --git a/api/gateway_chat.py b/api/gateway_chat.py index 1c658abb..69e653ac 100644 --- a/api/gateway_chat.py +++ b/api/gateway_chat.py @@ -202,10 +202,19 @@ def _run_gateway_chat_streaming( # Scope Gateway long-term continuity to this WebUI conversation # without exposing the browser's auth cookie or CSRF material. headers["X-Hermes-Session-Key"] = f"webui:{session_id}" + message_content: Any = str(msg_text or "") + if attachments: + try: + from api.streaming import _build_native_multimodal_message + + message_content = _build_native_multimodal_message("", str(msg_text or ""), attachments, str(workspace), cfg=cfg) + except Exception: + logger.debug("Failed to build gateway multimodal attachment payload", exc_info=True) + message_content = str(msg_text or "") body = { "model": model or "default", "stream": True, - "messages": [{"role": "user", "content": str(msg_text or "")}], + "messages": [{"role": "user", "content": message_content}], } if model_provider: body["provider"] = model_provider diff --git a/tests/test_webui_gateway_chat_backend.py b/tests/test_webui_gateway_chat_backend.py index e2bed326..f9c7f94d 100644 --- a/tests/test_webui_gateway_chat_backend.py +++ b/tests/test_webui_gateway_chat_backend.py @@ -1,4 +1,6 @@ from collections import OrderedDict +import base64 +import json import api.gateway_chat as gateway_chat import api.models as models @@ -117,3 +119,56 @@ def test_gateway_chat_worker_translates_sse_and_persists_session(tmp_path, monke assert captured["headers"]["X-hermes-session-id"] == s.session_id assert captured["headers"]["X-hermes-session-key"] == f"webui:{s.session_id}" assert '"stream": true' in captured["body"] + + +def test_gateway_chat_worker_forwards_image_attachments_as_multimodal_parts(tmp_path, monkeypatch): + session_dir = tmp_path / "sessions" + session_dir.mkdir() + monkeypatch.setattr(models, "SESSION_DIR", session_dir) + monkeypatch.setattr(models, "SESSION_INDEX_FILE", session_dir / "_index.json") + monkeypatch.setattr(models, "SESSIONS", OrderedDict()) + + image_bytes = base64.b64decode( + "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/x8AAwMCAO+/p9sAAAAASUVORK5CYII=" + ) + image_path = tmp_path / "photo.png" + image_path.write_bytes(image_bytes) + captured = {} + + class FakeResponse: + def __enter__(self): + return self + + def __exit__(self, exc_type, exc, tb): + return False + + def __iter__(self): + yield b'data: {"choices":[{"delta":{"content":"saw it"}}]}\n\n' + yield b'data: [DONE]\n\n' + + def fake_urlopen(req, timeout=0): + captured["body"] = json.loads(req.data.decode("utf-8")) + return FakeResponse() + + monkeypatch.setenv("HERMES_WEBUI_GATEWAY_BASE_URL", "http://gateway.local") + monkeypatch.setattr(gateway_chat.urllib.request, "urlopen", fake_urlopen) + + s = new_session() + stream_id = "stream-gateway-image-test" + s.active_stream_id = stream_id + s.save() + STREAMS[stream_id] = create_stream_channel() + + gateway_chat._run_gateway_chat_streaming( + s.session_id, + "What is in this image?", + "test-model", + str(tmp_path), + stream_id, + [{"path": str(image_path), "mime": "image/png", "is_image": True}], + ) + + content = captured["body"]["messages"][0]["content"] + assert content[0] == {"type": "text", "text": "What is in this image?"} + assert content[1]["type"] == "image_url" + assert content[1]["image_url"]["url"].startswith("data:image/png;base64,")