fix: tighten title language detection

This commit is contained in:
ai-ag2026
2026-05-28 08:21:58 +02:00
parent 5528e2c579
commit 2aeebf56ac
3 changed files with 29 additions and 3 deletions
+4
View File
@@ -3,6 +3,10 @@
## [Unreleased]
### Fixed
- Title-language detection no longer treats common English tech/jargon text such as "session die" or DAS/DER references as German just because of shared tokens. (Refs #3040)
## [v0.51.152] — 2026-05-28 — Release DX (stage-batch34 — single-PR optional gateway-backed browser chat)
### Added
+3 -3
View File
@@ -1385,12 +1385,12 @@ def _detect_title_language(text: str) -> str:
return ''
german_markers = {
'warum', 'werden', 'wird', 'wurde', 'hier', 'nicht', 'mehr', 'alte', 'alten',
'bilder', 'angezeigt', 'session', 'prüfe', 'ich', 'die', 'der', 'das', 'den',
'und', 'oder', 'mit', 'für', 'von', 'zu', 'ist', 'sind', 'bitte', 'kannst',
'bilder', 'angezeigt', 'prüfe', 'ich', 'und', 'oder', 'mit', 'für', 'von',
'zu', 'ist', 'sind', 'bitte', 'kannst',
}
tokens = re.findall(r'[A-Za-zÀ-ÖØ-öø-ÿ]+', s)
german_hits = sum(1 for tok in tokens if tok in german_markers)
if re.search(r'[äöüß]', s) or german_hits >= 2:
if re.search(r'[äöüß]', s) or german_hits >= 3:
return 'de'
return ''
+22
View File
@@ -229,6 +229,28 @@ class TestGenerateTitleRawViaAuxTimeout(unittest.TestCase):
self.assertIn('Match the language of the user question', messages[0]['content'])
self.assertIn('If the user writes German, output a German title', messages[0]['content'])
def test_title_language_detection_avoids_english_tech_false_positives(self):
"""English tech/jargon text must not be classified as German by shared tokens."""
from api.streaming import _detect_title_language
examples = [
'Why did the session die after the DAS storage failover?',
'The session can die when DAS storage disconnects.',
'Debug the session and DER certificate import failure.',
]
for text in examples:
with self.subTest(text=text):
self.assertEqual(_detect_title_language(text), '')
def test_title_language_detection_keeps_german_without_umlaut(self):
"""German without umlauts still needs a language hint when evidence is specific."""
from api.streaming import _detect_title_language
self.assertEqual(
_detect_title_language('Warum werden hier die Bilder der alten Session nicht angezeigt?'),
'de',
)
def test_german_source_rejects_english_aux_title(self):
"""Regression: an English aux title must not overwrite a German conversation."""
from api.streaming import _generate_llm_session_title_via_aux