# utils/sanitize.py import os import re from typing import Iterable DEFAULT_MARKERS = ("user:", "assistant:", "system:", "human:") def _markers_from_env() -> Iterable[str]: raw = os.getenv("LAB_SANITIZE_MARKERS") if not raw: return DEFAULT_MARKERS # comma/semicolon/space separated parts = re.split(r"[,\s;]+", raw.strip()) return tuple([p for p in parts if p]) def sanitize_output(response: str) -> str: """ Remove hallucinated dialogue markers (e.g., 'user:', 'assistant:') and all text that follows. Markers are case-insensitive. Configurable via LAB_SANITIZE_MARKERS. """ if not response: return response markers = _markers_from_env() # Build a single regex from the configured markers, escaped for safety pattern = r"(" + r"|".join(re.escape(m) for m in markers) + r")" return re.split(pattern, response, flags=re.IGNORECASE)[0].strip()