# core/subject_normalizer.py import re from typing import Iterable, Tuple, List, Optional # Split ONLY on: # - semicolons (with optional surrounding spaces) # - dashes that have spaces on both sides: " - ", " – ", " — " SPLIT_RE = re.compile(r"\s*;\s*|\s+[–—-]\s+") TRIGGER_RE = re.compile(r";|\s[–—-]\s") # used to decide whether to normalize at all def _dedupe_preserve_order(items: Iterable[str]) -> List[str]: seen = set() out: List[str] = [] for it in items: k = it.casefold() if k not in seen: out.append(it) seen.add(k) return out def normalize_subject_field(original: Optional[str], allowed_multiword: Optional[Iterable[str]] = None) -> Tuple[str, List[str]]: """ Normalize the subject line ONLY when it contains a semicolon or a spaced dash. - Split on ';' or spaced dashes (" - ", " – ", " — ") - Trim each part - Remove empties - De-duplicate (case-insensitive) while preserving order - Join with ', ' as the canonical separator - Keep original casing and wording - If there is no trigger, return the original unchanged. Returns: (normalized_value, warnings_list) """ warnings: List[str] = [] s = (original or "").strip() if not s: return s, warnings # Only touch lines that clearly use the "bad" separators if not TRIGGER_RE.search(s): # No semicolons or spaced dashes -> leave untouched return s, warnings # Split only on our allowed separators parts = [p.strip() for p in SPLIT_RE.split(s)] parts = [p for p in parts if p] # drop empties # Optional allow-list support (not required for this narrow change). # If you pass a set of known multi-word subjects, we simply don't split on commas here anyway. # We already only split on our "bad" separators above. if allowed_multiword: # Nothing special to do here, since we didn't split on commas. pass parts = _dedupe_preserve_order(parts) # Reassemble to canonical comma-separated format normalized = ", ".join(parts) return normalized, warnings