61 lines
2.1 KiB
Python
61 lines
2.1 KiB
Python
# core/subject_normalizer.py
|
||
import re
|
||
from typing import Iterable, Tuple, List, Optional
|
||
|
||
# Split ONLY on:
|
||
# - semicolons (with optional surrounding spaces)
|
||
# - dashes that have spaces on both sides: " - ", " – ", " — "
|
||
SPLIT_RE = re.compile(r"\s*;\s*|\s+[–—-]\s+")
|
||
TRIGGER_RE = re.compile(r";|\s[–—-]\s") # used to decide whether to normalize at all
|
||
|
||
def _dedupe_preserve_order(items: Iterable[str]) -> List[str]:
|
||
seen = set()
|
||
out: List[str] = []
|
||
for it in items:
|
||
k = it.casefold()
|
||
if k not in seen:
|
||
out.append(it)
|
||
seen.add(k)
|
||
return out
|
||
|
||
def normalize_subject_field(original: Optional[str], allowed_multiword: Optional[Iterable[str]] = None) -> Tuple[str, List[str]]:
|
||
"""
|
||
Normalize the subject line ONLY when it contains a semicolon or a spaced dash.
|
||
- Split on ';' or spaced dashes (" - ", " – ", " — ")
|
||
- Trim each part
|
||
- Remove empties
|
||
- De-duplicate (case-insensitive) while preserving order
|
||
- Join with ', ' as the canonical separator
|
||
- Keep original casing and wording
|
||
- If there is no trigger, return the original unchanged.
|
||
|
||
Returns: (normalized_value, warnings_list)
|
||
"""
|
||
warnings: List[str] = []
|
||
s = (original or "").strip()
|
||
|
||
if not s:
|
||
return s, warnings
|
||
|
||
# Only touch lines that clearly use the "bad" separators
|
||
if not TRIGGER_RE.search(s):
|
||
# No semicolons or spaced dashes -> leave untouched
|
||
return s, warnings
|
||
|
||
# Split only on our allowed separators
|
||
parts = [p.strip() for p in SPLIT_RE.split(s)]
|
||
parts = [p for p in parts if p] # drop empties
|
||
|
||
# Optional allow-list support (not required for this narrow change).
|
||
# If you pass a set of known multi-word subjects, we simply don't split on commas here anyway.
|
||
# We already only split on our "bad" separators above.
|
||
if allowed_multiword:
|
||
# Nothing special to do here, since we didn't split on commas.
|
||
pass
|
||
|
||
parts = _dedupe_preserve_order(parts)
|
||
|
||
# Reassemble to canonical comma-separated format
|
||
normalized = ", ".join(parts)
|
||
|
||
return normalized, warnings |