Illustrations/web/core/subject_normalizer.py

# core/subject_normalizer.py
import re
from typing import Iterable, Tuple, List, Optional

# Split ONLY on:
#  - semicolons (with optional surrounding spaces)
#  - dashes that have spaces on both sides: " - ", " – ", " — "
SPLIT_RE = re.compile(r"\s*;\s*|\s+[–—-]\s+")
TRIGGER_RE = re.compile(r";|\s[–—-]\s")  # used to decide whether to normalize at all

def _dedupe_preserve_order(items: Iterable[str]) -> List[str]:
    seen = set()
    out: List[str] = []
    for it in items:
        k = it.casefold()
        if k not in seen:
            out.append(it)
            seen.add(k)
    return out

def normalize_subject_field(original: Optional[str], allowed_multiword: Optional[Iterable[str]] = None) -> Tuple[str, List[str]]:
    """
    Normalize the subject line ONLY when it contains a semicolon or a spaced dash.
    - Split on ';' or spaced dashes (" - ", " – ", " — ")
    - Trim each part
    - Remove empties
    - De-duplicate (case-insensitive) while preserving order
    - Join with ', ' as the canonical separator
    - Keep original casing and wording
    - If there is no trigger, return the original unchanged.

    Returns: (normalized_value, warnings_list)
    """
    warnings: List[str] = []
    s = (original or "").strip()

    if not s:
        return s, warnings

    # Only touch lines that clearly use the "bad" separators
    if not TRIGGER_RE.search(s):
        # No semicolons or spaced dashes -> leave untouched
        return s, warnings

    # Split only on our allowed separators
    parts = [p.strip() for p in SPLIT_RE.split(s)]
    parts = [p for p in parts if p]  # drop empties

    # Optional allow-list support (not required for this narrow change).
    # If you pass a set of known multi-word subjects, we simply don't split on commas here anyway.
    # We already only split on our "bad" separators above.
    if allowed_multiword:
        # Nothing special to do here, since we didn't split on commas.
        pass

    parts = _dedupe_preserve_order(parts)

    # Reassemble to canonical comma-separated format
    normalized = ", ".join(parts)

    return normalized, warnings