Illustrations/web/core/subject_normalizer.py

61 lines
2.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# core/subject_normalizer.py
import re
from typing import Iterable, Tuple, List, Optional
# Split ONLY on:
# - semicolons (with optional surrounding spaces)
# - dashes that have spaces on both sides: " - ", " ", " — "
SPLIT_RE = re.compile(r"\s*;\s*|\s+[–—-]\s+")
TRIGGER_RE = re.compile(r";|\s[–—-]\s") # used to decide whether to normalize at all
def _dedupe_preserve_order(items: Iterable[str]) -> List[str]:
seen = set()
out: List[str] = []
for it in items:
k = it.casefold()
if k not in seen:
out.append(it)
seen.add(k)
return out
def normalize_subject_field(original: Optional[str], allowed_multiword: Optional[Iterable[str]] = None) -> Tuple[str, List[str]]:
"""
Normalize the subject line ONLY when it contains a semicolon or a spaced dash.
- Split on ';' or spaced dashes (" - ", " ", "")
- Trim each part
- Remove empties
- De-duplicate (case-insensitive) while preserving order
- Join with ', ' as the canonical separator
- Keep original casing and wording
- If there is no trigger, return the original unchanged.
Returns: (normalized_value, warnings_list)
"""
warnings: List[str] = []
s = (original or "").strip()
if not s:
return s, warnings
# Only touch lines that clearly use the "bad" separators
if not TRIGGER_RE.search(s):
# No semicolons or spaced dashes -> leave untouched
return s, warnings
# Split only on our allowed separators
parts = [p.strip() for p in SPLIT_RE.split(s)]
parts = [p for p in parts if p] # drop empties
# Optional allow-list support (not required for this narrow change).
# If you pass a set of known multi-word subjects, we simply don't split on commas here anyway.
# We already only split on our "bad" separators above.
if allowed_multiword:
# Nothing special to do here, since we didn't split on commas.
pass
parts = _dedupe_preserve_order(parts)
# Reassemble to canonical comma-separated format
normalized = ", ".join(parts)
return normalized, warnings