diff --git a/web/core/subject_normalizer.py b/web/core/subject_normalizer.py index 304a541..c1e0869 100644 --- a/web/core/subject_normalizer.py +++ b/web/core/subject_normalizer.py @@ -1,105 +1,61 @@ # core/subject_normalizer.py import re -from typing import List, Tuple, Dict, Set -from django.conf import settings -from pathlib import Path +from typing import Iterable, Tuple, List, Optional -# Where we'll try to load the canonical subject catalog from. -# Put your subjects.txt at your project root (same level as manage.py). -_CATALOG_LOCATIONS = [ - Path(getattr(settings, "BASE_DIR", ".")) / "subjects.txt", - Path(__file__).resolve().parent / "subjects.txt", # optional fallback -] +# Split ONLY on: +# - semicolons (with optional surrounding spaces) +# - dashes that have spaces on both sides: " - ", " – ", " — " +SPLIT_RE = re.compile(r"\s*;\s*|\s+[–—-]\s+") +TRIGGER_RE = re.compile(r";|\s[–—-]\s") # used to decide whether to normalize at all -def _load_subject_catalog() -> Dict[str, str]: +def _dedupe_preserve_order(items: Iterable[str]) -> List[str]: + seen = set() + out: List[str] = [] + for it in items: + k = it.casefold() + if k not in seen: + out.append(it) + seen.add(k) + return out + +def normalize_subject_field(original: Optional[str], allowed_multiword: Optional[Iterable[str]] = None) -> Tuple[str, List[str]]: """ - Load canonical subjects from subjects.txt, one per line. - Returns a map of lowercase -> canonical (original) string. - If no file found, returns empty map (safe no-op behavior). - """ - for loc in _CATALOG_LOCATIONS: - try: - if loc.exists(): - lines = [ln.strip() for ln in loc.read_text(encoding="utf-8").splitlines()] - catalog = {ln.lower(): ln for ln in lines if ln} - return catalog - except Exception: - # Fail open: just continue to next location - pass - return {} + Normalize the subject line ONLY when it contains a semicolon or a spaced dash. + - Split on ';' or spaced dashes (" - ", " – ", " — ") + - Trim each part + - Remove empties + - De-duplicate (case-insensitive) while preserving order + - Join with ', ' as the canonical separator + - Keep original casing and wording + - If there is no trigger, return the original unchanged. -_SUBJECT_CATALOG: Dict[str, str] = _load_subject_catalog() - -_SEP_DASH = re.compile(r"\s+[-–—]\s+") # only split dashes when surrounded by spaces -_SEP_SEMI = re.compile(r"\s*;\s*") -_SEP_SLASH = re.compile(r"\s*/\s*") -_MULTI_SPACES = re.compile(r"\s+") - -def normalize_subject_field(raw: str) -> Tuple[str, List[str]]: - """ - Convert messy subject separators to commas, lightly tidy tokens, - dedupe while preserving order, and map tokens to a canonical form - using subjects.txt when available. - - Returns: (normalized_subjects_string, warnings_list) + Returns: (normalized_value, warnings_list) """ warnings: List[str] = [] - if not raw: - return "", warnings + s = (original or "").strip() - original = raw + if not s: + return s, warnings - s = raw.strip() + # Only touch lines that clearly use the "bad" separators + if not TRIGGER_RE.search(s): + # No semicolons or spaced dashes -> leave untouched + return s, warnings - # Normalize common separators to commas. - # • semicolons: ; -> , - # • slashes: / -> , - # • spaced dashes ( - / – / — with spaces around): -> , - s = _SEP_SEMI.sub(", ", s) - s = _SEP_SLASH.sub(", ", s) - s = _SEP_DASH.sub(", ", s) + # Split only on our allowed separators + parts = [p.strip() for p in SPLIT_RE.split(s)] + parts = [p for p in parts if p] # drop empties - # Also normalize stray multiple commas/spaces into single commas with space - # (we'll split on comma anyway, but this helps avoid empty tokens) - s = re.sub(r"\s*,\s*", ",", s) + # Optional allow-list support (not required for this narrow change). + # If you pass a set of known multi-word subjects, we simply don't split on commas here anyway. + # We already only split on our "bad" separators above. + if allowed_multiword: + # Nothing special to do here, since we didn't split on commas. + pass - # Split by comma - parts = [p for p in s.split(",") if p is not None] + parts = _dedupe_preserve_order(parts) - cleaned: List[str] = [] - seen: Set[str] = set() - - for p in parts: - t = p.strip() - if not t: - continue - - # Collapse inner whitespace - t = _MULTI_SPACES.sub(" ", t) - - # If we have a catalog, map by lowercase for consistent canonicalization - low = t.lower() - if _SUBJECT_CATALOG: - if low in _SUBJECT_CATALOG: - t_canon = _SUBJECT_CATALOG[low] - else: - # Not in catalog — keep as typed, but note once - t_canon = t - warnings.append(f"Unknown subject (kept as-is): {t}") - else: - # No catalog available — keep token as typed - t_canon = t - - # Deduplicate by lowercase - if t_canon.lower() in seen: - continue - seen.add(t_canon.lower()) - cleaned.append(t_canon) - - normalized = ", ".join(cleaned) - - # If we changed separators or spacing, add a soft warning for transparency - if normalized != original.strip(): - warnings.append("Separators and spacing normalized.") + # Reassemble to canonical comma-separated format + normalized = ", ".join(parts) return normalized, warnings \ No newline at end of file