# core/subject_normalizer.py import re from typing import List, Tuple, Dict, Set from django.conf import settings from pathlib import Path # Where we'll try to load the canonical subject catalog from. # Put your subjects.txt at your project root (same level as manage.py). _CATALOG_LOCATIONS = [ Path(getattr(settings, "BASE_DIR", ".")) / "subjects.txt", Path(__file__).resolve().parent / "subjects.txt", # optional fallback ] def _load_subject_catalog() -> Dict[str, str]: """ Load canonical subjects from subjects.txt, one per line. Returns a map of lowercase -> canonical (original) string. If no file found, returns empty map (safe no-op behavior). """ for loc in _CATALOG_LOCATIONS: try: if loc.exists(): lines = [ln.strip() for ln in loc.read_text(encoding="utf-8").splitlines()] catalog = {ln.lower(): ln for ln in lines if ln} return catalog except Exception: # Fail open: just continue to next location pass return {} _SUBJECT_CATALOG: Dict[str, str] = _load_subject_catalog() _SEP_DASH = re.compile(r"\s+[-–—]\s+") # only split dashes when surrounded by spaces _SEP_SEMI = re.compile(r"\s*;\s*") _SEP_SLASH = re.compile(r"\s*/\s*") _MULTI_SPACES = re.compile(r"\s+") def normalize_subject_field(raw: str) -> Tuple[str, List[str]]: """ Convert messy subject separators to commas, lightly tidy tokens, dedupe while preserving order, and map tokens to a canonical form using subjects.txt when available. Returns: (normalized_subjects_string, warnings_list) """ warnings: List[str] = [] if not raw: return "", warnings original = raw s = raw.strip() # Normalize common separators to commas. # • semicolons: ; -> , # • slashes: / -> , # • spaced dashes ( - / – / — with spaces around): -> , s = _SEP_SEMI.sub(", ", s) s = _SEP_SLASH.sub(", ", s) s = _SEP_DASH.sub(", ", s) # Also normalize stray multiple commas/spaces into single commas with space # (we'll split on comma anyway, but this helps avoid empty tokens) s = re.sub(r"\s*,\s*", ",", s) # Split by comma parts = [p for p in s.split(",") if p is not None] cleaned: List[str] = [] seen: Set[str] = set() for p in parts: t = p.strip() if not t: continue # Collapse inner whitespace t = _MULTI_SPACES.sub(" ", t) # If we have a catalog, map by lowercase for consistent canonicalization low = t.lower() if _SUBJECT_CATALOG: if low in _SUBJECT_CATALOG: t_canon = _SUBJECT_CATALOG[low] else: # Not in catalog — keep as typed, but note once t_canon = t warnings.append(f"Unknown subject (kept as-is): {t}") else: # No catalog available — keep token as typed t_canon = t # Deduplicate by lowercase if t_canon.lower() in seen: continue seen.add(t_canon.lower()) cleaned.append(t_canon) normalized = ", ".join(cleaned) # If we changed separators or spacing, add a soft warning for transparency if normalized != original.strip(): warnings.append("Separators and spacing normalized.") return normalized, warnings