Illustrations/web/core/subject_normalizer.py

# core/subject_normalizer.py
import re
from typing import List, Tuple, Dict, Set
from django.conf import settings
from pathlib import Path

# Where we'll try to load the canonical subject catalog from.
# Put your subjects.txt at your project root (same level as manage.py).
_CATALOG_LOCATIONS = [
    Path(getattr(settings, "BASE_DIR", ".")) / "subjects.txt",
    Path(__file__).resolve().parent / "subjects.txt",  # optional fallback
]

def _load_subject_catalog() -> Dict[str, str]:
    """
    Load canonical subjects from subjects.txt, one per line.
    Returns a map of lowercase -> canonical (original) string.
    If no file found, returns empty map (safe no-op behavior).
    """
    for loc in _CATALOG_LOCATIONS:
        try:
            if loc.exists():
                lines = [ln.strip() for ln in loc.read_text(encoding="utf-8").splitlines()]
                catalog = {ln.lower(): ln for ln in lines if ln}
                return catalog
        except Exception:
            # Fail open: just continue to next location
            pass
    return {}

_SUBJECT_CATALOG: Dict[str, str] = _load_subject_catalog()

_SEP_DASH = re.compile(r"\s+[-–—]\s+")      # only split dashes when surrounded by spaces
_SEP_SEMI = re.compile(r"\s*;\s*")
_SEP_SLASH = re.compile(r"\s*/\s*")
_MULTI_SPACES = re.compile(r"\s+")

def normalize_subject_field(raw: str) -> Tuple[str, List[str]]:
    """
    Convert messy subject separators to commas, lightly tidy tokens,
    dedupe while preserving order, and map tokens to a canonical form
    using subjects.txt when available.

    Returns: (normalized_subjects_string, warnings_list)
    """
    warnings: List[str] = []
    if not raw:
        return "", warnings

    original = raw

    s = raw.strip()

    # Normalize common separators to commas.
    #  • semicolons: ;  -> ,
    #  • slashes:   /   -> ,
    #  • spaced dashes ( - / – / — with spaces around): -> ,
    s = _SEP_SEMI.sub(", ", s)
    s = _SEP_SLASH.sub(", ", s)
    s = _SEP_DASH.sub(", ", s)

    # Also normalize stray multiple commas/spaces into single commas with space
    # (we'll split on comma anyway, but this helps avoid empty tokens)
    s = re.sub(r"\s*,\s*", ",", s)

    # Split by comma
    parts = [p for p in s.split(",") if p is not None]

    cleaned: List[str] = []
    seen: Set[str] = set()

    for p in parts:
        t = p.strip()
        if not t:
            continue

        # Collapse inner whitespace
        t = _MULTI_SPACES.sub(" ", t)

        # If we have a catalog, map by lowercase for consistent canonicalization
        low = t.lower()
        if _SUBJECT_CATALOG:
            if low in _SUBJECT_CATALOG:
                t_canon = _SUBJECT_CATALOG[low]
            else:
                # Not in catalog — keep as typed, but note once
                t_canon = t
                warnings.append(f"Unknown subject (kept as-is): {t}")
        else:
            # No catalog available — keep token as typed
            t_canon = t

        # Deduplicate by lowercase
        if t_canon.lower() in seen:
            continue
        seen.add(t_canon.lower())
        cleaned.append(t_canon)

    normalized = ", ".join(cleaned)

    # If we changed separators or spacing, add a soft warning for transparency
    if normalized != original.strip():
        warnings.append("Separators and spacing normalized.")

    return normalized, warnings