Add web/core/subject_normalizer.py

2025-08-16 16:44:03 +00:00
parent 291c278a37
commit 1b3e587149
1 changed files with 105 additions and 0 deletions
@@ -0,0 +1,105 @@
+# core/subject_normalizer.py
+import re
+from typing import List, Tuple, Dict, Set
+from django.conf import settings
+from pathlib import Path
+
+# Where we'll try to load the canonical subject catalog from.
+# Put your subjects.txt at your project root (same level as manage.py).
+_CATALOG_LOCATIONS = [
+    Path(getattr(settings, "BASE_DIR", ".")) / "subjects.txt",
+    Path(__file__).resolve().parent / "subjects.txt",  # optional fallback
+]
+
+def _load_subject_catalog() -> Dict[str, str]:
+    """
+    Load canonical subjects from subjects.txt, one per line.
+    Returns a map of lowercase -> canonical (original) string.
+    If no file found, returns empty map (safe no-op behavior).
+    """
+    for loc in _CATALOG_LOCATIONS:
+        try:
+            if loc.exists():
+                lines = [ln.strip() for ln in loc.read_text(encoding="utf-8").splitlines()]
+                catalog = {ln.lower(): ln for ln in lines if ln}
+                return catalog
+        except Exception:
+            # Fail open: just continue to next location
+            pass
+    return {}
+
+_SUBJECT_CATALOG: Dict[str, str] = _load_subject_catalog()
+
+_SEP_DASH = re.compile(r"\s+[-–—]\s+")      # only split dashes when surrounded by spaces
+_SEP_SEMI = re.compile(r"\s*;\s*")
+_SEP_SLASH = re.compile(r"\s*/\s*")
+_MULTI_SPACES = re.compile(r"\s+")
+
+def normalize_subject_field(raw: str) -> Tuple[str, List[str]]:
+    """
+    Convert messy subject separators to commas, lightly tidy tokens,
+    dedupe while preserving order, and map tokens to a canonical form
+    using subjects.txt when available.
+
+    Returns: (normalized_subjects_string, warnings_list)
+    """
+    warnings: List[str] = []
+    if not raw:
+        return "", warnings
+
+    original = raw
+
+    s = raw.strip()
+
+    # Normalize common separators to commas.
+    #  • semicolons: ;  -> ,
+    #  • slashes:   /   -> ,
+    #  • spaced dashes ( - / – / — with spaces around): -> ,
+    s = _SEP_SEMI.sub(", ", s)
+    s = _SEP_SLASH.sub(", ", s)
+    s = _SEP_DASH.sub(", ", s)
+
+    # Also normalize stray multiple commas/spaces into single commas with space
+    # (we'll split on comma anyway, but this helps avoid empty tokens)
+    s = re.sub(r"\s*,\s*", ",", s)
+
+    # Split by comma
+    parts = [p for p in s.split(",") if p is not None]
+
+    cleaned: List[str] = []
+    seen: Set[str] = set()
+
+    for p in parts:
+        t = p.strip()
+        if not t:
+            continue
+
+        # Collapse inner whitespace
+        t = _MULTI_SPACES.sub(" ", t)
+
+        # If we have a catalog, map by lowercase for consistent canonicalization
+        low = t.lower()
+        if _SUBJECT_CATALOG:
+            if low in _SUBJECT_CATALOG:
+                t_canon = _SUBJECT_CATALOG[low]
+            else:
+                # Not in catalog — keep as typed, but note once
+                t_canon = t
+                warnings.append(f"Unknown subject (kept as-is): {t}")
+        else:
+            # No catalog available — keep token as typed
+            t_canon = t
+
+        # Deduplicate by lowercase
+        if t_canon.lower() in seen:
+            continue
+        seen.add(t_canon.lower())
+        cleaned.append(t_canon)
+
+    normalized = ", ".join(cleaned)
+
+    # If we changed separators or spacing, add a soft warning for transparency
+    if normalized != original.strip():
+        warnings.append("Separators and spacing normalized.")
+
+    return normalized, warnings