Update web/core/subject_normalizer.py

2025-08-16 16:57:05 +00:00
parent a007922ead
commit 8de9020545
1 changed files with 45 additions and 89 deletions
@@ -1,105 +1,61 @@
 # core/subject_normalizer.py
 import re
-from typing import List, Tuple, Dict, Set
-from django.conf import settings
-from pathlib import Path
+from typing import Iterable, Tuple, List, Optional

-# Where we'll try to load the canonical subject catalog from.
-# Put your subjects.txt at your project root (same level as manage.py).
-_CATALOG_LOCATIONS = [
-    Path(getattr(settings, "BASE_DIR", ".")) / "subjects.txt",
-    Path(__file__).resolve().parent / "subjects.txt",  # optional fallback
-]
+# Split ONLY on:
+#  - semicolons (with optional surrounding spaces)
+#  - dashes that have spaces on both sides: " - ", " – ", " — "
+SPLIT_RE = re.compile(r"\s*;\s*|\s+[–—-]\s+")
+TRIGGER_RE = re.compile(r";|\s[–—-]\s")  # used to decide whether to normalize at all

-def _load_subject_catalog() -> Dict[str, str]:
+def _dedupe_preserve_order(items: Iterable[str]) -> List[str]:
+    seen = set()
+    out: List[str] = []
+    for it in items:
+        k = it.casefold()
+        if k not in seen:
+            out.append(it)
+            seen.add(k)
+    return out
+
+def normalize_subject_field(original: Optional[str], allowed_multiword: Optional[Iterable[str]] = None) -> Tuple[str, List[str]]:
    """
-    Load canonical subjects from subjects.txt, one per line.
-    Returns a map of lowercase -> canonical (original) string.
-    If no file found, returns empty map (safe no-op behavior).
-    """
-    for loc in _CATALOG_LOCATIONS:
-        try:
-            if loc.exists():
-                lines = [ln.strip() for ln in loc.read_text(encoding="utf-8").splitlines()]
-                catalog = {ln.lower(): ln for ln in lines if ln}
-                return catalog
-        except Exception:
-            # Fail open: just continue to next location
-            pass
-    return {}
+    Normalize the subject line ONLY when it contains a semicolon or a spaced dash.
+    - Split on ';' or spaced dashes (" - ", " – ", " — ")
+    - Trim each part
+    - Remove empties
+    - De-duplicate (case-insensitive) while preserving order
+    - Join with ', ' as the canonical separator
+    - Keep original casing and wording
+    - If there is no trigger, return the original unchanged.

-_SUBJECT_CATALOG: Dict[str, str] = _load_subject_catalog()
-
-_SEP_DASH = re.compile(r"\s+[-–—]\s+")      # only split dashes when surrounded by spaces
-_SEP_SEMI = re.compile(r"\s*;\s*")
-_SEP_SLASH = re.compile(r"\s*/\s*")
-_MULTI_SPACES = re.compile(r"\s+")
-
-def normalize_subject_field(raw: str) -> Tuple[str, List[str]]:
-    """
-    Convert messy subject separators to commas, lightly tidy tokens,
-    dedupe while preserving order, and map tokens to a canonical form
-    using subjects.txt when available.
-
-    Returns: (normalized_subjects_string, warnings_list)
+    Returns: (normalized_value, warnings_list)
    """
    warnings: List[str] = []
-    if not raw:
-        return "", warnings
+    s = (original or "").strip()

-    original = raw
+    if not s:
+        return s, warnings

-    s = raw.strip()
+    # Only touch lines that clearly use the "bad" separators
+    if not TRIGGER_RE.search(s):
+        # No semicolons or spaced dashes -> leave untouched
+        return s, warnings

-    # Normalize common separators to commas.
-    #  • semicolons: ;  -> ,
-    #  • slashes:   /   -> ,
-    #  • spaced dashes ( - / – / — with spaces around): -> ,
-    s = _SEP_SEMI.sub(", ", s)
-    s = _SEP_SLASH.sub(", ", s)
-    s = _SEP_DASH.sub(", ", s)
+    # Split only on our allowed separators
+    parts = [p.strip() for p in SPLIT_RE.split(s)]
+    parts = [p for p in parts if p]  # drop empties

-    # Also normalize stray multiple commas/spaces into single commas with space
-    # (we'll split on comma anyway, but this helps avoid empty tokens)
-    s = re.sub(r"\s*,\s*", ",", s)
+    # Optional allow-list support (not required for this narrow change).
+    # If you pass a set of known multi-word subjects, we simply don't split on commas here anyway.
+    # We already only split on our "bad" separators above.
+    if allowed_multiword:
+        # Nothing special to do here, since we didn't split on commas.
+        pass

-    # Split by comma
-    parts = [p for p in s.split(",") if p is not None]
+    parts = _dedupe_preserve_order(parts)

-    cleaned: List[str] = []
-    seen: Set[str] = set()
-
-    for p in parts:
-        t = p.strip()
-        if not t:
-            continue
-
-        # Collapse inner whitespace
-        t = _MULTI_SPACES.sub(" ", t)
-
-        # If we have a catalog, map by lowercase for consistent canonicalization
-        low = t.lower()
-        if _SUBJECT_CATALOG:
-            if low in _SUBJECT_CATALOG:
-                t_canon = _SUBJECT_CATALOG[low]
-            else:
-                # Not in catalog — keep as typed, but note once
-                t_canon = t
-                warnings.append(f"Unknown subject (kept as-is): {t}")
-        else:
-            # No catalog available — keep token as typed
-            t_canon = t
-
-        # Deduplicate by lowercase
-        if t_canon.lower() in seen:
-            continue
-        seen.add(t_canon.lower())
-        cleaned.append(t_canon)
-
-    normalized = ", ".join(cleaned)
-
-    # If we changed separators or spacing, add a soft warning for transparency
-    if normalized != original.strip():
-        warnings.append("Separators and spacing normalized.")
+    # Reassemble to canonical comma-separated format
+    normalized = ", ".join(parts)

    return normalized, warnings