From 1b3e5871495bb86950cda1dc02061d76b76b11e5 Mon Sep 17 00:00:00 2001 From: Joshua Laymon Date: Sat, 16 Aug 2025 16:44:03 +0000 Subject: [PATCH] Add web/core/subject_normalizer.py --- web/core/subject_normalizer.py | 105 +++++++++++++++++++++++++++++++++ 1 file changed, 105 insertions(+) create mode 100644 web/core/subject_normalizer.py diff --git a/web/core/subject_normalizer.py b/web/core/subject_normalizer.py new file mode 100644 index 0000000..304a541 --- /dev/null +++ b/web/core/subject_normalizer.py @@ -0,0 +1,105 @@ +# core/subject_normalizer.py +import re +from typing import List, Tuple, Dict, Set +from django.conf import settings +from pathlib import Path + +# Where we'll try to load the canonical subject catalog from. +# Put your subjects.txt at your project root (same level as manage.py). +_CATALOG_LOCATIONS = [ + Path(getattr(settings, "BASE_DIR", ".")) / "subjects.txt", + Path(__file__).resolve().parent / "subjects.txt", # optional fallback +] + +def _load_subject_catalog() -> Dict[str, str]: + """ + Load canonical subjects from subjects.txt, one per line. + Returns a map of lowercase -> canonical (original) string. + If no file found, returns empty map (safe no-op behavior). + """ + for loc in _CATALOG_LOCATIONS: + try: + if loc.exists(): + lines = [ln.strip() for ln in loc.read_text(encoding="utf-8").splitlines()] + catalog = {ln.lower(): ln for ln in lines if ln} + return catalog + except Exception: + # Fail open: just continue to next location + pass + return {} + +_SUBJECT_CATALOG: Dict[str, str] = _load_subject_catalog() + +_SEP_DASH = re.compile(r"\s+[-–—]\s+") # only split dashes when surrounded by spaces +_SEP_SEMI = re.compile(r"\s*;\s*") +_SEP_SLASH = re.compile(r"\s*/\s*") +_MULTI_SPACES = re.compile(r"\s+") + +def normalize_subject_field(raw: str) -> Tuple[str, List[str]]: + """ + Convert messy subject separators to commas, lightly tidy tokens, + dedupe while preserving order, and map tokens to a canonical form + using subjects.txt when available. + + Returns: (normalized_subjects_string, warnings_list) + """ + warnings: List[str] = [] + if not raw: + return "", warnings + + original = raw + + s = raw.strip() + + # Normalize common separators to commas. + # • semicolons: ; -> , + # • slashes: / -> , + # • spaced dashes ( - / – / — with spaces around): -> , + s = _SEP_SEMI.sub(", ", s) + s = _SEP_SLASH.sub(", ", s) + s = _SEP_DASH.sub(", ", s) + + # Also normalize stray multiple commas/spaces into single commas with space + # (we'll split on comma anyway, but this helps avoid empty tokens) + s = re.sub(r"\s*,\s*", ",", s) + + # Split by comma + parts = [p for p in s.split(",") if p is not None] + + cleaned: List[str] = [] + seen: Set[str] = set() + + for p in parts: + t = p.strip() + if not t: + continue + + # Collapse inner whitespace + t = _MULTI_SPACES.sub(" ", t) + + # If we have a catalog, map by lowercase for consistent canonicalization + low = t.lower() + if _SUBJECT_CATALOG: + if low in _SUBJECT_CATALOG: + t_canon = _SUBJECT_CATALOG[low] + else: + # Not in catalog — keep as typed, but note once + t_canon = t + warnings.append(f"Unknown subject (kept as-is): {t}") + else: + # No catalog available — keep token as typed + t_canon = t + + # Deduplicate by lowercase + if t_canon.lower() in seen: + continue + seen.add(t_canon.lower()) + cleaned.append(t_canon) + + normalized = ", ".join(cleaned) + + # If we changed separators or spacing, add a soft warning for transparency + if normalized != original.strip(): + warnings.append("Separators and spacing normalized.") + + return normalized, warnings \ No newline at end of file