Update web/core/subject_normalizer.py

This commit is contained in:
Joshua Laymon 2025-08-16 16:57:05 +00:00
parent a007922ead
commit 8de9020545

View File

@ -1,105 +1,61 @@
# core/subject_normalizer.py # core/subject_normalizer.py
import re import re
from typing import List, Tuple, Dict, Set from typing import Iterable, Tuple, List, Optional
from django.conf import settings
from pathlib import Path
# Where we'll try to load the canonical subject catalog from. # Split ONLY on:
# Put your subjects.txt at your project root (same level as manage.py). # - semicolons (with optional surrounding spaces)
_CATALOG_LOCATIONS = [ # - dashes that have spaces on both sides: " - ", " ", " — "
Path(getattr(settings, "BASE_DIR", ".")) / "subjects.txt", SPLIT_RE = re.compile(r"\s*;\s*|\s+[–—-]\s+")
Path(__file__).resolve().parent / "subjects.txt", # optional fallback TRIGGER_RE = re.compile(r";|\s[–—-]\s") # used to decide whether to normalize at all
]
def _load_subject_catalog() -> Dict[str, str]: def _dedupe_preserve_order(items: Iterable[str]) -> List[str]:
seen = set()
out: List[str] = []
for it in items:
k = it.casefold()
if k not in seen:
out.append(it)
seen.add(k)
return out
def normalize_subject_field(original: Optional[str], allowed_multiword: Optional[Iterable[str]] = None) -> Tuple[str, List[str]]:
""" """
Load canonical subjects from subjects.txt, one per line. Normalize the subject line ONLY when it contains a semicolon or a spaced dash.
Returns a map of lowercase -> canonical (original) string. - Split on ';' or spaced dashes (" - ", " ", "")
If no file found, returns empty map (safe no-op behavior). - Trim each part
""" - Remove empties
for loc in _CATALOG_LOCATIONS: - De-duplicate (case-insensitive) while preserving order
try: - Join with ', ' as the canonical separator
if loc.exists(): - Keep original casing and wording
lines = [ln.strip() for ln in loc.read_text(encoding="utf-8").splitlines()] - If there is no trigger, return the original unchanged.
catalog = {ln.lower(): ln for ln in lines if ln}
return catalog
except Exception:
# Fail open: just continue to next location
pass
return {}
_SUBJECT_CATALOG: Dict[str, str] = _load_subject_catalog() Returns: (normalized_value, warnings_list)
_SEP_DASH = re.compile(r"\s+[-–—]\s+") # only split dashes when surrounded by spaces
_SEP_SEMI = re.compile(r"\s*;\s*")
_SEP_SLASH = re.compile(r"\s*/\s*")
_MULTI_SPACES = re.compile(r"\s+")
def normalize_subject_field(raw: str) -> Tuple[str, List[str]]:
"""
Convert messy subject separators to commas, lightly tidy tokens,
dedupe while preserving order, and map tokens to a canonical form
using subjects.txt when available.
Returns: (normalized_subjects_string, warnings_list)
""" """
warnings: List[str] = [] warnings: List[str] = []
if not raw: s = (original or "").strip()
return "", warnings
original = raw if not s:
return s, warnings
s = raw.strip() # Only touch lines that clearly use the "bad" separators
if not TRIGGER_RE.search(s):
# No semicolons or spaced dashes -> leave untouched
return s, warnings
# Normalize common separators to commas. # Split only on our allowed separators
# • semicolons: ; -> , parts = [p.strip() for p in SPLIT_RE.split(s)]
# • slashes: / -> , parts = [p for p in parts if p] # drop empties
# • spaced dashes ( - / / — with spaces around): -> ,
s = _SEP_SEMI.sub(", ", s)
s = _SEP_SLASH.sub(", ", s)
s = _SEP_DASH.sub(", ", s)
# Also normalize stray multiple commas/spaces into single commas with space # Optional allow-list support (not required for this narrow change).
# (we'll split on comma anyway, but this helps avoid empty tokens) # If you pass a set of known multi-word subjects, we simply don't split on commas here anyway.
s = re.sub(r"\s*,\s*", ",", s) # We already only split on our "bad" separators above.
if allowed_multiword:
# Nothing special to do here, since we didn't split on commas.
pass
# Split by comma parts = _dedupe_preserve_order(parts)
parts = [p for p in s.split(",") if p is not None]
cleaned: List[str] = [] # Reassemble to canonical comma-separated format
seen: Set[str] = set() normalized = ", ".join(parts)
for p in parts:
t = p.strip()
if not t:
continue
# Collapse inner whitespace
t = _MULTI_SPACES.sub(" ", t)
# If we have a catalog, map by lowercase for consistent canonicalization
low = t.lower()
if _SUBJECT_CATALOG:
if low in _SUBJECT_CATALOG:
t_canon = _SUBJECT_CATALOG[low]
else:
# Not in catalog — keep as typed, but note once
t_canon = t
warnings.append(f"Unknown subject (kept as-is): {t}")
else:
# No catalog available — keep token as typed
t_canon = t
# Deduplicate by lowercase
if t_canon.lower() in seen:
continue
seen.add(t_canon.lower())
cleaned.append(t_canon)
normalized = ", ".join(cleaned)
# If we changed separators or spacing, add a soft warning for transparency
if normalized != original.strip():
warnings.append("Separators and spacing normalized.")
return normalized, warnings return normalized, warnings