105 lines
3.3 KiB
Python
105 lines
3.3 KiB
Python
# core/subject_normalizer.py
|
||
import re
|
||
from typing import List, Tuple, Dict, Set
|
||
from django.conf import settings
|
||
from pathlib import Path
|
||
|
||
# Where we'll try to load the canonical subject catalog from.
|
||
# Put your subjects.txt at your project root (same level as manage.py).
|
||
_CATALOG_LOCATIONS = [
|
||
Path(getattr(settings, "BASE_DIR", ".")) / "subjects.txt",
|
||
Path(__file__).resolve().parent / "subjects.txt", # optional fallback
|
||
]
|
||
|
||
def _load_subject_catalog() -> Dict[str, str]:
|
||
"""
|
||
Load canonical subjects from subjects.txt, one per line.
|
||
Returns a map of lowercase -> canonical (original) string.
|
||
If no file found, returns empty map (safe no-op behavior).
|
||
"""
|
||
for loc in _CATALOG_LOCATIONS:
|
||
try:
|
||
if loc.exists():
|
||
lines = [ln.strip() for ln in loc.read_text(encoding="utf-8").splitlines()]
|
||
catalog = {ln.lower(): ln for ln in lines if ln}
|
||
return catalog
|
||
except Exception:
|
||
# Fail open: just continue to next location
|
||
pass
|
||
return {}
|
||
|
||
_SUBJECT_CATALOG: Dict[str, str] = _load_subject_catalog()
|
||
|
||
_SEP_DASH = re.compile(r"\s+[-–—]\s+") # only split dashes when surrounded by spaces
|
||
_SEP_SEMI = re.compile(r"\s*;\s*")
|
||
_SEP_SLASH = re.compile(r"\s*/\s*")
|
||
_MULTI_SPACES = re.compile(r"\s+")
|
||
|
||
def normalize_subject_field(raw: str) -> Tuple[str, List[str]]:
|
||
"""
|
||
Convert messy subject separators to commas, lightly tidy tokens,
|
||
dedupe while preserving order, and map tokens to a canonical form
|
||
using subjects.txt when available.
|
||
|
||
Returns: (normalized_subjects_string, warnings_list)
|
||
"""
|
||
warnings: List[str] = []
|
||
if not raw:
|
||
return "", warnings
|
||
|
||
original = raw
|
||
|
||
s = raw.strip()
|
||
|
||
# Normalize common separators to commas.
|
||
# • semicolons: ; -> ,
|
||
# • slashes: / -> ,
|
||
# • spaced dashes ( - / – / — with spaces around): -> ,
|
||
s = _SEP_SEMI.sub(", ", s)
|
||
s = _SEP_SLASH.sub(", ", s)
|
||
s = _SEP_DASH.sub(", ", s)
|
||
|
||
# Also normalize stray multiple commas/spaces into single commas with space
|
||
# (we'll split on comma anyway, but this helps avoid empty tokens)
|
||
s = re.sub(r"\s*,\s*", ",", s)
|
||
|
||
# Split by comma
|
||
parts = [p for p in s.split(",") if p is not None]
|
||
|
||
cleaned: List[str] = []
|
||
seen: Set[str] = set()
|
||
|
||
for p in parts:
|
||
t = p.strip()
|
||
if not t:
|
||
continue
|
||
|
||
# Collapse inner whitespace
|
||
t = _MULTI_SPACES.sub(" ", t)
|
||
|
||
# If we have a catalog, map by lowercase for consistent canonicalization
|
||
low = t.lower()
|
||
if _SUBJECT_CATALOG:
|
||
if low in _SUBJECT_CATALOG:
|
||
t_canon = _SUBJECT_CATALOG[low]
|
||
else:
|
||
# Not in catalog — keep as typed, but note once
|
||
t_canon = t
|
||
warnings.append(f"Unknown subject (kept as-is): {t}")
|
||
else:
|
||
# No catalog available — keep token as typed
|
||
t_canon = t
|
||
|
||
# Deduplicate by lowercase
|
||
if t_canon.lower() in seen:
|
||
continue
|
||
seen.add(t_canon.lower())
|
||
cleaned.append(t_canon)
|
||
|
||
normalized = ", ".join(cleaned)
|
||
|
||
# If we changed separators or spacing, add a soft warning for transparency
|
||
if normalized != original.strip():
|
||
warnings.append("Separators and spacing normalized.")
|
||
|
||
return normalized, warnings |