Illustrations/web/core/subject_normalizer.py

105 lines
3.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# core/subject_normalizer.py
import re
from typing import List, Tuple, Dict, Set
from django.conf import settings
from pathlib import Path
# Where we'll try to load the canonical subject catalog from.
# Put your subjects.txt at your project root (same level as manage.py).
_CATALOG_LOCATIONS = [
Path(getattr(settings, "BASE_DIR", ".")) / "subjects.txt",
Path(__file__).resolve().parent / "subjects.txt", # optional fallback
]
def _load_subject_catalog() -> Dict[str, str]:
"""
Load canonical subjects from subjects.txt, one per line.
Returns a map of lowercase -> canonical (original) string.
If no file found, returns empty map (safe no-op behavior).
"""
for loc in _CATALOG_LOCATIONS:
try:
if loc.exists():
lines = [ln.strip() for ln in loc.read_text(encoding="utf-8").splitlines()]
catalog = {ln.lower(): ln for ln in lines if ln}
return catalog
except Exception:
# Fail open: just continue to next location
pass
return {}
_SUBJECT_CATALOG: Dict[str, str] = _load_subject_catalog()
_SEP_DASH = re.compile(r"\s+[-–—]\s+") # only split dashes when surrounded by spaces
_SEP_SEMI = re.compile(r"\s*;\s*")
_SEP_SLASH = re.compile(r"\s*/\s*")
_MULTI_SPACES = re.compile(r"\s+")
def normalize_subject_field(raw: str) -> Tuple[str, List[str]]:
"""
Convert messy subject separators to commas, lightly tidy tokens,
dedupe while preserving order, and map tokens to a canonical form
using subjects.txt when available.
Returns: (normalized_subjects_string, warnings_list)
"""
warnings: List[str] = []
if not raw:
return "", warnings
original = raw
s = raw.strip()
# Normalize common separators to commas.
# • semicolons: ; -> ,
# • slashes: / -> ,
# • spaced dashes ( - / / — with spaces around): -> ,
s = _SEP_SEMI.sub(", ", s)
s = _SEP_SLASH.sub(", ", s)
s = _SEP_DASH.sub(", ", s)
# Also normalize stray multiple commas/spaces into single commas with space
# (we'll split on comma anyway, but this helps avoid empty tokens)
s = re.sub(r"\s*,\s*", ",", s)
# Split by comma
parts = [p for p in s.split(",") if p is not None]
cleaned: List[str] = []
seen: Set[str] = set()
for p in parts:
t = p.strip()
if not t:
continue
# Collapse inner whitespace
t = _MULTI_SPACES.sub(" ", t)
# If we have a catalog, map by lowercase for consistent canonicalization
low = t.lower()
if _SUBJECT_CATALOG:
if low in _SUBJECT_CATALOG:
t_canon = _SUBJECT_CATALOG[low]
else:
# Not in catalog — keep as typed, but note once
t_canon = t
warnings.append(f"Unknown subject (kept as-is): {t}")
else:
# No catalog available — keep token as typed
t_canon = t
# Deduplicate by lowercase
if t_canon.lower() in seen:
continue
seen.add(t_canon.lower())
cleaned.append(t_canon)
normalized = ", ".join(cleaned)
# If we changed separators or spacing, add a soft warning for transparency
if normalized != original.strip():
warnings.append("Separators and spacing normalized.")
return normalized, warnings