Update web/core/subject_normalizer.py
This commit is contained in:
parent
a007922ead
commit
8de9020545
@ -1,105 +1,61 @@
|
|||||||
# core/subject_normalizer.py
|
# core/subject_normalizer.py
|
||||||
import re
|
import re
|
||||||
from typing import List, Tuple, Dict, Set
|
from typing import Iterable, Tuple, List, Optional
|
||||||
from django.conf import settings
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
# Where we'll try to load the canonical subject catalog from.
|
# Split ONLY on:
|
||||||
# Put your subjects.txt at your project root (same level as manage.py).
|
# - semicolons (with optional surrounding spaces)
|
||||||
_CATALOG_LOCATIONS = [
|
# - dashes that have spaces on both sides: " - ", " – ", " — "
|
||||||
Path(getattr(settings, "BASE_DIR", ".")) / "subjects.txt",
|
SPLIT_RE = re.compile(r"\s*;\s*|\s+[–—-]\s+")
|
||||||
Path(__file__).resolve().parent / "subjects.txt", # optional fallback
|
TRIGGER_RE = re.compile(r";|\s[–—-]\s") # used to decide whether to normalize at all
|
||||||
]
|
|
||||||
|
|
||||||
def _load_subject_catalog() -> Dict[str, str]:
|
def _dedupe_preserve_order(items: Iterable[str]) -> List[str]:
|
||||||
|
seen = set()
|
||||||
|
out: List[str] = []
|
||||||
|
for it in items:
|
||||||
|
k = it.casefold()
|
||||||
|
if k not in seen:
|
||||||
|
out.append(it)
|
||||||
|
seen.add(k)
|
||||||
|
return out
|
||||||
|
|
||||||
|
def normalize_subject_field(original: Optional[str], allowed_multiword: Optional[Iterable[str]] = None) -> Tuple[str, List[str]]:
|
||||||
"""
|
"""
|
||||||
Load canonical subjects from subjects.txt, one per line.
|
Normalize the subject line ONLY when it contains a semicolon or a spaced dash.
|
||||||
Returns a map of lowercase -> canonical (original) string.
|
- Split on ';' or spaced dashes (" - ", " – ", " — ")
|
||||||
If no file found, returns empty map (safe no-op behavior).
|
- Trim each part
|
||||||
"""
|
- Remove empties
|
||||||
for loc in _CATALOG_LOCATIONS:
|
- De-duplicate (case-insensitive) while preserving order
|
||||||
try:
|
- Join with ', ' as the canonical separator
|
||||||
if loc.exists():
|
- Keep original casing and wording
|
||||||
lines = [ln.strip() for ln in loc.read_text(encoding="utf-8").splitlines()]
|
- If there is no trigger, return the original unchanged.
|
||||||
catalog = {ln.lower(): ln for ln in lines if ln}
|
|
||||||
return catalog
|
|
||||||
except Exception:
|
|
||||||
# Fail open: just continue to next location
|
|
||||||
pass
|
|
||||||
return {}
|
|
||||||
|
|
||||||
_SUBJECT_CATALOG: Dict[str, str] = _load_subject_catalog()
|
Returns: (normalized_value, warnings_list)
|
||||||
|
|
||||||
_SEP_DASH = re.compile(r"\s+[-–—]\s+") # only split dashes when surrounded by spaces
|
|
||||||
_SEP_SEMI = re.compile(r"\s*;\s*")
|
|
||||||
_SEP_SLASH = re.compile(r"\s*/\s*")
|
|
||||||
_MULTI_SPACES = re.compile(r"\s+")
|
|
||||||
|
|
||||||
def normalize_subject_field(raw: str) -> Tuple[str, List[str]]:
|
|
||||||
"""
|
|
||||||
Convert messy subject separators to commas, lightly tidy tokens,
|
|
||||||
dedupe while preserving order, and map tokens to a canonical form
|
|
||||||
using subjects.txt when available.
|
|
||||||
|
|
||||||
Returns: (normalized_subjects_string, warnings_list)
|
|
||||||
"""
|
"""
|
||||||
warnings: List[str] = []
|
warnings: List[str] = []
|
||||||
if not raw:
|
s = (original or "").strip()
|
||||||
return "", warnings
|
|
||||||
|
|
||||||
original = raw
|
if not s:
|
||||||
|
return s, warnings
|
||||||
|
|
||||||
s = raw.strip()
|
# Only touch lines that clearly use the "bad" separators
|
||||||
|
if not TRIGGER_RE.search(s):
|
||||||
|
# No semicolons or spaced dashes -> leave untouched
|
||||||
|
return s, warnings
|
||||||
|
|
||||||
# Normalize common separators to commas.
|
# Split only on our allowed separators
|
||||||
# • semicolons: ; -> ,
|
parts = [p.strip() for p in SPLIT_RE.split(s)]
|
||||||
# • slashes: / -> ,
|
parts = [p for p in parts if p] # drop empties
|
||||||
# • spaced dashes ( - / – / — with spaces around): -> ,
|
|
||||||
s = _SEP_SEMI.sub(", ", s)
|
|
||||||
s = _SEP_SLASH.sub(", ", s)
|
|
||||||
s = _SEP_DASH.sub(", ", s)
|
|
||||||
|
|
||||||
# Also normalize stray multiple commas/spaces into single commas with space
|
# Optional allow-list support (not required for this narrow change).
|
||||||
# (we'll split on comma anyway, but this helps avoid empty tokens)
|
# If you pass a set of known multi-word subjects, we simply don't split on commas here anyway.
|
||||||
s = re.sub(r"\s*,\s*", ",", s)
|
# We already only split on our "bad" separators above.
|
||||||
|
if allowed_multiword:
|
||||||
|
# Nothing special to do here, since we didn't split on commas.
|
||||||
|
pass
|
||||||
|
|
||||||
# Split by comma
|
parts = _dedupe_preserve_order(parts)
|
||||||
parts = [p for p in s.split(",") if p is not None]
|
|
||||||
|
|
||||||
cleaned: List[str] = []
|
# Reassemble to canonical comma-separated format
|
||||||
seen: Set[str] = set()
|
normalized = ", ".join(parts)
|
||||||
|
|
||||||
for p in parts:
|
|
||||||
t = p.strip()
|
|
||||||
if not t:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Collapse inner whitespace
|
|
||||||
t = _MULTI_SPACES.sub(" ", t)
|
|
||||||
|
|
||||||
# If we have a catalog, map by lowercase for consistent canonicalization
|
|
||||||
low = t.lower()
|
|
||||||
if _SUBJECT_CATALOG:
|
|
||||||
if low in _SUBJECT_CATALOG:
|
|
||||||
t_canon = _SUBJECT_CATALOG[low]
|
|
||||||
else:
|
|
||||||
# Not in catalog — keep as typed, but note once
|
|
||||||
t_canon = t
|
|
||||||
warnings.append(f"Unknown subject (kept as-is): {t}")
|
|
||||||
else:
|
|
||||||
# No catalog available — keep token as typed
|
|
||||||
t_canon = t
|
|
||||||
|
|
||||||
# Deduplicate by lowercase
|
|
||||||
if t_canon.lower() in seen:
|
|
||||||
continue
|
|
||||||
seen.add(t_canon.lower())
|
|
||||||
cleaned.append(t_canon)
|
|
||||||
|
|
||||||
normalized = ", ".join(cleaned)
|
|
||||||
|
|
||||||
# If we changed separators or spacing, add a soft warning for transparency
|
|
||||||
if normalized != original.strip():
|
|
||||||
warnings.append("Separators and spacing normalized.")
|
|
||||||
|
|
||||||
return normalized, warnings
|
return normalized, warnings
|
||||||
Loading…
Reference in New Issue
Block a user