Add web/core/source_normalizer.py

This commit is contained in:
Joshua Laymon 2025-08-14 19:30:28 +00:00
parent b98a63a4d5
commit 389f53a97c

View File

@ -0,0 +1,253 @@
# core/source_normalizer.py
from __future__ import annotations
import re
from typing import List, Tuple, Optional
# ------------------------------------------------------------
# Helpers
# ------------------------------------------------------------
# Map publication names/aliases -> WOL code
PUB_MAP = {
# Watchtower (both Study/Public)
"watchtower": "w",
"the watchtower": "w",
"wt": "w",
"w": "w",
# Awake!
"awake": "g",
"awake!": "g",
"aw": "g",
"g": "g",
# Yearbook
"yearbook": "yb",
"yb": "yb",
# Our Kingdom Ministry
"our kingdom ministry": "km",
"kingdom ministry": "km",
"km": "km",
}
# Month name / abbreviation map -> month number
MONTH_MAP = {
"january": 1, "jan": 1,
"february": 2, "feb": 2,
"march": 3, "mar": 3,
"april": 4, "apr": 4,
"may": 5,
"june": 6, "jun": 6,
"july": 7, "jul": 7,
"august": 8, "aug": 8,
"september": 9, "sep": 9, "sept": 9,
"october": 10, "oct": 10,
"november": 11, "nov": 11,
"december": 12, "dec": 12,
}
DASH = r"[–—-]" # en/em/normal hyphen
SPACE = r"[ \t\xa0\u2009\u202F]*"
# Pages (single, list, ranges). We normalize dash to hyphen later.
PAGES_RE = re.compile(
rf"(?P<label>pp?|pages?|pgs?)\.?{SPACE}(?P<pages>\d+(?:{SPACE}[,\-–—]{SPACE}\d+)*)",
re.I,
)
# Paragraphs “par.” or “pars.”
PARS_RE = re.compile(
rf"(pars?\.?{SPACE}(?P<pars>\d+(?:{SPACE}[,\-–—]{SPACE}\d+)*))",
re.I,
)
# Already-short-coded like: w09 10/1 p.25 g95 1/22 pp.4-6 yb89 p.112 km95 6 p.4
SHORTCODE_RE = re.compile(
rf"""
^{SPACE}
(?P<code>w|g|yb|km){SPACE}
(?P<year>\d{{2,4}}){SPACE}
(?:
(?P<month>\d{{1,2}})
/
(?P<day>\d{{1,2}})
)?
(?:{SPACE}(?P<pp>pp?\.{SPACE}\d+(?:{SPACE}[,\-]{SPACE}\d+)*))?
(?:{SPACE}(?P<pars>pars?\.{SPACE}\d+(?:{SPACE}[,\-]{SPACE}\d+)*))?
{SPACE}$
""",
re.X | re.I,
)
# Legacy W/G style: W63 5/1 274 G60 10/22 9
LEGACY_WG_RE = re.compile(
rf"^{SPACE}(?P<code>[WGwg]){SPACE}(?P<yy>\d{{2}}){SPACE}(?P<md>\d{{1,2}}/\d{{1,2}}){SPACE}(?P<pg>\d+){SPACE}$"
)
# Month name formats like: March 15, 2013 Mar 15 2013
NAMED_DATE_RE = re.compile(
rf"""
(?P<month_name>[A-Za-z]+){SPACE}
(?P<day>\d{{1,2}}){SPACE},?{SPACE}
(?P<year>\d{{4}}|\d{{2}})
""",
re.X,
)
# Numeric date formats like: 6/22/1993 or 7/1/09
NUMERIC_DATE_RE = re.compile(
rf"(?P<month>\d{{1,2}}){SPACE}/{SPACE}(?P<day>\d{{1,2}}){SPACE}/{SPACE}(?P<year>\d{{2,4}})"
)
# Light cleaner
def _clean(s: str) -> str:
s = s.replace("\xa0", " ").replace("\u2009", " ").replace("\u202f", " ")
s = re.sub(r"\s+", " ", s).strip()
return s
def _to_yy(year: str) -> str:
y = int(year)
if y >= 100:
return f"{y % 100:02d}"
return f"{y:02d}"
def _norm_pages(label: str, pages: str) -> str:
# normalize dashes and spaces, decide p. vs pp.
p = re.sub(rf"{SPACE}[{DASH},]{SPACE}", lambda m: m.group(0).strip(), pages)
p = p.replace("", "-").replace("", "-").replace(" ", "")
if "-" in p or "," in p:
return f"pp.{p}"
return f"p.{p}"
def _extract_pub(text: str) -> Optional[str]:
t = text.lower()
# try whole-word-ish matches
for k, v in PUB_MAP.items():
if re.search(rf"\b{k}\b", t):
return v
return None
def _extract_date(text: str) -> Optional[tuple[int, int, str]]:
# Named month
m = NAMED_DATE_RE.search(text)
if m:
month_name = m.group("month_name").lower().strip(".")
if month_name in MONTH_MAP:
mm = MONTH_MAP[month_name]
dd = int(m.group("day"))
yy = _to_yy(m.group("year"))
return (mm, dd, yy)
# Numeric month/day/year
m = NUMERIC_DATE_RE.search(text)
if m:
mm = int(m.group("month"))
dd = int(m.group("day"))
yy = _to_yy(m.group("year"))
return (mm, dd, yy)
return None
def _extract_pages(text: str) -> Optional[str]:
m = PAGES_RE.search(text)
if not m:
return None
return _norm_pages(m.group("label"), m.group("pages"))
def _extract_pars(text: str) -> Optional[str]:
m = PARS_RE.search(text)
if not m:
return None
# normalize dashes and spaces
pars = m.group("pars")
pars = pars.replace("", "-").replace("", "-")
pars = re.sub(rf"{SPACE}[,-]{SPACE}", lambda m: m.group(0).strip(), pars)
pars = re.sub(r"\s+", "", pars)
return f"pars.{pars}"
def _format_output(code: str, yy: str, mm: int, dd: int, pages: Optional[str], pars: Optional[str]) -> str:
out = f"{code}{yy} {mm}/{dd}"
if pages:
out += f" {pages}"
if pars:
out += f", {pars}"
return out
# ------------------------------------------------------------
# Public API
# ------------------------------------------------------------
def normalize_source_field(text: str) -> tuple[str, List[str]]:
"""
Try to normalize Watchtower/Awake!/Yearbook/KM citations into WOL short-code.
If the line is not a JW publication, or if we can't confidently parse pub + full date,
we return it unchanged. Returns (normalized_text, warnings).
"""
warnings: List[str] = []
original = text or ""
s = _clean(original)
if not s:
return ("", warnings)
# 1) Already short-coded?
m = SHORTCODE_RE.match(s)
if m:
code = m.group("code").lower()
yy = _to_yy(m.group("year"))
mm = m.group("month")
dd = m.group("day")
pages = m.group("pp")
pars = m.group("pars")
# Normalize pp/pars spacing/dashes
pages_norm = None
if pages:
pm = re.search(r"pp?\.?\s*(.+)$", pages, re.I)
if pm:
pages_norm = _norm_pages("p", pm.group(1))
pars_norm = None
if pars:
par_m = re.search(r"pars?\.?\s*(.+)$", pars, re.I)
if par_m:
ptxt = par_m.group(1).replace("", "-").replace("", "-")
ptxt = re.sub(r"\s+", "", ptxt)
pars_norm = f"pars.{ptxt}"
if mm and dd:
out = _format_output(code, yy, int(mm), int(dd), pages_norm, pars_norm)
return (out, warnings)
# If no full date in the short-code, keep as-is (strict mode).
return (s, warnings)
# 2) Legacy W/G lines like "W63 5/1 274"
m = LEGACY_WG_RE.match(s)
if m:
code = "w" if m.group("code").lower() == "w" else "g"
yy = m.group("yy")
md = m.group("md") # m/d
pg = m.group("pg")
try:
mm, dd = [int(x) for x in md.split("/")]
except Exception:
return (s, warnings)
pages = _norm_pages("p", pg)
return (_format_output(code, yy, mm, dd, pages, None), warnings)
# 3) Otherwise, must detect publication + full date; else pass-through.
pub = _extract_pub(s)
if not pub:
return (s, warnings)
date_tuple = _extract_date(s)
if not date_tuple:
# Month-only references remain unchanged in strict mode.
return (s, warnings)
mm, dd, yy = date_tuple
pages = _extract_pages(s)
pars = _extract_pars(s)
out = _format_output(pub, yy, mm, dd, pages, pars)
return (out, warnings)