Illustrations/web/core/source_normalizer.py

253 lines
7.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# core/source_normalizer.py
from __future__ import annotations
import re
from typing import List, Tuple, Optional
# ------------------------------------------------------------
# Helpers
# ------------------------------------------------------------
# Map publication names/aliases -> WOL code
PUB_MAP = {
# Watchtower (both Study/Public)
"watchtower": "w",
"the watchtower": "w",
"wt": "w",
"w": "w",
# Awake!
"awake": "g",
"awake!": "g",
"aw": "g",
"g": "g",
# Yearbook
"yearbook": "yb",
"yb": "yb",
# Our Kingdom Ministry
"our kingdom ministry": "km",
"kingdom ministry": "km",
"km": "km",
}
# Month name / abbreviation map -> month number
MONTH_MAP = {
"january": 1, "jan": 1,
"february": 2, "feb": 2,
"march": 3, "mar": 3,
"april": 4, "apr": 4,
"may": 5,
"june": 6, "jun": 6,
"july": 7, "jul": 7,
"august": 8, "aug": 8,
"september": 9, "sep": 9, "sept": 9,
"october": 10, "oct": 10,
"november": 11, "nov": 11,
"december": 12, "dec": 12,
}
DASH = r"[–—-]" # en/em/normal hyphen
SPACE = r"[ \t\xa0\u2009\u202F]*"
# Pages (single, list, ranges). We normalize dash to hyphen later.
PAGES_RE = re.compile(
rf"(?P<label>pp?|pages?|pgs?)\.?{SPACE}(?P<pages>\d+(?:{SPACE}[,\-–—]{SPACE}\d+)*)",
re.I,
)
# Paragraphs “par.” or “pars.”
PARS_RE = re.compile(
rf"(pars?\.?{SPACE}(?P<pars>\d+(?:{SPACE}[,\-–—]{SPACE}\d+)*))",
re.I,
)
# Already-short-coded like: w09 10/1 p.25 g95 1/22 pp.4-6 yb89 p.112 km95 6 p.4
SHORTCODE_RE = re.compile(
rf"""
^{SPACE}
(?P<code>w|g|yb|km){SPACE}
(?P<year>\d{{2,4}}){SPACE}
(?:
(?P<month>\d{{1,2}})
/
(?P<day>\d{{1,2}})
)?
(?:{SPACE}(?P<pp>pp?\.{SPACE}\d+(?:{SPACE}[,\-–—]{SPACE}\d+)*))?
(?:{SPACE}(?P<pars>pars?\.{SPACE}\d+(?:{SPACE}[,\-–—]{SPACE}\d+)*))?
{SPACE}$
""",
re.X | re.I,
)
# Legacy W/G style: W63 5/1 274 G60 10/22 9
LEGACY_WG_RE = re.compile(
rf"^{SPACE}(?P<code>[WGwg]){SPACE}(?P<yy>\d{{2}}){SPACE}(?P<md>\d{{1,2}}/\d{{1,2}}){SPACE}(?P<pg>\d+){SPACE}$"
)
# Month name formats like: March 15, 2013 Mar 15 2013
NAMED_DATE_RE = re.compile(
rf"""
(?P<month_name>[A-Za-z]+){SPACE}
(?P<day>\d{{1,2}}){SPACE},?{SPACE}
(?P<year>\d{{4}}|\d{{2}})
""",
re.X,
)
# Numeric date formats like: 6/22/1993 or 7/1/09
NUMERIC_DATE_RE = re.compile(
rf"(?P<month>\d{{1,2}}){SPACE}/{SPACE}(?P<day>\d{{1,2}}){SPACE}/{SPACE}(?P<year>\d{{2,4}})"
)
# Light cleaner
def _clean(s: str) -> str:
s = s.replace("\xa0", " ").replace("\u2009", " ").replace("\u202f", " ")
s = re.sub(r"\s+", " ", s).strip()
return s
def _to_yy(year: str) -> str:
y = int(year)
if y >= 100:
return f"{y % 100:02d}"
return f"{y:02d}"
def _norm_pages(label: str, pages: str) -> str:
# normalize dashes and spaces, decide p. vs pp.
p = re.sub(rf"{SPACE}[{DASH},]{SPACE}", lambda m: m.group(0).strip(), pages)
p = p.replace("", "-").replace("", "-").replace(" ", "")
if "-" in p or "," in p:
return f"pp.{p}"
return f"p.{p}"
def _extract_pub(text: str) -> Optional[str]:
t = text.lower()
# try whole-word-ish matches
for k, v in PUB_MAP.items():
if re.search(rf"\b{k}\b", t):
return v
return None
def _extract_date(text: str) -> Optional[tuple[int, int, str]]:
# Named month
m = NAMED_DATE_RE.search(text)
if m:
month_name = m.group("month_name").lower().strip(".")
if month_name in MONTH_MAP:
mm = MONTH_MAP[month_name]
dd = int(m.group("day"))
yy = _to_yy(m.group("year"))
return (mm, dd, yy)
# Numeric month/day/year
m = NUMERIC_DATE_RE.search(text)
if m:
mm = int(m.group("month"))
dd = int(m.group("day"))
yy = _to_yy(m.group("year"))
return (mm, dd, yy)
return None
def _extract_pages(text: str) -> Optional[str]:
m = PAGES_RE.search(text)
if not m:
return None
return _norm_pages(m.group("label"), m.group("pages"))
def _extract_pars(text: str) -> Optional[str]:
m = PARS_RE.search(text)
if not m:
return None
# normalize dashes and spaces
pars = m.group("pars")
pars = pars.replace("", "-").replace("", "-")
pars = re.sub(rf"{SPACE}[,-]{SPACE}", lambda m: m.group(0).strip(), pars)
pars = re.sub(r"\s+", "", pars)
return f"pars.{pars}"
def _format_output(code: str, yy: str, mm: int, dd: int, pages: Optional[str], pars: Optional[str]) -> str:
out = f"{code}{yy} {mm}/{dd}"
if pages:
out += f" {pages}"
if pars:
out += f", {pars}"
return out
# ------------------------------------------------------------
# Public API
# ------------------------------------------------------------
def normalize_source_field(text: str) -> tuple[str, List[str]]:
"""
Try to normalize Watchtower/Awake!/Yearbook/KM citations into WOL short-code.
If the line is not a JW publication, or if we can't confidently parse pub + full date,
we return it unchanged. Returns (normalized_text, warnings).
"""
warnings: List[str] = []
original = text or ""
s = _clean(original)
if not s:
return ("", warnings)
# 1) Already short-coded?
m = SHORTCODE_RE.match(s)
if m:
code = m.group("code").lower()
yy = _to_yy(m.group("year"))
mm = m.group("month")
dd = m.group("day")
pages = m.group("pp")
pars = m.group("pars")
# Normalize pp/pars spacing/dashes
pages_norm = None
if pages:
pm = re.search(r"pp?\.?\s*(.+)$", pages, re.I)
if pm:
pages_norm = _norm_pages("p", pm.group(1))
pars_norm = None
if pars:
par_m = re.search(r"pars?\.?\s*(.+)$", pars, re.I)
if par_m:
ptxt = par_m.group(1).replace("", "-").replace("", "-")
ptxt = re.sub(r"\s+", "", ptxt)
pars_norm = f"pars.{ptxt}"
if mm and dd:
out = _format_output(code, yy, int(mm), int(dd), pages_norm, pars_norm)
return (out, warnings)
# If no full date in the short-code, keep as-is (strict mode).
return (s, warnings)
# 2) Legacy W/G lines like "W63 5/1 274"
m = LEGACY_WG_RE.match(s)
if m:
code = "w" if m.group("code").lower() == "w" else "g"
yy = m.group("yy")
md = m.group("md") # m/d
pg = m.group("pg")
try:
mm, dd = [int(x) for x in md.split("/")]
except Exception:
return (s, warnings)
pages = _norm_pages("p", pg)
return (_format_output(code, yy, mm, dd, pages, None), warnings)
# 3) Otherwise, must detect publication + full date; else pass-through.
pub = _extract_pub(s)
if not pub:
return (s, warnings)
date_tuple = _extract_date(s)
if not date_tuple:
# Month-only references remain unchanged in strict mode.
return (s, warnings)
mm, dd, yy = date_tuple
pages = _extract_pages(s)
pars = _extract_pars(s)
out = _format_output(pub, yy, mm, dd, pages, pars)
return (out, warnings)