Add web/core/source_normalizer.py
This commit is contained in:
parent
b98a63a4d5
commit
389f53a97c
253
web/core/source_normalizer.py
Normal file
253
web/core/source_normalizer.py
Normal file
@ -0,0 +1,253 @@
|
|||||||
|
# core/source_normalizer.py
|
||||||
|
from __future__ import annotations
|
||||||
|
import re
|
||||||
|
from typing import List, Tuple, Optional
|
||||||
|
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
# Helpers
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
|
||||||
|
# Map publication names/aliases -> WOL code
|
||||||
|
PUB_MAP = {
|
||||||
|
# Watchtower (both Study/Public)
|
||||||
|
"watchtower": "w",
|
||||||
|
"the watchtower": "w",
|
||||||
|
"wt": "w",
|
||||||
|
"w": "w",
|
||||||
|
|
||||||
|
# Awake!
|
||||||
|
"awake": "g",
|
||||||
|
"awake!": "g",
|
||||||
|
"aw": "g",
|
||||||
|
"g": "g",
|
||||||
|
|
||||||
|
# Yearbook
|
||||||
|
"yearbook": "yb",
|
||||||
|
"yb": "yb",
|
||||||
|
|
||||||
|
# Our Kingdom Ministry
|
||||||
|
"our kingdom ministry": "km",
|
||||||
|
"kingdom ministry": "km",
|
||||||
|
"km": "km",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Month name / abbreviation map -> month number
|
||||||
|
MONTH_MAP = {
|
||||||
|
"january": 1, "jan": 1,
|
||||||
|
"february": 2, "feb": 2,
|
||||||
|
"march": 3, "mar": 3,
|
||||||
|
"april": 4, "apr": 4,
|
||||||
|
"may": 5,
|
||||||
|
"june": 6, "jun": 6,
|
||||||
|
"july": 7, "jul": 7,
|
||||||
|
"august": 8, "aug": 8,
|
||||||
|
"september": 9, "sep": 9, "sept": 9,
|
||||||
|
"october": 10, "oct": 10,
|
||||||
|
"november": 11, "nov": 11,
|
||||||
|
"december": 12, "dec": 12,
|
||||||
|
}
|
||||||
|
|
||||||
|
DASH = r"[–—-]" # en/em/normal hyphen
|
||||||
|
SPACE = r"[ \t\xa0\u2009\u202F]*"
|
||||||
|
|
||||||
|
# Pages (single, list, ranges). We normalize dash to hyphen later.
|
||||||
|
PAGES_RE = re.compile(
|
||||||
|
rf"(?P<label>pp?|pages?|pgs?)\.?{SPACE}(?P<pages>\d+(?:{SPACE}[,\-–—]{SPACE}\d+)*)",
|
||||||
|
re.I,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Paragraphs “par.” or “pars.”
|
||||||
|
PARS_RE = re.compile(
|
||||||
|
rf"(pars?\.?{SPACE}(?P<pars>\d+(?:{SPACE}[,\-–—]{SPACE}\d+)*))",
|
||||||
|
re.I,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Already-short-coded like: w09 10/1 p.25 g95 1/22 pp.4-6 yb89 p.112 km95 6 p.4
|
||||||
|
SHORTCODE_RE = re.compile(
|
||||||
|
rf"""
|
||||||
|
^{SPACE}
|
||||||
|
(?P<code>w|g|yb|km){SPACE}
|
||||||
|
(?P<year>\d{{2,4}}){SPACE}
|
||||||
|
(?:
|
||||||
|
(?P<month>\d{{1,2}})
|
||||||
|
/
|
||||||
|
(?P<day>\d{{1,2}})
|
||||||
|
)?
|
||||||
|
(?:{SPACE}(?P<pp>pp?\.{SPACE}\d+(?:{SPACE}[,\-–—]{SPACE}\d+)*))?
|
||||||
|
(?:{SPACE}(?P<pars>pars?\.{SPACE}\d+(?:{SPACE}[,\-–—]{SPACE}\d+)*))?
|
||||||
|
{SPACE}$
|
||||||
|
""",
|
||||||
|
re.X | re.I,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Legacy W/G style: W63 5/1 274 G60 10/22 9
|
||||||
|
LEGACY_WG_RE = re.compile(
|
||||||
|
rf"^{SPACE}(?P<code>[WGwg]){SPACE}(?P<yy>\d{{2}}){SPACE}(?P<md>\d{{1,2}}/\d{{1,2}}){SPACE}(?P<pg>\d+){SPACE}$"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Month name formats like: March 15, 2013 Mar 15 2013
|
||||||
|
NAMED_DATE_RE = re.compile(
|
||||||
|
rf"""
|
||||||
|
(?P<month_name>[A-Za-z]+){SPACE}
|
||||||
|
(?P<day>\d{{1,2}}){SPACE},?{SPACE}
|
||||||
|
(?P<year>\d{{4}}|\d{{2}})
|
||||||
|
""",
|
||||||
|
re.X,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Numeric date formats like: 6/22/1993 or 7/1/09
|
||||||
|
NUMERIC_DATE_RE = re.compile(
|
||||||
|
rf"(?P<month>\d{{1,2}}){SPACE}/{SPACE}(?P<day>\d{{1,2}}){SPACE}/{SPACE}(?P<year>\d{{2,4}})"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Light cleaner
|
||||||
|
def _clean(s: str) -> str:
|
||||||
|
s = s.replace("\xa0", " ").replace("\u2009", " ").replace("\u202f", " ")
|
||||||
|
s = re.sub(r"\s+", " ", s).strip()
|
||||||
|
return s
|
||||||
|
|
||||||
|
def _to_yy(year: str) -> str:
|
||||||
|
y = int(year)
|
||||||
|
if y >= 100:
|
||||||
|
return f"{y % 100:02d}"
|
||||||
|
return f"{y:02d}"
|
||||||
|
|
||||||
|
def _norm_pages(label: str, pages: str) -> str:
|
||||||
|
# normalize dashes and spaces, decide p. vs pp.
|
||||||
|
p = re.sub(rf"{SPACE}[{DASH},]{SPACE}", lambda m: m.group(0).strip(), pages)
|
||||||
|
p = p.replace("–", "-").replace("—", "-").replace(" ", "")
|
||||||
|
if "-" in p or "," in p:
|
||||||
|
return f"pp.{p}"
|
||||||
|
return f"p.{p}"
|
||||||
|
|
||||||
|
def _extract_pub(text: str) -> Optional[str]:
|
||||||
|
t = text.lower()
|
||||||
|
# try whole-word-ish matches
|
||||||
|
for k, v in PUB_MAP.items():
|
||||||
|
if re.search(rf"\b{k}\b", t):
|
||||||
|
return v
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _extract_date(text: str) -> Optional[tuple[int, int, str]]:
|
||||||
|
# Named month
|
||||||
|
m = NAMED_DATE_RE.search(text)
|
||||||
|
if m:
|
||||||
|
month_name = m.group("month_name").lower().strip(".")
|
||||||
|
if month_name in MONTH_MAP:
|
||||||
|
mm = MONTH_MAP[month_name]
|
||||||
|
dd = int(m.group("day"))
|
||||||
|
yy = _to_yy(m.group("year"))
|
||||||
|
return (mm, dd, yy)
|
||||||
|
|
||||||
|
# Numeric month/day/year
|
||||||
|
m = NUMERIC_DATE_RE.search(text)
|
||||||
|
if m:
|
||||||
|
mm = int(m.group("month"))
|
||||||
|
dd = int(m.group("day"))
|
||||||
|
yy = _to_yy(m.group("year"))
|
||||||
|
return (mm, dd, yy)
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _extract_pages(text: str) -> Optional[str]:
|
||||||
|
m = PAGES_RE.search(text)
|
||||||
|
if not m:
|
||||||
|
return None
|
||||||
|
return _norm_pages(m.group("label"), m.group("pages"))
|
||||||
|
|
||||||
|
def _extract_pars(text: str) -> Optional[str]:
|
||||||
|
m = PARS_RE.search(text)
|
||||||
|
if not m:
|
||||||
|
return None
|
||||||
|
# normalize dashes and spaces
|
||||||
|
pars = m.group("pars")
|
||||||
|
pars = pars.replace("–", "-").replace("—", "-")
|
||||||
|
pars = re.sub(rf"{SPACE}[,-]{SPACE}", lambda m: m.group(0).strip(), pars)
|
||||||
|
pars = re.sub(r"\s+", "", pars)
|
||||||
|
return f"pars.{pars}"
|
||||||
|
|
||||||
|
def _format_output(code: str, yy: str, mm: int, dd: int, pages: Optional[str], pars: Optional[str]) -> str:
|
||||||
|
out = f"{code}{yy} {mm}/{dd}"
|
||||||
|
if pages:
|
||||||
|
out += f" {pages}"
|
||||||
|
if pars:
|
||||||
|
out += f", {pars}"
|
||||||
|
return out
|
||||||
|
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
# Public API
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
|
||||||
|
def normalize_source_field(text: str) -> tuple[str, List[str]]:
|
||||||
|
"""
|
||||||
|
Try to normalize Watchtower/Awake!/Yearbook/KM citations into WOL short-code.
|
||||||
|
If the line is not a JW publication, or if we can't confidently parse pub + full date,
|
||||||
|
we return it unchanged. Returns (normalized_text, warnings).
|
||||||
|
"""
|
||||||
|
warnings: List[str] = []
|
||||||
|
original = text or ""
|
||||||
|
s = _clean(original)
|
||||||
|
if not s:
|
||||||
|
return ("", warnings)
|
||||||
|
|
||||||
|
# 1) Already short-coded?
|
||||||
|
m = SHORTCODE_RE.match(s)
|
||||||
|
if m:
|
||||||
|
code = m.group("code").lower()
|
||||||
|
yy = _to_yy(m.group("year"))
|
||||||
|
mm = m.group("month")
|
||||||
|
dd = m.group("day")
|
||||||
|
pages = m.group("pp")
|
||||||
|
pars = m.group("pars")
|
||||||
|
|
||||||
|
# Normalize pp/pars spacing/dashes
|
||||||
|
pages_norm = None
|
||||||
|
if pages:
|
||||||
|
pm = re.search(r"pp?\.?\s*(.+)$", pages, re.I)
|
||||||
|
if pm:
|
||||||
|
pages_norm = _norm_pages("p", pm.group(1))
|
||||||
|
|
||||||
|
pars_norm = None
|
||||||
|
if pars:
|
||||||
|
par_m = re.search(r"pars?\.?\s*(.+)$", pars, re.I)
|
||||||
|
if par_m:
|
||||||
|
ptxt = par_m.group(1).replace("–", "-").replace("—", "-")
|
||||||
|
ptxt = re.sub(r"\s+", "", ptxt)
|
||||||
|
pars_norm = f"pars.{ptxt}"
|
||||||
|
|
||||||
|
if mm and dd:
|
||||||
|
out = _format_output(code, yy, int(mm), int(dd), pages_norm, pars_norm)
|
||||||
|
return (out, warnings)
|
||||||
|
# If no full date in the short-code, keep as-is (strict mode).
|
||||||
|
return (s, warnings)
|
||||||
|
|
||||||
|
# 2) Legacy W/G lines like "W63 5/1 274"
|
||||||
|
m = LEGACY_WG_RE.match(s)
|
||||||
|
if m:
|
||||||
|
code = "w" if m.group("code").lower() == "w" else "g"
|
||||||
|
yy = m.group("yy")
|
||||||
|
md = m.group("md") # m/d
|
||||||
|
pg = m.group("pg")
|
||||||
|
try:
|
||||||
|
mm, dd = [int(x) for x in md.split("/")]
|
||||||
|
except Exception:
|
||||||
|
return (s, warnings)
|
||||||
|
pages = _norm_pages("p", pg)
|
||||||
|
return (_format_output(code, yy, mm, dd, pages, None), warnings)
|
||||||
|
|
||||||
|
# 3) Otherwise, must detect publication + full date; else pass-through.
|
||||||
|
pub = _extract_pub(s)
|
||||||
|
if not pub:
|
||||||
|
return (s, warnings)
|
||||||
|
|
||||||
|
date_tuple = _extract_date(s)
|
||||||
|
if not date_tuple:
|
||||||
|
# Month-only references remain unchanged in strict mode.
|
||||||
|
return (s, warnings)
|
||||||
|
mm, dd, yy = date_tuple
|
||||||
|
|
||||||
|
pages = _extract_pages(s)
|
||||||
|
pars = _extract_pars(s)
|
||||||
|
|
||||||
|
out = _format_output(pub, yy, mm, dd, pages, pars)
|
||||||
|
return (out, warnings)
|
||||||
Loading…
Reference in New Issue
Block a user