From 389f53a97cf6e30d1f91a1145ab2bc503f6ab523 Mon Sep 17 00:00:00 2001 From: Joshua Laymon Date: Thu, 14 Aug 2025 19:30:28 +0000 Subject: [PATCH] Add web/core/source_normalizer.py --- web/core/source_normalizer.py | 253 ++++++++++++++++++++++++++++++++++ 1 file changed, 253 insertions(+) create mode 100644 web/core/source_normalizer.py diff --git a/web/core/source_normalizer.py b/web/core/source_normalizer.py new file mode 100644 index 0000000..646aa06 --- /dev/null +++ b/web/core/source_normalizer.py @@ -0,0 +1,253 @@ +# core/source_normalizer.py +from __future__ import annotations +import re +from typing import List, Tuple, Optional + +# ------------------------------------------------------------ +# Helpers +# ------------------------------------------------------------ + +# Map publication names/aliases -> WOL code +PUB_MAP = { + # Watchtower (both Study/Public) + "watchtower": "w", + "the watchtower": "w", + "wt": "w", + "w": "w", + + # Awake! + "awake": "g", + "awake!": "g", + "aw": "g", + "g": "g", + + # Yearbook + "yearbook": "yb", + "yb": "yb", + + # Our Kingdom Ministry + "our kingdom ministry": "km", + "kingdom ministry": "km", + "km": "km", +} + +# Month name / abbreviation map -> month number +MONTH_MAP = { + "january": 1, "jan": 1, + "february": 2, "feb": 2, + "march": 3, "mar": 3, + "april": 4, "apr": 4, + "may": 5, + "june": 6, "jun": 6, + "july": 7, "jul": 7, + "august": 8, "aug": 8, + "september": 9, "sep": 9, "sept": 9, + "october": 10, "oct": 10, + "november": 11, "nov": 11, + "december": 12, "dec": 12, +} + +DASH = r"[–—-]" # en/em/normal hyphen +SPACE = r"[ \t\xa0\u2009\u202F]*" + +# Pages (single, list, ranges). We normalize dash to hyphen later. +PAGES_RE = re.compile( + rf"(?P