# core/source_normalizer.py from __future__ import annotations import re from typing import List, Tuple, Optional # ------------------------------------------------------------ # Helpers # ------------------------------------------------------------ # Map publication names/aliases -> WOL code PUB_MAP = { # Watchtower (both Study/Public) "watchtower": "w", "the watchtower": "w", "wt": "w", "w": "w", # Awake! "awake": "g", "awake!": "g", "aw": "g", "g": "g", # Yearbook "yearbook": "yb", "yb": "yb", # Our Kingdom Ministry "our kingdom ministry": "km", "kingdom ministry": "km", "km": "km", } # Month name / abbreviation map -> month number MONTH_MAP = { "january": 1, "jan": 1, "february": 2, "feb": 2, "march": 3, "mar": 3, "april": 4, "apr": 4, "may": 5, "june": 6, "jun": 6, "july": 7, "jul": 7, "august": 8, "aug": 8, "september": 9, "sep": 9, "sept": 9, "october": 10, "oct": 10, "november": 11, "nov": 11, "december": 12, "dec": 12, } DASH = r"[–—-]" # en/em/normal hyphen SPACE = r"[ \t\xa0\u2009\u202F]*" # Pages (single, list, ranges). We normalize dash to hyphen later. PAGES_RE = re.compile( rf"(?P