From fb74fe1ebf2fa58a30ad5ceb7c9b107db84f10ea Mon Sep 17 00:00:00 2001 From: Joshua Laymon Date: Thu, 14 Aug 2025 02:09:33 +0000 Subject: [PATCH] Update web/core/scripture_normalizer.py --- web/core/scripture_normalizer.py | 49 ++++---------------------------- 1 file changed, 6 insertions(+), 43 deletions(-) diff --git a/web/core/scripture_normalizer.py b/web/core/scripture_normalizer.py index 7a6a4bb..d56cea3 100644 --- a/web/core/scripture_normalizer.py +++ b/web/core/scripture_normalizer.py @@ -78,7 +78,6 @@ BOOK_CANON: Dict[str, str] = { "revelation": "Rev.", "rev": "Rev.", "re": "Rev.", } -# also accept short two-word numbered patterns dynamically (e.g., "1 sam", "2 ki") def _variants() -> Dict[str, str]: base = dict(BOOK_CANON) numbered = [ @@ -90,9 +89,7 @@ def _variants() -> Dict[str, str]: for name, abbr in numbered: base[f"{n} {name}"] = f"{prefix} {abbr}" base[f"{n}{name}"] = f"{prefix} {abbr}" - # ultra short no-space combos like "1co", "2ki" base[f"{n}{name[:2]}"] = f"{prefix} {abbr}" - # very common shorthands that users type base["ps"] = "Ps." base["prov"] = "Prov." base["eccles"] = "Eccl." @@ -101,29 +98,26 @@ def _variants() -> Dict[str, str]: base["mk"] = "Mark" base["lk"] = "Luke" base["jn"] = "John" - base["ti"] = "Tim." # used only with a leading number, handled below - base["co"] = "Cor." # used only with a leading number, handled below + base["ti"] = "Tim." + base["co"] = "Cor." return base BOOK_MAP = _variants() -# Words to strip like "Read", "chapter" CRUFT_RE = re.compile(r"\b(read|see|chap(?:ter)?|ch)\b\.?", re.I) -# Book prefix (allows "1Co", "2 Pet.", "Rom", etc.) BOOK_RE = re.compile( r""" ^\s* (?: - (?P[1-3]|i{1,3})\s* # optional 1/2/3 or roman + (?P[1-3]|i{1,3})\s* )? \s* - (?P[A-Za-z\.]+(?:\s+[A-Za-z\.]+){0,2}) # book words (1-3 words) + (?P[A-Za-z\.]+(?:\s+[A-Za-z\.]+){0,2}) """, re.X, ) -# chapter/verse piece like "4:6,7-9" or "21" C_V_RE = re.compile( r""" (?: @@ -150,48 +144,33 @@ def _canon_key(raw: str) -> str: return key def _canon_book(book_raw: str, num: Optional[str] = None) -> Optional[str]: - """ - Try to map a raw book token (with optional leading number) to our canonical abbr. - """ key = _canon_key(book_raw) - # if the token was like "co" or "ti" and a number exists, bias to Cor./Tim. if num and key in ("co", "cor"): return f"{num} Cor." if num and key in ("ti", "tim"): return f"{num} Tim." - # direct lookup in our expanded map if key in BOOK_MAP: val = BOOK_MAP[key] - # If lookup returned a numbered abbr like "1 Cor." (already good) return val - # sometimes users omit space: "1co", "2ki" - m = re.match(r"(?P[1-3]|i{1,3})(?P(rest>[A-Za-z].*))$", key) + # FIXED: correct named group syntax here + m = re.match(r"(?P[1-3]|i{1,3})(?P[A-Za-z].*)$", key) if m: n = m.group("n") rest = m.group("rest") if rest in BOOK_MAP: base = BOOK_MAP[rest] - # normalize roman to arabic n = {"i": "1", "ii": "2", "iii": "3"}.get(n, n) - # if base already includes number, replace it if base[0] in "123": return f"{n} {base.split(' ', 1)[1]}" return f"{n} {base}" - # last chance: try the core canon directly return BOOK_CANON.get(key) def _parse_segment(seg: str, last_book: Optional[str]) -> Tuple[Optional[str], Optional[str], bool]: - """ - Parse a semicolon-delimited segment. - Returns (book_canon, cv_string, preserve_raw). - - If preserve_raw is True, cv_string contains the original (cleaned) segment to keep verbatim. - """ original = seg.strip() s = _clean_text(seg) if not s: return (None, None, False) - # Try to detect a leading book token m = BOOK_RE.match(s) book = None rest = s @@ -204,18 +183,14 @@ def _parse_segment(seg: str, last_book: Optional[str]) -> Tuple[Optional[str], O book = canon rest = s[m.end():].strip(",;: .") else: - # Not recognized as a book -> keep whole thing verbatim return (None, original, True) if not book: - # Inherit previous recognized book if we have one if last_book: book = last_book else: - # There is no book context — keep segment verbatim to avoid data loss return (None, original, True) - # Normalize the chapter/verse part rest = re.sub(r"\s+", "", rest) rest = re.sub(r":\s*", ":", rest) @@ -229,22 +204,14 @@ def _parse_segment(seg: str, last_book: Optional[str]) -> Tuple[Optional[str], O cv = m2.group(0).replace(" ", "") if m2 else None if not cv: - # Nothing parseable after a valid book; just keep the book return (book, None, False) - # Normalize dashes, commas cv = cv.replace("–", "-").replace("—", "-") cv = re.sub(r",\s*", ",", cv) return (book, cv, False) def normalize_scripture_field(text: str) -> Tuple[str, List[str]]: - """ - Normalize an entire scripture_raw string. - - Unknown pieces are preserved verbatim. - - Known pieces are standardized and each segment repeats the book. - Returns (normalized_text, warnings). - """ warnings: List[str] = [] if not text: return ("", warnings) @@ -257,7 +224,6 @@ def normalize_scripture_field(text: str) -> Tuple[str, List[str]]: book, cv, preserve = _parse_segment(piece, last_book) if preserve: out.append(cv or piece.strip()) - # do not update last_book when we couldn't recognize a book continue if book and cv: @@ -266,14 +232,11 @@ def normalize_scripture_field(text: str) -> Tuple[str, List[str]]: continue if book and not cv: - # book only (e.g., "Acts") out.append(book) last_book = book continue - # If we get here: (no book, no preserve) — nothing useful to add if piece.strip(): - # As a final safeguard, keep original out.append(piece.strip()) norm = "; ".join(x.strip() for x in out if x.strip())