From fb74fe1ebf2fa58a30ad5ceb7c9b107db84f10ea Mon Sep 17 00:00:00 2001
From: Joshua Laymon <joshlaymon@icloud.com>
Date: Thu, 14 Aug 2025 02:09:33 +0000
Subject: [PATCH] Update web/core/scripture_normalizer.py

---
 web/core/scripture_normalizer.py | 49 ++++----------------------------
 1 file changed, 6 insertions(+), 43 deletions(-)
diff --git a/web/core/scripture_normalizer.py b/web/core/scripture_normalizer.py
index 7a6a4bb..d56cea3 100644
--- a/web/core/scripture_normalizer.py
+++ b/web/core/scripture_normalizer.py
@@ -78,7 +78,6 @@ BOOK_CANON: Dict[str, str] = {
     "revelation": "Rev.", "rev": "Rev.", "re": "Rev.",
 }
 
-# also accept short two-word numbered patterns dynamically (e.g., "1 sam", "2 ki")
 def _variants() -> Dict[str, str]:
     base = dict(BOOK_CANON)
     numbered = [
@@ -90,9 +89,7 @@ def _variants() -> Dict[str, str]:
         for name, abbr in numbered:
             base[f"{n} {name}"] = f"{prefix} {abbr}"
             base[f"{n}{name}"] = f"{prefix} {abbr}"
-            # ultra short no-space combos like "1co", "2ki"
             base[f"{n}{name[:2]}"] = f"{prefix} {abbr}"
-    # very common shorthands that users type
     base["ps"] = "Ps."
     base["prov"] = "Prov."
     base["eccles"] = "Eccl."
@@ -101,29 +98,26 @@ def _variants() -> Dict[str, str]:
     base["mk"] = "Mark"
     base["lk"] = "Luke"
     base["jn"] = "John"
-    base["ti"] = "Tim."   # used only with a leading number, handled below
-    base["co"] = "Cor."   # used only with a leading number, handled below
+    base["ti"] = "Tim."
+    base["co"] = "Cor."
     return base
 
 BOOK_MAP = _variants()
 
-# Words to strip like "Read", "chapter"
 CRUFT_RE = re.compile(r"\b(read|see|chap(?:ter)?|ch)\b\.?", re.I)
 
-# Book prefix (allows "1Co", "2 Pet.", "Rom", etc.)
 BOOK_RE = re.compile(
     r"""
     ^\s*
     (?:
-      (?P<num>[1-3]|i{1,3})\s*    # optional 1/2/3 or roman
+      (?P<num>[1-3]|i{1,3})\s*
     )?
     \s*
-    (?P<book>[A-Za-z\.]+(?:\s+[A-Za-z\.]+){0,2})   # book words (1-3 words)
+    (?P<book>[A-Za-z\.]+(?:\s+[A-Za-z\.]+){0,2})
     """,
     re.X,
 )
 
-# chapter/verse piece like "4:6,7-9" or "21"
 C_V_RE = re.compile(
     r"""
     (?:
@@ -150,48 +144,33 @@ def _canon_key(raw: str) -> str:
     return key
 
 def _canon_book(book_raw: str, num: Optional[str] = None) -> Optional[str]:
-    """
-    Try to map a raw book token (with optional leading number) to our canonical abbr.
-    """
     key = _canon_key(book_raw)
-    # if the token was like "co" or "ti" and a number exists, bias to Cor./Tim.
     if num and key in ("co", "cor"):
         return f"{num} Cor."
     if num and key in ("ti", "tim"):
         return f"{num} Tim."
-    # direct lookup in our expanded map
     if key in BOOK_MAP:
         val = BOOK_MAP[key]
-        # If lookup returned a numbered abbr like "1 Cor." (already good)
         return val
-    # sometimes users omit space: "1co", "2ki"
-    m = re.match(r"(?P<n>[1-3]|i{1,3})(?P(rest>[A-Za-z].*))$", key)
+    # FIXED: correct named group syntax here
+    m = re.match(r"(?P<n>[1-3]|i{1,3})(?P<rest>[A-Za-z].*)$", key)
     if m:
         n = m.group("n")
         rest = m.group("rest")
         if rest in BOOK_MAP:
             base = BOOK_MAP[rest]
-            # normalize roman to arabic
             n = {"i": "1", "ii": "2", "iii": "3"}.get(n, n)
-            # if base already includes number, replace it
             if base[0] in "123":
                 return f"{n} {base.split(' ', 1)[1]}"
             return f"{n} {base}"
-    # last chance: try the core canon directly
     return BOOK_CANON.get(key)
 
 def _parse_segment(seg: str, last_book: Optional[str]) -> Tuple[Optional[str], Optional[str], bool]:
-    """
-    Parse a semicolon-delimited segment.
-    Returns (book_canon, cv_string, preserve_raw).
-    - If preserve_raw is True, cv_string contains the original (cleaned) segment to keep verbatim.
-    """
     original = seg.strip()
     s = _clean_text(seg)
     if not s:
         return (None, None, False)
 
-    # Try to detect a leading book token
     m = BOOK_RE.match(s)
     book = None
     rest = s
@@ -204,18 +183,14 @@ def _parse_segment(seg: str, last_book: Optional[str]) -> Tuple[Optional[str], O
             book = canon
             rest = s[m.end():].strip(",;: .")
         else:
-            # Not recognized as a book -> keep whole thing verbatim
             return (None, original, True)
 
     if not book:
-        # Inherit previous recognized book if we have one
         if last_book:
             book = last_book
         else:
-            # There is no book context — keep segment verbatim to avoid data loss
             return (None, original, True)
 
-    # Normalize the chapter/verse part
     rest = re.sub(r"\s+", "", rest)
     rest = re.sub(r":\s*", ":", rest)
 
@@ -229,22 +204,14 @@ def _parse_segment(seg: str, last_book: Optional[str]) -> Tuple[Optional[str], O
         cv = m2.group(0).replace(" ", "") if m2 else None
 
     if not cv:
-        # Nothing parseable after a valid book; just keep the book
         return (book, None, False)
 
-    # Normalize dashes, commas
     cv = cv.replace("–", "-").replace("—", "-")
     cv = re.sub(r",\s*", ",", cv)
 
     return (book, cv, False)
 
 def normalize_scripture_field(text: str) -> Tuple[str, List[str]]:
-    """
-    Normalize an entire scripture_raw string.
-    - Unknown pieces are preserved verbatim.
-    - Known pieces are standardized and each segment repeats the book.
-    Returns (normalized_text, warnings).
-    """
     warnings: List[str] = []
     if not text:
         return ("", warnings)
@@ -257,7 +224,6 @@ def normalize_scripture_field(text: str) -> Tuple[str, List[str]]:
         book, cv, preserve = _parse_segment(piece, last_book)
         if preserve:
             out.append(cv or piece.strip())
-            # do not update last_book when we couldn't recognize a book
             continue
 
         if book and cv:
@@ -266,14 +232,11 @@ def normalize_scripture_field(text: str) -> Tuple[str, List[str]]:
             continue
 
         if book and not cv:
-            # book only (e.g., "Acts")
             out.append(book)
             last_book = book
             continue
 
-        # If we get here: (no book, no preserve) — nothing useful to add
         if piece.strip():
-            # As a final safeguard, keep original
             out.append(piece.strip())
 
     norm = "; ".join(x.strip() for x in out if x.strip())