Update web/core/scripture_normalizer.py

2025-08-14 02:15:11 +00:00 · 2025-08-14 02:15:11 +00:00 · cbb60cc3cd
commit cbb60cc3cd
parent fb74fe1ebf
1 changed files with 25 additions and 9 deletions
--- a/web/core/scripture_normalizer.py
+++ b/web/core/scripture_normalizer.py
@ -144,26 +144,42 @@ def _canon_key(raw: str) -> str:
    return key

 def _canon_book(book_raw: str, num: Optional[str] = None) -> Optional[str]:
-    key = _canon_key(book_raw)
+    """
+    Map a raw book token to our canonical abbreviation.
+    Strategy:
+      1) Try exact/alias matches (with dots removed)  ✅
+      2) Try the same again with ALL spaces removed  ✅  (so 'i sa' -> 'isa')
+      3) Only then, try the 'leading 1/2/3 or i/ii/iii + rest' heuristic.
+    """
+    key = _canon_key(book_raw)            # lower, trimmed, dots removed, single spaces
+    key_nospace = key.replace(" ", "")    # used to catch 'i sa' -> 'isa'
+
+    # 1) direct lookup (e.g., 'isa', 'isaiah', 'ps', '1 corinthians', etc.)
+    if key in BOOK_MAP:
+        return BOOK_MAP[key]
+    if key_nospace in BOOK_MAP:
+        return BOOK_MAP[key_nospace]
+
+    # 2) special explicit “number + short key” combos
    if num and key in ("co", "cor"):
        return f"{num} Cor."
    if num and key in ("ti", "tim"):
        return f"{num} Tim."
-    if key in BOOK_MAP:
-        val = BOOK_MAP[key]
-        return val
-    # FIXED: correct named group syntax here
-    m = re.match(r"(?P<n>[1-3]|i{1,3})(?P<rest>[A-Za-z].*)$", key)
+
+    # 3) heuristic: split a leading number/roman from the rest (e.g., 'i john' or '2ki')
+    m = re.match(r"(?P<n>[1-3]|i{1,3})(?P<rest>[A-Za-z].*)$", key_nospace)
    if m:
-        n = m.group("n")
+        n = {"i": "1", "ii": "2", "iii": "3"}.get(m.group("n"), m.group("n"))
        rest = m.group("rest")
        if rest in BOOK_MAP:
            base = BOOK_MAP[rest]
-            n = {"i": "1", "ii": "2", "iii": "3"}.get(n, n)
+            # If base already begins with a number (rare in our map), replace it with n
            if base[0] in "123":
                return f"{n} {base.split(' ', 1)[1]}"
            return f"{n} {base}"
-    return BOOK_CANON.get(key)
+
+    # 4) last‑resort: try the raw key against canonical map
+    return BOOK_CANON.get(key_nospace) or BOOK_CANON.get(key)

 def _parse_segment(seg: str, last_book: Optional[str]) -> Tuple[Optional[str], Optional[str], bool]:
    original = seg.strip()