Update web/core/scripture_normalizer.py

2025-08-14 02:15:11 +00:00
parent fb74fe1ebf
commit cbb60cc3cd
1 changed files with 25 additions and 9 deletions
@@ -144,26 +144,42 @@ def _canon_key(raw: str) -> str:
    return key
 def _canon_book(book_raw: str, num: Optional[str] = None) -> Optional[str]:
-    key = _canon_key(book_raw)
+    """
    Map a raw book token to our canonical abbreviation.
    Strategy:
      1) Try exact/alias matches (with dots removed)  ✅
      2) Try the same again with ALL spaces removed  ✅  (so 'i sa' -> 'isa')
      3) Only then, try the 'leading 1/2/3 or i/ii/iii + rest' heuristic.
    """
    key = _canon_key(book_raw)            # lower, trimmed, dots removed, single spaces
    key_nospace = key.replace(" ", "")    # used to catch 'i sa' -> 'isa'
    # 1) direct lookup (e.g., 'isa', 'isaiah', 'ps', '1 corinthians', etc.)
    if key in BOOK_MAP:
        return BOOK_MAP[key]
    if key_nospace in BOOK_MAP:
        return BOOK_MAP[key_nospace]
    # 2) special explicit “number + short key” combos
    if num and key in ("co", "cor"):
        return f"{num} Cor."
    if num and key in ("ti", "tim"):
        return f"{num} Tim."
-    if key in BOOK_MAP:
+
-        val = BOOK_MAP[key]
+    # 3) heuristic: split a leading number/roman from the rest (e.g., 'i john' or '2ki')
-        return val
+    m = re.match(r"(?P<n>[1-3]|i{1,3})(?P<rest>[A-Za-z].*)$", key_nospace)
    # FIXED: correct named group syntax here
    m = re.match(r"(?P<n>[1-3]|i{1,3})(?P<rest>[A-Za-z].*)$", key)
    if m:
-        n = m.group("n")
+        n = {"i": "1", "ii": "2", "iii": "3"}.get(m.group("n"), m.group("n"))
        rest = m.group("rest")
        if rest in BOOK_MAP:
            base = BOOK_MAP[rest]
-            n = {"i": "1", "ii": "2", "iii": "3"}.get(n, n)
+            # If base already begins with a number (rare in our map), replace it with n
            if base[0] in "123":
                return f"{n} {base.split(' ', 1)[1]}"
            return f"{n} {base}"
-    return BOOK_CANON.get(key)
+
    # 4) last‑resort: try the raw key against canonical map
    return BOOK_CANON.get(key_nospace) or BOOK_CANON.get(key)
 def _parse_segment(seg: str, last_book: Optional[str]) -> Tuple[Optional[str], Optional[str], bool]:
    original = seg.strip()