Update web/core/scripture_normalizer.py

This commit is contained in:
Joshua Laymon 2025-08-14 02:15:11 +00:00
parent fb74fe1ebf
commit cbb60cc3cd

View File

@ -144,26 +144,42 @@ def _canon_key(raw: str) -> str:
return key
def _canon_book(book_raw: str, num: Optional[str] = None) -> Optional[str]:
key = _canon_key(book_raw)
"""
Map a raw book token to our canonical abbreviation.
Strategy:
1) Try exact/alias matches (with dots removed)
2) Try the same again with ALL spaces removed (so 'i sa' -> 'isa')
3) Only then, try the 'leading 1/2/3 or i/ii/iii + rest' heuristic.
"""
key = _canon_key(book_raw) # lower, trimmed, dots removed, single spaces
key_nospace = key.replace(" ", "") # used to catch 'i sa' -> 'isa'
# 1) direct lookup (e.g., 'isa', 'isaiah', 'ps', '1 corinthians', etc.)
if key in BOOK_MAP:
return BOOK_MAP[key]
if key_nospace in BOOK_MAP:
return BOOK_MAP[key_nospace]
# 2) special explicit “number + short key” combos
if num and key in ("co", "cor"):
return f"{num} Cor."
if num and key in ("ti", "tim"):
return f"{num} Tim."
if key in BOOK_MAP:
val = BOOK_MAP[key]
return val
# FIXED: correct named group syntax here
m = re.match(r"(?P<n>[1-3]|i{1,3})(?P<rest>[A-Za-z].*)$", key)
# 3) heuristic: split a leading number/roman from the rest (e.g., 'i john' or '2ki')
m = re.match(r"(?P<n>[1-3]|i{1,3})(?P<rest>[A-Za-z].*)$", key_nospace)
if m:
n = m.group("n")
n = {"i": "1", "ii": "2", "iii": "3"}.get(m.group("n"), m.group("n"))
rest = m.group("rest")
if rest in BOOK_MAP:
base = BOOK_MAP[rest]
n = {"i": "1", "ii": "2", "iii": "3"}.get(n, n)
# If base already begins with a number (rare in our map), replace it with n
if base[0] in "123":
return f"{n} {base.split(' ', 1)[1]}"
return f"{n} {base}"
return BOOK_CANON.get(key)
# 4) lastresort: try the raw key against canonical map
return BOOK_CANON.get(key_nospace) or BOOK_CANON.get(key)
def _parse_segment(seg: str, last_book: Optional[str]) -> Tuple[Optional[str], Optional[str], bool]:
original = seg.strip()