Illustrations/web/core/scripture_normalizer.py
Joshua Laymon 1de4b84e2e Update web/core/scripture_normalizer.py
hopefully final version fixing Peter
2025-08-14 02:39:04 +00:00

200 lines
7.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# core/scripture_normalizer.py
from __future__ import annotations
import re
from typing import Dict, List, Optional, Tuple
# --------------------------
# Canonical book abbreviations
# --------------------------
BOOK_CANON: Dict[str, str] = {
# OT
"genesis": "Gen.", "gen": "Gen.", "genesisesis": "Gen.",
"exodus": "Ex.", "ex": "Ex.",
"leviticus": "Lev.", "lev": "Lev.",
"numbers": "Num.", "num": "Num.", "nums": "Num.",
"deuteronomy": "Deut.", "deut": "Deut.", "deutronomy": "Deut.", "deu": "Deut.",
"joshua": "Josh.", "josh": "Josh.",
"judges": "Judg.", "judg": "Judg.",
"ruth": "Ruth",
"1 samuel": "1 Sam.", "i samuel": "1 Sam.", "1 sam": "1 Sam.", "1sam": "1 Sam.",
"2 samuel": "2 Sam.", "ii samuel": "2 Sam.", "2 sam": "2 Sam.", "2sam": "2 Sam.",
"1 kings": "1 Ki.", "i kings": "1 Ki.", "1 ki": "1 Ki.", "1kgs": "1 Ki.", "1 kgs": "1 Ki.",
"2 kings": "2 Ki.", "ii kings": "2 Ki.", "2 ki": "2 Ki.", "2kgs": "2 Ki.", "2 kgs": "2 Ki.",
"1 chronicles": "1 Chron.", "i chronicles": "1 Chron.", "1 chron": "1 Chron.",
"2 chronicles": "2 Chron.", "ii chronicles": "2 Chron.", "2 chron": "2 Chron.",
"ezra": "Ezra",
"nehemiah": "Neh.", "neh": "Neh.",
"esther": "Esth.", "esth": "Esth.",
"job": "Job",
"psalms": "Ps.", "psalm": "Ps.", "ps": "Ps.", "pss": "Ps.",
"proverbs": "Prov.", "prov": "Prov.",
"ecclesiastes": "Eccl.", "eccles": "Eccl.", "eccl": "Eccl.",
"song of solomon": "Song", "song of songs": "Song", "song": "Song",
"isaiah": "Isa.", "isa": "Isa.",
"jeremiah": "Jer.", "jer": "Jer.",
"lamentations": "Lam.", "lam": "Lam.",
"ezekiel": "Ezek.", "ezek": "Ezek.",
"daniel": "Dan.", "dan": "Dan.",
"hosea": "Hos.", "hos": "Hos.",
"joel": "Joel",
"amos": "Amos",
"obadiah": "Obad.", "obad": "Obad.",
"jonah": "Jon.", "jon": "Jon.",
"micah": "Mic.", "mic": "Mic.",
"nahum": "Nah.", "nah": "Nah.",
"habakkuk": "Hab.", "hab": "Hab.",
"zephaniah": "Zeph.", "zeph": "Zeph.",
"haggai": "Hag.", "hag": "Hag.",
"zechariah": "Zech.", "zech": "Zech.",
"malachi": "Mal.", "mal": "Mal.",
# NT
"matthew": "Matt.", "matt": "Matt.", "mt": "Matt.",
"mark": "Mark", "mk": "Mark", "mrk": "Mark", "mr": "Mark",
"luke": "Luke", "lk": "Luke",
"john": "John", "jn": "John", "joh": "John",
"acts": "Acts",
"romans": "Rom.", "rom": "Rom.",
"1 corinthians": "1 Cor.", "i corinthians": "1 Cor.", "1 cor": "1 Cor.", "1co": "1 Cor.",
"2 corinthians": "2 Cor.", "ii corinthians": "2 Cor.", "2 cor": "2 Cor.", "2co": "2 Cor.",
"galatians": "Gal.", "gal": "Gal.",
"ephesians": "Eph.", "eph": "Eph.",
"philippians": "Phil.", "phil": "Phil.", "philippians 216": "Phil.",
"colossians": "Col.", "col": "Col.",
"1 thessalonians": "1 Thess.", "i thessalonians": "1 Thess.", "1 thess": "1 Thess.",
"2 thessalonians": "2 Thess.", "ii thessalonians": "2 Thess.", "2 thess": "2 Thess.",
"1 timothy": "1 Tim.", "i timothy": "1 Tim.", "1 tim": "1 Tim.", "1ti": "1 Tim.",
"2 timothy": "2 Tim.", "ii timothy": "2 Tim.", "2 tim": "2 Tim.", "2ti": "2 Tim.",
"titus": "Titus", "tit": "Titus",
"philemon": "Philem.", "philem": "Philem.",
"hebrews": "Heb.", "heb": "Heb.",
"james": "Jas.", "jas": "Jas.",
# Peter (expanded aliases)
"1 peter": "1 Pet.", "i peter": "1 Pet.", "1 pet": "1 Pet.",
"1pe": "1 Pet.", "1 pe": "1 Pet.", "1pet": "1 Pet.", "1 pet.": "1 Pet.", "1peter": "1 Pet.",
"2 peter": "2 Pet.", "ii peter": "2 Pet.", "2 pet": "2 Pet.",
"2pe": "2 Pet.", "2 pe": "2 Pet.", "2pet": "2 Pet.", "2 pet.": "2 Pet.", "2peter": "2 Pet.",
"1 john": "1 John", "i john": "1 John",
"2 john": "2 John", "ii john": "2 John",
"3 john": "3 John", "iii john": "3 John",
"jude": "Jude",
"revelation": "Rev.", "rev": "Rev.",
# Chronicles short forms
"1 ch": "1 Chron.", "1 chr": "1 Chron.", "1 Ch": "1 Chron.", "1 Chr": "1 Chron.",
"2 ch": "2 Chron.", "2 chr": "2 Chron.", "2 Ch": "2 Chron.", "2 Chr": "2 Chron.",
"ch": "Chron.", "chr": "Chron.",
}
NEVER_PREFIX_NUMERAL = {"isa", "isaiah", "job", "joel", "amos", "nah", "hag", "mal", "rom", "gal", "eph", "tit", "heb", "jas", "jude"}
def _canon_key(s: str) -> str:
s = (s or "").strip().lower()
s = s.replace(".", " ")
s = re.sub(r"\s+", " ", s)
return s
def _direct_lookup(key: str) -> Optional[str]:
if key in BOOK_CANON:
return BOOK_CANON[key]
nospace = key.replace(" ", "")
if nospace in BOOK_CANON:
return BOOK_CANON[nospace]
return None
CRUFT_RE = re.compile(r"\b(read|see|chap(?:ter)?|ch)\b\.?", re.I)
def _clean_text(s: str) -> str:
s = s.replace("\xa0", " ").replace("\u2009", " ").replace("\u202f", " ")
s = CRUFT_RE.sub("", s)
s = s.replace("", "-").replace("", "-")
s = re.sub(r"\s+", " ", s)
return s.strip(" ;,.\t\r\n ")
BOOK_PREFIX_RE = re.compile(r"^\s*(?:(?P<num>[1-3]|i{1,3})\s*)?(?P<book>[A-Za-z\.]+(?:\s+[A-Za-z\.]+){0,2})", re.X)
C_V_RE = re.compile(r"(?P<ch>\d+)(?::(?P<vs>[\d,\-\s]+))?")
def _canon_book(book_raw: str, num: Optional[str]) -> Optional[str]:
key = _canon_key(book_raw)
direct = _direct_lookup(key)
if direct:
return direct
bare = key.replace(" ", "")
if bare in NEVER_PREFIX_NUMERAL:
return _direct_lookup(bare)
if num:
n = {"i": "1", "ii": "2", "iii": "3"}.get(num.lower(), num)
short = key.replace(" ", "")
map_num = {
"co": "Cor.", "cor": "Cor.",
"ti": "Tim.", "tim": "Tim.",
"pe": "Pet.", "pet": "Pet.", "peter": "Pet.",
"jo": "John", "jn": "John", "joh": "John", "john": "John",
"sa": "Sam.", "sam": "Sam.", "samuel": "Sam.",
"ki": "Ki.", "kgs": "Ki.", "kings": "Ki.",
"chronicles": "Chron.", "chron": "Chron.",
"thessalonians": "Thess.", "thess": "Thess.",
}
if short in map_num:
return f"{n} {map_num[short]}"
return None
def _parse_segment(seg: str, last_book: Optional[str]) -> Tuple[Optional[str], Optional[str], bool]:
raw = _clean_text(seg)
if not raw:
return (None, None, False)
m = BOOK_PREFIX_RE.match(raw)
book_part = None
num = None
rest = raw
if m:
num = (m.group("num") or "").strip() or None
book_text = (m.group("book") or "").strip()
rest = raw[m.end():].strip(" :,;.")
book_part = _canon_book(book_text if not num else f"{num} {book_text}", num=num)
if not book_part:
whole = _canon_book(book_text, None)
if whole:
book_part = whole
rest = raw[m.end():].strip(" :,;.")
if not book_part:
if C_V_RE.search(rest):
return (None, None, True)
whole = _canon_book(raw, None)
if whole:
return (whole, None, False)
return (None, None, True)
rest = rest.replace(" ", "")
if not rest:
return (book_part, None, False)
mcv = C_V_RE.search(rest)
if mcv:
cv = rest.replace(" ", "")
return (book_part, cv, False)
return (None, None, True)
def normalize_scripture_field(text: str) -> Tuple[str, List[str]]:
warnings: List[str] = []
if not text:
return ("", warnings)
pieces = [p for p in re.split(r"\s*;\s*", text) if p and p.strip()]
out: List[str] = []
last_book: Optional[str] = None
for piece in pieces:
book, cv, preserve = _parse_segment(piece, last_book)
if preserve:
out.append(_clean_text(piece))
continue
if book and not cv:
out.append(book)
last_book = book
continue
if not book and cv:
warnings.append(f"Missing book for '{piece.strip()}'")
continue
if book and cv:
out.append(f"{book} {cv}")
last_book = book
norm = "; ".join(s for s in (o.strip(" ;,") for o in out) if s)
norm = re.sub(r"\s+", " ", norm).strip(" ;,")
return (norm, warnings)