Illustrations/web/core/scripture_normalizer.py

260 lines
10 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# core/scripture_normalizer.py
from __future__ import annotations
import re
from typing import Dict, List, Tuple, Optional
# =========================
# Canonical book abbreviations
# =========================
# House style (edit as you like). Keys are normalized to lower-case without periods.
BOOK_CANON: Dict[str, str] = {
# ----- OT -----
"genesis": "Gen.", "gen": "Gen.", "ge": "Gen.", "gn": "Gen.", "genesisesis": "Gen.",
"exodus": "Ex.", "ex": "Ex.", "exo": "Ex.",
"leviticus": "Lev.", "lev": "Lev.", "le": "Lev.",
"numbers": "Num.", "num": "Num.", "nu": "Num.", "nums": "Num.",
"deuteronomy": "Deut.", "deut": "Deut.", "deu": "Deut.", "dt": "Deut.", "deutronomy": "Deut.",
"joshua": "Josh.", "josh": "Josh.", "jos": "Josh.",
"judges": "Judg.", "judg": "Judg.", "jdg": "Judg.",
"ruth": "Ruth", "ru": "Ruth",
"1 samuel": "1 Sam.", "i samuel": "1 Sam.", "1 sam": "1 Sam.", "1sam": "1 Sam.", "1sa": "1 Sam.",
"2 samuel": "2 Sam.", "ii samuel": "2 Sam.", "2 sam": "2 Sam.", "2sam": "2 Sam.", "2sa": "2 Sam.",
"1 kings": "1 Ki.", "i kings": "1 Ki.", "1 ki": "1 Ki.", "1kgs": "1 Ki.", "1 kgs": "1 Ki.", "1kings": "1 Ki.", "1ki": "1 Ki.",
"2 kings": "2 Ki.", "ii kings": "2 Ki.", "2 ki": "2 Ki.", "2kgs": "2 Ki.", "2 kgs": "2 Ki.", "2kings": "2 Ki.", "2ki": "2 Ki.",
"1 chronicles": "1 Chron.", "i chronicles": "1 Chron.", "1 chron": "1 Chron.", "1ch": "1 Chron.", "1chron": "1 Chron.",
"2 chronicles": "2 Chron.", "ii chronicles": "2 Chron.", "2 chron": "2 Chron.", "2ch": "2 Chron.", "2chron": "2 Chron.",
"ezra": "Ezra", "ezr": "Ezra",
"nehemiah": "Neh.", "neh": "Neh.",
"esther": "Esth.", "esth": "Esth.", "es": "Esth.",
"job": "Job",
"psalm": "Ps.", "psalms": "Ps.", "ps": "Ps.", "psm": "Ps.", "pss": "Ps.",
"proverbs": "Prov.", "prov": "Prov.", "pr": "Prov.",
"ecclesiastes": "Eccl.", "eccles": "Eccl.", "eccl": "Eccl.", "ecc": "Eccl.", "ec": "Eccl.",
"song of solomon": "Song", "song of songs": "Song", "song": "Song", "so": "Song", "sos": "Song",
"isaiah": "Isa.", "isa": "Isa.", "is": "Isa.",
"jeremiah": "Jer.", "jer": "Jer.", "je": "Jer.",
"lamentations": "Lam.", "lam": "Lam.", "la": "Lam.",
"ezekiel": "Ezek.", "ezek": "Ezek.", "eze": "Ezek.", "ezk": "Ezek.",
"daniel": "Dan.", "dan": "Dan.", "da": "Dan.",
"hosea": "Hos.", "hos": "Hos.", "ho": "Hos.",
"joel": "Joel", "joe": "Joel", "jl": "Joel",
"amos": "Amos", "am": "Amos",
"obadiah": "Obad.", "obad": "Obad.", "ob": "Obad.",
"jonah": "Jon.", "jon": "Jon.",
"micah": "Mic.", "mic": "Mic.",
"nahum": "Nah.", "nah": "Nah.",
"habakkuk": "Hab.", "hab": "Hab.",
"zephaniah": "Zeph.", "zeph": "Zeph.", "zep": "Zeph.",
"haggai": "Hag.", "hag": "Hag.",
"zechariah": "Zech.", "zech": "Zech.", "zec": "Zech.",
"malachi": "Mal.", "mal": "Mal.",
# ----- NT -----
"matthew": "Matt.", "matt": "Matt.", "mt": "Matt.", "mat": "Matt.",
"mark": "Mark", "mrk": "Mark", "mk": "Mark", "mr": "Mark",
"luke": "Luke", "lk": "Luke",
"john": "John", "jn": "John", "jo": "John", "joh": "John",
"acts": "Acts", "act": "Acts", "ac": "Acts",
"romans": "Rom.", "rom": "Rom.", "ro": "Rom.", "rm": "Rom.",
"1 corinthians": "1 Cor.", "i corinthians": "1 Cor.", "1 cor": "1 Cor.", "1cor": "1 Cor.", "1co": "1 Cor.", "1 corinthians": "1 Cor.",
"2 corinthians": "2 Cor.", "ii corinthians": "2 Cor.", "2 cor": "2 Cor.", "2cor": "2 Cor.", "2co": "2 Cor.", "2 corinthians": "2 Cor.",
"galatians": "Gal.", "gal": "Gal.", "ga": "Gal.",
"ephesians": "Eph.", "eph": "Eph.", "eph.": "Eph.",
"philippians": "Phil.", "phil": "Phil.", "php": "Phil.", "phi": "Phil.", "philippians216": "Phil.",
"colossians": "Col.", "col": "Col.",
"1 thessalonians": "1 Thess.", "i thessalonians": "1 Thess.", "1 thess": "1 Thess.", "1 th": "1 Thess.", "1thess": "1 Thess.",
"2 thessalonians": "2 Thess.", "ii thessalonians": "2 Thess.", "2 thess": "2 Thess.", "2 th": "2 Thess.", "2thess": "2 Thess.",
"1 timothy": "1 Tim.", "i timothy": "1 Tim.", "1 tim": "1 Tim.", "1ti": "1 Tim.", "1 timothy": "1 Tim.",
"2 timothy": "2 Tim.", "ii timothy": "2 Tim.", "2 tim": "2 Tim.", "2ti": "2 Tim.", "2 timothy": "2 Tim.",
"titus": "Titus", "tit": "Titus", "ti.": "Titus",
"philemon": "Philem.", "philem": "Philem.", "phm": "Philem.",
"hebrews": "Heb.", "heb": "Heb.",
"james": "Jas.", "jas": "Jas.", "jam": "Jas.", "jms": "Jas.",
"1 peter": "1 Pet.", "i peter": "1 Pet.", "1 pet": "1 Pet.", "1pe": "1 Pet.",
"2 peter": "2 Pet.", "ii peter": "2 Pet.", "2 pet": "2 Pet.", "2pe": "2 Pet.",
"1 john": "1 John", "i john": "1 John", "1jn": "1 John", "1 jo": "1 John",
"2 john": "2 John", "ii john": "2 John", "2jn": "2 John", "2 jo": "2 John",
"3 john": "3 John", "iii john": "3 John", "3jn": "3 John", "3 jo": "3 John",
"jude": "Jude", "jud": "Jude",
"revelation": "Rev.", "rev": "Rev.", "re": "Rev.",
}
def _variants() -> Dict[str, str]:
base = dict(BOOK_CANON)
numbered = [
("samuel", "Sam."), ("kings", "Ki."), ("chronicles", "Chron."),
("corinthians", "Cor."), ("thessalonians", "Thess."),
("timothy", "Tim."), ("peter", "Pet."), ("john", "John"),
]
for n, prefix in (("1", "1"), ("i", "1"), ("2", "2"), ("ii", "2"), ("3", "3"), ("iii", "3")):
for name, abbr in numbered:
base[f"{n} {name}"] = f"{prefix} {abbr}"
base[f"{n}{name}"] = f"{prefix} {abbr}"
base[f"{n}{name[:2]}"] = f"{prefix} {abbr}"
base["ps"] = "Ps."
base["prov"] = "Prov."
base["eccles"] = "Eccl."
base["deut."] = "Deut."
base["mt"] = "Matt."
base["mk"] = "Mark"
base["lk"] = "Luke"
base["jn"] = "John"
base["ti"] = "Tim."
base["co"] = "Cor."
return base
BOOK_MAP = _variants()
CRUFT_RE = re.compile(r"\b(read|see|chap(?:ter)?|ch)\b\.?", re.I)
BOOK_RE = re.compile(
r"""
^\s*
(?:
(?P<num>[1-3]|i{1,3})\s*
)?
\s*
(?P<book>[A-Za-z\.]+(?:\s+[A-Za-z\.]+){0,2})
""",
re.X,
)
C_V_RE = re.compile(
r"""
(?:
(?P<ch>\d+)
(?:
:(?P<vs>[\d,\-\u2013\u2014\s]+)
)?
)
""",
re.X,
)
def _clean_text(s: str) -> str:
s = s.replace("\xa0", " ").replace("\u2009", " ").replace("\u202f", " ")
s = CRUFT_RE.sub("", s)
s = s.replace("..", ".").replace("", "-").replace("", "-")
s = re.sub(r"\s{2,}", " ", s)
return s.strip(" ;,.\t\r\n ")
def _canon_key(raw: str) -> str:
key = raw.lower().strip()
key = key.replace(".", "")
key = re.sub(r"\s+", " ", key)
return key
def _canon_book(book_raw: str, num: Optional[str] = None) -> Optional[str]:
"""
Map a raw book token to our canonical abbreviation.
Strategy:
1) Try exact/alias matches (with dots removed) ✅
2) Try the same again with ALL spaces removed ✅ (so 'i sa' -> 'isa')
3) Only then, try the 'leading 1/2/3 or i/ii/iii + rest' heuristic.
"""
key = _canon_key(book_raw) # lower, trimmed, dots removed, single spaces
key_nospace = key.replace(" ", "") # used to catch 'i sa' -> 'isa'
# 1) direct lookup (e.g., 'isa', 'isaiah', 'ps', '1 corinthians', etc.)
if key in BOOK_MAP:
return BOOK_MAP[key]
if key_nospace in BOOK_MAP:
return BOOK_MAP[key_nospace]
# 2) special explicit “number + short key” combos
if num and key in ("co", "cor"):
return f"{num} Cor."
if num and key in ("ti", "tim"):
return f"{num} Tim."
# 3) heuristic: split a leading number/roman from the rest (e.g., 'i john' or '2ki')
m = re.match(r"(?P<n>[1-3]|i{1,3})(?P<rest>[A-Za-z].*)$", key_nospace)
if m:
n = {"i": "1", "ii": "2", "iii": "3"}.get(m.group("n"), m.group("n"))
rest = m.group("rest")
if rest in BOOK_MAP:
base = BOOK_MAP[rest]
# If base already begins with a number (rare in our map), replace it with n
if base[0] in "123":
return f"{n} {base.split(' ', 1)[1]}"
return f"{n} {base}"
# 4) lastresort: try the raw key against canonical map
return BOOK_CANON.get(key_nospace) or BOOK_CANON.get(key)
def _parse_segment(seg: str, last_book: Optional[str]) -> Tuple[Optional[str], Optional[str], bool]:
original = seg.strip()
s = _clean_text(seg)
if not s:
return (None, None, False)
m = BOOK_RE.match(s)
book = None
rest = s
if m:
num = (m.group("num") or "").strip()
raw_book = (m.group("book") or "").strip()
raw_joined = f"{num} {raw_book}".strip()
canon = _canon_book(raw_joined or raw_book, num=num or None)
if canon:
book = canon
rest = s[m.end():].strip(",;: .")
else:
return (None, original, True)
if not book:
if last_book:
book = last_book
else:
return (None, original, True)
rest = re.sub(r"\s+", "", rest)
rest = re.sub(r":\s*", ":", rest)
if not rest:
return (book, None, False)
if C_V_RE.search(rest):
cv = rest.replace(" ", "")
else:
m2 = re.search(r"\d+(?::[\d,\-]+)?", rest)
cv = m2.group(0).replace(" ", "") if m2 else None
if not cv:
return (book, None, False)
cv = cv.replace("", "-").replace("", "-")
cv = re.sub(r",\s*", ",", cv)
return (book, cv, False)
def normalize_scripture_field(text: str) -> Tuple[str, List[str]]:
warnings: List[str] = []
if not text:
return ("", warnings)
pieces = [p for p in re.split(r"\s*;\s*", text) if p and p.strip()]
out: List[str] = []
last_book: Optional[str] = None
for piece in pieces:
book, cv, preserve = _parse_segment(piece, last_book)
if preserve:
out.append(cv or piece.strip())
continue
if book and cv:
out.append(f"{book} {cv}")
last_book = book
continue
if book and not cv:
out.append(book)
last_book = book
continue
if piece.strip():
out.append(piece.strip())
norm = "; ".join(x.strip() for x in out if x.strip())
norm = re.sub(r"\s+", " ", norm).strip(" ;,")
return (norm, warnings)