Illustrations/web/core/scripture_normalizer.py

244 lines
9.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# core/scripture_normalizer.py
from __future__ import annotations
import re
from typing import Dict, List, Tuple, Optional
# =========================
# Canonical book abbreviations
# =========================
# House style (edit as you like). Keys are normalized to lower-case without periods.
BOOK_CANON: Dict[str, str] = {
# ----- OT -----
"genesis": "Gen.", "gen": "Gen.", "ge": "Gen.", "gn": "Gen.", "genesisesis": "Gen.",
"exodus": "Ex.", "ex": "Ex.", "exo": "Ex.",
"leviticus": "Lev.", "lev": "Lev.", "le": "Lev.",
"numbers": "Num.", "num": "Num.", "nu": "Num.", "nums": "Num.",
"deuteronomy": "Deut.", "deut": "Deut.", "deu": "Deut.", "dt": "Deut.", "deutronomy": "Deut.",
"joshua": "Josh.", "josh": "Josh.", "jos": "Josh.",
"judges": "Judg.", "judg": "Judg.", "jdg": "Judg.",
"ruth": "Ruth", "ru": "Ruth",
"1 samuel": "1 Sam.", "i samuel": "1 Sam.", "1 sam": "1 Sam.", "1sam": "1 Sam.", "1sa": "1 Sam.",
"2 samuel": "2 Sam.", "ii samuel": "2 Sam.", "2 sam": "2 Sam.", "2sam": "2 Sam.", "2sa": "2 Sam.",
"1 kings": "1 Ki.", "i kings": "1 Ki.", "1 ki": "1 Ki.", "1kgs": "1 Ki.", "1 kgs": "1 Ki.", "1kings": "1 Ki.", "1ki": "1 Ki.",
"2 kings": "2 Ki.", "ii kings": "2 Ki.", "2 ki": "2 Ki.", "2kgs": "2 Ki.", "2 kgs": "2 Ki.", "2kings": "2 Ki.", "2ki": "2 Ki.",
"1 chronicles": "1 Chron.", "i chronicles": "1 Chron.", "1 chron": "1 Chron.", "1ch": "1 Chron.", "1chron": "1 Chron.",
"2 chronicles": "2 Chron.", "ii chronicles": "2 Chron.", "2 chron": "2 Chron.", "2ch": "2 Chron.", "2chron": "2 Chron.",
"ezra": "Ezra", "ezr": "Ezra",
"nehemiah": "Neh.", "neh": "Neh.",
"esther": "Esth.", "esth": "Esth.", "es": "Esth.",
"job": "Job",
"psalm": "Ps.", "psalms": "Ps.", "ps": "Ps.", "psm": "Ps.", "pss": "Ps.",
"proverbs": "Prov.", "prov": "Prov.", "pr": "Prov.",
"ecclesiastes": "Eccl.", "eccles": "Eccl.", "eccl": "Eccl.", "ecc": "Eccl.", "ec": "Eccl.",
"song of solomon": "Song", "song of songs": "Song", "song": "Song", "so": "Song", "sos": "Song",
"isaiah": "Isa.", "isa": "Isa.", "is": "Isa.",
"jeremiah": "Jer.", "jer": "Jer.", "je": "Jer.",
"lamentations": "Lam.", "lam": "Lam.", "la": "Lam.",
"ezekiel": "Ezek.", "ezek": "Ezek.", "eze": "Ezek.", "ezk": "Ezek.",
"daniel": "Dan.", "dan": "Dan.", "da": "Dan.",
"hosea": "Hos.", "hos": "Hos.", "ho": "Hos.",
"joel": "Joel", "joe": "Joel", "jl": "Joel",
"amos": "Amos", "am": "Amos",
"obadiah": "Obad.", "obad": "Obad.", "ob": "Obad.",
"jonah": "Jon.", "jon": "Jon.",
"micah": "Mic.", "mic": "Mic.",
"nahum": "Nah.", "nah": "Nah.",
"habakkuk": "Hab.", "hab": "Hab.",
"zephaniah": "Zeph.", "zeph": "Zeph.", "zep": "Zeph.",
"haggai": "Hag.", "hag": "Hag.",
"zechariah": "Zech.", "zech": "Zech.", "zec": "Zech.",
"malachi": "Mal.", "mal": "Mal.",
# ----- NT -----
"matthew": "Matt.", "matt": "Matt.", "mt": "Matt.", "mat": "Matt.",
"mark": "Mark", "mrk": "Mark", "mk": "Mark", "mr": "Mark",
"luke": "Luke", "lk": "Luke",
"john": "John", "jn": "John", "jo": "John", "joh": "John",
"acts": "Acts", "act": "Acts", "ac": "Acts",
"romans": "Rom.", "rom": "Rom.", "ro": "Rom.", "rm": "Rom.",
"1 corinthians": "1 Cor.", "i corinthians": "1 Cor.", "1 cor": "1 Cor.", "1cor": "1 Cor.", "1co": "1 Cor.", "1 corinthians": "1 Cor.",
"2 corinthians": "2 Cor.", "ii corinthians": "2 Cor.", "2 cor": "2 Cor.", "2cor": "2 Cor.", "2co": "2 Cor.", "2 corinthians": "2 Cor.",
"galatians": "Gal.", "gal": "Gal.", "ga": "Gal.",
"ephesians": "Eph.", "eph": "Eph.", "eph.": "Eph.",
"philippians": "Phil.", "phil": "Phil.", "php": "Phil.", "phi": "Phil.", "philippians216": "Phil.",
"colossians": "Col.", "col": "Col.",
"1 thessalonians": "1 Thess.", "i thessalonians": "1 Thess.", "1 thess": "1 Thess.", "1 th": "1 Thess.", "1thess": "1 Thess.",
"2 thessalonians": "2 Thess.", "ii thessalonians": "2 Thess.", "2 thess": "2 Thess.", "2 th": "2 Thess.", "2thess": "2 Thess.",
"1 timothy": "1 Tim.", "i timothy": "1 Tim.", "1 tim": "1 Tim.", "1ti": "1 Tim.", "1 timothy": "1 Tim.",
"2 timothy": "2 Tim.", "ii timothy": "2 Tim.", "2 tim": "2 Tim.", "2ti": "2 Tim.", "2 timothy": "2 Tim.",
"titus": "Titus", "tit": "Titus", "ti.": "Titus",
"philemon": "Philem.", "philem": "Philem.", "phm": "Philem.",
"hebrews": "Heb.", "heb": "Heb.",
"james": "Jas.", "jas": "Jas.", "jam": "Jas.", "jms": "Jas.",
"1 peter": "1 Pet.", "i peter": "1 Pet.", "1 pet": "1 Pet.", "1pe": "1 Pet.",
"2 peter": "2 Pet.", "ii peter": "2 Pet.", "2 pet": "2 Pet.", "2pe": "2 Pet.",
"1 john": "1 John", "i john": "1 John", "1jn": "1 John", "1 jo": "1 John",
"2 john": "2 John", "ii john": "2 John", "2jn": "2 John", "2 jo": "2 John",
"3 john": "3 John", "iii john": "3 John", "3jn": "3 John", "3 jo": "3 John",
"jude": "Jude", "jud": "Jude",
"revelation": "Rev.", "rev": "Rev.", "re": "Rev.",
}
def _variants() -> Dict[str, str]:
base = dict(BOOK_CANON)
numbered = [
("samuel", "Sam."), ("kings", "Ki."), ("chronicles", "Chron."),
("corinthians", "Cor."), ("thessalonians", "Thess."),
("timothy", "Tim."), ("peter", "Pet."), ("john", "John"),
]
for n, prefix in (("1", "1"), ("i", "1"), ("2", "2"), ("ii", "2"), ("3", "3"), ("iii", "3")):
for name, abbr in numbered:
base[f"{n} {name}"] = f"{prefix} {abbr}"
base[f"{n}{name}"] = f"{prefix} {abbr}"
base[f"{n}{name[:2]}"] = f"{prefix} {abbr}"
base["ps"] = "Ps."
base["prov"] = "Prov."
base["eccles"] = "Eccl."
base["deut."] = "Deut."
base["mt"] = "Matt."
base["mk"] = "Mark"
base["lk"] = "Luke"
base["jn"] = "John"
base["ti"] = "Tim."
base["co"] = "Cor."
return base
BOOK_MAP = _variants()
CRUFT_RE = re.compile(r"\b(read|see|chap(?:ter)?|ch)\b\.?", re.I)
BOOK_RE = re.compile(
r"""
^\s*
(?:
(?P<num>[1-3]|i{1,3})\s*
)?
\s*
(?P<book>[A-Za-z\.]+(?:\s+[A-Za-z\.]+){0,2})
""",
re.X,
)
C_V_RE = re.compile(
r"""
(?:
(?P<ch>\d+)
(?:
:(?P<vs>[\d,\-\u2013\u2014\s]+)
)?
)
""",
re.X,
)
def _clean_text(s: str) -> str:
s = s.replace("\xa0", " ").replace("\u2009", " ").replace("\u202f", " ")
s = CRUFT_RE.sub("", s)
s = s.replace("..", ".").replace("", "-").replace("", "-")
s = re.sub(r"\s{2,}", " ", s)
return s.strip(" ;,.\t\r\n ")
def _canon_key(raw: str) -> str:
key = raw.lower().strip()
key = key.replace(".", "")
key = re.sub(r"\s+", " ", key)
return key
def _canon_book(book_raw: str, num: Optional[str] = None) -> Optional[str]:
key = _canon_key(book_raw)
if num and key in ("co", "cor"):
return f"{num} Cor."
if num and key in ("ti", "tim"):
return f"{num} Tim."
if key in BOOK_MAP:
val = BOOK_MAP[key]
return val
# FIXED: correct named group syntax here
m = re.match(r"(?P<n>[1-3]|i{1,3})(?P<rest>[A-Za-z].*)$", key)
if m:
n = m.group("n")
rest = m.group("rest")
if rest in BOOK_MAP:
base = BOOK_MAP[rest]
n = {"i": "1", "ii": "2", "iii": "3"}.get(n, n)
if base[0] in "123":
return f"{n} {base.split(' ', 1)[1]}"
return f"{n} {base}"
return BOOK_CANON.get(key)
def _parse_segment(seg: str, last_book: Optional[str]) -> Tuple[Optional[str], Optional[str], bool]:
original = seg.strip()
s = _clean_text(seg)
if not s:
return (None, None, False)
m = BOOK_RE.match(s)
book = None
rest = s
if m:
num = (m.group("num") or "").strip()
raw_book = (m.group("book") or "").strip()
raw_joined = f"{num} {raw_book}".strip()
canon = _canon_book(raw_joined or raw_book, num=num or None)
if canon:
book = canon
rest = s[m.end():].strip(",;: .")
else:
return (None, original, True)
if not book:
if last_book:
book = last_book
else:
return (None, original, True)
rest = re.sub(r"\s+", "", rest)
rest = re.sub(r":\s*", ":", rest)
if not rest:
return (book, None, False)
if C_V_RE.search(rest):
cv = rest.replace(" ", "")
else:
m2 = re.search(r"\d+(?::[\d,\-]+)?", rest)
cv = m2.group(0).replace(" ", "") if m2 else None
if not cv:
return (book, None, False)
cv = cv.replace("", "-").replace("", "-")
cv = re.sub(r",\s*", ",", cv)
return (book, cv, False)
def normalize_scripture_field(text: str) -> Tuple[str, List[str]]:
warnings: List[str] = []
if not text:
return ("", warnings)
pieces = [p for p in re.split(r"\s*;\s*", text) if p and p.strip()]
out: List[str] = []
last_book: Optional[str] = None
for piece in pieces:
book, cv, preserve = _parse_segment(piece, last_book)
if preserve:
out.append(cv or piece.strip())
continue
if book and cv:
out.append(f"{book} {cv}")
last_book = book
continue
if book and not cv:
out.append(book)
last_book = book
continue
if piece.strip():
out.append(piece.strip())
norm = "; ".join(x.strip() for x in out if x.strip())
norm = re.sub(r"\s+", " ", norm).strip(" ;,")
return (norm, warnings)