Illustrations/web/core/scripture_normalizer.py

278 lines
9.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# core/scripture_normalizer.py
from __future__ import annotations
import re
from typing import Dict, List, Tuple
# -----------------------------
# Canonical book abbreviations
# -----------------------------
BOOK_CANON: Dict[str, str] = {
# OT
"genesis": "Gen.", "gen": "Gen.", "genesisesis": "Gen.",
"exodus": "Ex.", "ex": "Ex.",
"leviticus": "Lev.", "lev": "Lev.",
"numbers": "Num.", "num": "Num.", "nums": "Num.",
"deuteronomy": "Deut.", "deut": "Deut.", "deu": "Deut.", "deutronomy": "Deut.", "deut.": "Deut.",
"joshua": "Josh.", "josh": "Josh.",
"judges": "Judg.", "judg": "Judg.",
"ruth": "Ruth",
"1 samuel": "1 Sam.", "i samuel": "1 Sam.", "1 sam": "1 Sam.",
"2 samuel": "2 Sam.", "ii samuel": "2 Sam.", "2 sam": "2 Sam.",
"1 kings": "1 Ki.", "i kings": "1 Ki.", "1 ki": "1 Ki.", "1kgs": "1 Ki.", "1 kgs": "1 Ki.",
"2 kings": "2 Ki.", "ii kings": "2 Ki.", "2 ki": "2 Ki.", "2kgs": "2 Ki.", "2 kgs": "2 Ki.",
"1 chronicles": "1 Chron.", "i chronicles": "1 Chron.", "1 chron": "1 Chron.",
"2 chronicles": "2 Chron.", "ii chronicles": "2 Chron.", "2 chron": "2 Chron.",
# short “Ch/Chr” forms (plus numbered)
"ch": "Chron.", "chr": "Chron.",
"1 ch": "1 Chron.", "1 chr": "1 Chron.",
"2 ch": "2 Chron.", "2 chr": "2 Chron.",
"ezra": "Ezra",
"nehemiah": "Neh.", "neh": "Neh.",
"esther": "Esth.", "esth": "Esth.",
"job": "Job",
"psalm": "Ps.", "psalms": "Ps.", "ps": "Ps.",
"proverbs": "Prov.", "prov": "Prov.",
"ecclesiastes": "Eccl.", "eccles": "Eccl.", "eccl": "Eccl.",
"song of solomon": "Song", "song of songs": "Song", "song": "Song",
"isaiah": "Isa.", "isa": "Isa.",
"jeremiah": "Jer.", "jer": "Jer.", "jer.": "Jer.",
"lamentations": "Lam.", "lam": "Lam.",
"ezekiel": "Ezek.", "ezek": "Ezek.",
"daniel": "Dan.", "dan": "Dan.",
"hosea": "Hos.", "hos": "Hos.",
"joel": "Joel",
"amos": "Amos",
"obadiah": "Obad.", "obad": "Obad.",
"jonah": "Jon.", "jon": "Jon.",
"micah": "Mic.", "mic": "Mic.",
"nahum": "Nah.", "nah": "Nah.",
"habakkuk": "Hab.", "hab": "Hab.",
"zephaniah": "Zeph.", "zeph": "Zeph.",
"haggai": "Hag.", "hag": "Hag.",
"zechariah": "Zech.", "zech": "Zech.",
"malachi": "Mal.", "mal": "Mal.",
# NT
"matthew": "Matt.", "matt": "Matt.", "mt": "Matt.",
"mark": "Mark", "mk": "Mark.",
"luke": "Luke", "lk": "Luke",
"john": "John", "jn": "John",
"acts": "Acts",
"romans": "Rom.", "rom": "Rom.",
"1 corinthians": "1 Cor.", "i corinthians": "1 Cor.", "1 cor": "1 Cor.", "1 cor.": "1 Cor.",
"2 corinthians": "2 Cor.", "ii corinthians": "2 Cor.", "2 cor": "2 Cor.", "2 cor.": "2 Cor.",
"galatians": "Gal.", "gal": "Gal.",
"ephesians": "Eph.", "eph": "Eph.", "eph.": "Eph.",
"philippians": "Phil.", "phil": "Phil.", "philippians 216": "Phil.",
"colossians": "Col.", "col": "Col.",
"1 thessalonians": "1 Thess.", "i thessalonians": "1 Thess.", "1 thess": "1 Thess.",
"2 thessalonians": "2 Thess.", "ii thessalonians": "2 Thess.", "2 thess": "2 Thess.",
"1 timothy": "1 Tim.", "i timothy": "1 Tim.", "1 tim": "1 Tim.",
"2 timothy": "2 Tim.", "ii timothy": "2 Tim.", "2 tim": "2 Tim.",
"titus": "Titus",
"philemon": "Philem.", "philem": "Philem.",
"hebrews": "Heb.", "heb": "Heb.",
"james": "Jas.", "jas": "Jas.",
"1 peter": "1 Pet.", "i peter": "1 Pet.", "1 pet": "1 Pet.",
"2 peter": "2 Pet.", "ii peter": "2 Pet.", "2 pet": "2 Pet.",
"1 john": "1 John", "i john": "1 John",
"2 john": "2 John", "ii john": "2 John",
"3 john": "3 John", "iii john": "3 John",
"jude": "Jude",
"revelation": "Rev.", "rev": "Rev.",
}
# For ultra-compact "1pe" / "2co" / "1ti" / "2th" etc.
NUMBERED_SHORT = {
"sa": "Sam.", "sam": "Sam.",
"ki": "Ki.", "kgs": "Ki.", "kg": "Ki.",
"ch": "Chron.", "chr": "Chron.",
"co": "Cor.", "cor": "Cor.",
"th": "Thess.", "ths": "Thess.",
"ti": "Tim.", "tim": "Tim.",
"pe": "Pet.", "pet": "Pet.",
"jn": "John", "jo": "John", "john": "John",
}
# strip cruft words like "Read", "chapter"
CRUFT_RE = re.compile(r"\b(read|see|chap(?:ter)?|ch)\b\.?", re.I)
# book-like prefix (optional leading number)
BOOK_RE = re.compile(
r"""
^\s*
(?:(?P<num>[1-3]|i{1,3})\s*)? # 1/2/3 or i/ii/iii
(?P<book>[A-Za-z\.]+(?:\s+[A-Za-z\.]+){0,2})
\s*
""",
re.X,
)
# chapter/verse piece
C_V_RE = re.compile(
r"""
(?:
(?P<ch>\d+)
(?::(?P<vs>[\d,\-\u2013\u2014\s]+))?
)
""",
re.X,
)
def _clean_text(s: str) -> str:
s = s.replace("\xa0", " ").replace("\u2009", " ").replace("\u202f", " ")
s = CRUFT_RE.sub("", s)
s = s.replace("..", ".").replace("", "-").replace("", "-")
s = re.sub(r"\s+", " ", s)
return s.strip(" ;,.\t\r\n ")
def _canon_book(book_raw: str, num: str | None = None) -> str | None:
"""
Normalize a book token (with or without a numeric prefix).
Handles tight forms like '1pe', '2co', '1ch', etc.
"""
if not book_raw and not num:
return None
# Build a normalized key with number + book words
raw = ((num or "") + " " + (book_raw or "")).strip()
key = raw.lower().replace(".", " ")
key = re.sub(r"\s+", " ", key)
# If it's like '1pe' (no space), insert a space
key = re.sub(r"^([1-3]|i{1,3})([a-z])", r"\1 \2", key)
# Try exact phrase first (e.g., "1 peter", "2 cor")
if key in BOOK_CANON:
return BOOK_CANON[key]
# Try without number (e.g., "peter", "cor")
parts = key.split(" ", 1)
if len(parts) == 2 and parts[1] in BOOK_CANON and parts[0] in {"1","2","3","i","ii","iii"}:
canon_no_num = BOOK_CANON[parts[1]]
# replace leading number in canon if present (for John we keep "1 John")
if canon_no_num.startswith(("1 ", "2 ", "3 ")):
# if canon is already numbered (like "1 John"), prefer that
return canon_no_num
lead = parts[0]
lead = {"i":"1","ii":"2","iii":"3"}.get(lead, lead)
return f"{lead} {canon_no_num}"
# Handle compact numbered shorts like '1 pe', '2 co', '1 ch', '2 th'
m = re.match(r"^(?P<n>[1-3]|i{1,3})\s*(?P<s>[a-z]{1,4})$", key)
if m:
n = m.group("n")
s = m.group("s")
n = {"i":"1","ii":"2","iii":"3"}.get(n, n)
if s in NUMBERED_SHORT:
return f"{n} {NUMBERED_SHORT[s]}"
# Lastly try pure book without number
if key in BOOK_CANON:
return BOOK_CANON[key]
return None
def _parse_segment(seg: str, last_book: str | None) -> Tuple[str | None, str | None, bool]:
"""
Parse one semicolon-delimited segment.
Returns (book_canon, cv_string, preserve_original_if_unknown)
"""
original = seg
s = _clean_text(seg)
if not s:
return (None, None, False)
# Detect a leading number stuck to letters like "1co", "2pe"
m_tight = re.match(r"^\s*(?P<num>[1-3]|i{1,3})\s*(?P<letters>[a-z]{1,4})\b\.?", s, flags=re.I)
if m_tight:
num = m_tight.group("num")
letters = m_tight.group("letters")
canon = _canon_book(letters, num=num)
if canon:
rest = s[m_tight.end():].strip(",;: .")
book = canon
else:
# not recognized—keep whole piece verbatim
return (None, original.strip(), True)
else:
# General book matcher
m = BOOK_RE.match(s)
book = None
rest = s
if m:
num = m.group("num")
raw_book = (m.group("book") or "").strip()
canon = _canon_book(raw_book, num=num or None)
if canon:
book = canon
rest = s[m.end():].strip(",;: .")
else:
# Not a recognized book: if we already have a last_book, treat
# this as CV only; otherwise preserve the original piece.
if last_book:
book = last_book
else:
return (None, original.strip(), True)
else:
# No obvious book inherit if we can, else preserve original
if last_book:
book = last_book
else:
return (None, original.strip(), True)
# Normalize chapter/verse part
rest = rest.replace(" ", "")
rest = re.sub(r":\s+", ":", rest)
if not rest:
cv = None
else:
if C_V_RE.search(rest):
cv = rest.replace(" ", "").replace("", "-").replace("", "-")
else:
m2 = re.search(r"\d+(?::[\d,\-]+)?", rest)
cv = m2.group(0).replace(" ", "") if m2 else None
return (book, cv, False)
def normalize_scripture_field(text: str) -> Tuple[str, List[str]]:
"""
Normalize a scripture_raw string.
Returns (normalized_text, warnings).
Unknown segments are preserved as-is and reported in warnings.
"""
warnings: List[str] = []
if not text:
return ("", warnings)
pieces = [p for p in re.split(r"\s*;\s*", text) if p and p.strip()]
out: List[str] = []
last_book: str | None = None
for piece in pieces:
book, cv, preserve = _parse_segment(piece, last_book)
if preserve:
out.append(piece.strip())
warnings.append(f"Unrecognized segment kept as-is: '{piece.strip()}'")
continue
if not book and not cv:
continue
if book and not cv:
out.append(book)
last_book = book
continue
if not book and cv:
# Shouldn't really happen now; keep as-is
out.append(piece.strip())
warnings.append(f"Missing book for '{piece.strip()}'")
continue
out.append(f"{book} {cv}")
last_book = book
norm = "; ".join(o.strip() for o in out if o.strip())
norm = re.sub(r"\s+", " ", norm).strip(" ;,")
return (norm, warnings)