Illustrations/web/core/scripture_normalizer.py

254 lines
9.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# core/scripture_normalizer.py
from __future__ import annotations
import re
from typing import Dict, List, Optional, Tuple
# --------------------------
# Canonical book abbreviations
# --------------------------
BOOK_CANON: Dict[str, str] = {
# OT
"genesis": "Gen.", "gen": "Gen.", "genesisesis": "Gen.",
"exodus": "Ex.", "ex": "Ex.",
"leviticus": "Lev.", "lev": "Lev.",
"numbers": "Num.", "num": "Num.", "nums": "Num.",
"deuteronomy": "Deut.", "deut": "Deut.", "deutronomy": "Deut.", "deu": "Deut.",
"joshua": "Josh.", "josh": "Josh.",
"judges": "Judg.", "judg": "Judg.",
"ruth": "Ruth",
"1 samuel": "1 Sam.", "i samuel": "1 Sam.", "1 sam": "1 Sam.", "1sam": "1 Sam.",
"2 samuel": "2 Sam.", "ii samuel": "2 Sam.", "2 sam": "2 Sam.", "2sam": "2 Sam.",
"1 kings": "1 Ki.", "i kings": "1 Ki.", "1 ki": "1 Ki.", "1kgs": "1 Ki.", "1 kgs": "1 Ki.",
"2 kings": "2 Ki.", "ii kings": "2 Ki.", "2 ki": "2 Ki.", "2kgs": "2 Ki.", "2 kgs": "2 Ki.",
"1 chronicles": "1 Chron.", "i chronicles": "1 Chron.", "1 chron": "1 Chron.",
"2 chronicles": "2 Chron.", "ii chronicles": "2 Chron.", "2 chron": "2 Chron.",
"ezra": "Ezra",
"nehemiah": "Neh.", "neh": "Neh.",
"esther": "Esth.", "esth": "Esth.",
"job": "Job",
"psalms": "Ps.", "psalm": "Ps.", "ps": "Ps.", "pss": "Ps.",
"proverbs": "Prov.", "prov": "Prov.",
"ecclesiastes": "Eccl.", "eccles": "Eccl.", "eccl": "Eccl.",
"song of solomon": "Song", "song of songs": "Song", "song": "Song",
"isaiah": "Isa.", "isa": "Isa.", # IMPORTANT: direct alias so 'Isa.' never becomes '1 Sam.'
"jeremiah": "Jer.", "jer": "Jer.",
"lamentations": "Lam.", "lam": "Lam.",
"ezekiel": "Ezek.", "ezek": "Ezek.",
"daniel": "Dan.", "dan": "Dan.",
"hosea": "Hos.", "hos": "Hos.",
"joel": "Joel",
"amos": "Amos",
"obadiah": "Obad.", "obad": "Obad.",
"jonah": "Jon.", "jon": "Jon.",
"micah": "Mic.", "mic": "Mic.",
"nahum": "Nah.", "nah": "Nah.",
"habakkuk": "Hab.", "hab": "Hab.",
"zephaniah": "Zeph.", "zeph": "Zeph.",
"haggai": "Hag.", "hag": "Hag.",
"zechariah": "Zech.", "zech": "Zech.",
"malachi": "Mal.", "mal": "Mal.",
# NT
"matthew": "Matt.", "matt": "Matt.", "mt": "Matt.",
"mark": "Mark", "mk": "Mark", "mrk": "Mark", "mr": "Mark",
"luke": "Luke", "lk": "Luke",
"john": "John", "jn": "John", "joh": "John",
"acts": "Acts",
"romans": "Rom.", "rom": "Rom.",
"1 corinthians": "1 Cor.", "i corinthians": "1 Cor.", "1 cor": "1 Cor.", "1co": "1 Cor.",
"2 corinthians": "2 Cor.", "ii corinthians": "2 Cor.", "2 cor": "2 Cor.", "2co": "2 Cor.",
"galatians": "Gal.", "gal": "Gal.",
"ephesians": "Eph.", "eph": "Eph.",
"philippians": "Phil.", "phil": "Phil.", "philippians 216": "Phil.", # import glitch seen
"colossians": "Col.", "col": "Col.",
"1 thessalonians": "1 Thess.", "i thessalonians": "1 Thess.", "1 thess": "1 Thess.",
"2 thessalonians": "2 Thess.", "ii thessalonians": "2 Thess.", "2 thess": "2 Thess.",
"1 timothy": "1 Tim.", "i timothy": "1 Tim.", "1 tim": "1 Tim.", "1ti": "1 Tim.",
"2 timothy": "2 Tim.", "ii timothy": "2 Tim.", "2 tim": "2 Tim.", "2ti": "2 Tim.",
"titus": "Titus", "tit": "Titus",
"philemon": "Philem.", "philem": "Philem.",
"hebrews": "Heb.", "heb": "Heb.",
"james": "Jas.", "jas": "Jas.",
"1 peter": "1 Pet.", "i peter": "1 Pet.", "1 pet": "1 Pet.",
"2 peter": "2 Pet.", "ii peter": "2 Pet.", "2 pet": "2 Pet.",
"1 john": "1 John", "i john": "1 John",
"2 john": "2 John", "ii john": "2 John",
"3 john": "3 John", "iii john": "3 John",
"jude": "Jude",
"revelation": "Rev.", "rev": "Rev.", "1 ch": "1 Chron.", "1 chr": "1 Chron.", "1 Ch": "1 Chron.", "1 Chr": "1 Chron.",
"2 ch": "2 Chron.", "2 chr": "2 Chron.", "2 Ch": "2 Chron.", "2 Chr": "2 Chron.",
"ch": "Chron.", "chr": "Chron.",
}
# Short bare tokens that must *never* be treated as “roman numeral + book”
# (prevents 'Isa.' => '1 Sam.' and similar).
NEVER_PREFIX_NUMERAL = {"isa", "isaiah", "job", "joel", "amos", "nah", "hag", "mal", "rom", "gal", "eph", "tit", "heb", "jas", "jude"}
def _canon_key(s: str) -> str:
s = (s or "").strip().lower()
s = s.replace(".", " ")
s = re.sub(r"\s+", " ", s)
return s
def _direct_lookup(key: str) -> Optional[str]:
"""Try direct lookups with and without spaces (e.g., 'i samuel', '1co')."""
if key in BOOK_CANON:
return BOOK_CANON[key]
nospace = key.replace(" ", "")
if nospace in BOOK_CANON:
return BOOK_CANON[nospace]
return None
# Split helpers
CRUFT_RE = re.compile(r"\b(read|see|chap(?:ter)?|ch)\b\.?", re.I)
def _clean_text(s: str) -> str:
s = s.replace("\xa0", " ").replace("\u2009", " ").replace("\u202f", " ")
s = CRUFT_RE.sub("", s)
s = s.replace("", "-").replace("", "-")
s = re.sub(r"\s+", " ", s)
return s.strip(" ;,.\t\r\n ")
BOOK_PREFIX_RE = re.compile(
r"""
^\s*
(?:(?P<num>[1-3]|i{1,3})\s*)? # optional leading number (1/2/3 or i/ii/iii)
(?P<book>[A-Za-z\.]+(?:\s+[A-Za-z\.]+){0,2}) # 1-3 words for the book
""",
re.X,
)
C_V_RE = re.compile(r"(?P<ch>\d+)(?::(?P<vs>[\d,\-\s]+))?")
def _canon_book(book_raw: str, num: Optional[str]) -> Optional[str]:
"""
Decide the canonical book abbreviation.
Rules:
1) Always try direct alias matches first (with/without spaces).
2) If the *book* piece looks like a short token that could be misread as a roman
numeral + another book (e.g., 'Isa'), *never* do numeral heuristics.
3) Otherwise, allow '1/2/3 + co/cor/ti/tim/pet/peter/jn/john' style heuristics.
"""
key = _canon_key(book_raw)
# Step 1: direct lookups
direct = _direct_lookup(key)
if direct:
return direct
# Guard: tokens like 'isa' must not be split into 'i' + 'sa'
bare = key.replace(" ", "")
if bare in NEVER_PREFIX_NUMERAL:
return _direct_lookup(bare)
# Step 3: numeral + short token combos
if num:
n = {"i": "1", "ii": "2", "iii": "3"}.get(num.lower(), num)
short = key.replace(" ", "")
# common short targets where number matters
map_num = {
"co": "Cor.", "cor": "Cor.",
"ti": "Tim.", "tim": "Tim.",
"pe": "Pet.", "pet": "Pet.", "peter": "Pet.",
"jo": "John", "jn": "John", "joh": "John", "john": "John",
"sa": "Sam.", "sam": "Sam.", "samuel": "Sam.",
"ki": "Ki.", "kgs": "Ki.", "kings": "Ki.",
"chronicles": "Chron.", "chron": "Chron.",
"thessalonians": "Thess.", "thess": "Thess.",
}
if short in map_num:
return f"{n} {map_num[short]}"
# Fallback
return None
def _parse_segment(seg: str, last_book: Optional[str]) -> Tuple[Optional[str], Optional[str], bool]:
"""
Parse one semicolon-delimited piece.
Returns (book_canon, cv, preserve_original_if_unparsed).
"""
raw = _clean_text(seg)
if not raw:
return (None, None, False)
m = BOOK_PREFIX_RE.match(raw)
book_part = None
num = None
rest = raw
if m:
# What the regex thinks is a numeric prefix and a book word blob:
num = (m.group("num") or "").strip() or None
book_text = (m.group("book") or "").strip()
rest = raw[m.end():].strip(" :,;.")
# Try to resolve the book using robust rules
book_part = _canon_book(book_text if not num else f"{num} {book_text}", num=num)
# If regex split out a bogus 'i' from something like "Isa.", fix by trying full token directly.
if not book_part:
whole = _canon_book(book_text, None)
if whole:
book_part = whole
rest = raw[m.end():].strip(" :,;.")
# no book found → inherit previous if we have verses; otherwise preserve as-is
if not book_part:
# If there's obvious chapter/verse, but no book, we cannot link → preserve as-is.
if C_V_RE.search(rest):
return (None, None, True)
# Or the whole thing might already be a known book alone:
whole = _canon_book(raw, None)
if whole:
return (whole, None, False)
return (None, None, True)
# normalize chapter/verse
rest = rest.replace(" ", "")
if not rest:
return (book_part, None, False)
mcv = C_V_RE.search(rest)
if mcv:
cv = rest
cv = cv.replace(" ", "")
return (book_part, cv, False)
# not a recognizable cv → keep original piece untouched
return (None, None, True)
def normalize_scripture_field(text: str) -> Tuple[str, List[str]]:
"""
Normalize a whole scripture_raw string.
Returns (normalized_text, warnings).
Unknown/unparseable chunks are preserved verbatim.
"""
warnings: List[str] = []
if not text:
return ("", warnings)
pieces = [p for p in re.split(r"\s*;\s*", text) if p and p.strip()]
out: List[str] = []
last_book: Optional[str] = None
for piece in pieces:
book, cv, preserve = _parse_segment(piece, last_book)
if preserve:
# Keep the original chunk
out.append(_clean_text(piece))
continue
if book and not cv:
out.append(book)
last_book = book
continue
if not book and cv:
warnings.append(f"Missing book for '{piece.strip()}'")
continue
if book and cv:
out.append(f"{book} {cv}")
last_book = book
norm = "; ".join(s for s in (o.strip(" ;,") for o in out) if s)
norm = re.sub(r"\s+", " ", norm).strip(" ;,")
return (norm, warnings)