Illustrations/web/core/scripture_normalizer.py

281 lines
11 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# core/scripture_normalizer.py
from __future__ import annotations
import re
from typing import Dict, List, Tuple, Optional
# =========================
# Canonical book abbreviations
# =========================
# House style (edit as you like). Keys are normalized to lower-case without periods.
BOOK_CANON: Dict[str, str] = {
# ----- OT -----
"genesis": "Gen.", "gen": "Gen.", "ge": "Gen.", "gn": "Gen.", "genesisesis": "Gen.",
"exodus": "Ex.", "ex": "Ex.", "exo": "Ex.",
"leviticus": "Lev.", "lev": "Lev.", "le": "Lev.",
"numbers": "Num.", "num": "Num.", "nu": "Num.", "nums": "Num.",
"deuteronomy": "Deut.", "deut": "Deut.", "deu": "Deut.", "dt": "Deut.", "deutronomy": "Deut.",
"joshua": "Josh.", "josh": "Josh.", "jos": "Josh.",
"judges": "Judg.", "judg": "Judg.", "jdg": "Judg.",
"ruth": "Ruth", "ru": "Ruth",
"1 samuel": "1 Sam.", "i samuel": "1 Sam.", "1 sam": "1 Sam.", "1sam": "1 Sam.", "1sa": "1 Sam.",
"2 samuel": "2 Sam.", "ii samuel": "2 Sam.", "2 sam": "2 Sam.", "2sam": "2 Sam.", "2sa": "2 Sam.",
"1 kings": "1 Ki.", "i kings": "1 Ki.", "1 ki": "1 Ki.", "1kgs": "1 Ki.", "1 kgs": "1 Ki.", "1kings": "1 Ki.", "1ki": "1 Ki.",
"2 kings": "2 Ki.", "ii kings": "2 Ki.", "2 ki": "2 Ki.", "2kgs": "2 Ki.", "2 kgs": "2 Ki.", "2kings": "2 Ki.", "2ki": "2 Ki.",
"1 chronicles": "1 Chron.", "i chronicles": "1 Chron.", "1 chron": "1 Chron.", "1ch": "1 Chron.", "1chron": "1 Chron.",
"2 chronicles": "2 Chron.", "ii chronicles": "2 Chron.", "2 chron": "2 Chron.", "2ch": "2 Chron.", "2chron": "2 Chron.",
"ezra": "Ezra", "ezr": "Ezra",
"nehemiah": "Neh.", "neh": "Neh.",
"esther": "Esth.", "esth": "Esth.", "es": "Esth.",
"job": "Job",
"psalm": "Ps.", "psalms": "Ps.", "ps": "Ps.", "psm": "Ps.", "pss": "Ps.",
"proverbs": "Prov.", "prov": "Prov.", "pr": "Prov.",
"ecclesiastes": "Eccl.", "eccles": "Eccl.", "eccl": "Eccl.", "ecc": "Eccl.", "ec": "Eccl.",
"song of solomon": "Song", "song of songs": "Song", "song": "Song", "so": "Song", "sos": "Song",
"isaiah": "Isa.", "isa": "Isa.", "is": "Isa.",
"jeremiah": "Jer.", "jer": "Jer.", "je": "Jer.",
"lamentations": "Lam.", "lam": "Lam.", "la": "Lam.",
"ezekiel": "Ezek.", "ezek": "Ezek.", "eze": "Ezek.", "ezk": "Ezek.",
"daniel": "Dan.", "dan": "Dan.", "da": "Dan.",
"hosea": "Hos.", "hos": "Hos.", "ho": "Hos.",
"joel": "Joel", "joe": "Joel", "jl": "Joel",
"amos": "Amos", "am": "Amos",
"obadiah": "Obad.", "obad": "Obad.", "ob": "Obad.",
"jonah": "Jon.", "jon": "Jon.",
"micah": "Mic.", "mic": "Mic.",
"nahum": "Nah.", "nah": "Nah.",
"habakkuk": "Hab.", "hab": "Hab.",
"zephaniah": "Zeph.", "zeph": "Zeph.", "zep": "Zeph.",
"haggai": "Hag.", "hag": "Hag.",
"zechariah": "Zech.", "zech": "Zech.", "zec": "Zech.",
"malachi": "Mal.", "mal": "Mal.",
# ----- NT -----
"matthew": "Matt.", "matt": "Matt.", "mt": "Matt.", "mat": "Matt.",
"mark": "Mark", "mrk": "Mark", "mk": "Mark", "mr": "Mark",
"luke": "Luke", "lk": "Luke",
"john": "John", "jn": "John", "jo": "John", "joh": "John",
"acts": "Acts", "act": "Acts", "ac": "Acts",
"romans": "Rom.", "rom": "Rom.", "ro": "Rom.", "rm": "Rom.",
"1 corinthians": "1 Cor.", "i corinthians": "1 Cor.", "1 cor": "1 Cor.", "1cor": "1 Cor.", "1co": "1 Cor.", "1 corinthians": "1 Cor.",
"2 corinthians": "2 Cor.", "ii corinthians": "2 Cor.", "2 cor": "2 Cor.", "2cor": "2 Cor.", "2co": "2 Cor.", "2 corinthians": "2 Cor.",
"galatians": "Gal.", "gal": "Gal.", "ga": "Gal.",
"ephesians": "Eph.", "eph": "Eph.", "eph.": "Eph.",
"philippians": "Phil.", "phil": "Phil.", "php": "Phil.", "phi": "Phil.", "philippians216": "Phil.",
"colossians": "Col.", "col": "Col.",
"1 thessalonians": "1 Thess.", "i thessalonians": "1 Thess.", "1 thess": "1 Thess.", "1 th": "1 Thess.", "1thess": "1 Thess.",
"2 thessalonians": "2 Thess.", "ii thessalonians": "2 Thess.", "2 thess": "2 Thess.", "2 th": "2 Thess.", "2thess": "2 Thess.",
"1 timothy": "1 Tim.", "i timothy": "1 Tim.", "1 tim": "1 Tim.", "1ti": "1 Tim.", "1 timothy": "1 Tim.",
"2 timothy": "2 Tim.", "ii timothy": "2 Tim.", "2 tim": "2 Tim.", "2ti": "2 Tim.", "2 timothy": "2 Tim.",
"titus": "Titus", "tit": "Titus", "ti.": "Titus",
"philemon": "Philem.", "philem": "Philem.", "phm": "Philem.",
"hebrews": "Heb.", "heb": "Heb.",
"james": "Jas.", "jas": "Jas.", "jam": "Jas.", "jms": "Jas.",
"1 peter": "1 Pet.", "i peter": "1 Pet.", "1 pet": "1 Pet.", "1pe": "1 Pet.",
"2 peter": "2 Pet.", "ii peter": "2 Pet.", "2 pet": "2 Pet.", "2pe": "2 Pet.",
"1 john": "1 John", "i john": "1 John", "1jn": "1 John", "1 jo": "1 John",
"2 john": "2 John", "ii john": "2 John", "2jn": "2 John", "2 jo": "2 John",
"3 john": "3 John", "iii john": "3 John", "3jn": "3 John", "3 jo": "3 John",
"jude": "Jude", "jud": "Jude",
"revelation": "Rev.", "rev": "Rev.", "re": "Rev.",
}
# also accept short two-word numbered patterns dynamically (e.g., "1 sam", "2 ki")
def _variants() -> Dict[str, str]:
base = dict(BOOK_CANON)
numbered = [
("samuel", "Sam."), ("kings", "Ki."), ("chronicles", "Chron."),
("corinthians", "Cor."), ("thessalonians", "Thess."),
("timothy", "Tim."), ("peter", "Pet."), ("john", "John"),
]
for n, prefix in (("1", "1"), ("i", "1"), ("2", "2"), ("ii", "2"), ("3", "3"), ("iii", "3")):
for name, abbr in numbered:
base[f"{n} {name}"] = f"{prefix} {abbr}"
base[f"{n}{name}"] = f"{prefix} {abbr}"
# ultra short no-space combos like "1co", "2ki"
base[f"{n}{name[:2]}"] = f"{prefix} {abbr}"
# very common shorthands that users type
base["ps"] = "Ps."
base["prov"] = "Prov."
base["eccles"] = "Eccl."
base["deut."] = "Deut."
base["mt"] = "Matt."
base["mk"] = "Mark"
base["lk"] = "Luke"
base["jn"] = "John"
base["ti"] = "Tim." # used only with a leading number, handled below
base["co"] = "Cor." # used only with a leading number, handled below
return base
BOOK_MAP = _variants()
# Words to strip like "Read", "chapter"
CRUFT_RE = re.compile(r"\b(read|see|chap(?:ter)?|ch)\b\.?", re.I)
# Book prefix (allows "1Co", "2 Pet.", "Rom", etc.)
BOOK_RE = re.compile(
r"""
^\s*
(?:
(?P<num>[1-3]|i{1,3})\s* # optional 1/2/3 or roman
)?
\s*
(?P<book>[A-Za-z\.]+(?:\s+[A-Za-z\.]+){0,2}) # book words (1-3 words)
""",
re.X,
)
# chapter/verse piece like "4:6,7-9" or "21"
C_V_RE = re.compile(
r"""
(?:
(?P<ch>\d+)
(?:
:(?P<vs>[\d,\-\u2013\u2014\s]+)
)?
)
""",
re.X,
)
def _clean_text(s: str) -> str:
s = s.replace("\xa0", " ").replace("\u2009", " ").replace("\u202f", " ")
s = CRUFT_RE.sub("", s)
s = s.replace("..", ".").replace("", "-").replace("", "-")
s = re.sub(r"\s{2,}", " ", s)
return s.strip(" ;,.\t\r\n ")
def _canon_key(raw: str) -> str:
key = raw.lower().strip()
key = key.replace(".", "")
key = re.sub(r"\s+", " ", key)
return key
def _canon_book(book_raw: str, num: Optional[str] = None) -> Optional[str]:
"""
Try to map a raw book token (with optional leading number) to our canonical abbr.
"""
key = _canon_key(book_raw)
# if the token was like "co" or "ti" and a number exists, bias to Cor./Tim.
if num and key in ("co", "cor"):
return f"{num} Cor."
if num and key in ("ti", "tim"):
return f"{num} Tim."
# direct lookup in our expanded map
if key in BOOK_MAP:
val = BOOK_MAP[key]
# If lookup returned a numbered abbr like "1 Cor." (already good)
return val
# sometimes users omit space: "1co", "2ki"
m = re.match(r"(?P<n>[1-3]|i{1,3})(?P(rest>[A-Za-z].*))$", key)
if m:
n = m.group("n")
rest = m.group("rest")
if rest in BOOK_MAP:
base = BOOK_MAP[rest]
# normalize roman to arabic
n = {"i": "1", "ii": "2", "iii": "3"}.get(n, n)
# if base already includes number, replace it
if base[0] in "123":
return f"{n} {base.split(' ', 1)[1]}"
return f"{n} {base}"
# last chance: try the core canon directly
return BOOK_CANON.get(key)
def _parse_segment(seg: str, last_book: Optional[str]) -> Tuple[Optional[str], Optional[str], bool]:
"""
Parse a semicolon-delimited segment.
Returns (book_canon, cv_string, preserve_raw).
- If preserve_raw is True, cv_string contains the original (cleaned) segment to keep verbatim.
"""
original = seg.strip()
s = _clean_text(seg)
if not s:
return (None, None, False)
# Try to detect a leading book token
m = BOOK_RE.match(s)
book = None
rest = s
if m:
num = (m.group("num") or "").strip()
raw_book = (m.group("book") or "").strip()
raw_joined = f"{num} {raw_book}".strip()
canon = _canon_book(raw_joined or raw_book, num=num or None)
if canon:
book = canon
rest = s[m.end():].strip(",;: .")
else:
# Not recognized as a book -> keep whole thing verbatim
return (None, original, True)
if not book:
# Inherit previous recognized book if we have one
if last_book:
book = last_book
else:
# There is no book context — keep segment verbatim to avoid data loss
return (None, original, True)
# Normalize the chapter/verse part
rest = re.sub(r"\s+", "", rest)
rest = re.sub(r":\s*", ":", rest)
if not rest:
return (book, None, False)
if C_V_RE.search(rest):
cv = rest.replace(" ", "")
else:
m2 = re.search(r"\d+(?::[\d,\-]+)?", rest)
cv = m2.group(0).replace(" ", "") if m2 else None
if not cv:
# Nothing parseable after a valid book; just keep the book
return (book, None, False)
# Normalize dashes, commas
cv = cv.replace("", "-").replace("", "-")
cv = re.sub(r",\s*", ",", cv)
return (book, cv, False)
def normalize_scripture_field(text: str) -> Tuple[str, List[str]]:
"""
Normalize an entire scripture_raw string.
- Unknown pieces are preserved verbatim.
- Known pieces are standardized and each segment repeats the book.
Returns (normalized_text, warnings).
"""
warnings: List[str] = []
if not text:
return ("", warnings)
pieces = [p for p in re.split(r"\s*;\s*", text) if p and p.strip()]
out: List[str] = []
last_book: Optional[str] = None
for piece in pieces:
book, cv, preserve = _parse_segment(piece, last_book)
if preserve:
out.append(cv or piece.strip())
# do not update last_book when we couldn't recognize a book
continue
if book and cv:
out.append(f"{book} {cv}")
last_book = book
continue
if book and not cv:
# book only (e.g., "Acts")
out.append(book)
last_book = book
continue
# If we get here: (no book, no preserve) — nothing useful to add
if piece.strip():
# As a final safeguard, keep original
out.append(piece.strip())
norm = "; ".join(x.strip() for x in out if x.strip())
norm = re.sub(r"\s+", " ", norm).strip(" ;,")
return (norm, warnings)