Illustrations/web/core/scripture_normalizer.py

250 lines
9.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# core/scripture_normalizer.py
from __future__ import annotations
import re
from typing import Dict, List, Tuple
# --- Book map (common full names + abbreviations -> canonical abbr) ---
BOOK_CANON = {
# OT
"genesis": "Gen.", "gen": "Gen.", "genesisesis": "Gen.", "genesis sis": "Gen.", "ge": "Gen.",
"exodus": "Ex.", "ex": "Ex.", "exo": "Ex.", "exod": "Ex.", "exodus.": "Ex.",
"leviticus": "Lev.", "lev": "Lev.", "le": "Lev.",
"numbers": "Num.", "num": "Num.", "nums": "Num.", "nu": "Num.",
"deuteronomy": "Deut.", "deut": "Deut.", "deu": "Deut.", "de": "Deut.", "deutronomy": "Deut.", "deut.": "Deut.",
"joshua": "Josh.", "josh": "Josh.", "jos": "Josh.",
"judges": "Judg.", "judg": "Judg.", "judg.": "Judg.", "judg": "Judg.",
"ruth": "Ruth",
"1 samuel": "1 Sam.", "i samuel": "1 Sam.", "1 sam": "1 Sam.", "1sa": "1 Sam.",
"2 samuel": "2 Sam.", "ii samuel": "2 Sam.", "2 sam": "2 Sam.", "2sa": "2 Sam.",
"1 kings": "1 Ki.", "i kings": "1 Ki.", "1 ki": "1 Ki.", "1kgs": "1 Ki.", "1 kgs": "1 Ki.", "1ki": "1 Ki.",
"2 kings": "2 Ki.", "ii kings": "2 Ki.", "2 ki": "2 Ki.", "2kgs": "2 Ki.", "2 kgs": "2 Ki.", "2ki": "2 Ki.",
"1 chronicles": "1 Chron.", "i chronicles": "1 Chron.", "1 chron": "1 Chron.", "1 ch": "1 Chron.", "1ch": "1 Chron.",
"2 chronicles": "2 Chron.", "ii chronicles": "2 Chron.", "2 chron": "2 Chron.", "2 ch": "2 Chron.", "2ch": "2 Chron.",
"ezra": "Ezra",
"nehemiah": "Neh.", "neh": "Neh.", "ne": "Neh.",
"esther": "Esth.", "esth": "Esth.",
"job": "Job",
"psalm": "Ps.", "psalms": "Ps.", "ps": "Ps.", "ps.": "Ps.", "pss": "Ps.",
"proverbs": "Prov.", "prov": "Prov.", "prov.": "Prov.", "pr": "Prov.",
"ecclesiastes": "Eccl.", "eccles": "Eccl.", "eccl": "Eccl.", "ecclesiastes.": "Eccl.", "ecc": "Eccl.", "ec": "Eccl.",
"song of solomon": "Song", "song of songs": "Song", "song": "Song",
"isaiah": "Isa.", "isa": "Isa.", "is": "Isa.",
"jeremiah": "Jer.", "jer": "Jer.", "jer.": "Jer.", "je": "Jer.",
"lamentations": "Lam.", "lam": "Lam.", "la": "Lam.",
"ezekiel": "Ezek.", "ezek": "Ezek.", "eze": "Ezek.",
"daniel": "Dan.", "dan": "Dan.", "da": "Dan.",
"hosea": "Hos.", "hos": "Hos.", "ho": "Hos.",
"joel": "Joel",
"amos": "Amos",
"obadiah": "Obad.", "obad": "Obad.",
"jonah": "Jon.", "jon": "Jon.",
"micah": "Mic.", "mic": "Mic.",
"nahum": "Nah.", "nah": "Nah.", "na": "Nah.",
"habakkuk": "Hab.", "hab": "Hab.",
"zephaniah": "Zeph.", "zeph": "Zeph.",
"haggai": "Hag.", "hag": "Hag.",
"zechariah": "Zech.", "zech": "Zech.", "zec": "Zech.",
"malachi": "Mal.", "mal": "Mal.",
# NT
"matthew": "Matt.", "matt": "Matt.", "mt": "Matt.", "mt.": "Matt.",
"mark": "Mark", "mk": "Mark", "mr": "Mark", "mr.": "Mark",
"luke": "Luke", "lk": "Luke", "lu": "Luke",
"john": "John", "jn": "John", "joh": "John", "jno": "John", "jo": "John",
"acts": "Acts", "ac": "Acts",
"romans": "Rom.", "rom": "Rom.", "ro": "Rom.",
"1 corinthians": "1 Cor.", "i corinthians": "1 Cor.", "1 cor": "1 Cor.", "1 cor.": "1 Cor.", "1 co": "1 Cor.", "1co": "1 Cor.",
"2 corinthians": "2 Cor.", "ii corinthians": "2 Cor.", "2 cor": "2 Cor.", "2 cor.": "2 Cor.", "2 co": "2 Cor.", "2co": "2 Cor.",
# bare "co" (no leading number) is used in your data for Colossians; keep that mapping here:
"co": "Col.",
"galatians": "Gal.", "gal": "Gal.", "ga": "Gal.",
"ephesians": "Eph.", "eph": "Eph.", "ep": "Eph.", "eph.": "Eph.",
"philippians": "Phil.", "phil": "Phil.", "php": "Phil.", "philippians 216": "Phil.", # import glitch
"colossians": "Col.", "col": "Col.",
"1 thessalonians": "1 Thess.", "i thessalonians": "1 Thess.", "1 thess": "1 Thess.", "1th": "1 Thess.",
"2 thessalonians": "2 Thess.", "ii thessalonians": "2 Thess.", "2 thess": "2 Thess.", "2th": "2 Thess.",
"1 timothy": "1 Tim.", "i timothy": "1 Tim.", "1 tim": "1 Tim.", "1ti": "1 Tim.",
"2 timothy": "2 Tim.", "ii timothy": "2 Tim.", "2 tim": "2 Tim.", "2ti": "2 Tim.",
"ti": "Tim.", # context handles numbered elsewhere
"titus": "Titus", "tit": "Titus",
"philemon": "Philem.", "philem": "Philem.", "phm": "Philem.",
"hebrews": "Heb.", "heb": "Heb.", "he": "Heb.",
"james": "Jas.", "jas": "Jas.", "jam": "Jas.",
"1 peter": "1 Pet.", "i peter": "1 Pet.", "1 pet": "1 Pet.", "1pe": "1 Pet.",
"2 peter": "2 Pet.", "ii peter": "2 Pet.", "2 pet": "2 Pet.", "2pe": "2 Pet.",
"1 john": "1 John", "i john": "1 John", "1 jn": "1 John", "1 jo": "1 John", "1jo": "1 John",
"2 john": "2 John", "ii john": "2 John", "2 jn": "2 John", "2jo": "2 John",
"3 john": "3 John", "iii john": "3 John", "3 jn": "3 John", "3jo": "3 John",
"jude": "Jude",
"revelation": "Rev.", "rev": "Rev.", "re": "Rev.",
}
def _variants() -> Dict[str, str]:
"""
Add “numbered + book” fallbacks like “1 cor”, “2 ki”, “1 chron”, etc.
Also teach the mapper that bare 'co' (no number) means Colossians, while '1co/2co' means Corinthians.
"""
base = dict(BOOK_CANON)
numbered = [
("samuel", "Sam."), ("kings", "Ki."), ("chronicles", "Chron."),
("corinthians", "Cor."), ("thessalonians", "Thess."),
("timothy", "Tim."), ("peter", "Pet."), ("john", "John"),
]
for n in ("1", "i"):
for name, abbr in numbered:
base[f"{n} {name}"] = f"1 {abbr}"
# very short glued forms used in the data (no space):
base[f"{n}ki"] = "1 Ki."
base[f"{n}ch"] = "1 Chron."
base[f"{n}co"] = "1 Cor."
base[f"{n}jo"] = "1 John"
for n in ("2", "ii"):
for name, abbr in numbered:
base[f"{n} {name}"] = f"2 {abbr}"
base[f"{n}ki"] = "2 Ki."
base[f"{n}ch"] = "2 Chron."
base[f"{n}co"] = "2 Cor."
base[f"{n}jo"] = "2 John"
for n in ("3", "iii"):
base[f"{n} john"] = "3 John"
base[f"{n}jo"] = "3 John"
# other common shorthands / oddities
base["ps"] = "Ps."
base["prov"] = "Prov."
base["pr"] = "Prov."
base["eccles"] = "Eccl."
base["ecc"] = "Eccl."
base["ec"] = "Eccl."
base["deut "] = "Deut."
base["deut."] = "Deut."
base["genesissis"] = "Gen."
return base
BOOK_MAP = _variants()
# strip cruft words like "Read", "chapter"
CRUFT_RE = re.compile(r"\b(read|see|chap(?:ter)?|ch)\b\.?", re.I)
# book prefix pattern (handles “1 Cor.”, “2 Peter”, “Rom.”, “Psalms”)
BOOK_RE = re.compile(
r"""
^\s*
(?:
(?P<num>[1-3]|i{1,3})\s*? # optional leading 1/2/3 (or roman i/ii/iii)
)?
\s*
(?P<book>[A-Za-z\.]+(?:\s+[A-Za-z\.]+){0,2}) # book words (1-3 words)
\s*
""",
re.X
)
# chapter/verse piece like "4:6, 7-9" or "21" (chapter only)
C_V_RE = re.compile(
r"""
(?:
(?P<ch>\d+)
(?:
:(?P<vs>[\d,\-\u2013\u2014\s]+) # verses: lists/ranges, allow en/em dash
)?
)
""",
re.X
)
def _clean_text(s: str) -> str:
s = s.replace("\xa0", " ").replace("\u2009", " ").replace("\u202f", " ")
s = CRUFT_RE.sub("", s)
s = s.replace("..", ".").replace("", "-").replace("", "-")
s = s.replace(" ", " ").strip(" ;,.\t\r\n")
# NEW: if entry begins with a number stuck to the book (e.g., "1co", "2ki"), insert a space
s = re.sub(r"^\s*(?P<num>(?:[1-3]|i{1,3}))(?=[A-Za-z])", r"\g<num> ", s, flags=re.I)
return s.strip()
def _canon_book(book_raw: str) -> str | None:
if not book_raw:
return None
key = book_raw.lower().strip().replace(".", "")
key = re.sub(r"\s+", " ", key)
# allow glued forms (e.g., "1co", "2ki", "1ch")
key = re.sub(r"^(?P<num>(?:[1-3]|i{1,3}))(?=[a-z])", r"\g<num> ", key)
# try exact
if key in BOOK_MAP:
return BOOK_MAP[key]
# finally, try canonical map without variants
return BOOK_CANON.get(key)
def _parse_segment(seg: str, last_book: str | None) -> Tuple[str | None, str | None]:
"""
Return (book_canon, cv_string) for one semicolon-delimited segment.
If no book is present, reuse last_book.
"""
s = _clean_text(seg)
if not s:
return (None, None)
# try to see if it starts with a book
m = BOOK_RE.match(s)
book = None
rest = s
if m:
raw = ((m.group("num") or "").strip() + " " + (m.group("book") or "").strip()).strip()
raw = raw.replace(" ", " ")
canon = _canon_book(raw)
if canon:
book = canon
rest = s[m.end():].strip(",;: .")
else:
rest = s
if not book:
book = last_book # inherit prior
# now rest should hold "4:6,7-9" or "21" etc — normalize spaces
rest = rest.replace(" ", "")
rest = re.sub(r":\s+", ":", rest)
if not rest:
cv = None
else:
if C_V_RE.search(rest):
cv = rest.replace(" ", "")
else:
m2 = re.search(r"\d+(?::[\d,\-]+)?", rest)
cv = m2.group(0).replace(" ", "") if m2 else None
return (book, cv)
def normalize_scripture_field(text: str) -> Tuple[str, List[str]]:
"""
Normalize a whole scripture_raw string.
Returns (normalized_text, warnings).
"""
warnings: List[str] = []
if not text:
return ("", warnings)
pieces = [p for p in re.split(r"\s*;\s*", text) if p and p.strip()]
out: List[str] = []
last_book: str | None = None
for piece in pieces:
book, cv = _parse_segment(piece, last_book)
if not book and not cv:
continue
if book and not cv:
out.append(book)
last_book = book
continue
if not book and cv:
warnings.append(f"Missing book for '{piece.strip()}'")
continue
cv = (cv or "").replace("", "-").replace("", "-")
out.append(f"{book} {cv}")
last_book = book
norm = "; ".join(o.strip() for o in out if o.strip())
norm = norm.strip(" ;,")
norm = re.sub(r"\s+", " ", norm)
return (norm, warnings)