Illustrations/web/core/scripture_normalizer.py

247 lines
8.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# core/scripture_normalizer.py
from __future__ import annotations
import re
from typing import Dict, List, Tuple
# --- Book map (common full names + abbreviations -> canonical abbr) ---
# Tweak any canonical value if you prefer a different house style.
BOOK_CANON = {
# OT
"genesis": "Gen.", "gen": "Gen.", "genesisesis": "Gen.",
"exodus": "Ex.", "ex": "Ex.",
"leviticus": "Lev.", "lev": "Lev.",
"numbers": "Num.", "num": "Num.", "nums": "Num.",
"deuteronomy": "Deut.", "deut": "Deut.", "deu": "Deut.", "deutronomy": "Deut.", "deut.": "Deut.",
"joshua": "Josh.", "josh": "Josh.",
"judges": "Judg.", "judg": "Judg.",
"ruth": "Ruth",
"1 samuel": "1 Sam.", "i samuel": "1 Sam.", "1 sam": "1 Sam.",
"2 samuel": "2 Sam.", "ii samuel": "2 Sam.", "2 sam": "2 Sam.",
"1 kings": "1 Ki.", "i kings": "1 Ki.", "1 ki": "1 Ki.", "1kgs": "1 Ki.", "1 kgs": "1 Ki.",
"2 kings": "2 Ki.", "ii kings": "2 Ki.", "2 ki": "2 Ki.", "2kgs": "2 Ki.", "2 kgs": "2 Ki.",
"1 chronicles": "1 Chron.", "i chronicles": "1 Chron.", "1 chron": "1 Chron.",
"2 chronicles": "2 Chron.", "ii chronicles": "2 Chron.", "2 chron": "2 Chron.",
"ezra": "Ezra",
"nehemiah": "Neh.", "neh": "Neh.",
"esther": "Esth.", "esth": "Esth.",
"job": "Job",
"psalm": "Ps.", "psalms": "Ps.", "ps": "Ps.",
"proverbs": "Prov.", "prov": "Prov.",
"ecclesiastes": "Eccl.", "eccles": "Eccl.", "eccl": "Eccl.",
"song of solomon": "Song", "song of songs": "Song", "song": "Song",
"isaiah": "Isa.", "isa": "Isa.",
"jeremiah": "Jer.", "jer": "Jer.", "jer.": "Jer.",
"lamentations": "Lam.", "lam": "Lam.",
"ezekiel": "Ezek.", "ezek": "Ezek.",
"daniel": "Dan.", "dan": "Dan.",
"hosea": "Hos.", "hos": "Hos.",
"joel": "Joel",
"amos": "Amos",
"obadiah": "Obad.", "obad": "Obad.",
"jonah": "Jon.", "jon": "Jon.",
"micah": "Mic.", "mic": "Mic.",
"nahum": "Nah.", "nah": "Nah.",
"habakkuk": "Hab.", "hab": "Hab.",
"zephaniah": "Zeph.", "zeph": "Zeph.",
"haggai": "Hag.", "hag": "Hag.",
"zechariah": "Zech.", "zech": "Zech.",
"malachi": "Mal.", "mal": "Mal.",
# NT
"matthew": "Matt.", "matt": "Matt.",
"mark": "Mark",
"luke": "Luke",
"john": "John",
"acts": "Acts",
"romans": "Rom.", "rom": "Rom.",
"1 corinthians": "1 Cor.", "i corinthians": "1 Cor.", "1 cor": "1 Cor.", "1 cor.": "1 Cor.",
"2 corinthians": "2 Cor.", "ii corinthians": "2 Cor.", "2 cor": "2 Cor.", "2 cor.": "2 Cor.",
"galatians": "Gal.", "gal": "Gal.",
"ephesians": "Eph.", "eph": "Eph.", "eph.": "Eph.",
"philippians": "Phil.", "phil": "Phil.", "philippians 216": "Phil.", # common import glitch
"colossians": "Col.", "col": "Col.",
"1 thessalonians": "1 Thess.", "i thessalonians": "1 Thess.", "1 thess": "1 Thess.",
"2 thessalonians": "2 Thess.", "ii thessalonians": "2 Thess.", "2 thess": "2 Thess.",
"1 timothy": "1 Tim.", "i timothy": "1 Tim.", "1 tim": "1 Tim.",
"2 timothy": "2 Tim.", "ii timothy": "2 Tim.", "2 tim": "2 Tim.",
"titus": "Titus",
"philemon": "Philem.", "philem": "Philem.",
"hebrews": "Heb.", "heb": "Heb.",
"james": "Jas.", "jas": "Jas.",
"1 peter": "1 Pet.", "i peter": "1 Pet.", "1 pet": "1 Pet.",
"2 peter": "2 Pet.", "ii peter": "2 Pet.", "2 pet": "2 Pet.",
"1 john": "1 John", "i john": "1 John",
"2 john": "2 John", "ii john": "2 John",
"3 john": "3 John", "iii john": "3 John",
"jude": "Jude",
"revelation": "Rev.", "rev": "Rev.",
}
# add “numbered+book” fallbacks like “1 cor”, “2 ki”, “1 chron”, etc.
def _variants() -> Dict[str, str]:
base = dict(BOOK_CANON)
numbered = [
("samuel", "Sam."), ("kings", "Ki."), ("chronicles", "Chron."),
("corinthians", "Cor."), ("thessalonians", "Thess."),
("timothy", "Tim."), ("peter", "Pet."), ("john", "John"),
]
for n in ("1", "i"):
for name, abbr in numbered:
base[f"{n} {name}"] = f"1 {abbr}"
for n in ("2", "ii"):
for name, abbr in numbered:
base[f"{n} {name}"] = f"2 {abbr}"
for n in ("3", "iii"):
base[f"{n} john"] = "3 John"
# very common shorthands
base["ps"] = "Ps."
base["prov"] = "Prov."
base["eccles"] = "Eccl."
base["deut "] = "Deut."
base["deut."] = "Deut."
return base
BOOK_MAP = _variants()
# strip cruft words like "Read", "chapter"
CRUFT_RE = re.compile(r"\b(read|see|chap(?:ter)?|ch)\b\.?", re.I)
# book prefix pattern (handles “1 Cor.”, “2 Peter”, “Rom.”, “Psalms”)
BOOK_RE = re.compile(
r"""
^\s*
(?:
(?P<num>[1-3]|i{1,3})\s* # optional leading 1/2/3 (or roman i/ii/iii)
)?
\s*
(?P<book>[A-Za-z\.]+(?:\s+[A-Za-z\.]+){0,2}) # book words (1-3 words)
\s*
""",
re.X
)
# chapter/verse piece like "4:6, 7-9" or "21" (chapter only)
C_V_RE = re.compile(
r"""
(?:
(?P<ch>\d+)
(?:
:(?P<vs>[\d,\-\u2013\u2014\s]+) # verses: lists/ranges, allow en/em dash
)?
)
""",
re.X
)
def _clean_text(s: str) -> str:
s = s.replace("\xa0", " ").replace("\u2009", " ").replace("\u202f", " ")
s = CRUFT_RE.sub("", s)
s = s.replace("..", ".").replace("", "-").replace("", "-")
s = s.replace(" ", " ").strip(" ;,.\t\r\n")
return s.strip()
def _canon_book(book_raw: str) -> str | None:
if not book_raw:
return None
key = book_raw.lower().strip().replace(".", "")
key = re.sub(r"\s+", " ", key)
# try exact
if key in BOOK_MAP:
return BOOK_MAP[key]
# try adding number + name variants already in map
return BOOK_CANON.get(key) # last-resort
def _parse_segment(seg: str, last_book: str | None) -> Tuple[str | None, str | None]:
"""
Return (book_canon, cv_string) for one semicolon-delimited segment.
If no book is present, reuse last_book.
"""
s = _clean_text(seg)
if not s:
return (None, None)
# try to see if it starts with a book
m = BOOK_RE.match(s)
book = None
rest = s
if m:
raw = ((m.group("num") or "").strip() + " " + (m.group("book") or "").strip()).strip()
raw = raw.replace(" ", " ")
# if the "book" word is obviously a book
canon = _canon_book(raw)
if canon:
book = canon
rest = s[m.end():].strip(",;: .")
else:
# maybe m just matched a word that's not a book; keep whole as rest
rest = s
if not book:
book = last_book # inherit prior
# now rest should hold "4:6,7-9" or "21" etc — normalize spaces
rest = rest.replace(" ", "")
# fix cases like "2: 24" -> "2:24"
rest = re.sub(r":\s+", ":", rest)
# allow chapter-only
if not rest:
cv = None
else:
# validate basic shape
if C_V_RE.search(rest):
cv = rest
# normalize commas around verses e.g. "6, 7" -> "6,7"
cv = cv.replace(" ", "")
else:
# weird text (like just a word) — treat as chapter-only number if any
m2 = re.search(r"\d+(?::[\d,\-]+)?", rest)
cv = m2.group(0).replace(" ", "") if m2 else None
return (book, cv)
def normalize_scripture_field(text: str) -> Tuple[str, List[str]]:
"""
Normalize a whole scripture_raw string.
Returns (normalized_text, warnings).
"""
warnings: List[str] = []
if not text:
return ("", warnings)
# split on semicolons; keep empty pieces out
pieces = [p for p in re.split(r"\s*;\s*", text) if p and p.strip()]
out: List[str] = []
last_book: str | None = None
for piece in pieces:
book, cv = _parse_segment(piece, last_book)
if not book and not cv:
continue
if book and not cv:
# only a book (e.g., "Acts")
out.append(book)
last_book = book
continue
if not book and cv:
# verses but no book — cannot link properly; warn and skip
warnings.append(f"Missing book for '{piece.strip()}'")
continue
# normalize verse separators "1, 2" -> "1,2" already done; ensure ranges use hyphen
cv = cv.replace("", "-").replace("", "-")
# build
out.append(f"{book} {cv}")
last_book = book
# de-dup whitespace, join with semicolons
norm = "; ".join(o.strip() for o in out if o.strip())
norm = norm.strip(" ;,")
# final tiny cleanup
norm = re.sub(r"\s+", " ", norm)
return (norm, warnings)