Illustrations/web/core/scripture_normalizer.py

250 lines
8.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# core/scripture_normalizer.py
from __future__ import annotations
import re
from typing import Dict, List, Tuple
# --- Book map (common full names + abbreviations -> canonical abbr) ---
# Tweak any canonical value if you prefer a different house style.
BOOK_CANON = {
# OT
"genesis": "Gen.", "gen": "Gen.", "genesisesis": "Gen.",
"exodus": "Ex.", "ex": "Ex.",
"leviticus": "Lev.", "lev": "Lev.",
"numbers": "Num.", "num": "Num.", "nums": "Num.",
"deuteronomy": "Deut.", "deut": "Deut.", "deu": "Deut.", "deutronomy": "Deut.", "deut.": "Deut.",
"joshua": "Josh.", "josh": "Josh.",
"judges": "Judg.", "judg": "Judg.",
"ruth": "Ruth",
"1 samuel": "1 Sam.", "i samuel": "1 Sam.", "1 sam": "1 Sam.",
"2 samuel": "2 Sam.", "ii samuel": "2 Sam.", "2 sam": "2 Sam.",
"1 kings": "1 Ki.", "i kings": "1 Ki.", "1 ki": "1 Ki.", "1kgs": "1 Ki.", "1 kgs": "1 Ki.",
"2 kings": "2 Ki.", "ii kings": "2 Ki.", "2 ki": "2 Ki.", "2kgs": "2 Ki.", "2 kgs": "2 Ki.",
"1 chronicles": "1 Chron.", "i chronicles": "1 Chron.", "1 chron": "1 Chron.",
"2 chronicles": "2 Chron.", "ii chronicles": "2 Chron.", "2 chron": "2 Chron.",
"ezra": "Ezra",
"nehemiah": "Neh.", "neh": "Neh.",
"esther": "Esth.", "esth": "Esth.",
"job": "Job",
"psalm": "Ps.", "psalms": "Ps.", "ps": "Ps.", "psa": "Ps.",
"proverbs": "Prov.", "prov": "Prov.",
"ecclesiastes": "Eccl.", "eccles": "Eccl.", "eccl": "Eccl.",
"song of solomon": "Song", "song of songs": "Song", "song": "Song",
"isaiah": "Isa.", "isa": "Isa.",
"jeremiah": "Jer.", "jer": "Jer.", "jer.": "Jer.",
"lamentations": "Lam.", "lam": "Lam.",
"ezekiel": "Ezek.", "ezek": "Ezek.",
"daniel": "Dan.", "dan": "Dan.",
"hosea": "Hos.", "hos": "Hos.",
"joel": "Joel",
"amos": "Amos",
"obadiah": "Obad.", "obad": "Obad.",
"jonah": "Jon.", "jon": "Jon.",
"micah": "Mic.", "mic": "Mic.",
"nahum": "Nah.", "nah": "Nah.",
"habakkuk": "Hab.", "hab": "Hab.",
"zephaniah": "Zeph.", "zeph": "Zeph.",
"haggai": "Hag.", "hag": "Hag.",
"zechariah": "Zech.", "zech": "Zech.",
"malachi": "Mal.", "mal": "Mal.",
# NT
"matthew": "Matt.", "matt": "Matt.", "mt": "Matt.",
"mark": "Mark", "mk": "Mark",
"luke": "Luke", "lk": "Luke",
"john": "John", "jn": "John",
"acts": "Acts",
"romans": "Rom.", "rom": "Rom.",
"1 corinthians": "1 Cor.", "i corinthians": "1 Cor.", "1 cor": "1 Cor.", "1 cor.": "1 Cor.",
"2 corinthians": "2 Cor.", "ii corinthians": "2 Cor.", "2 cor": "2 Cor.", "2 cor.": "2 Cor.",
"galatians": "Gal.", "gal": "Gal.",
"ephesians": "Eph.", "eph": "Eph.", "eph.": "Eph.",
"philippians": "Phil.", "phil": "Phil.", "philippians 216": "Phil.", # common import glitch
"colossians": "Col.", "col": "Col.",
"1 thessalonians": "1 Thess.", "i thessalonians": "1 Thess.", "1 thess": "1 Thess.", "1 thes": "1 Thess.",
"2 thessalonians": "2 Thess.", "ii thessalonians": "2 Thess.", "2 thess": "2 Thess.", "2 thes": "2 Thess.",
"1 timothy": "1 Tim.", "i timothy": "1 Tim.", "1 tim": "1 Tim.", "1 ti": "1 Tim.",
"2 timothy": "2 Tim.", "ii timothy": "2 Tim.", "2 tim": "2 Tim.", "2 ti": "2 Tim.",
"titus": "Titus", "ti": "Tim.", # 'Ti' is usually Timothy in your data
"philemon": "Philem.", "philem": "Philem.",
"hebrews": "Heb.", "heb": "Heb.",
"james": "Jas.", "jas": "Jas.",
"1 peter": "1 Pet.", "i peter": "1 Pet.", "1 pet": "1 Pet.",
"2 peter": "2 Pet.", "ii peter": "2 Pet.", "2 pet": "2 Pet.",
"1 john": "1 John", "i john": "1 John",
"2 john": "2 John", "ii john": "2 John",
"3 john": "3 John", "iii john": "3 John",
"jude": "Jude",
"revelation": "Rev.", "rev": "Rev.",
# very short generic stems
"cor": "Cor.", "co": "Cor.",
"thess": "Thess.", "thes": "Thess.",
}
# add “numbered+book” fallbacks like “1 cor”, “2 ki”, “1 chron”, etc.
def _variants() -> Dict[str, str]:
base = dict(BOOK_CANON)
numbered = [
("samuel", "Sam."), ("kings", "Ki."), ("chronicles", "Chron."),
("corinthians", "Cor."), ("cor", "Cor."), ("co", "Cor."),
("thessalonians", "Thess."), ("thess", "Thess."), ("thes", "Thess."),
("timothy", "Tim."), ("ti", "Tim."),
("peter", "Pet."), ("john", "John"),
]
for n in ("1", "i"):
for name, abbr in numbered:
base[f"{n} {name}"] = f"1 {abbr}"
base[f"{n}{name}"] = f"1 {abbr}"
for n in ("2", "ii"):
for name, abbr in numbered:
base[f"{n} {name}"] = f"2 {abbr}"
base[f"{n}{name}"] = f"2 {abbr}"
for n in ("3", "iii"):
base[f"{n} john"] = "3 John"
base[f"{n}john"] = "3 John"
# very common shorthands
base["ps"] = "Ps."
base["prov"] = "Prov."
base["eccles"] = "Eccl."
base["deut "] = "Deut."
base["deut."] = "Deut."
return base
BOOK_MAP = _variants()
# Extra explicit short forms that don't naturally fall out of the variant builder.
BOOK_MAP.update({
"1 co": "1 Cor.", "2 co": "2 Cor.", "1co": "1 Cor.", "2co": "2 Cor.",
})
# strip cruft words like "Read", "chapter"
CRUFT_RE = re.compile(r"\b(read|see|chap(?:ter)?|ch)\b\.?", re.I)
# book prefix pattern (handles “1 Cor.”, “2 Peter”, “Rom.”, “Psalms”)
BOOK_RE = re.compile(
r"""
^\s*
(?:
(?P<num>[1-3]|i{1,3})\s* # optional leading 1/2/3 (or roman i/ii/iii)
)?
\s*
(?P<book>[A-Za-z\.]+(?:\s+[A-Za-z\.]+){0,2}) # book words (1-3 words)
\s*
""",
re.X
)
# chapter/verse piece like "4:6, 7-9" or "21" (chapter only)
C_V_RE = re.compile(
r"""
(?:
(?P<ch>\d+)
(?:
:(?P<vs>[\d,\-\u2013\u2014\s]+) # verses: lists/ranges, allow en/em dash
)?
)
""",
re.X
)
def _clean_text(s: str) -> str:
s = s.replace("\xa0", " ").replace("\u2009", " ").replace("\u202f", " ")
s = CRUFT_RE.sub("", s)
s = s.replace("..", ".").replace("", "-").replace("", "-")
s = s.replace(" ", " ").strip(" ;,.\t\r\n")
return s.strip()
def _canon_book(book_raw: str) -> str | None:
if not book_raw:
return None
key = book_raw.lower().strip().replace(".", "")
key = re.sub(r"\s+", " ", key)
# try exact
if key in BOOK_MAP:
return BOOK_MAP[key]
# try adding number + name variants already in map
return BOOK_CANON.get(key) # last-resort
def _parse_segment(seg: str, last_book: str | None) -> Tuple[str | None, str | None]:
"""
Return (book_canon, cv_string) for one semicolon-delimited segment.
If no book is present, reuse last_book.
"""
s = _clean_text(seg)
if not s:
return (None, None)
# try to see if it starts with a book
m = BOOK_RE.match(s)
book = None
rest = s
if m:
raw = ((m.group("num") or "").strip() + " " + (m.group("book") or "").strip()).strip()
raw = raw.replace(" ", " ")
canon = _canon_book(raw)
if canon:
book = canon
rest = s[m.end():].strip(",;: .")
else:
rest = s
if not book:
book = last_book # inherit prior (e.g., "14:20" after "1 Cor. 13:11")
# now rest should hold "4:6,7-9" or "21" etc — normalize spaces
rest = re.sub(r"\s+", "", rest)
rest = re.sub(r":\s+", ":", rest) # "2: 24" -> "2:24"
# allow chapter-only
if not rest:
cv = None
else:
if C_V_RE.search(rest):
cv = rest
# normalize commas to include a following space: "6,7" -> "6, 7"
cv = re.sub(r"\s*,\s*", ", ", cv)
else:
m2 = re.search(r"\d+(?::[\d,\-]+)?", rest)
cv = re.sub(r"\s*,\s*", ", ", m2.group(0)) if m2 else None
return (book, cv)
def normalize_scripture_field(text: str) -> Tuple[str, List[str]]:
"""
Normalize a whole scripture_raw string.
Returns (normalized_text, warnings).
"""
warnings: List[str] = []
if not text:
return ("", warnings)
# split on semicolons; keep empty pieces out
pieces = [p for p in re.split(r"\s*;\s*", text) if p and p.strip()]
out: List[str] = []
last_book: str | None = None
for piece in pieces:
book, cv = _parse_segment(piece, last_book)
if not book and not cv:
continue
if book and not cv:
out.append(book)
last_book = book
continue
if not book and cv:
warnings.append(f"Missing book for '{piece.strip()}'")
continue
cv = (cv or "").replace("", "-").replace("", "-")
cv = re.sub(r"\s+", " ", cv).strip()
out.append(f"{book} {cv}" if cv else f"{book}")
last_book = book
norm = "; ".join(o.strip() for o in out if o.strip())
norm = norm.strip(" ;,")
norm = re.sub(r"\s+", " ", norm)
return (norm, warnings)