Update web/core/scripture_normalizer.py

This commit is contained in:
Joshua Laymon 2025-08-14 02:09:33 +00:00
parent 93b772324a
commit fb74fe1ebf

View File

@ -78,7 +78,6 @@ BOOK_CANON: Dict[str, str] = {
"revelation": "Rev.", "rev": "Rev.", "re": "Rev.", "revelation": "Rev.", "rev": "Rev.", "re": "Rev.",
} }
# also accept short two-word numbered patterns dynamically (e.g., "1 sam", "2 ki")
def _variants() -> Dict[str, str]: def _variants() -> Dict[str, str]:
base = dict(BOOK_CANON) base = dict(BOOK_CANON)
numbered = [ numbered = [
@ -90,9 +89,7 @@ def _variants() -> Dict[str, str]:
for name, abbr in numbered: for name, abbr in numbered:
base[f"{n} {name}"] = f"{prefix} {abbr}" base[f"{n} {name}"] = f"{prefix} {abbr}"
base[f"{n}{name}"] = f"{prefix} {abbr}" base[f"{n}{name}"] = f"{prefix} {abbr}"
# ultra short no-space combos like "1co", "2ki"
base[f"{n}{name[:2]}"] = f"{prefix} {abbr}" base[f"{n}{name[:2]}"] = f"{prefix} {abbr}"
# very common shorthands that users type
base["ps"] = "Ps." base["ps"] = "Ps."
base["prov"] = "Prov." base["prov"] = "Prov."
base["eccles"] = "Eccl." base["eccles"] = "Eccl."
@ -101,29 +98,26 @@ def _variants() -> Dict[str, str]:
base["mk"] = "Mark" base["mk"] = "Mark"
base["lk"] = "Luke" base["lk"] = "Luke"
base["jn"] = "John" base["jn"] = "John"
base["ti"] = "Tim." # used only with a leading number, handled below base["ti"] = "Tim."
base["co"] = "Cor." # used only with a leading number, handled below base["co"] = "Cor."
return base return base
BOOK_MAP = _variants() BOOK_MAP = _variants()
# Words to strip like "Read", "chapter"
CRUFT_RE = re.compile(r"\b(read|see|chap(?:ter)?|ch)\b\.?", re.I) CRUFT_RE = re.compile(r"\b(read|see|chap(?:ter)?|ch)\b\.?", re.I)
# Book prefix (allows "1Co", "2 Pet.", "Rom", etc.)
BOOK_RE = re.compile( BOOK_RE = re.compile(
r""" r"""
^\s* ^\s*
(?: (?:
(?P<num>[1-3]|i{1,3})\s* # optional 1/2/3 or roman (?P<num>[1-3]|i{1,3})\s*
)? )?
\s* \s*
(?P<book>[A-Za-z\.]+(?:\s+[A-Za-z\.]+){0,2}) # book words (1-3 words) (?P<book>[A-Za-z\.]+(?:\s+[A-Za-z\.]+){0,2})
""", """,
re.X, re.X,
) )
# chapter/verse piece like "4:6,7-9" or "21"
C_V_RE = re.compile( C_V_RE = re.compile(
r""" r"""
(?: (?:
@ -150,48 +144,33 @@ def _canon_key(raw: str) -> str:
return key return key
def _canon_book(book_raw: str, num: Optional[str] = None) -> Optional[str]: def _canon_book(book_raw: str, num: Optional[str] = None) -> Optional[str]:
"""
Try to map a raw book token (with optional leading number) to our canonical abbr.
"""
key = _canon_key(book_raw) key = _canon_key(book_raw)
# if the token was like "co" or "ti" and a number exists, bias to Cor./Tim.
if num and key in ("co", "cor"): if num and key in ("co", "cor"):
return f"{num} Cor." return f"{num} Cor."
if num and key in ("ti", "tim"): if num and key in ("ti", "tim"):
return f"{num} Tim." return f"{num} Tim."
# direct lookup in our expanded map
if key in BOOK_MAP: if key in BOOK_MAP:
val = BOOK_MAP[key] val = BOOK_MAP[key]
# If lookup returned a numbered abbr like "1 Cor." (already good)
return val return val
# sometimes users omit space: "1co", "2ki" # FIXED: correct named group syntax here
m = re.match(r"(?P<n>[1-3]|i{1,3})(?P(rest>[A-Za-z].*))$", key) m = re.match(r"(?P<n>[1-3]|i{1,3})(?P<rest>[A-Za-z].*)$", key)
if m: if m:
n = m.group("n") n = m.group("n")
rest = m.group("rest") rest = m.group("rest")
if rest in BOOK_MAP: if rest in BOOK_MAP:
base = BOOK_MAP[rest] base = BOOK_MAP[rest]
# normalize roman to arabic
n = {"i": "1", "ii": "2", "iii": "3"}.get(n, n) n = {"i": "1", "ii": "2", "iii": "3"}.get(n, n)
# if base already includes number, replace it
if base[0] in "123": if base[0] in "123":
return f"{n} {base.split(' ', 1)[1]}" return f"{n} {base.split(' ', 1)[1]}"
return f"{n} {base}" return f"{n} {base}"
# last chance: try the core canon directly
return BOOK_CANON.get(key) return BOOK_CANON.get(key)
def _parse_segment(seg: str, last_book: Optional[str]) -> Tuple[Optional[str], Optional[str], bool]: def _parse_segment(seg: str, last_book: Optional[str]) -> Tuple[Optional[str], Optional[str], bool]:
"""
Parse a semicolon-delimited segment.
Returns (book_canon, cv_string, preserve_raw).
- If preserve_raw is True, cv_string contains the original (cleaned) segment to keep verbatim.
"""
original = seg.strip() original = seg.strip()
s = _clean_text(seg) s = _clean_text(seg)
if not s: if not s:
return (None, None, False) return (None, None, False)
# Try to detect a leading book token
m = BOOK_RE.match(s) m = BOOK_RE.match(s)
book = None book = None
rest = s rest = s
@ -204,18 +183,14 @@ def _parse_segment(seg: str, last_book: Optional[str]) -> Tuple[Optional[str], O
book = canon book = canon
rest = s[m.end():].strip(",;: .") rest = s[m.end():].strip(",;: .")
else: else:
# Not recognized as a book -> keep whole thing verbatim
return (None, original, True) return (None, original, True)
if not book: if not book:
# Inherit previous recognized book if we have one
if last_book: if last_book:
book = last_book book = last_book
else: else:
# There is no book context — keep segment verbatim to avoid data loss
return (None, original, True) return (None, original, True)
# Normalize the chapter/verse part
rest = re.sub(r"\s+", "", rest) rest = re.sub(r"\s+", "", rest)
rest = re.sub(r":\s*", ":", rest) rest = re.sub(r":\s*", ":", rest)
@ -229,22 +204,14 @@ def _parse_segment(seg: str, last_book: Optional[str]) -> Tuple[Optional[str], O
cv = m2.group(0).replace(" ", "") if m2 else None cv = m2.group(0).replace(" ", "") if m2 else None
if not cv: if not cv:
# Nothing parseable after a valid book; just keep the book
return (book, None, False) return (book, None, False)
# Normalize dashes, commas
cv = cv.replace("", "-").replace("", "-") cv = cv.replace("", "-").replace("", "-")
cv = re.sub(r",\s*", ",", cv) cv = re.sub(r",\s*", ",", cv)
return (book, cv, False) return (book, cv, False)
def normalize_scripture_field(text: str) -> Tuple[str, List[str]]: def normalize_scripture_field(text: str) -> Tuple[str, List[str]]:
"""
Normalize an entire scripture_raw string.
- Unknown pieces are preserved verbatim.
- Known pieces are standardized and each segment repeats the book.
Returns (normalized_text, warnings).
"""
warnings: List[str] = [] warnings: List[str] = []
if not text: if not text:
return ("", warnings) return ("", warnings)
@ -257,7 +224,6 @@ def normalize_scripture_field(text: str) -> Tuple[str, List[str]]:
book, cv, preserve = _parse_segment(piece, last_book) book, cv, preserve = _parse_segment(piece, last_book)
if preserve: if preserve:
out.append(cv or piece.strip()) out.append(cv or piece.strip())
# do not update last_book when we couldn't recognize a book
continue continue
if book and cv: if book and cv:
@ -266,14 +232,11 @@ def normalize_scripture_field(text: str) -> Tuple[str, List[str]]:
continue continue
if book and not cv: if book and not cv:
# book only (e.g., "Acts")
out.append(book) out.append(book)
last_book = book last_book = book
continue continue
# If we get here: (no book, no preserve) — nothing useful to add
if piece.strip(): if piece.strip():
# As a final safeguard, keep original
out.append(piece.strip()) out.append(piece.strip())
norm = "; ".join(x.strip() for x in out if x.strip()) norm = "; ".join(x.strip() for x in out if x.strip())