Update web/core/scripture_normalizer.py
This commit is contained in:
parent
93b772324a
commit
fb74fe1ebf
@ -78,7 +78,6 @@ BOOK_CANON: Dict[str, str] = {
|
||||
"revelation": "Rev.", "rev": "Rev.", "re": "Rev.",
|
||||
}
|
||||
|
||||
# also accept short two-word numbered patterns dynamically (e.g., "1 sam", "2 ki")
|
||||
def _variants() -> Dict[str, str]:
|
||||
base = dict(BOOK_CANON)
|
||||
numbered = [
|
||||
@ -90,9 +89,7 @@ def _variants() -> Dict[str, str]:
|
||||
for name, abbr in numbered:
|
||||
base[f"{n} {name}"] = f"{prefix} {abbr}"
|
||||
base[f"{n}{name}"] = f"{prefix} {abbr}"
|
||||
# ultra short no-space combos like "1co", "2ki"
|
||||
base[f"{n}{name[:2]}"] = f"{prefix} {abbr}"
|
||||
# very common shorthands that users type
|
||||
base["ps"] = "Ps."
|
||||
base["prov"] = "Prov."
|
||||
base["eccles"] = "Eccl."
|
||||
@ -101,29 +98,26 @@ def _variants() -> Dict[str, str]:
|
||||
base["mk"] = "Mark"
|
||||
base["lk"] = "Luke"
|
||||
base["jn"] = "John"
|
||||
base["ti"] = "Tim." # used only with a leading number, handled below
|
||||
base["co"] = "Cor." # used only with a leading number, handled below
|
||||
base["ti"] = "Tim."
|
||||
base["co"] = "Cor."
|
||||
return base
|
||||
|
||||
BOOK_MAP = _variants()
|
||||
|
||||
# Words to strip like "Read", "chapter"
|
||||
CRUFT_RE = re.compile(r"\b(read|see|chap(?:ter)?|ch)\b\.?", re.I)
|
||||
|
||||
# Book prefix (allows "1Co", "2 Pet.", "Rom", etc.)
|
||||
BOOK_RE = re.compile(
|
||||
r"""
|
||||
^\s*
|
||||
(?:
|
||||
(?P<num>[1-3]|i{1,3})\s* # optional 1/2/3 or roman
|
||||
(?P<num>[1-3]|i{1,3})\s*
|
||||
)?
|
||||
\s*
|
||||
(?P<book>[A-Za-z\.]+(?:\s+[A-Za-z\.]+){0,2}) # book words (1-3 words)
|
||||
(?P<book>[A-Za-z\.]+(?:\s+[A-Za-z\.]+){0,2})
|
||||
""",
|
||||
re.X,
|
||||
)
|
||||
|
||||
# chapter/verse piece like "4:6,7-9" or "21"
|
||||
C_V_RE = re.compile(
|
||||
r"""
|
||||
(?:
|
||||
@ -150,48 +144,33 @@ def _canon_key(raw: str) -> str:
|
||||
return key
|
||||
|
||||
def _canon_book(book_raw: str, num: Optional[str] = None) -> Optional[str]:
|
||||
"""
|
||||
Try to map a raw book token (with optional leading number) to our canonical abbr.
|
||||
"""
|
||||
key = _canon_key(book_raw)
|
||||
# if the token was like "co" or "ti" and a number exists, bias to Cor./Tim.
|
||||
if num and key in ("co", "cor"):
|
||||
return f"{num} Cor."
|
||||
if num and key in ("ti", "tim"):
|
||||
return f"{num} Tim."
|
||||
# direct lookup in our expanded map
|
||||
if key in BOOK_MAP:
|
||||
val = BOOK_MAP[key]
|
||||
# If lookup returned a numbered abbr like "1 Cor." (already good)
|
||||
return val
|
||||
# sometimes users omit space: "1co", "2ki"
|
||||
m = re.match(r"(?P<n>[1-3]|i{1,3})(?P(rest>[A-Za-z].*))$", key)
|
||||
# FIXED: correct named group syntax here
|
||||
m = re.match(r"(?P<n>[1-3]|i{1,3})(?P<rest>[A-Za-z].*)$", key)
|
||||
if m:
|
||||
n = m.group("n")
|
||||
rest = m.group("rest")
|
||||
if rest in BOOK_MAP:
|
||||
base = BOOK_MAP[rest]
|
||||
# normalize roman to arabic
|
||||
n = {"i": "1", "ii": "2", "iii": "3"}.get(n, n)
|
||||
# if base already includes number, replace it
|
||||
if base[0] in "123":
|
||||
return f"{n} {base.split(' ', 1)[1]}"
|
||||
return f"{n} {base}"
|
||||
# last chance: try the core canon directly
|
||||
return BOOK_CANON.get(key)
|
||||
|
||||
def _parse_segment(seg: str, last_book: Optional[str]) -> Tuple[Optional[str], Optional[str], bool]:
|
||||
"""
|
||||
Parse a semicolon-delimited segment.
|
||||
Returns (book_canon, cv_string, preserve_raw).
|
||||
- If preserve_raw is True, cv_string contains the original (cleaned) segment to keep verbatim.
|
||||
"""
|
||||
original = seg.strip()
|
||||
s = _clean_text(seg)
|
||||
if not s:
|
||||
return (None, None, False)
|
||||
|
||||
# Try to detect a leading book token
|
||||
m = BOOK_RE.match(s)
|
||||
book = None
|
||||
rest = s
|
||||
@ -204,18 +183,14 @@ def _parse_segment(seg: str, last_book: Optional[str]) -> Tuple[Optional[str], O
|
||||
book = canon
|
||||
rest = s[m.end():].strip(",;: .")
|
||||
else:
|
||||
# Not recognized as a book -> keep whole thing verbatim
|
||||
return (None, original, True)
|
||||
|
||||
if not book:
|
||||
# Inherit previous recognized book if we have one
|
||||
if last_book:
|
||||
book = last_book
|
||||
else:
|
||||
# There is no book context — keep segment verbatim to avoid data loss
|
||||
return (None, original, True)
|
||||
|
||||
# Normalize the chapter/verse part
|
||||
rest = re.sub(r"\s+", "", rest)
|
||||
rest = re.sub(r":\s*", ":", rest)
|
||||
|
||||
@ -229,22 +204,14 @@ def _parse_segment(seg: str, last_book: Optional[str]) -> Tuple[Optional[str], O
|
||||
cv = m2.group(0).replace(" ", "") if m2 else None
|
||||
|
||||
if not cv:
|
||||
# Nothing parseable after a valid book; just keep the book
|
||||
return (book, None, False)
|
||||
|
||||
# Normalize dashes, commas
|
||||
cv = cv.replace("–", "-").replace("—", "-")
|
||||
cv = re.sub(r",\s*", ",", cv)
|
||||
|
||||
return (book, cv, False)
|
||||
|
||||
def normalize_scripture_field(text: str) -> Tuple[str, List[str]]:
|
||||
"""
|
||||
Normalize an entire scripture_raw string.
|
||||
- Unknown pieces are preserved verbatim.
|
||||
- Known pieces are standardized and each segment repeats the book.
|
||||
Returns (normalized_text, warnings).
|
||||
"""
|
||||
warnings: List[str] = []
|
||||
if not text:
|
||||
return ("", warnings)
|
||||
@ -257,7 +224,6 @@ def normalize_scripture_field(text: str) -> Tuple[str, List[str]]:
|
||||
book, cv, preserve = _parse_segment(piece, last_book)
|
||||
if preserve:
|
||||
out.append(cv or piece.strip())
|
||||
# do not update last_book when we couldn't recognize a book
|
||||
continue
|
||||
|
||||
if book and cv:
|
||||
@ -266,14 +232,11 @@ def normalize_scripture_field(text: str) -> Tuple[str, List[str]]:
|
||||
continue
|
||||
|
||||
if book and not cv:
|
||||
# book only (e.g., "Acts")
|
||||
out.append(book)
|
||||
last_book = book
|
||||
continue
|
||||
|
||||
# If we get here: (no book, no preserve) — nothing useful to add
|
||||
if piece.strip():
|
||||
# As a final safeguard, keep original
|
||||
out.append(piece.strip())
|
||||
|
||||
norm = "; ".join(x.strip() for x in out if x.strip())
|
||||
|
||||
Loading…
Reference in New Issue
Block a user