Update web/core/scripture_normalizer.py
This commit is contained in:
parent
93b772324a
commit
fb74fe1ebf
@ -78,7 +78,6 @@ BOOK_CANON: Dict[str, str] = {
|
|||||||
"revelation": "Rev.", "rev": "Rev.", "re": "Rev.",
|
"revelation": "Rev.", "rev": "Rev.", "re": "Rev.",
|
||||||
}
|
}
|
||||||
|
|
||||||
# also accept short two-word numbered patterns dynamically (e.g., "1 sam", "2 ki")
|
|
||||||
def _variants() -> Dict[str, str]:
|
def _variants() -> Dict[str, str]:
|
||||||
base = dict(BOOK_CANON)
|
base = dict(BOOK_CANON)
|
||||||
numbered = [
|
numbered = [
|
||||||
@ -90,9 +89,7 @@ def _variants() -> Dict[str, str]:
|
|||||||
for name, abbr in numbered:
|
for name, abbr in numbered:
|
||||||
base[f"{n} {name}"] = f"{prefix} {abbr}"
|
base[f"{n} {name}"] = f"{prefix} {abbr}"
|
||||||
base[f"{n}{name}"] = f"{prefix} {abbr}"
|
base[f"{n}{name}"] = f"{prefix} {abbr}"
|
||||||
# ultra short no-space combos like "1co", "2ki"
|
|
||||||
base[f"{n}{name[:2]}"] = f"{prefix} {abbr}"
|
base[f"{n}{name[:2]}"] = f"{prefix} {abbr}"
|
||||||
# very common shorthands that users type
|
|
||||||
base["ps"] = "Ps."
|
base["ps"] = "Ps."
|
||||||
base["prov"] = "Prov."
|
base["prov"] = "Prov."
|
||||||
base["eccles"] = "Eccl."
|
base["eccles"] = "Eccl."
|
||||||
@ -101,29 +98,26 @@ def _variants() -> Dict[str, str]:
|
|||||||
base["mk"] = "Mark"
|
base["mk"] = "Mark"
|
||||||
base["lk"] = "Luke"
|
base["lk"] = "Luke"
|
||||||
base["jn"] = "John"
|
base["jn"] = "John"
|
||||||
base["ti"] = "Tim." # used only with a leading number, handled below
|
base["ti"] = "Tim."
|
||||||
base["co"] = "Cor." # used only with a leading number, handled below
|
base["co"] = "Cor."
|
||||||
return base
|
return base
|
||||||
|
|
||||||
BOOK_MAP = _variants()
|
BOOK_MAP = _variants()
|
||||||
|
|
||||||
# Words to strip like "Read", "chapter"
|
|
||||||
CRUFT_RE = re.compile(r"\b(read|see|chap(?:ter)?|ch)\b\.?", re.I)
|
CRUFT_RE = re.compile(r"\b(read|see|chap(?:ter)?|ch)\b\.?", re.I)
|
||||||
|
|
||||||
# Book prefix (allows "1Co", "2 Pet.", "Rom", etc.)
|
|
||||||
BOOK_RE = re.compile(
|
BOOK_RE = re.compile(
|
||||||
r"""
|
r"""
|
||||||
^\s*
|
^\s*
|
||||||
(?:
|
(?:
|
||||||
(?P<num>[1-3]|i{1,3})\s* # optional 1/2/3 or roman
|
(?P<num>[1-3]|i{1,3})\s*
|
||||||
)?
|
)?
|
||||||
\s*
|
\s*
|
||||||
(?P<book>[A-Za-z\.]+(?:\s+[A-Za-z\.]+){0,2}) # book words (1-3 words)
|
(?P<book>[A-Za-z\.]+(?:\s+[A-Za-z\.]+){0,2})
|
||||||
""",
|
""",
|
||||||
re.X,
|
re.X,
|
||||||
)
|
)
|
||||||
|
|
||||||
# chapter/verse piece like "4:6,7-9" or "21"
|
|
||||||
C_V_RE = re.compile(
|
C_V_RE = re.compile(
|
||||||
r"""
|
r"""
|
||||||
(?:
|
(?:
|
||||||
@ -150,48 +144,33 @@ def _canon_key(raw: str) -> str:
|
|||||||
return key
|
return key
|
||||||
|
|
||||||
def _canon_book(book_raw: str, num: Optional[str] = None) -> Optional[str]:
|
def _canon_book(book_raw: str, num: Optional[str] = None) -> Optional[str]:
|
||||||
"""
|
|
||||||
Try to map a raw book token (with optional leading number) to our canonical abbr.
|
|
||||||
"""
|
|
||||||
key = _canon_key(book_raw)
|
key = _canon_key(book_raw)
|
||||||
# if the token was like "co" or "ti" and a number exists, bias to Cor./Tim.
|
|
||||||
if num and key in ("co", "cor"):
|
if num and key in ("co", "cor"):
|
||||||
return f"{num} Cor."
|
return f"{num} Cor."
|
||||||
if num and key in ("ti", "tim"):
|
if num and key in ("ti", "tim"):
|
||||||
return f"{num} Tim."
|
return f"{num} Tim."
|
||||||
# direct lookup in our expanded map
|
|
||||||
if key in BOOK_MAP:
|
if key in BOOK_MAP:
|
||||||
val = BOOK_MAP[key]
|
val = BOOK_MAP[key]
|
||||||
# If lookup returned a numbered abbr like "1 Cor." (already good)
|
|
||||||
return val
|
return val
|
||||||
# sometimes users omit space: "1co", "2ki"
|
# FIXED: correct named group syntax here
|
||||||
m = re.match(r"(?P<n>[1-3]|i{1,3})(?P(rest>[A-Za-z].*))$", key)
|
m = re.match(r"(?P<n>[1-3]|i{1,3})(?P<rest>[A-Za-z].*)$", key)
|
||||||
if m:
|
if m:
|
||||||
n = m.group("n")
|
n = m.group("n")
|
||||||
rest = m.group("rest")
|
rest = m.group("rest")
|
||||||
if rest in BOOK_MAP:
|
if rest in BOOK_MAP:
|
||||||
base = BOOK_MAP[rest]
|
base = BOOK_MAP[rest]
|
||||||
# normalize roman to arabic
|
|
||||||
n = {"i": "1", "ii": "2", "iii": "3"}.get(n, n)
|
n = {"i": "1", "ii": "2", "iii": "3"}.get(n, n)
|
||||||
# if base already includes number, replace it
|
|
||||||
if base[0] in "123":
|
if base[0] in "123":
|
||||||
return f"{n} {base.split(' ', 1)[1]}"
|
return f"{n} {base.split(' ', 1)[1]}"
|
||||||
return f"{n} {base}"
|
return f"{n} {base}"
|
||||||
# last chance: try the core canon directly
|
|
||||||
return BOOK_CANON.get(key)
|
return BOOK_CANON.get(key)
|
||||||
|
|
||||||
def _parse_segment(seg: str, last_book: Optional[str]) -> Tuple[Optional[str], Optional[str], bool]:
|
def _parse_segment(seg: str, last_book: Optional[str]) -> Tuple[Optional[str], Optional[str], bool]:
|
||||||
"""
|
|
||||||
Parse a semicolon-delimited segment.
|
|
||||||
Returns (book_canon, cv_string, preserve_raw).
|
|
||||||
- If preserve_raw is True, cv_string contains the original (cleaned) segment to keep verbatim.
|
|
||||||
"""
|
|
||||||
original = seg.strip()
|
original = seg.strip()
|
||||||
s = _clean_text(seg)
|
s = _clean_text(seg)
|
||||||
if not s:
|
if not s:
|
||||||
return (None, None, False)
|
return (None, None, False)
|
||||||
|
|
||||||
# Try to detect a leading book token
|
|
||||||
m = BOOK_RE.match(s)
|
m = BOOK_RE.match(s)
|
||||||
book = None
|
book = None
|
||||||
rest = s
|
rest = s
|
||||||
@ -204,18 +183,14 @@ def _parse_segment(seg: str, last_book: Optional[str]) -> Tuple[Optional[str], O
|
|||||||
book = canon
|
book = canon
|
||||||
rest = s[m.end():].strip(",;: .")
|
rest = s[m.end():].strip(",;: .")
|
||||||
else:
|
else:
|
||||||
# Not recognized as a book -> keep whole thing verbatim
|
|
||||||
return (None, original, True)
|
return (None, original, True)
|
||||||
|
|
||||||
if not book:
|
if not book:
|
||||||
# Inherit previous recognized book if we have one
|
|
||||||
if last_book:
|
if last_book:
|
||||||
book = last_book
|
book = last_book
|
||||||
else:
|
else:
|
||||||
# There is no book context — keep segment verbatim to avoid data loss
|
|
||||||
return (None, original, True)
|
return (None, original, True)
|
||||||
|
|
||||||
# Normalize the chapter/verse part
|
|
||||||
rest = re.sub(r"\s+", "", rest)
|
rest = re.sub(r"\s+", "", rest)
|
||||||
rest = re.sub(r":\s*", ":", rest)
|
rest = re.sub(r":\s*", ":", rest)
|
||||||
|
|
||||||
@ -229,22 +204,14 @@ def _parse_segment(seg: str, last_book: Optional[str]) -> Tuple[Optional[str], O
|
|||||||
cv = m2.group(0).replace(" ", "") if m2 else None
|
cv = m2.group(0).replace(" ", "") if m2 else None
|
||||||
|
|
||||||
if not cv:
|
if not cv:
|
||||||
# Nothing parseable after a valid book; just keep the book
|
|
||||||
return (book, None, False)
|
return (book, None, False)
|
||||||
|
|
||||||
# Normalize dashes, commas
|
|
||||||
cv = cv.replace("–", "-").replace("—", "-")
|
cv = cv.replace("–", "-").replace("—", "-")
|
||||||
cv = re.sub(r",\s*", ",", cv)
|
cv = re.sub(r",\s*", ",", cv)
|
||||||
|
|
||||||
return (book, cv, False)
|
return (book, cv, False)
|
||||||
|
|
||||||
def normalize_scripture_field(text: str) -> Tuple[str, List[str]]:
|
def normalize_scripture_field(text: str) -> Tuple[str, List[str]]:
|
||||||
"""
|
|
||||||
Normalize an entire scripture_raw string.
|
|
||||||
- Unknown pieces are preserved verbatim.
|
|
||||||
- Known pieces are standardized and each segment repeats the book.
|
|
||||||
Returns (normalized_text, warnings).
|
|
||||||
"""
|
|
||||||
warnings: List[str] = []
|
warnings: List[str] = []
|
||||||
if not text:
|
if not text:
|
||||||
return ("", warnings)
|
return ("", warnings)
|
||||||
@ -257,7 +224,6 @@ def normalize_scripture_field(text: str) -> Tuple[str, List[str]]:
|
|||||||
book, cv, preserve = _parse_segment(piece, last_book)
|
book, cv, preserve = _parse_segment(piece, last_book)
|
||||||
if preserve:
|
if preserve:
|
||||||
out.append(cv or piece.strip())
|
out.append(cv or piece.strip())
|
||||||
# do not update last_book when we couldn't recognize a book
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if book and cv:
|
if book and cv:
|
||||||
@ -266,14 +232,11 @@ def normalize_scripture_field(text: str) -> Tuple[str, List[str]]:
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
if book and not cv:
|
if book and not cv:
|
||||||
# book only (e.g., "Acts")
|
|
||||||
out.append(book)
|
out.append(book)
|
||||||
last_book = book
|
last_book = book
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# If we get here: (no book, no preserve) — nothing useful to add
|
|
||||||
if piece.strip():
|
if piece.strip():
|
||||||
# As a final safeguard, keep original
|
|
||||||
out.append(piece.strip())
|
out.append(piece.strip())
|
||||||
|
|
||||||
norm = "; ".join(x.strip() for x in out if x.strip())
|
norm = "; ".join(x.strip() for x in out if x.strip())
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user