Update web/core/utils.py

This commit is contained in:
2025-08-13 15:08:26 +00:00
parent d52b89ee3f
commit bf084fc13c
+103 -92
View File
@@ -1,68 +1,69 @@
# core/utils.py
from __future__ import annotations
import csv import csv
import io import io
import re import re
from datetime import datetime from datetime import datetime
from typing import Dict, Any from typing import Any, Dict, List, Optional
from django.db import transaction from django.db import transaction
from core.models import Entry from core.models import Entry
# --- Search helpers restored -------------------------------------------------
from typing import List # =============================================================================
# Search helpers (used by views)
# =============================================================================
def terms(q: str) -> List[str]: def terms(q: str) -> List[str]:
""" """
Split a query string into search terms. Split a query string into tokens.
- Quoted phrases are kept together: `"good shepherd"` - Quoted phrases are kept together: "good shepherd"
- Unquoted text splits on whitespace. - Unquoted text splits on whitespace.
- Empty/whitespace-only input returns [].
""" """
if not q: if not q:
return [] return []
# capture "quoted phrases" OR bare tokens
rx = re.compile(r'"([^"]+)"|(\S+)') rx = re.compile(r'"([^"]+)"|(\S+)')
out = [] out: List[str] = []
for m in rx.finditer(q): for m in rx.finditer(q):
phrase = m.group(1) if m.group(1) is not None else m.group(2) piece = m.group(1) if m.group(1) is not None else m.group(2)
t = (phrase or "").strip() t = (piece or "").strip()
if t: if t:
out.append(t) out.append(t)
return out return out
def has_wildcards(s: str) -> bool:
""" def has_wildcards(s: Optional[str]) -> bool:
True if user supplied * or ? wildcards (FileMaker-style). """True if user supplied wildcard characters (*, ?, % or _)."""
We also treat SQL wildcards % and _ as wildcards if present.
"""
if not s: if not s:
return False return False
return any(ch in s for ch in ("*", "?", "%", "_")) return any(ch in s for ch in ("*", "?", "%", "_"))
def wildcard_to_regex(s: str) -> str:
""" def wildcard_to_regex(s: Optional[str]) -> str:
Convert * and ? to a case-insensitive regex fragment suitable for Django's iregex. r"""
- Escapes regex meta first, then replaces \* -> .* and \? -> . Convert FileMaker-style wildcards to a regex fragment suitable for Django's
- Wraps with '.*' so it matches anywhere (like icontains). iregex lookup.
Example: 'lov* you?' -> '(?i).*lov.* you..*'
(The view should use iregex so (?i) or case-insensitive flag applies.) Rules:
- Escape regex meta first, then replace \* -> .* and \? -> .
- Wrap with '.*' so it matches anywhere (like icontains).
""" """
if s is None: if s is None:
s = "" s = ""
# Escape regex specials, then un-escape our wildcards into regex
pat = re.escape(s) pat = re.escape(s)
pat = pat.replace(r"\*", ".*").replace(r"\?", ".") pat = pat.replace(r"\*", ".*").replace(r"\?", ".")
# Match anywhere by default
pat = f".*{pat}.*" pat = f".*{pat}.*"
# collapse consecutive ".*.*" pat = re.sub(r"(?:\.\*){2,}", ".*", pat) # collapse repeats
pat = re.sub(r"(?:\.\*){2,}", ".*", pat)
return pat return pat
# -----------------------------------------------------------------------------
# ==============================
# Helpers # =============================================================================
# ============================== # CSV import utilities
# =============================================================================
def _decode_bytes(b: bytes) -> str: def _decode_bytes(b: bytes) -> str:
# Keep BOM-safe decoding # BOM-safe decode
return b.decode("utf-8-sig", errors="replace") return b.decode("utf-8-sig", errors="replace")
@@ -76,27 +77,54 @@ def _sniff_dialect(txt: str):
def _norm_header(h: str) -> str: def _norm_header(h: str) -> str:
""" """
Normalize headers in a forgiving way: Normalize a header name in a forgiving way:
- lower-case - lower-case
- remove all non-alphanumerics - treat underscores as spaces
- collapse spaces/underscores - collapse spaces
- drop non-alphanumerics
""" """
if not h: if not h:
return "" return ""
h = h.strip().lower() h = h.strip().lower().replace("_", " ")
h = h.replace("_", " ")
h = re.sub(r"\s+", " ", h) h = re.sub(r"\s+", " ", h)
# drop everything non-alnum
h = re.sub(r"[^a-z0-9 ]+", "", h) h = re.sub(r"[^a-z0-9 ]+", "", h)
return h.replace(" ", "") return h.replace(" ", "")
def _build_header_map(headers: List[str]) -> Dict[str, str]:
"""
Map original header -> canonical key the importer expects.
Canonical keys we use internally:
subject, illustration, application, scripture, source,
talk_title, talk_number, code, date, date_edited
"""
canon_targets = {
"subject": "subject",
"illustration": "illustration",
"application": "application",
"scripture": "scripture",
"source": "source",
"talktitle": "talk_title",
"title": "talk_title",
"talknumber": "talk_number",
"number": "talk_number",
"code": "code",
"date": "date",
"dateedited": "date_edited",
"edited": "date_edited",
}
out: Dict[str, str] = {}
for h in headers:
norm = _norm_header(h)
out[h] = canon_targets.get(norm, norm) # unknowns map to their normalized name
return out
def _getv(row: Dict[str, Any], hdr_map: Dict[str, str], canon: str) -> str: def _getv(row: Dict[str, Any], hdr_map: Dict[str, str], canon: str) -> str:
# Look up using canonical key -> original header """Case/spacing-insensitive value lookup."""
for orig, can in hdr_map.items(): for original, mapped in hdr_map.items():
if can == canon: if mapped == canon:
v = row.get(orig, "") return (row.get(original) or "").strip()
return (v or "").strip()
return "" return ""
@@ -117,9 +145,14 @@ def _parse_date(s: str):
return None return None
# ============================== def _parse_int(s: str) -> Optional[int]:
# Public: import_csv_bytes """Return an int from a string (tolerates commas), else None."""
# ============================== s = (s or "").strip()
if not s:
return None
m = re.match(r"^-?\d+", s.replace(",", ""))
return int(m.group(0)) if m else None
def import_csv_bytes( def import_csv_bytes(
csv_bytes: bytes, csv_bytes: bytes,
@@ -128,16 +161,20 @@ def import_csv_bytes(
# tune these if you changed model field sizes # tune these if you changed model field sizes
max_source=255, max_source=255,
max_code=128, max_code=128,
max_talk_number=128, max_talk_number=128, # only affects clipping BEFORE int parse; int parse handles None
max_talk_title=512, max_talk_title=512,
max_scripture=512, max_scripture=512,
): ):
""" """
Import CSV seed in an idempotent/upsert fashion. Import CSV seed in an idempotent/upsert fashion.
Mapping (case/spacing-insensitive): Expected headers (case/spacing-insensitive):
Subject, Illustration, Application, Scripture, Source, Subject, Illustration, Application, Scripture, Source,
Talk Title, Talk Number, Code, Date, Date Edited Talk Title, Talk Number, Code, Date, Date Edited
Upsert rule:
1) Prefer Code if present (treat as external key).
2) Else fall back to the triple (subject, illustration, application).
""" """
text = _decode_bytes(csv_bytes) text = _decode_bytes(csv_bytes)
dialect = _sniff_dialect(text) dialect = _sniff_dialect(text)
@@ -145,36 +182,15 @@ def import_csv_bytes(
rdr = csv.DictReader(f, dialect=dialect) rdr = csv.DictReader(f, dialect=dialect)
seen_headers = [h.strip() for h in (rdr.fieldnames or [])] seen_headers = [h.strip() for h in (rdr.fieldnames or [])]
header_map = _build_header_map(seen_headers)
# Build header normalization map
# Canonical keys we expect:
# subject illustration application scripture source talktitle talknumber code date dateedited
canon_targets = {
"subject": "subject",
"illustration": "illustration",
"application": "application",
"scripture": "scripture",
"source": "source",
"talktitle": "talk_title",
"title": "talk_title",
"talknumber": "talk_number",
"number": "talk_number",
"code": "code",
"date": "date",
"dateedited": "date_edited",
"edited": "date_edited",
}
header_map = {}
for h in seen_headers:
header_map[h] = canon_targets.get(_norm_header(h), _norm_header(h)) # unknowns still map to their norm
inserted = updated = skipped = 0 inserted = updated = skipped = 0
errors = [] errors: List[str] = []
scripture_parsed = 0 scripture_parsed = 0
with transaction.atomic():
for idx, row in enumerate(rdr, start=2): # data starts at line 2 for idx, row in enumerate(rdr, start=2): # data starts at line 2
try: try:
with transaction.atomic():
subject = _getv(row, header_map, "subject") subject = _getv(row, header_map, "subject")
illustration = _getv(row, header_map, "illustration") illustration = _getv(row, header_map, "illustration")
application = _getv(row, header_map, "application") application = _getv(row, header_map, "application")
@@ -182,71 +198,66 @@ def import_csv_bytes(
scripture_raw = _clip(_getv(row, header_map, "scripture"), max_scripture) scripture_raw = _clip(_getv(row, header_map, "scripture"), max_scripture)
source = _clip(_getv(row, header_map, "source"), max_source) source = _clip(_getv(row, header_map, "source"), max_source)
talk_title = _clip(_getv(row, header_map, "talk_title"), max_talk_title) talk_title = _clip(_getv(row, header_map, "talk_title"), max_talk_title)
talk_number = _clip(_getv(row, header_map, "talk_number"), max_talk_number)
# Safe talk number parse (non-numeric -> None)
talk_number_raw = _clip(_getv(row, header_map, "talk_number"), max_talk_number)
talk_number = _parse_int(talk_number_raw)
entry_code = _clip(_getv(row, header_map, "code"), max_code) entry_code = _clip(_getv(row, header_map, "code"), max_code)
date_added = _parse_date(_getv(row, header_map, "date")) date_added = _parse_date(_getv(row, header_map, "date"))
date_edited = _parse_date(_getv(row, header_map, "date_edited")) date_edited = _parse_date(_getv(row, header_map, "date_edited"))
# Decide how to find an existing row: # Find existing
# 1) Prefer Code if present (treat as external key) obj: Optional[Entry] = None
# 2) Else fall back to (subject, illustration, application)
obj = None
if entry_code: if entry_code:
obj = Entry.objects.filter(entry_code=entry_code).first() obj = Entry.objects.filter(entry_code=entry_code).first()
if obj is None: if obj is None:
obj = Entry.objects.filter( obj = Entry.objects.filter(
subject=subject, illustration=illustration, application=application subject=subject,
illustration=illustration,
application=application,
).first() ).first()
created = obj is None created = obj is None
if created: if created:
obj = Entry() obj = Entry()
# Assign fields # Assign
obj.subject = subject obj.subject = subject
obj.illustration = illustration obj.illustration = illustration
obj.application = application obj.application = application
obj.scripture_raw = scripture_raw obj.scripture_raw = scripture_raw
obj.source = source obj.source = source
obj.talk_title = talk_title obj.talk_title = talk_title
obj.talk_number = talk_number obj.talk_number = talk_number # None is fine for IntegerField
obj.entry_code = entry_code obj.entry_code = entry_code
if date_added: if date_added:
obj.date_added = date_added obj.date_added = date_added
if date_edited: if date_edited:
obj.date_edited = date_edited obj.date_edited = date_edited
if dry_run: if not dry_run:
updated += 1 if not created else 0
inserted += 1 if created else 0
else:
obj.save() obj.save()
if created: if created:
inserted += 1 inserted += 1
else: else:
updated += 1 updated += 1
# (Optional) quick scripture counter — were not parsing here,
# but keep a metric like your previous report
if scripture_raw: if scripture_raw:
scripture_parsed += 1 scripture_parsed += 1
except Exception as e: except Exception as e:
skipped += 1 skipped += 1
# keep error list compact errors.append(f"line {idx}: {type(e).__name__}: {e}")
msg = str(e)
if "value too long for type" in msg and max(msg.count("\n"), 0) == 0:
errors.append("value too long for type character varying(...)")
else:
errors.append(msg)
return { return {
"rows": inserted + updated + skipped, "rows": inserted + updated + skipped,
"inserted": inserted, "inserted": inserted,
"updated": updated, "updated": updated,
"skipped": skipped, "skipped": skipped,
"errors": errors[:200], # cap to avoid huge output "errors": errors[:200], # cap output
"scripture_parsed": scripture_parsed, "scripture_parsed": scripture_parsed,
"scripture_failed": 0, "scripture_failed": 0,
"dialect_delimiter": dialect.delimiter, "dialect_delimiter": dialect.delimiter,