Update web/core/utils.py
This commit is contained in:
+103
-92
@@ -1,68 +1,69 @@
|
|||||||
|
# core/utils.py
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
import csv
|
import csv
|
||||||
import io
|
import io
|
||||||
import re
|
import re
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import Dict, Any
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
from django.db import transaction
|
from django.db import transaction
|
||||||
from core.models import Entry
|
from core.models import Entry
|
||||||
|
|
||||||
# --- Search helpers restored -------------------------------------------------
|
|
||||||
from typing import List
|
# =============================================================================
|
||||||
|
# Search helpers (used by views)
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
def terms(q: str) -> List[str]:
|
def terms(q: str) -> List[str]:
|
||||||
"""
|
"""
|
||||||
Split a query string into search terms.
|
Split a query string into tokens.
|
||||||
- Quoted phrases are kept together: `"good shepherd"`
|
- Quoted phrases are kept together: "good shepherd"
|
||||||
- Unquoted text splits on whitespace.
|
- Unquoted text splits on whitespace.
|
||||||
- Empty/whitespace-only input returns [].
|
|
||||||
"""
|
"""
|
||||||
if not q:
|
if not q:
|
||||||
return []
|
return []
|
||||||
# capture "quoted phrases" OR bare tokens
|
|
||||||
rx = re.compile(r'"([^"]+)"|(\S+)')
|
rx = re.compile(r'"([^"]+)"|(\S+)')
|
||||||
out = []
|
out: List[str] = []
|
||||||
for m in rx.finditer(q):
|
for m in rx.finditer(q):
|
||||||
phrase = m.group(1) if m.group(1) is not None else m.group(2)
|
piece = m.group(1) if m.group(1) is not None else m.group(2)
|
||||||
t = (phrase or "").strip()
|
t = (piece or "").strip()
|
||||||
if t:
|
if t:
|
||||||
out.append(t)
|
out.append(t)
|
||||||
return out
|
return out
|
||||||
|
|
||||||
def has_wildcards(s: str) -> bool:
|
|
||||||
"""
|
def has_wildcards(s: Optional[str]) -> bool:
|
||||||
True if user supplied * or ? wildcards (FileMaker-style).
|
"""True if user supplied wildcard characters (*, ?, % or _)."""
|
||||||
We also treat SQL wildcards % and _ as wildcards if present.
|
|
||||||
"""
|
|
||||||
if not s:
|
if not s:
|
||||||
return False
|
return False
|
||||||
return any(ch in s for ch in ("*", "?", "%", "_"))
|
return any(ch in s for ch in ("*", "?", "%", "_"))
|
||||||
|
|
||||||
def wildcard_to_regex(s: str) -> str:
|
|
||||||
"""
|
def wildcard_to_regex(s: Optional[str]) -> str:
|
||||||
Convert * and ? to a case-insensitive regex fragment suitable for Django's iregex.
|
r"""
|
||||||
- Escapes regex meta first, then replaces \* -> .* and \? -> .
|
Convert FileMaker-style wildcards to a regex fragment suitable for Django's
|
||||||
- Wraps with '.*' so it matches anywhere (like icontains).
|
iregex lookup.
|
||||||
Example: 'lov* you?' -> '(?i).*lov.* you..*'
|
|
||||||
(The view should use iregex so (?i) or case-insensitive flag applies.)
|
Rules:
|
||||||
|
- Escape regex meta first, then replace \* -> .* and \? -> .
|
||||||
|
- Wrap with '.*' so it matches anywhere (like icontains).
|
||||||
"""
|
"""
|
||||||
if s is None:
|
if s is None:
|
||||||
s = ""
|
s = ""
|
||||||
# Escape regex specials, then un-escape our wildcards into regex
|
|
||||||
pat = re.escape(s)
|
pat = re.escape(s)
|
||||||
pat = pat.replace(r"\*", ".*").replace(r"\?", ".")
|
pat = pat.replace(r"\*", ".*").replace(r"\?", ".")
|
||||||
# Match anywhere by default
|
|
||||||
pat = f".*{pat}.*"
|
pat = f".*{pat}.*"
|
||||||
# collapse consecutive ".*.*"
|
pat = re.sub(r"(?:\.\*){2,}", ".*", pat) # collapse repeats
|
||||||
pat = re.sub(r"(?:\.\*){2,}", ".*", pat)
|
|
||||||
return pat
|
return pat
|
||||||
# -----------------------------------------------------------------------------
|
|
||||||
# ==============================
|
|
||||||
# Helpers
|
# =============================================================================
|
||||||
# ==============================
|
# CSV import utilities
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
def _decode_bytes(b: bytes) -> str:
|
def _decode_bytes(b: bytes) -> str:
|
||||||
# Keep BOM-safe decoding
|
# BOM-safe decode
|
||||||
return b.decode("utf-8-sig", errors="replace")
|
return b.decode("utf-8-sig", errors="replace")
|
||||||
|
|
||||||
|
|
||||||
@@ -76,27 +77,54 @@ def _sniff_dialect(txt: str):
|
|||||||
|
|
||||||
def _norm_header(h: str) -> str:
|
def _norm_header(h: str) -> str:
|
||||||
"""
|
"""
|
||||||
Normalize headers in a forgiving way:
|
Normalize a header name in a forgiving way:
|
||||||
- lower-case
|
- lower-case
|
||||||
- remove all non-alphanumerics
|
- treat underscores as spaces
|
||||||
- collapse spaces/underscores
|
- collapse spaces
|
||||||
|
- drop non-alphanumerics
|
||||||
"""
|
"""
|
||||||
if not h:
|
if not h:
|
||||||
return ""
|
return ""
|
||||||
h = h.strip().lower()
|
h = h.strip().lower().replace("_", " ")
|
||||||
h = h.replace("_", " ")
|
|
||||||
h = re.sub(r"\s+", " ", h)
|
h = re.sub(r"\s+", " ", h)
|
||||||
# drop everything non-alnum
|
|
||||||
h = re.sub(r"[^a-z0-9 ]+", "", h)
|
h = re.sub(r"[^a-z0-9 ]+", "", h)
|
||||||
return h.replace(" ", "")
|
return h.replace(" ", "")
|
||||||
|
|
||||||
|
|
||||||
|
def _build_header_map(headers: List[str]) -> Dict[str, str]:
|
||||||
|
"""
|
||||||
|
Map original header -> canonical key the importer expects.
|
||||||
|
Canonical keys we use internally:
|
||||||
|
subject, illustration, application, scripture, source,
|
||||||
|
talk_title, talk_number, code, date, date_edited
|
||||||
|
"""
|
||||||
|
canon_targets = {
|
||||||
|
"subject": "subject",
|
||||||
|
"illustration": "illustration",
|
||||||
|
"application": "application",
|
||||||
|
"scripture": "scripture",
|
||||||
|
"source": "source",
|
||||||
|
"talktitle": "talk_title",
|
||||||
|
"title": "talk_title",
|
||||||
|
"talknumber": "talk_number",
|
||||||
|
"number": "talk_number",
|
||||||
|
"code": "code",
|
||||||
|
"date": "date",
|
||||||
|
"dateedited": "date_edited",
|
||||||
|
"edited": "date_edited",
|
||||||
|
}
|
||||||
|
out: Dict[str, str] = {}
|
||||||
|
for h in headers:
|
||||||
|
norm = _norm_header(h)
|
||||||
|
out[h] = canon_targets.get(norm, norm) # unknowns map to their normalized name
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
def _getv(row: Dict[str, Any], hdr_map: Dict[str, str], canon: str) -> str:
|
def _getv(row: Dict[str, Any], hdr_map: Dict[str, str], canon: str) -> str:
|
||||||
# Look up using canonical key -> original header
|
"""Case/spacing-insensitive value lookup."""
|
||||||
for orig, can in hdr_map.items():
|
for original, mapped in hdr_map.items():
|
||||||
if can == canon:
|
if mapped == canon:
|
||||||
v = row.get(orig, "")
|
return (row.get(original) or "").strip()
|
||||||
return (v or "").strip()
|
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
|
|
||||||
@@ -117,9 +145,14 @@ def _parse_date(s: str):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
# ==============================
|
def _parse_int(s: str) -> Optional[int]:
|
||||||
# Public: import_csv_bytes
|
"""Return an int from a string (tolerates commas), else None."""
|
||||||
# ==============================
|
s = (s or "").strip()
|
||||||
|
if not s:
|
||||||
|
return None
|
||||||
|
m = re.match(r"^-?\d+", s.replace(",", ""))
|
||||||
|
return int(m.group(0)) if m else None
|
||||||
|
|
||||||
|
|
||||||
def import_csv_bytes(
|
def import_csv_bytes(
|
||||||
csv_bytes: bytes,
|
csv_bytes: bytes,
|
||||||
@@ -128,16 +161,20 @@ def import_csv_bytes(
|
|||||||
# tune these if you changed model field sizes
|
# tune these if you changed model field sizes
|
||||||
max_source=255,
|
max_source=255,
|
||||||
max_code=128,
|
max_code=128,
|
||||||
max_talk_number=128,
|
max_talk_number=128, # only affects clipping BEFORE int parse; int parse handles None
|
||||||
max_talk_title=512,
|
max_talk_title=512,
|
||||||
max_scripture=512,
|
max_scripture=512,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Import CSV seed in an idempotent/upsert fashion.
|
Import CSV seed in an idempotent/upsert fashion.
|
||||||
|
|
||||||
Mapping (case/spacing-insensitive):
|
Expected headers (case/spacing-insensitive):
|
||||||
Subject, Illustration, Application, Scripture, Source,
|
Subject, Illustration, Application, Scripture, Source,
|
||||||
Talk Title, Talk Number, Code, Date, Date Edited
|
Talk Title, Talk Number, Code, Date, Date Edited
|
||||||
|
|
||||||
|
Upsert rule:
|
||||||
|
1) Prefer Code if present (treat as external key).
|
||||||
|
2) Else fall back to the triple (subject, illustration, application).
|
||||||
"""
|
"""
|
||||||
text = _decode_bytes(csv_bytes)
|
text = _decode_bytes(csv_bytes)
|
||||||
dialect = _sniff_dialect(text)
|
dialect = _sniff_dialect(text)
|
||||||
@@ -145,36 +182,15 @@ def import_csv_bytes(
|
|||||||
rdr = csv.DictReader(f, dialect=dialect)
|
rdr = csv.DictReader(f, dialect=dialect)
|
||||||
|
|
||||||
seen_headers = [h.strip() for h in (rdr.fieldnames or [])]
|
seen_headers = [h.strip() for h in (rdr.fieldnames or [])]
|
||||||
|
header_map = _build_header_map(seen_headers)
|
||||||
# Build header normalization map
|
|
||||||
# Canonical keys we expect:
|
|
||||||
# subject illustration application scripture source talktitle talknumber code date dateedited
|
|
||||||
canon_targets = {
|
|
||||||
"subject": "subject",
|
|
||||||
"illustration": "illustration",
|
|
||||||
"application": "application",
|
|
||||||
"scripture": "scripture",
|
|
||||||
"source": "source",
|
|
||||||
"talktitle": "talk_title",
|
|
||||||
"title": "talk_title",
|
|
||||||
"talknumber": "talk_number",
|
|
||||||
"number": "talk_number",
|
|
||||||
"code": "code",
|
|
||||||
"date": "date",
|
|
||||||
"dateedited": "date_edited",
|
|
||||||
"edited": "date_edited",
|
|
||||||
}
|
|
||||||
header_map = {}
|
|
||||||
for h in seen_headers:
|
|
||||||
header_map[h] = canon_targets.get(_norm_header(h), _norm_header(h)) # unknowns still map to their norm
|
|
||||||
|
|
||||||
inserted = updated = skipped = 0
|
inserted = updated = skipped = 0
|
||||||
errors = []
|
errors: List[str] = []
|
||||||
scripture_parsed = 0
|
scripture_parsed = 0
|
||||||
|
|
||||||
with transaction.atomic():
|
|
||||||
for idx, row in enumerate(rdr, start=2): # data starts at line 2
|
for idx, row in enumerate(rdr, start=2): # data starts at line 2
|
||||||
try:
|
try:
|
||||||
|
with transaction.atomic():
|
||||||
subject = _getv(row, header_map, "subject")
|
subject = _getv(row, header_map, "subject")
|
||||||
illustration = _getv(row, header_map, "illustration")
|
illustration = _getv(row, header_map, "illustration")
|
||||||
application = _getv(row, header_map, "application")
|
application = _getv(row, header_map, "application")
|
||||||
@@ -182,71 +198,66 @@ def import_csv_bytes(
|
|||||||
scripture_raw = _clip(_getv(row, header_map, "scripture"), max_scripture)
|
scripture_raw = _clip(_getv(row, header_map, "scripture"), max_scripture)
|
||||||
source = _clip(_getv(row, header_map, "source"), max_source)
|
source = _clip(_getv(row, header_map, "source"), max_source)
|
||||||
talk_title = _clip(_getv(row, header_map, "talk_title"), max_talk_title)
|
talk_title = _clip(_getv(row, header_map, "talk_title"), max_talk_title)
|
||||||
talk_number = _clip(_getv(row, header_map, "talk_number"), max_talk_number)
|
|
||||||
|
# Safe talk number parse (non-numeric -> None)
|
||||||
|
talk_number_raw = _clip(_getv(row, header_map, "talk_number"), max_talk_number)
|
||||||
|
talk_number = _parse_int(talk_number_raw)
|
||||||
|
|
||||||
entry_code = _clip(_getv(row, header_map, "code"), max_code)
|
entry_code = _clip(_getv(row, header_map, "code"), max_code)
|
||||||
|
|
||||||
date_added = _parse_date(_getv(row, header_map, "date"))
|
date_added = _parse_date(_getv(row, header_map, "date"))
|
||||||
date_edited = _parse_date(_getv(row, header_map, "date_edited"))
|
date_edited = _parse_date(_getv(row, header_map, "date_edited"))
|
||||||
|
|
||||||
# Decide how to find an existing row:
|
# Find existing
|
||||||
# 1) Prefer Code if present (treat as external key)
|
obj: Optional[Entry] = None
|
||||||
# 2) Else fall back to (subject, illustration, application)
|
|
||||||
obj = None
|
|
||||||
if entry_code:
|
if entry_code:
|
||||||
obj = Entry.objects.filter(entry_code=entry_code).first()
|
obj = Entry.objects.filter(entry_code=entry_code).first()
|
||||||
if obj is None:
|
if obj is None:
|
||||||
obj = Entry.objects.filter(
|
obj = Entry.objects.filter(
|
||||||
subject=subject, illustration=illustration, application=application
|
subject=subject,
|
||||||
|
illustration=illustration,
|
||||||
|
application=application,
|
||||||
).first()
|
).first()
|
||||||
|
|
||||||
created = obj is None
|
created = obj is None
|
||||||
if created:
|
if created:
|
||||||
obj = Entry()
|
obj = Entry()
|
||||||
|
|
||||||
# Assign fields
|
# Assign
|
||||||
obj.subject = subject
|
obj.subject = subject
|
||||||
obj.illustration = illustration
|
obj.illustration = illustration
|
||||||
obj.application = application
|
obj.application = application
|
||||||
obj.scripture_raw = scripture_raw
|
obj.scripture_raw = scripture_raw
|
||||||
obj.source = source
|
obj.source = source
|
||||||
obj.talk_title = talk_title
|
obj.talk_title = talk_title
|
||||||
obj.talk_number = talk_number
|
obj.talk_number = talk_number # None is fine for IntegerField
|
||||||
obj.entry_code = entry_code
|
obj.entry_code = entry_code
|
||||||
if date_added:
|
if date_added:
|
||||||
obj.date_added = date_added
|
obj.date_added = date_added
|
||||||
if date_edited:
|
if date_edited:
|
||||||
obj.date_edited = date_edited
|
obj.date_edited = date_edited
|
||||||
|
|
||||||
if dry_run:
|
if not dry_run:
|
||||||
updated += 1 if not created else 0
|
|
||||||
inserted += 1 if created else 0
|
|
||||||
else:
|
|
||||||
obj.save()
|
obj.save()
|
||||||
|
|
||||||
if created:
|
if created:
|
||||||
inserted += 1
|
inserted += 1
|
||||||
else:
|
else:
|
||||||
updated += 1
|
updated += 1
|
||||||
|
|
||||||
# (Optional) quick scripture counter — we’re not parsing here,
|
|
||||||
# but keep a metric like your previous report
|
|
||||||
if scripture_raw:
|
if scripture_raw:
|
||||||
scripture_parsed += 1
|
scripture_parsed += 1
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
skipped += 1
|
skipped += 1
|
||||||
# keep error list compact
|
errors.append(f"line {idx}: {type(e).__name__}: {e}")
|
||||||
msg = str(e)
|
|
||||||
if "value too long for type" in msg and max(msg.count("\n"), 0) == 0:
|
|
||||||
errors.append("value too long for type character varying(...)")
|
|
||||||
else:
|
|
||||||
errors.append(msg)
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"rows": inserted + updated + skipped,
|
"rows": inserted + updated + skipped,
|
||||||
"inserted": inserted,
|
"inserted": inserted,
|
||||||
"updated": updated,
|
"updated": updated,
|
||||||
"skipped": skipped,
|
"skipped": skipped,
|
||||||
"errors": errors[:200], # cap to avoid huge output
|
"errors": errors[:200], # cap output
|
||||||
"scripture_parsed": scripture_parsed,
|
"scripture_parsed": scripture_parsed,
|
||||||
"scripture_failed": 0,
|
"scripture_failed": 0,
|
||||||
"dialect_delimiter": dialect.delimiter,
|
"dialect_delimiter": dialect.delimiter,
|
||||||
|
|||||||
Reference in New Issue
Block a user