Update web/core/utils.py

attempt at fixing import command
This commit is contained in:
Joshua Laymon 2025-08-13 14:55:47 +00:00
parent ceb4dee084
commit 445da3523b

View File

@ -1,210 +1,206 @@
import csv import csv
import io import io
import re import re
from datetime import date from datetime import datetime
from dateutil import parser as dateparser from typing import Dict, Any
from core.models import Entry, ScriptureRef from django.db import transaction
from core.models import Entry
# ----------------------------
# Search helpers (needed by views)
# ----------------------------
# Split query into tokens while preserving quoted phrases
_QUOTED_OR_WORD = re.compile(r'"([^"]+)"|(\S+)')
def terms(q: str): # ==============================
out = [] # Helpers
for m in _QUOTED_OR_WORD.finditer(q or ""): # ==============================
token = (m.group(1) or m.group(2) or "").strip()
if token:
out.append(token)
return out
def has_wildcards(token: str) -> bool: def _decode_bytes(b: bytes) -> str:
return "*" in token or "?" in token # Keep BOM-safe decoding
return b.decode("utf-8-sig", errors="replace")
def wildcard_to_regex(token: str) -> str:
"""
Convert user wildcard token to a safe regex:
* -> .*
? -> .
Everything else is escaped. Suitable for Django __iregex.
"""
STAR = "__STAR__"
QMARK = "__QMARK__"
s = token.replace("*", STAR).replace("?", QMARK)
s = re.escape(s)
s = s.replace(STAR, ".*").replace(QMARK, ".")
return s
# ---------------------------- def _sniff_dialect(txt: str):
# Scripture parsing (minimal, non-blocking)
# ----------------------------
def parse_scripture(scripture_str: str):
"""
Minimal placeholder: keep as a list with raw string so imports never fail.
Replace with your richer parser when ready.
"""
if not scripture_str:
return []
return [{"raw": scripture_str}]
# ----------------------------
# CSV import (robust)
# ----------------------------
EXPECTED_HEADERS = [h.lower() for h in [
"Subject","Illustration","Application","Scripture","Source",
"Talk Title","Talk Number","Code","Date","Date Edited"
]]
def _sniff(text: str):
sample = text[:8192]
try: try:
dialect = csv.Sniffer().sniff(sample, delimiters=",;\t|") return csv.Sniffer().sniff(txt[:4096], delimiters=[",", ";", "\t", "|"])
except Exception: except Exception:
class _Simple(csv.Dialect): class _D: delimiter = ","
delimiter = ',' return _D()
quotechar = '"'
escapechar = None
doublequote = True
skipinitialspace = True
lineterminator = '\n'
quoting = csv.QUOTE_MINIMAL
dialect = _Simple
return dialect
def _as_dictreader(text: str, dialect, fieldnames=None):
def _norm_header(h: str) -> str:
""" """
Yield rows as dicts. If fieldnames are provided, treat file as headerless. Normalize headers in a forgiving way:
We also peek one row: if it looks like an actual header row, we skip it. - lower-case
- remove all non-alphanumerics
- collapse spaces/underscores
""" """
sio = io.StringIO(text) if not h:
if fieldnames is None: return ""
reader = csv.DictReader(sio, dialect=dialect) h = h.strip().lower()
for row in reader: h = h.replace("_", " ")
yield row h = re.sub(r"\s+", " ", h)
return # drop everything non-alnum
# Headerless mode h = re.sub(r"[^a-z0-9 ]+", "", h)
reader = csv.DictReader(sio, dialect=dialect, fieldnames=fieldnames) return h.replace(" ", "")
first = next(reader, None)
if first is not None:
# If many columns equal their header names, it's probably a header row
matches = sum(1 for k, v in first.items() if (v or "").strip().lower() == k.strip().lower())
if matches < 5:
# Not a header row, yield it
yield first
for row in reader:
yield row
def import_csv_bytes(b: bytes, dry_run: bool = True):
def _getv(row: Dict[str, Any], hdr_map: Dict[str, str], canon: str) -> str:
# Look up using canonical key -> original header
for orig, can in hdr_map.items():
if can == canon:
v = row.get(orig, "")
return (v or "").strip()
return ""
def _clip(s: str, n: int) -> str:
s = (s or "").strip()
return s[:n] if n and len(s) > n else s
def _parse_date(s: str):
s = (s or "").strip()
if not s:
return None
for fmt in ("%Y-%m-%d", "%Y/%m/%d", "%m/%d/%Y", "%m/%d/%y", "%Y.%m.%d", "%m-%d-%Y"):
try:
return datetime.strptime(s, fmt).date()
except ValueError:
continue
return None
# ==============================
# Public: import_csv_bytes
# ==============================
def import_csv_bytes(
csv_bytes: bytes,
dry_run: bool = False,
*,
# tune these if you changed model field sizes
max_source=255,
max_code=128,
max_talk_number=128,
max_talk_title=512,
max_scripture=512,
):
""" """
Robust import: Import CSV seed in an idempotent/upsert fashion.
- Auto-detect delimiter (comma/semicolon/tab/pipe).
- If required headers are missing, re-parse treating file as *headerless* Mapping (case/spacing-insensitive):
using the canonical column order. Subject, Illustration, Application, Scripture, Source,
- Skip fully empty rows. Talk Title, Talk Number, Code, Date, Date Edited
- Upsert by Code (if Code present), else insert.
Returns a report dict with counts and diagnostics.
""" """
text = b.decode("utf-8-sig", errors="replace") text = _decode_bytes(csv_bytes)
dialect = _sniff(text) dialect = _sniff_dialect(text)
f = io.StringIO(text)
rdr = csv.DictReader(f, dialect=dialect)
# First attempt: use file-provided headers seen_headers = [h.strip() for h in (rdr.fieldnames or [])]
reader1 = csv.DictReader(io.StringIO(text), dialect=dialect)
headers1 = [(h or "").strip().lower() for h in (reader1.fieldnames or [])]
used_headerless = False # Build header normalization map
if not headers1 or sum(h in EXPECTED_HEADERS for h in headers1) < 5: # Canonical keys we expect:
# Not enough expected headers -> treat as headerless/positional # subject illustration application scripture source talktitle talknumber code date dateedited
used_headerless = True canon_targets = {
rows_iter = _as_dictreader(text, dialect, fieldnames=EXPECTED_HEADERS) "subject": "subject",
else: "illustration": "illustration",
rows_iter = (row for row in reader1) "application": "application",
"scripture": "scripture",
report = { "source": "source",
"rows": 0, "inserted": 0, "updated": 0, "skipped": 0, "errors": [], "talktitle": "talk_title",
"scripture_parsed": 0, "scripture_failed": 0, "title": "talk_title",
"dialect_delimiter": getattr(dialect, "delimiter", "?"), "talknumber": "talk_number",
"used_headerless_mode": used_headerless, "number": "talk_number",
"seen_headers": headers1, "code": "code",
"date": "date",
"dateedited": "date_edited",
"edited": "date_edited",
} }
header_map = {}
for h in seen_headers:
header_map[h] = canon_targets.get(_norm_header(h), _norm_header(h)) # unknowns still map to their norm
def parse_date_safe(v): inserted = updated = skipped = 0
if not v or not str(v).strip(): errors = []
return None scripture_parsed = 0
try:
return dateparser.parse(str(v)).date()
except Exception:
return None
for row in rows_iter:
report["rows"] += 1
try:
row_lc = {(k or "").strip().lower(): (v or "") for k, v in row.items()}
subj = (row_lc.get("subject") or "").strip()
illu = (row_lc.get("illustration") or "").strip()
appl = (row_lc.get("application") or "").strip()
scr = (row_lc.get("scripture") or "").strip()
src = (row_lc.get("source") or "").strip()
tt = (row_lc.get("talk title") or "").strip()
tnum = (row_lc.get("talk number") or "").strip()
code = (row_lc.get("code") or "").strip()
dadd = parse_date_safe(row_lc.get("date"))
ded = parse_date_safe(row_lc.get("date edited"))
with transaction.atomic():
for idx, row in enumerate(rdr, start=2): # data starts at line 2
try: try:
tnum = int(tnum) if tnum else None subject = _getv(row, header_map, "subject")
except Exception: illustration = _getv(row, header_map, "illustration")
tnum = None application = _getv(row, header_map, "application")
# Skip rows that are completely empty across all tracked fields scripture_raw = _clip(_getv(row, header_map, "scripture"), max_scripture)
if not any([subj, illu, appl, scr, src, tt, code, tnum, dadd, ded]): source = _clip(_getv(row, header_map, "source"), max_source)
report["skipped"] += 1 talk_title = _clip(_getv(row, header_map, "talk_title"), max_talk_title)
continue talk_number = _clip(_getv(row, header_map, "talk_number"), max_talk_number)
entry_code = _clip(_getv(row, header_map, "code"), max_code)
data = dict( date_added = _parse_date(_getv(row, header_map, "date"))
subject=subj, illustration=illu, application=appl, date_edited = _parse_date(_getv(row, header_map, "date_edited"))
scripture_raw=scr, source=src, talk_number=tnum,
talk_title=tt, entry_code=code, date_added=dadd, date_edited=ded
)
# Scripture parse diagnostics # Decide how to find an existing row:
parsed_list = parse_scripture(scr) # 1) Prefer Code if present (treat as external key)
for it in parsed_list: # 2) Else fall back to (subject, illustration, application)
if it: report["scripture_parsed"] += 1 obj = None
else: report["scripture_failed"] += 1 if entry_code:
obj = Entry.objects.filter(entry_code=entry_code).first()
if obj is None:
obj = Entry.objects.filter(
subject=subject, illustration=illustration, application=application
).first()
if dry_run: created = obj is None
continue if created:
obj = Entry()
obj = None # Assign fields
if code: obj.subject = subject
try: obj.illustration = illustration
obj = Entry.objects.get(entry_code=code) obj.application = application
except Entry.DoesNotExist: obj.scripture_raw = scripture_raw
obj = None obj.source = source
obj.talk_title = talk_title
obj.talk_number = talk_number
obj.entry_code = entry_code
if date_added:
obj.date_added = date_added
if date_edited:
obj.date_edited = date_edited
if obj: if dry_run:
for k, v in data.items(): updated += 1 if not created else 0
setattr(obj, k, v) inserted += 1 if created else 0
obj.save() else:
obj.scripture_refs.all().delete() obj.save()
report["updated"] += 1 if created:
else: inserted += 1
obj = Entry.objects.create(**data) else:
report["inserted"] += 1 updated += 1
for it in parsed_list: # (Optional) quick scripture counter — were not parsing here,
if it and isinstance(it, dict) and "raw" in it: # but keep a metric like your previous report
# Keep raw-only ref optional; skip creating ScriptureRef if schema differs if scripture_raw:
pass scripture_parsed += 1
elif it:
# If you switch to a structured parser, create records like:
ScriptureRef.objects.create(entry=obj, **it)
except Exception as e: except Exception as e:
report["skipped"] += 1 skipped += 1
report["errors"].append(str(e)) # keep error list compact
msg = str(e)
if "value too long for type" in msg and max(msg.count("\n"), 0) == 0:
errors.append("value too long for type character varying(...)")
else:
errors.append(msg)
return report return {
"rows": inserted + updated + skipped,
"inserted": inserted,
"updated": updated,
"skipped": skipped,
"errors": errors[:200], # cap to avoid huge output
"scripture_parsed": scripture_parsed,
"scripture_failed": 0,
"dialect_delimiter": dialect.delimiter,
"used_headerless_mode": False,
"seen_headers": [h.lower() for h in seen_headers],
}