Update web/core/utils.py

attempt at fixing import command
This commit is contained in:
Joshua Laymon 2025-08-13 14:55:47 +00:00
parent ceb4dee084
commit 445da3523b

View File

@ -1,210 +1,206 @@
import csv import csv
import io import io
import re import re
from datetime import date from datetime import datetime
from dateutil import parser as dateparser from typing import Dict, Any
from core.models import Entry, ScriptureRef from django.db import transaction
from core.models import Entry
# ----------------------------
# Search helpers (needed by views)
# ----------------------------
# Split query into tokens while preserving quoted phrases
_QUOTED_OR_WORD = re.compile(r'"([^"]+)"|(\S+)')
def terms(q: str): # ==============================
out = [] # Helpers
for m in _QUOTED_OR_WORD.finditer(q or ""): # ==============================
token = (m.group(1) or m.group(2) or "").strip()
if token:
out.append(token)
return out
def has_wildcards(token: str) -> bool: def _decode_bytes(b: bytes) -> str:
return "*" in token or "?" in token # Keep BOM-safe decoding
return b.decode("utf-8-sig", errors="replace")
def wildcard_to_regex(token: str) -> str:
"""
Convert user wildcard token to a safe regex:
* -> .*
? -> .
Everything else is escaped. Suitable for Django __iregex.
"""
STAR = "__STAR__"
QMARK = "__QMARK__"
s = token.replace("*", STAR).replace("?", QMARK)
s = re.escape(s)
s = s.replace(STAR, ".*").replace(QMARK, ".")
return s
# ---------------------------- def _sniff_dialect(txt: str):
# Scripture parsing (minimal, non-blocking)
# ----------------------------
def parse_scripture(scripture_str: str):
"""
Minimal placeholder: keep as a list with raw string so imports never fail.
Replace with your richer parser when ready.
"""
if not scripture_str:
return []
return [{"raw": scripture_str}]
# ----------------------------
# CSV import (robust)
# ----------------------------
EXPECTED_HEADERS = [h.lower() for h in [
"Subject","Illustration","Application","Scripture","Source",
"Talk Title","Talk Number","Code","Date","Date Edited"
]]
def _sniff(text: str):
sample = text[:8192]
try: try:
dialect = csv.Sniffer().sniff(sample, delimiters=",;\t|") return csv.Sniffer().sniff(txt[:4096], delimiters=[",", ";", "\t", "|"])
except Exception: except Exception:
class _Simple(csv.Dialect): class _D: delimiter = ","
delimiter = ',' return _D()
quotechar = '"'
escapechar = None
doublequote = True
skipinitialspace = True
lineterminator = '\n'
quoting = csv.QUOTE_MINIMAL
dialect = _Simple
return dialect
def _as_dictreader(text: str, dialect, fieldnames=None):
def _norm_header(h: str) -> str:
""" """
Yield rows as dicts. If fieldnames are provided, treat file as headerless. Normalize headers in a forgiving way:
We also peek one row: if it looks like an actual header row, we skip it. - lower-case
- remove all non-alphanumerics
- collapse spaces/underscores
""" """
sio = io.StringIO(text) if not h:
if fieldnames is None: return ""
reader = csv.DictReader(sio, dialect=dialect) h = h.strip().lower()
for row in reader: h = h.replace("_", " ")
yield row h = re.sub(r"\s+", " ", h)
return # drop everything non-alnum
# Headerless mode h = re.sub(r"[^a-z0-9 ]+", "", h)
reader = csv.DictReader(sio, dialect=dialect, fieldnames=fieldnames) return h.replace(" ", "")
first = next(reader, None)
if first is not None:
# If many columns equal their header names, it's probably a header row
matches = sum(1 for k, v in first.items() if (v or "").strip().lower() == k.strip().lower())
if matches < 5:
# Not a header row, yield it
yield first
for row in reader:
yield row
def import_csv_bytes(b: bytes, dry_run: bool = True):
"""
Robust import:
- Auto-detect delimiter (comma/semicolon/tab/pipe).
- If required headers are missing, re-parse treating file as *headerless*
using the canonical column order.
- Skip fully empty rows.
- Upsert by Code (if Code present), else insert.
Returns a report dict with counts and diagnostics.
"""
text = b.decode("utf-8-sig", errors="replace")
dialect = _sniff(text)
# First attempt: use file-provided headers def _getv(row: Dict[str, Any], hdr_map: Dict[str, str], canon: str) -> str:
reader1 = csv.DictReader(io.StringIO(text), dialect=dialect) # Look up using canonical key -> original header
headers1 = [(h or "").strip().lower() for h in (reader1.fieldnames or [])] for orig, can in hdr_map.items():
if can == canon:
v = row.get(orig, "")
return (v or "").strip()
return ""
used_headerless = False
if not headers1 or sum(h in EXPECTED_HEADERS for h in headers1) < 5:
# Not enough expected headers -> treat as headerless/positional
used_headerless = True
rows_iter = _as_dictreader(text, dialect, fieldnames=EXPECTED_HEADERS)
else:
rows_iter = (row for row in reader1)
report = { def _clip(s: str, n: int) -> str:
"rows": 0, "inserted": 0, "updated": 0, "skipped": 0, "errors": [], s = (s or "").strip()
"scripture_parsed": 0, "scripture_failed": 0, return s[:n] if n and len(s) > n else s
"dialect_delimiter": getattr(dialect, "delimiter", "?"),
"used_headerless_mode": used_headerless,
"seen_headers": headers1,
}
def parse_date_safe(v):
if not v or not str(v).strip(): def _parse_date(s: str):
s = (s or "").strip()
if not s:
return None return None
for fmt in ("%Y-%m-%d", "%Y/%m/%d", "%m/%d/%Y", "%m/%d/%y", "%Y.%m.%d", "%m-%d-%Y"):
try: try:
return dateparser.parse(str(v)).date() return datetime.strptime(s, fmt).date()
except Exception: except ValueError:
return None
for row in rows_iter:
report["rows"] += 1
try:
row_lc = {(k or "").strip().lower(): (v or "") for k, v in row.items()}
subj = (row_lc.get("subject") or "").strip()
illu = (row_lc.get("illustration") or "").strip()
appl = (row_lc.get("application") or "").strip()
scr = (row_lc.get("scripture") or "").strip()
src = (row_lc.get("source") or "").strip()
tt = (row_lc.get("talk title") or "").strip()
tnum = (row_lc.get("talk number") or "").strip()
code = (row_lc.get("code") or "").strip()
dadd = parse_date_safe(row_lc.get("date"))
ded = parse_date_safe(row_lc.get("date edited"))
try:
tnum = int(tnum) if tnum else None
except Exception:
tnum = None
# Skip rows that are completely empty across all tracked fields
if not any([subj, illu, appl, scr, src, tt, code, tnum, dadd, ded]):
report["skipped"] += 1
continue continue
return None
data = dict(
subject=subj, illustration=illu, application=appl,
scripture_raw=scr, source=src, talk_number=tnum,
talk_title=tt, entry_code=code, date_added=dadd, date_edited=ded
)
# Scripture parse diagnostics # ==============================
parsed_list = parse_scripture(scr) # Public: import_csv_bytes
for it in parsed_list: # ==============================
if it: report["scripture_parsed"] += 1
else: report["scripture_failed"] += 1 def import_csv_bytes(
csv_bytes: bytes,
dry_run: bool = False,
*,
# tune these if you changed model field sizes
max_source=255,
max_code=128,
max_talk_number=128,
max_talk_title=512,
max_scripture=512,
):
"""
Import CSV seed in an idempotent/upsert fashion.
Mapping (case/spacing-insensitive):
Subject, Illustration, Application, Scripture, Source,
Talk Title, Talk Number, Code, Date, Date Edited
"""
text = _decode_bytes(csv_bytes)
dialect = _sniff_dialect(text)
f = io.StringIO(text)
rdr = csv.DictReader(f, dialect=dialect)
seen_headers = [h.strip() for h in (rdr.fieldnames or [])]
# Build header normalization map
# Canonical keys we expect:
# subject illustration application scripture source talktitle talknumber code date dateedited
canon_targets = {
"subject": "subject",
"illustration": "illustration",
"application": "application",
"scripture": "scripture",
"source": "source",
"talktitle": "talk_title",
"title": "talk_title",
"talknumber": "talk_number",
"number": "talk_number",
"code": "code",
"date": "date",
"dateedited": "date_edited",
"edited": "date_edited",
}
header_map = {}
for h in seen_headers:
header_map[h] = canon_targets.get(_norm_header(h), _norm_header(h)) # unknowns still map to their norm
inserted = updated = skipped = 0
errors = []
scripture_parsed = 0
with transaction.atomic():
for idx, row in enumerate(rdr, start=2): # data starts at line 2
try:
subject = _getv(row, header_map, "subject")
illustration = _getv(row, header_map, "illustration")
application = _getv(row, header_map, "application")
scripture_raw = _clip(_getv(row, header_map, "scripture"), max_scripture)
source = _clip(_getv(row, header_map, "source"), max_source)
talk_title = _clip(_getv(row, header_map, "talk_title"), max_talk_title)
talk_number = _clip(_getv(row, header_map, "talk_number"), max_talk_number)
entry_code = _clip(_getv(row, header_map, "code"), max_code)
date_added = _parse_date(_getv(row, header_map, "date"))
date_edited = _parse_date(_getv(row, header_map, "date_edited"))
# Decide how to find an existing row:
# 1) Prefer Code if present (treat as external key)
# 2) Else fall back to (subject, illustration, application)
obj = None
if entry_code:
obj = Entry.objects.filter(entry_code=entry_code).first()
if obj is None:
obj = Entry.objects.filter(
subject=subject, illustration=illustration, application=application
).first()
created = obj is None
if created:
obj = Entry()
# Assign fields
obj.subject = subject
obj.illustration = illustration
obj.application = application
obj.scripture_raw = scripture_raw
obj.source = source
obj.talk_title = talk_title
obj.talk_number = talk_number
obj.entry_code = entry_code
if date_added:
obj.date_added = date_added
if date_edited:
obj.date_edited = date_edited
if dry_run: if dry_run:
continue updated += 1 if not created else 0
inserted += 1 if created else 0
obj = None
if code:
try:
obj = Entry.objects.get(entry_code=code)
except Entry.DoesNotExist:
obj = None
if obj:
for k, v in data.items():
setattr(obj, k, v)
obj.save()
obj.scripture_refs.all().delete()
report["updated"] += 1
else: else:
obj = Entry.objects.create(**data) obj.save()
report["inserted"] += 1 if created:
inserted += 1
else:
updated += 1
for it in parsed_list: # (Optional) quick scripture counter — were not parsing here,
if it and isinstance(it, dict) and "raw" in it: # but keep a metric like your previous report
# Keep raw-only ref optional; skip creating ScriptureRef if schema differs if scripture_raw:
pass scripture_parsed += 1
elif it:
# If you switch to a structured parser, create records like:
ScriptureRef.objects.create(entry=obj, **it)
except Exception as e: except Exception as e:
report["skipped"] += 1 skipped += 1
report["errors"].append(str(e)) # keep error list compact
msg = str(e)
if "value too long for type" in msg and max(msg.count("\n"), 0) == 0:
errors.append("value too long for type character varying(...)")
else:
errors.append(msg)
return report return {
"rows": inserted + updated + skipped,
"inserted": inserted,
"updated": updated,
"skipped": skipped,
"errors": errors[:200], # cap to avoid huge output
"scripture_parsed": scripture_parsed,
"scripture_failed": 0,
"dialect_delimiter": dialect.delimiter,
"used_headerless_mode": False,
"seen_headers": [h.lower() for h in seen_headers],
}