Update web/core/utils.py

attempt at fixing import command
This commit is contained in:
Joshua Laymon 2025-08-13 14:55:47 +00:00
parent ceb4dee084
commit 445da3523b

View File

@ -1,210 +1,206 @@
import csv
import io
import re
from datetime import date
from dateutil import parser as dateparser
from datetime import datetime
from typing import Dict, Any
from core.models import Entry, ScriptureRef
from django.db import transaction
from core.models import Entry
# ----------------------------
# Search helpers (needed by views)
# ----------------------------
# Split query into tokens while preserving quoted phrases
_QUOTED_OR_WORD = re.compile(r'"([^"]+)"|(\S+)')
def terms(q: str):
out = []
for m in _QUOTED_OR_WORD.finditer(q or ""):
token = (m.group(1) or m.group(2) or "").strip()
if token:
out.append(token)
return out
# ==============================
# Helpers
# ==============================
def has_wildcards(token: str) -> bool:
return "*" in token or "?" in token
def _decode_bytes(b: bytes) -> str:
# Keep BOM-safe decoding
return b.decode("utf-8-sig", errors="replace")
def wildcard_to_regex(token: str) -> str:
"""
Convert user wildcard token to a safe regex:
* -> .*
? -> .
Everything else is escaped. Suitable for Django __iregex.
"""
STAR = "__STAR__"
QMARK = "__QMARK__"
s = token.replace("*", STAR).replace("?", QMARK)
s = re.escape(s)
s = s.replace(STAR, ".*").replace(QMARK, ".")
return s
# ----------------------------
# Scripture parsing (minimal, non-blocking)
# ----------------------------
def parse_scripture(scripture_str: str):
"""
Minimal placeholder: keep as a list with raw string so imports never fail.
Replace with your richer parser when ready.
"""
if not scripture_str:
return []
return [{"raw": scripture_str}]
# ----------------------------
# CSV import (robust)
# ----------------------------
EXPECTED_HEADERS = [h.lower() for h in [
"Subject","Illustration","Application","Scripture","Source",
"Talk Title","Talk Number","Code","Date","Date Edited"
]]
def _sniff(text: str):
sample = text[:8192]
def _sniff_dialect(txt: str):
try:
dialect = csv.Sniffer().sniff(sample, delimiters=",;\t|")
return csv.Sniffer().sniff(txt[:4096], delimiters=[",", ";", "\t", "|"])
except Exception:
class _Simple(csv.Dialect):
delimiter = ','
quotechar = '"'
escapechar = None
doublequote = True
skipinitialspace = True
lineterminator = '\n'
quoting = csv.QUOTE_MINIMAL
dialect = _Simple
return dialect
class _D: delimiter = ","
return _D()
def _as_dictreader(text: str, dialect, fieldnames=None):
def _norm_header(h: str) -> str:
"""
Yield rows as dicts. If fieldnames are provided, treat file as headerless.
We also peek one row: if it looks like an actual header row, we skip it.
Normalize headers in a forgiving way:
- lower-case
- remove all non-alphanumerics
- collapse spaces/underscores
"""
sio = io.StringIO(text)
if fieldnames is None:
reader = csv.DictReader(sio, dialect=dialect)
for row in reader:
yield row
return
# Headerless mode
reader = csv.DictReader(sio, dialect=dialect, fieldnames=fieldnames)
first = next(reader, None)
if first is not None:
# If many columns equal their header names, it's probably a header row
matches = sum(1 for k, v in first.items() if (v or "").strip().lower() == k.strip().lower())
if matches < 5:
# Not a header row, yield it
yield first
for row in reader:
yield row
if not h:
return ""
h = h.strip().lower()
h = h.replace("_", " ")
h = re.sub(r"\s+", " ", h)
# drop everything non-alnum
h = re.sub(r"[^a-z0-9 ]+", "", h)
return h.replace(" ", "")
def import_csv_bytes(b: bytes, dry_run: bool = True):
def _getv(row: Dict[str, Any], hdr_map: Dict[str, str], canon: str) -> str:
# Look up using canonical key -> original header
for orig, can in hdr_map.items():
if can == canon:
v = row.get(orig, "")
return (v or "").strip()
return ""
def _clip(s: str, n: int) -> str:
s = (s or "").strip()
return s[:n] if n and len(s) > n else s
def _parse_date(s: str):
s = (s or "").strip()
if not s:
return None
for fmt in ("%Y-%m-%d", "%Y/%m/%d", "%m/%d/%Y", "%m/%d/%y", "%Y.%m.%d", "%m-%d-%Y"):
try:
return datetime.strptime(s, fmt).date()
except ValueError:
continue
return None
# ==============================
# Public: import_csv_bytes
# ==============================
def import_csv_bytes(
csv_bytes: bytes,
dry_run: bool = False,
*,
# tune these if you changed model field sizes
max_source=255,
max_code=128,
max_talk_number=128,
max_talk_title=512,
max_scripture=512,
):
"""
Robust import:
- Auto-detect delimiter (comma/semicolon/tab/pipe).
- If required headers are missing, re-parse treating file as *headerless*
using the canonical column order.
- Skip fully empty rows.
- Upsert by Code (if Code present), else insert.
Returns a report dict with counts and diagnostics.
Import CSV seed in an idempotent/upsert fashion.
Mapping (case/spacing-insensitive):
Subject, Illustration, Application, Scripture, Source,
Talk Title, Talk Number, Code, Date, Date Edited
"""
text = b.decode("utf-8-sig", errors="replace")
dialect = _sniff(text)
text = _decode_bytes(csv_bytes)
dialect = _sniff_dialect(text)
f = io.StringIO(text)
rdr = csv.DictReader(f, dialect=dialect)
# First attempt: use file-provided headers
reader1 = csv.DictReader(io.StringIO(text), dialect=dialect)
headers1 = [(h or "").strip().lower() for h in (reader1.fieldnames or [])]
seen_headers = [h.strip() for h in (rdr.fieldnames or [])]
used_headerless = False
if not headers1 or sum(h in EXPECTED_HEADERS for h in headers1) < 5:
# Not enough expected headers -> treat as headerless/positional
used_headerless = True
rows_iter = _as_dictreader(text, dialect, fieldnames=EXPECTED_HEADERS)
else:
rows_iter = (row for row in reader1)
report = {
"rows": 0, "inserted": 0, "updated": 0, "skipped": 0, "errors": [],
"scripture_parsed": 0, "scripture_failed": 0,
"dialect_delimiter": getattr(dialect, "delimiter", "?"),
"used_headerless_mode": used_headerless,
"seen_headers": headers1,
# Build header normalization map
# Canonical keys we expect:
# subject illustration application scripture source talktitle talknumber code date dateedited
canon_targets = {
"subject": "subject",
"illustration": "illustration",
"application": "application",
"scripture": "scripture",
"source": "source",
"talktitle": "talk_title",
"title": "talk_title",
"talknumber": "talk_number",
"number": "talk_number",
"code": "code",
"date": "date",
"dateedited": "date_edited",
"edited": "date_edited",
}
header_map = {}
for h in seen_headers:
header_map[h] = canon_targets.get(_norm_header(h), _norm_header(h)) # unknowns still map to their norm
def parse_date_safe(v):
if not v or not str(v).strip():
return None
try:
return dateparser.parse(str(v)).date()
except Exception:
return None
for row in rows_iter:
report["rows"] += 1
try:
row_lc = {(k or "").strip().lower(): (v or "") for k, v in row.items()}
subj = (row_lc.get("subject") or "").strip()
illu = (row_lc.get("illustration") or "").strip()
appl = (row_lc.get("application") or "").strip()
scr = (row_lc.get("scripture") or "").strip()
src = (row_lc.get("source") or "").strip()
tt = (row_lc.get("talk title") or "").strip()
tnum = (row_lc.get("talk number") or "").strip()
code = (row_lc.get("code") or "").strip()
dadd = parse_date_safe(row_lc.get("date"))
ded = parse_date_safe(row_lc.get("date edited"))
inserted = updated = skipped = 0
errors = []
scripture_parsed = 0
with transaction.atomic():
for idx, row in enumerate(rdr, start=2): # data starts at line 2
try:
tnum = int(tnum) if tnum else None
except Exception:
tnum = None
subject = _getv(row, header_map, "subject")
illustration = _getv(row, header_map, "illustration")
application = _getv(row, header_map, "application")
# Skip rows that are completely empty across all tracked fields
if not any([subj, illu, appl, scr, src, tt, code, tnum, dadd, ded]):
report["skipped"] += 1
continue
scripture_raw = _clip(_getv(row, header_map, "scripture"), max_scripture)
source = _clip(_getv(row, header_map, "source"), max_source)
talk_title = _clip(_getv(row, header_map, "talk_title"), max_talk_title)
talk_number = _clip(_getv(row, header_map, "talk_number"), max_talk_number)
entry_code = _clip(_getv(row, header_map, "code"), max_code)
data = dict(
subject=subj, illustration=illu, application=appl,
scripture_raw=scr, source=src, talk_number=tnum,
talk_title=tt, entry_code=code, date_added=dadd, date_edited=ded
)
date_added = _parse_date(_getv(row, header_map, "date"))
date_edited = _parse_date(_getv(row, header_map, "date_edited"))
# Scripture parse diagnostics
parsed_list = parse_scripture(scr)
for it in parsed_list:
if it: report["scripture_parsed"] += 1
else: report["scripture_failed"] += 1
# Decide how to find an existing row:
# 1) Prefer Code if present (treat as external key)
# 2) Else fall back to (subject, illustration, application)
obj = None
if entry_code:
obj = Entry.objects.filter(entry_code=entry_code).first()
if obj is None:
obj = Entry.objects.filter(
subject=subject, illustration=illustration, application=application
).first()
if dry_run:
continue
created = obj is None
if created:
obj = Entry()
obj = None
if code:
try:
obj = Entry.objects.get(entry_code=code)
except Entry.DoesNotExist:
obj = None
# Assign fields
obj.subject = subject
obj.illustration = illustration
obj.application = application
obj.scripture_raw = scripture_raw
obj.source = source
obj.talk_title = talk_title
obj.talk_number = talk_number
obj.entry_code = entry_code
if date_added:
obj.date_added = date_added
if date_edited:
obj.date_edited = date_edited
if obj:
for k, v in data.items():
setattr(obj, k, v)
obj.save()
obj.scripture_refs.all().delete()
report["updated"] += 1
else:
obj = Entry.objects.create(**data)
report["inserted"] += 1
if dry_run:
updated += 1 if not created else 0
inserted += 1 if created else 0
else:
obj.save()
if created:
inserted += 1
else:
updated += 1
for it in parsed_list:
if it and isinstance(it, dict) and "raw" in it:
# Keep raw-only ref optional; skip creating ScriptureRef if schema differs
pass
elif it:
# If you switch to a structured parser, create records like:
ScriptureRef.objects.create(entry=obj, **it)
# (Optional) quick scripture counter — were not parsing here,
# but keep a metric like your previous report
if scripture_raw:
scripture_parsed += 1
except Exception as e:
report["skipped"] += 1
report["errors"].append(str(e))
except Exception as e:
skipped += 1
# keep error list compact
msg = str(e)
if "value too long for type" in msg and max(msg.count("\n"), 0) == 0:
errors.append("value too long for type character varying(...)")
else:
errors.append(msg)
return report
return {
"rows": inserted + updated + skipped,
"inserted": inserted,
"updated": updated,
"skipped": skipped,
"errors": errors[:200], # cap to avoid huge output
"scripture_parsed": scripture_parsed,
"scripture_failed": 0,
"dialect_delimiter": dialect.delimiter,
"used_headerless_mode": False,
"seen_headers": [h.lower() for h in seen_headers],
}