Update web/core/utils.py
attempt at fixing import command
This commit is contained in:
parent
ceb4dee084
commit
445da3523b
@ -1,210 +1,206 @@
|
|||||||
import csv
|
import csv
|
||||||
import io
|
import io
|
||||||
import re
|
import re
|
||||||
from datetime import date
|
from datetime import datetime
|
||||||
from dateutil import parser as dateparser
|
from typing import Dict, Any
|
||||||
|
|
||||||
from core.models import Entry, ScriptureRef
|
from django.db import transaction
|
||||||
|
from core.models import Entry
|
||||||
|
|
||||||
# ----------------------------
|
|
||||||
# Search helpers (needed by views)
|
|
||||||
# ----------------------------
|
|
||||||
# Split query into tokens while preserving quoted phrases
|
|
||||||
_QUOTED_OR_WORD = re.compile(r'"([^"]+)"|(\S+)')
|
|
||||||
|
|
||||||
def terms(q: str):
|
# ==============================
|
||||||
out = []
|
# Helpers
|
||||||
for m in _QUOTED_OR_WORD.finditer(q or ""):
|
# ==============================
|
||||||
token = (m.group(1) or m.group(2) or "").strip()
|
|
||||||
if token:
|
|
||||||
out.append(token)
|
|
||||||
return out
|
|
||||||
|
|
||||||
def has_wildcards(token: str) -> bool:
|
def _decode_bytes(b: bytes) -> str:
|
||||||
return "*" in token or "?" in token
|
# Keep BOM-safe decoding
|
||||||
|
return b.decode("utf-8-sig", errors="replace")
|
||||||
|
|
||||||
def wildcard_to_regex(token: str) -> str:
|
|
||||||
"""
|
|
||||||
Convert user wildcard token to a safe regex:
|
|
||||||
* -> .*
|
|
||||||
? -> .
|
|
||||||
Everything else is escaped. Suitable for Django __iregex.
|
|
||||||
"""
|
|
||||||
STAR = "__STAR__"
|
|
||||||
QMARK = "__QMARK__"
|
|
||||||
s = token.replace("*", STAR).replace("?", QMARK)
|
|
||||||
s = re.escape(s)
|
|
||||||
s = s.replace(STAR, ".*").replace(QMARK, ".")
|
|
||||||
return s
|
|
||||||
|
|
||||||
# ----------------------------
|
def _sniff_dialect(txt: str):
|
||||||
# Scripture parsing (minimal, non-blocking)
|
|
||||||
# ----------------------------
|
|
||||||
def parse_scripture(scripture_str: str):
|
|
||||||
"""
|
|
||||||
Minimal placeholder: keep as a list with raw string so imports never fail.
|
|
||||||
Replace with your richer parser when ready.
|
|
||||||
"""
|
|
||||||
if not scripture_str:
|
|
||||||
return []
|
|
||||||
return [{"raw": scripture_str}]
|
|
||||||
|
|
||||||
# ----------------------------
|
|
||||||
# CSV import (robust)
|
|
||||||
# ----------------------------
|
|
||||||
EXPECTED_HEADERS = [h.lower() for h in [
|
|
||||||
"Subject","Illustration","Application","Scripture","Source",
|
|
||||||
"Talk Title","Talk Number","Code","Date","Date Edited"
|
|
||||||
]]
|
|
||||||
|
|
||||||
def _sniff(text: str):
|
|
||||||
sample = text[:8192]
|
|
||||||
try:
|
try:
|
||||||
dialect = csv.Sniffer().sniff(sample, delimiters=",;\t|")
|
return csv.Sniffer().sniff(txt[:4096], delimiters=[",", ";", "\t", "|"])
|
||||||
except Exception:
|
except Exception:
|
||||||
class _Simple(csv.Dialect):
|
class _D: delimiter = ","
|
||||||
delimiter = ','
|
return _D()
|
||||||
quotechar = '"'
|
|
||||||
escapechar = None
|
|
||||||
doublequote = True
|
|
||||||
skipinitialspace = True
|
|
||||||
lineterminator = '\n'
|
|
||||||
quoting = csv.QUOTE_MINIMAL
|
|
||||||
dialect = _Simple
|
|
||||||
return dialect
|
|
||||||
|
|
||||||
def _as_dictreader(text: str, dialect, fieldnames=None):
|
|
||||||
|
def _norm_header(h: str) -> str:
|
||||||
"""
|
"""
|
||||||
Yield rows as dicts. If fieldnames are provided, treat file as headerless.
|
Normalize headers in a forgiving way:
|
||||||
We also peek one row: if it looks like an actual header row, we skip it.
|
- lower-case
|
||||||
|
- remove all non-alphanumerics
|
||||||
|
- collapse spaces/underscores
|
||||||
"""
|
"""
|
||||||
sio = io.StringIO(text)
|
if not h:
|
||||||
if fieldnames is None:
|
return ""
|
||||||
reader = csv.DictReader(sio, dialect=dialect)
|
h = h.strip().lower()
|
||||||
for row in reader:
|
h = h.replace("_", " ")
|
||||||
yield row
|
h = re.sub(r"\s+", " ", h)
|
||||||
return
|
# drop everything non-alnum
|
||||||
# Headerless mode
|
h = re.sub(r"[^a-z0-9 ]+", "", h)
|
||||||
reader = csv.DictReader(sio, dialect=dialect, fieldnames=fieldnames)
|
return h.replace(" ", "")
|
||||||
first = next(reader, None)
|
|
||||||
if first is not None:
|
|
||||||
# If many columns equal their header names, it's probably a header row
|
|
||||||
matches = sum(1 for k, v in first.items() if (v or "").strip().lower() == k.strip().lower())
|
|
||||||
if matches < 5:
|
|
||||||
# Not a header row, yield it
|
|
||||||
yield first
|
|
||||||
for row in reader:
|
|
||||||
yield row
|
|
||||||
|
|
||||||
def import_csv_bytes(b: bytes, dry_run: bool = True):
|
|
||||||
"""
|
|
||||||
Robust import:
|
|
||||||
- Auto-detect delimiter (comma/semicolon/tab/pipe).
|
|
||||||
- If required headers are missing, re-parse treating file as *headerless*
|
|
||||||
using the canonical column order.
|
|
||||||
- Skip fully empty rows.
|
|
||||||
- Upsert by Code (if Code present), else insert.
|
|
||||||
Returns a report dict with counts and diagnostics.
|
|
||||||
"""
|
|
||||||
text = b.decode("utf-8-sig", errors="replace")
|
|
||||||
dialect = _sniff(text)
|
|
||||||
|
|
||||||
# First attempt: use file-provided headers
|
def _getv(row: Dict[str, Any], hdr_map: Dict[str, str], canon: str) -> str:
|
||||||
reader1 = csv.DictReader(io.StringIO(text), dialect=dialect)
|
# Look up using canonical key -> original header
|
||||||
headers1 = [(h or "").strip().lower() for h in (reader1.fieldnames or [])]
|
for orig, can in hdr_map.items():
|
||||||
|
if can == canon:
|
||||||
|
v = row.get(orig, "")
|
||||||
|
return (v or "").strip()
|
||||||
|
return ""
|
||||||
|
|
||||||
used_headerless = False
|
|
||||||
if not headers1 or sum(h in EXPECTED_HEADERS for h in headers1) < 5:
|
|
||||||
# Not enough expected headers -> treat as headerless/positional
|
|
||||||
used_headerless = True
|
|
||||||
rows_iter = _as_dictreader(text, dialect, fieldnames=EXPECTED_HEADERS)
|
|
||||||
else:
|
|
||||||
rows_iter = (row for row in reader1)
|
|
||||||
|
|
||||||
report = {
|
def _clip(s: str, n: int) -> str:
|
||||||
"rows": 0, "inserted": 0, "updated": 0, "skipped": 0, "errors": [],
|
s = (s or "").strip()
|
||||||
"scripture_parsed": 0, "scripture_failed": 0,
|
return s[:n] if n and len(s) > n else s
|
||||||
"dialect_delimiter": getattr(dialect, "delimiter", "?"),
|
|
||||||
"used_headerless_mode": used_headerless,
|
|
||||||
"seen_headers": headers1,
|
|
||||||
}
|
|
||||||
|
|
||||||
def parse_date_safe(v):
|
|
||||||
if not v or not str(v).strip():
|
def _parse_date(s: str):
|
||||||
|
s = (s or "").strip()
|
||||||
|
if not s:
|
||||||
return None
|
return None
|
||||||
|
for fmt in ("%Y-%m-%d", "%Y/%m/%d", "%m/%d/%Y", "%m/%d/%y", "%Y.%m.%d", "%m-%d-%Y"):
|
||||||
try:
|
try:
|
||||||
return dateparser.parse(str(v)).date()
|
return datetime.strptime(s, fmt).date()
|
||||||
except Exception:
|
except ValueError:
|
||||||
return None
|
|
||||||
|
|
||||||
for row in rows_iter:
|
|
||||||
report["rows"] += 1
|
|
||||||
try:
|
|
||||||
row_lc = {(k or "").strip().lower(): (v or "") for k, v in row.items()}
|
|
||||||
|
|
||||||
subj = (row_lc.get("subject") or "").strip()
|
|
||||||
illu = (row_lc.get("illustration") or "").strip()
|
|
||||||
appl = (row_lc.get("application") or "").strip()
|
|
||||||
scr = (row_lc.get("scripture") or "").strip()
|
|
||||||
src = (row_lc.get("source") or "").strip()
|
|
||||||
tt = (row_lc.get("talk title") or "").strip()
|
|
||||||
tnum = (row_lc.get("talk number") or "").strip()
|
|
||||||
code = (row_lc.get("code") or "").strip()
|
|
||||||
dadd = parse_date_safe(row_lc.get("date"))
|
|
||||||
ded = parse_date_safe(row_lc.get("date edited"))
|
|
||||||
|
|
||||||
try:
|
|
||||||
tnum = int(tnum) if tnum else None
|
|
||||||
except Exception:
|
|
||||||
tnum = None
|
|
||||||
|
|
||||||
# Skip rows that are completely empty across all tracked fields
|
|
||||||
if not any([subj, illu, appl, scr, src, tt, code, tnum, dadd, ded]):
|
|
||||||
report["skipped"] += 1
|
|
||||||
continue
|
continue
|
||||||
|
return None
|
||||||
|
|
||||||
data = dict(
|
|
||||||
subject=subj, illustration=illu, application=appl,
|
|
||||||
scripture_raw=scr, source=src, talk_number=tnum,
|
|
||||||
talk_title=tt, entry_code=code, date_added=dadd, date_edited=ded
|
|
||||||
)
|
|
||||||
|
|
||||||
# Scripture parse diagnostics
|
# ==============================
|
||||||
parsed_list = parse_scripture(scr)
|
# Public: import_csv_bytes
|
||||||
for it in parsed_list:
|
# ==============================
|
||||||
if it: report["scripture_parsed"] += 1
|
|
||||||
else: report["scripture_failed"] += 1
|
def import_csv_bytes(
|
||||||
|
csv_bytes: bytes,
|
||||||
|
dry_run: bool = False,
|
||||||
|
*,
|
||||||
|
# tune these if you changed model field sizes
|
||||||
|
max_source=255,
|
||||||
|
max_code=128,
|
||||||
|
max_talk_number=128,
|
||||||
|
max_talk_title=512,
|
||||||
|
max_scripture=512,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Import CSV seed in an idempotent/upsert fashion.
|
||||||
|
|
||||||
|
Mapping (case/spacing-insensitive):
|
||||||
|
Subject, Illustration, Application, Scripture, Source,
|
||||||
|
Talk Title, Talk Number, Code, Date, Date Edited
|
||||||
|
"""
|
||||||
|
text = _decode_bytes(csv_bytes)
|
||||||
|
dialect = _sniff_dialect(text)
|
||||||
|
f = io.StringIO(text)
|
||||||
|
rdr = csv.DictReader(f, dialect=dialect)
|
||||||
|
|
||||||
|
seen_headers = [h.strip() for h in (rdr.fieldnames or [])]
|
||||||
|
|
||||||
|
# Build header normalization map
|
||||||
|
# Canonical keys we expect:
|
||||||
|
# subject illustration application scripture source talktitle talknumber code date dateedited
|
||||||
|
canon_targets = {
|
||||||
|
"subject": "subject",
|
||||||
|
"illustration": "illustration",
|
||||||
|
"application": "application",
|
||||||
|
"scripture": "scripture",
|
||||||
|
"source": "source",
|
||||||
|
"talktitle": "talk_title",
|
||||||
|
"title": "talk_title",
|
||||||
|
"talknumber": "talk_number",
|
||||||
|
"number": "talk_number",
|
||||||
|
"code": "code",
|
||||||
|
"date": "date",
|
||||||
|
"dateedited": "date_edited",
|
||||||
|
"edited": "date_edited",
|
||||||
|
}
|
||||||
|
header_map = {}
|
||||||
|
for h in seen_headers:
|
||||||
|
header_map[h] = canon_targets.get(_norm_header(h), _norm_header(h)) # unknowns still map to their norm
|
||||||
|
|
||||||
|
inserted = updated = skipped = 0
|
||||||
|
errors = []
|
||||||
|
scripture_parsed = 0
|
||||||
|
|
||||||
|
with transaction.atomic():
|
||||||
|
for idx, row in enumerate(rdr, start=2): # data starts at line 2
|
||||||
|
try:
|
||||||
|
subject = _getv(row, header_map, "subject")
|
||||||
|
illustration = _getv(row, header_map, "illustration")
|
||||||
|
application = _getv(row, header_map, "application")
|
||||||
|
|
||||||
|
scripture_raw = _clip(_getv(row, header_map, "scripture"), max_scripture)
|
||||||
|
source = _clip(_getv(row, header_map, "source"), max_source)
|
||||||
|
talk_title = _clip(_getv(row, header_map, "talk_title"), max_talk_title)
|
||||||
|
talk_number = _clip(_getv(row, header_map, "talk_number"), max_talk_number)
|
||||||
|
entry_code = _clip(_getv(row, header_map, "code"), max_code)
|
||||||
|
|
||||||
|
date_added = _parse_date(_getv(row, header_map, "date"))
|
||||||
|
date_edited = _parse_date(_getv(row, header_map, "date_edited"))
|
||||||
|
|
||||||
|
# Decide how to find an existing row:
|
||||||
|
# 1) Prefer Code if present (treat as external key)
|
||||||
|
# 2) Else fall back to (subject, illustration, application)
|
||||||
|
obj = None
|
||||||
|
if entry_code:
|
||||||
|
obj = Entry.objects.filter(entry_code=entry_code).first()
|
||||||
|
if obj is None:
|
||||||
|
obj = Entry.objects.filter(
|
||||||
|
subject=subject, illustration=illustration, application=application
|
||||||
|
).first()
|
||||||
|
|
||||||
|
created = obj is None
|
||||||
|
if created:
|
||||||
|
obj = Entry()
|
||||||
|
|
||||||
|
# Assign fields
|
||||||
|
obj.subject = subject
|
||||||
|
obj.illustration = illustration
|
||||||
|
obj.application = application
|
||||||
|
obj.scripture_raw = scripture_raw
|
||||||
|
obj.source = source
|
||||||
|
obj.talk_title = talk_title
|
||||||
|
obj.talk_number = talk_number
|
||||||
|
obj.entry_code = entry_code
|
||||||
|
if date_added:
|
||||||
|
obj.date_added = date_added
|
||||||
|
if date_edited:
|
||||||
|
obj.date_edited = date_edited
|
||||||
|
|
||||||
if dry_run:
|
if dry_run:
|
||||||
continue
|
updated += 1 if not created else 0
|
||||||
|
inserted += 1 if created else 0
|
||||||
obj = None
|
|
||||||
if code:
|
|
||||||
try:
|
|
||||||
obj = Entry.objects.get(entry_code=code)
|
|
||||||
except Entry.DoesNotExist:
|
|
||||||
obj = None
|
|
||||||
|
|
||||||
if obj:
|
|
||||||
for k, v in data.items():
|
|
||||||
setattr(obj, k, v)
|
|
||||||
obj.save()
|
|
||||||
obj.scripture_refs.all().delete()
|
|
||||||
report["updated"] += 1
|
|
||||||
else:
|
else:
|
||||||
obj = Entry.objects.create(**data)
|
obj.save()
|
||||||
report["inserted"] += 1
|
if created:
|
||||||
|
inserted += 1
|
||||||
|
else:
|
||||||
|
updated += 1
|
||||||
|
|
||||||
for it in parsed_list:
|
# (Optional) quick scripture counter — we’re not parsing here,
|
||||||
if it and isinstance(it, dict) and "raw" in it:
|
# but keep a metric like your previous report
|
||||||
# Keep raw-only ref optional; skip creating ScriptureRef if schema differs
|
if scripture_raw:
|
||||||
pass
|
scripture_parsed += 1
|
||||||
elif it:
|
|
||||||
# If you switch to a structured parser, create records like:
|
|
||||||
ScriptureRef.objects.create(entry=obj, **it)
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
report["skipped"] += 1
|
skipped += 1
|
||||||
report["errors"].append(str(e))
|
# keep error list compact
|
||||||
|
msg = str(e)
|
||||||
|
if "value too long for type" in msg and max(msg.count("\n"), 0) == 0:
|
||||||
|
errors.append("value too long for type character varying(...)")
|
||||||
|
else:
|
||||||
|
errors.append(msg)
|
||||||
|
|
||||||
return report
|
return {
|
||||||
|
"rows": inserted + updated + skipped,
|
||||||
|
"inserted": inserted,
|
||||||
|
"updated": updated,
|
||||||
|
"skipped": skipped,
|
||||||
|
"errors": errors[:200], # cap to avoid huge output
|
||||||
|
"scripture_parsed": scripture_parsed,
|
||||||
|
"scripture_failed": 0,
|
||||||
|
"dialect_delimiter": dialect.delimiter,
|
||||||
|
"used_headerless_mode": False,
|
||||||
|
"seen_headers": [h.lower() for h in seen_headers],
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue
Block a user