Update web/core/utils.py

This commit is contained in:
Joshua Laymon 2025-08-22 11:18:09 +00:00
parent f64d41313e
commit a1b174adc3

View File

@ -11,6 +11,42 @@ from django.db import transaction, IntegrityError, DataError, DatabaseError
from .models import Entry from .models import Entry
EXPECTED_HEADERS: List[str] = [
"Subject", "Illustration", "Application", "Scripture", "Source",
"Talk Title", "Talk Number", "Code", "Date", "Date Edited",
]
# Map CSV header labels -> Entry model field names
HEADER_MAP: Dict[str, str] = {
"Subject": "subject",
"Illustration": "illustration",
"Application": "application",
"Scripture": "scripture_raw",
"Source": "source",
"Talk Title": "talk_title",
"Talk Number": "talk_number",
"Code": "entry_code",
"Date": "date_added",
"Date Edited": "date_edited",
}
# Accept both the pretty labels *and* the actual model field names
# (lets you import older dumps or hand-made files)
ACCEPTABLE_HEADERS: Dict[str, str] = {
**{h.lower(): HEADER_MAP[h] for h in EXPECTED_HEADERS},
# direct model names also OK
"subject": "subject",
"illustration": "illustration",
"application": "application",
"scripture_raw": "scripture_raw",
"source": "source",
"talk_title": "talk_title",
"talk_number": "talk_number",
"entry_code": "entry_code",
"date_added": "date_added",
"date_edited": "date_edited",
}
# ============================ # ============================
# Search helpers (used by views) # Search helpers (used by views)
@ -215,149 +251,6 @@ def _coerce_int(val: str):
return None return None
def import_csv_bytes(b: bytes, dry_run: bool = False, commit_every: int = 500) -> Dict[str, object]:
"""
Robust CSV import. Commits each row in its own transaction so that one bad
row does not poison the entire import (avoids TransactionManagementError cascades).
Returns a report dict with counts and first-line error messages.
"""
text = _decode_bytes(b)
dialect = _sniff_dialect(text)
delimiter = getattr(dialect, "delimiter", ",")
# --- headers ---
f = io.StringIO(text)
reader = csv.reader(f, dialect=dialect)
try:
raw_headers = next(reader)
except StopIteration:
return {
"rows": 0, "inserted": 0, "updated": 0, "skipped": 0, "errors": [],
"scripture_parsed": 0, "scripture_failed": 0,
"dialect_delimiter": delimiter, "used_headerless_mode": False,
"seen_headers": []
}
headers = raw_headers if len(raw_headers) == EXPECTED_COLS else _split_lenient(
",".join(raw_headers), delimiter=delimiter, expected=EXPECTED_COLS
)
header_map = _build_header_map(headers)
# Pair raw lines so we can repair rows mis-split by csv
raw_lines = text.splitlines()[1:] # skip header
dict_reader = csv.DictReader(io.StringIO(text), fieldnames=headers, dialect=dialect)
next(dict_reader, None) # skip header
total = inserted = updated = skipped = 0
errors: List[str] = []
scripture_ok = scripture_bad = 0
# Import loop (row-by-row atomic)
for idx, (raw_line, row) in enumerate(zip(raw_lines, dict_reader), start=2):
total += 1
# Repair if DictReader got the wrong shape (inconsistent quotes in source)
if len(row) != EXPECTED_COLS or None in row:
cells = _split_lenient(raw_line, delimiter=delimiter, expected=EXPECTED_COLS)
row = dict(zip(headers, cells))
# Extract canonical fields
subject = _getv(row, header_map, "subject").strip()
illustration = _getv(row, header_map, "illustration").strip()
application = _getv(row, header_map, "application").strip()
scripture = _getv(row, header_map, "scripture").strip()
source = _getv(row, header_map, "source").strip()
talk_title = _getv(row, header_map, "talk title").strip()
talk_number = _coerce_int(_getv(row, header_map, "talk number"))
entry_code = _getv(row, header_map, "code").strip()
date_added = _parse_date(_getv(row, header_map, "date"))
date_edited = _parse_date(_getv(row, header_map, "date edited"))
# Skip rows with no meaningful text
if not (subject or illustration or application):
skipped += 1
continue
# Clip to DB lengths
subject = _clip("subject", subject)
illustration = _clip("illustration", illustration)
application = _clip("application", application)
scripture = _clip("scripture_raw", scripture)
source = _clip("source", source)
talk_title = _clip("talk_title", talk_title)
entry_code = _clip("entry_code", entry_code)
scripture_ok += 1 if scripture else 0
scripture_bad += 0 if scripture else 1
# Upsert key: prefer entry_code; else (subject + illustration)
lookup: Dict[str, object] = {}
if entry_code:
lookup["entry_code"] = entry_code
else:
lookup["subject"] = subject
lookup["illustration"] = illustration
if dry_run:
exists = Entry.objects.filter(**lookup).exists()
inserted += 0 if exists else 1
updated += 1 if exists else 0
continue
try:
# Isolate each row so a failure rolls back only that row
with transaction.atomic():
obj = Entry.objects.filter(**lookup).first()
created = False
if not obj:
obj = Entry(**lookup)
created = True
obj.subject = subject
obj.illustration = illustration
obj.application = application
obj.scripture_raw = scripture
obj.source = source
obj.talk_title = talk_title
obj.talk_number = talk_number
if entry_code:
obj.entry_code = entry_code
if date_added:
obj.date_added = date_added
if date_edited:
obj.date_edited = date_edited
obj.save()
inserted += 1 if created else 0
updated += 0 if created else 1
except (IntegrityError, DataError, DatabaseError, ValueError) as e:
msg = str(e).splitlines()[0]
errors.append(f"line {idx}: {type(e).__name__}: {msg}")
skipped += 1
# continue to next row
return {
"rows": total,
"inserted": inserted,
"updated": updated,
"skipped": skipped,
"errors": errors,
"scripture_parsed": scripture_ok,
"scripture_failed": scripture_bad,
"dialect_delimiter": delimiter,
"used_headerless_mode": False,
"seen_headers": headers,
}
EXPECTED_HEADERS = [
"Subject","Illustration","Application","Scripture","Source",
"Talk Title","Talk Number","Code","Date","Date Edited"
]
def _to_int_or_none(s: str) -> Optional[int]: def _to_int_or_none(s: str) -> Optional[int]:
s = (s or "").strip() s = (s or "").strip()
if not s: if not s:
@ -378,96 +271,224 @@ def _to_date_or_none(s: str) -> Optional[datetime.date]:
pass pass
return None # let caller decide if this is acceptable return None # let caller decide if this is acceptable
def import_csv_bytes(data: bytes, dry_run: bool = True, batch_size: int = 1000) -> Dict[str, Any]: def _clean_header_token(s: Any) -> str:
""" """
Robust CSV importer for Entries. Make a header token safe/normalized:
- data: raw bytes of the uploaded file - None -> ""
- dry_run: when True, do not write to DB; return preview + errors - trim spaces
- batch_size: bulk_create chunk size - strip surrounding single/double quotes
Returns: dict(report=..., rows=preview_rows, errors=[...]) - drop weird prefixes like r:"Talk Title" or r.'Talk Title'
- lowercase for matching
""" """
text = io.TextIOWrapper(io.BytesIO(data), encoding="utf-8-sig", newline="") s = "" if s is None else str(s)
reader = csv.reader(text) s = s.strip()
# strip surrounding quotes
if len(s) >= 2 and s[0] == s[-1] and s[0] in ("'", '"'):
s = s[1:-1]
# drop r: or r. prefix some CSV tools add
if s[:2].lower() in ("r:", "r."):
s = s[2:].lstrip()
return s.strip().lower()
# Read header row
_DATE_FORMATS = (
"%Y-%m-%d",
"%m/%d/%Y",
"%m/%d/%y",
"%d-%b-%Y", # 05-Sep-2024
"%Y/%m/%d",
)
def _parse_date(val: str) -> Optional[datetime.date]:
if not val:
return None
txt = str(val).strip()
# Accept ISO-like with time: 2024-01-02T00:00:00
if "T" in txt:
try:
return datetime.fromisoformat(txt).date()
except Exception:
pass
for fmt in _DATE_FORMATS:
try:
return datetime.strptime(txt, fmt).date()
except Exception:
continue
# as a last resort, try only year-month-day pieces
try: try:
header = next(reader) parts = [int(p) for p in txt.replace("/", "-").split("-")]
except StopIteration: if len(parts) >= 3:
return {"report": "Empty file.", "rows": [], "errors": ["File is empty."]} return datetime(parts[0], parts[1], parts[2]).date()
except Exception:
pass
return None
# Loose header check: either exact match, or map by index if close
header_norm = [h.strip() for h in header]
if header_norm != EXPECTED_HEADERS:
return {
"report": "Header mismatch.",
"rows": [],
"errors": [
"Expected header: " + ", ".join(EXPECTED_HEADERS),
"Found header: " + ", ".join(header_norm),
],
}
to_create: List[Entry] = [] def _to_int_or_none(v: Any) -> Optional[int]:
if v is None:
return None
s = str(v).strip()
if s == "":
return None
try:
return int(float(s)) # tolerate "123.0"
except Exception:
return None
def import_csv_bytes(data: bytes, dry_run: bool = True) -> Dict[str, Any]:
"""
Robust CSV importer for Entry.
- Accepts your human-readable header (Subject, Illustration, ...)
and/or direct model field names.
- Normalizes odd headers like r."Talk Title".
- Handles BOM & dialect sniffing.
- Returns a report dict: {ok, created, updated, skipped, errors, preview, total_rows, header}
"""
report: Dict[str, Any] = {
"ok": False,
"created": 0,
"updated": 0,
"skipped": 0,
"errors": [], # list[str]
"preview": [], # first ~10 rows that would be imported
"total_rows": 0,
"header": [],
}
# --- decode safely (remove BOM, keep unknowns) ---
text = data.decode("utf-8-sig", errors="replace")
# --- sniff dialect; fall back to excel ---
try:
sample = "\n".join(text.splitlines()[:10])
dialect = csv.Sniffer().sniff(sample) if sample.strip() else csv.excel
except Exception:
dialect = csv.excel
rdr = csv.reader(io.StringIO(text), dialect)
try:
raw_header = next(rdr, [])
except Exception as e:
report["errors"].append(f"Failed reading header: {e}")
return report
# Clean & map header
cleaned = [_clean_header_token(h) for h in raw_header]
mapped: List[str] = []
unknowns: List[str] = []
for token in cleaned:
target = ACCEPTABLE_HEADERS.get(token)
if target:
mapped.append(target)
else:
unknowns.append(token or "(empty)")
# If header doesn't match expected width but row count does, assume *no* header;
# inject expected header so downstream works.
has_header = True
if unknowns:
# Heuristic: if the number of columns equals EXPECTED_HEADERS and *none*
# of the cleaned tokens map, it's probably a data row (no header)
matches = sum(1 for t in cleaned if t in ACCEPTABLE_HEADERS)
if matches == 0 and len(cleaned) == len(EXPECTED_HEADERS):
# inject expected header and re-run
has_header = False
mapped = [HEADER_MAP[h] for h in EXPECTED_HEADERS]
# rebuild a reader with the expected header injected
sio = io.StringIO(text)
rdr_tmp = csv.reader(sio, dialect)
rows = list(rdr_tmp)
rows.insert(0, EXPECTED_HEADERS) # inject pretty header for report
rdr = iter(rows) # consume from this list iterator
next(rdr, None) # skip our injected header
else:
# keep going but warn in the report
report["errors"].append(
"Some header columns were not recognized: "
+ ", ".join(unknowns)
+ " (continuing with best-effort mapping)"
)
report["header"] = mapped
# Read rows
rows = list(rdr)
report["total_rows"] = len(rows)
# Build row dicts
def row_to_obj(row_idx: int, row: List[str]) -> Tuple[Optional[Entry], Optional[Dict[str, Any]], Optional[str]]:
"""
Returns (entry_instance_or_None, values_dict_or_None, error_message_or_None)
but does not save to DB.
"""
if len(row) < len(mapped):
return None, None, f"Row {row_idx}: expected {len(mapped)} columns, found {len(row)}."
values: Dict[str, Any] = {}
for i, field in enumerate(mapped):
raw_val = row[i] if i < len(row) else ""
# Coerce types for specific fields
if field in ("date_added", "date_edited"):
values[field] = _parse_date(raw_val)
elif field == "talk_number":
values[field] = _to_int_or_none(raw_val)
else:
values[field] = (raw_val or "").strip()
# Create (unsaved) Entry instance for preview/validation
e = Entry(**{k: v for k, v in values.items() if v not in (None, "")})
return e, values, None
# Preview first few
for i, row in enumerate(rows[:10], start=1):
e, values, err = row_to_obj(i, row)
report["preview"].append({
"row": i,
"values": values if values else {},
"error": err,
})
if dry_run:
# Dry run: dont write, just validate basic structure
bad = [p for p in report["preview"] if p["error"]]
if bad:
report["errors"].extend(p["error"] for p in bad if p["error"])
report["ok"] = len(report["errors"]) == 0
return report
# Real import (create new rows).
# If you want update/merge behavior, add a key strategy here.
created = 0
updated = 0
skipped = 0
errors: List[str] = [] errors: List[str] = []
preview: List[Tuple[int, Dict[str, Any]]] = [] # first 100 rows for the UI
rownum = 1
def make_entry(row: List[str]) -> Optional[Entry]: with transaction.atomic():
# force length to 10, padding if needed for idx, row in enumerate(rows, start=1):
padded = row + [""] * (10 - len(row)) e, values, err = row_to_obj(idx, row)
subj, ill, app, scr, src, talk_title, talk_num, code, d_added, d_edited = padded[:10] if err:
errors.append(err)
skipped += 1
continue
e = Entry(
subject=(subj or "").strip(),
illustration=(ill or "").strip(),
application=(app or "").strip(),
scripture_raw=(scr or "").strip(),
source=(src or "").strip(),
talk_title=(talk_title or "").strip(),
talk_number=_to_int_or_none(talk_num),
entry_code=(code or "").strip(),
date_added=_to_date_or_none(d_added),
date_edited=_to_date_or_none(d_edited),
)
return e
created_total = 0
with (transaction.atomic() if not dry_run else _noop_context()):
for row in reader:
rownum += 1
try: try:
e = make_entry(row) # Simple create-only behavior:
# (optional) add required-field checks; e.g., at least one of illustration/application Entry.objects.create(**values)
if not ((e.illustration and e.illustration.strip()) or (e.application and e.application.strip())): created += 1
errors.append(f"Row {rownum}: missing Illustration and Application")
continue
to_create.append(e)
if len(preview) < 100:
preview.append((rownum, {
"Subject": e.subject, "Illustration": e.illustration[:120],
"Application": e.application[:120], "Scripture": e.scripture_raw,
"Source": e.source, "Talk Title": e.talk_title,
"Talk Number": e.talk_number, "Code": e.entry_code,
"Date": e.date_added, "Date Edited": e.date_edited,
}))
if not dry_run and len(to_create) >= batch_size:
Entry.objects.bulk_create(to_create, batch_size=batch_size)
created_total += len(to_create)
to_create.clear()
except Exception as ex: except Exception as ex:
errors.append(f"Row {rownum}: {ex}") errors.append(f"Row {idx}: failed to save ({ex})")
skipped += 1
if not dry_run and to_create: report.update({
Entry.objects.bulk_create(to_create, batch_size=batch_size) "ok": len(errors) == 0,
created_total += len(to_create) "created": created,
to_create.clear() "updated": updated,
"skipped": skipped,
report = f"{'Would import' if dry_run else 'Imported'} {created_total if not dry_run else len(preview)}+ rows." "errors": errors,
return {"report": report, "rows": preview, "errors": errors} })
return report
# small context manager used above # small context manager used above
class _noop_context: class _noop_context: