Update web/core/utils.py

This commit is contained in:
Joshua Laymon 2025-08-22 13:57:58 +00:00
parent cc6461ced2
commit 17c9a89848

View File

@ -336,158 +336,181 @@ def _to_int_or_none(v: Any) -> Optional[int]:
return None
def import_csv_bytes(data: bytes, dry_run: bool = True) -> Dict[str, Any]:
"""
Robust CSV importer for Entry.
import csv
import io
from datetime import datetime
from typing import Optional, List, Dict, Any
- Accepts your human-readable header (Subject, Illustration, ...)
and/or direct model field names.
- Normalizes odd headers like r."Talk Title".
- Handles BOM & dialect sniffing.
- Returns a report dict: {ok, created, updated, skipped, errors, preview, total_rows, header}
from django.db import transaction
from .models import Entry
# Canonical header order expected from the CSV (and shown in the UI)
EXPECTED_HEADERS = [
"Subject", "Illustration", "Application", "Scripture", "Source",
"Talk Title", "Talk Number", "Code", "Date", "Date Edited",
]
def _clean_header_cell(s: str) -> str:
if s is None:
return ""
s = str(s).strip()
# Handle odd prefixes like r:"Talk Title"
low = s.lower()
if low.startswith("r:") or low.startswith("r="):
s = s[2:].lstrip()
# Strip wrapping quotes
if len(s) >= 2 and s[0] == s[-1] and s[0] in ('"', "'"):
s = s[1:-1]
return s.strip()
def _parse_int(x: str) -> Optional[int]:
x = (x or "").strip()
if not x:
return None
try:
return int(x)
except Exception:
return None
def _parse_date(x: str):
"""
report: Dict[str, Any] = {
"ok": False,
Returns a date object or None.
Tries several common formats, then ISO.
"""
x = (x or "").strip()
if not x:
return None
for fmt in ("%Y-%m-%d", "%m/%d/%Y", "%d/%m/%Y"):
try:
return datetime.strptime(x, fmt).date()
except Exception:
pass
try:
return datetime.fromisoformat(x).date()
except Exception:
return None
def import_csv_bytes(content: bytes, dry_run: bool = True, batch_size: int = 1000) -> Dict[str, Any]:
"""
Parse the uploaded CSV (bytes), optionally write to DB.
Returns a report dict the templates expect:
{
"total": <int>,
"created": <int>,
"updated": 0,
"skipped": <int>,
"errors": [ ... ],
"preview": [ [cell,...], ... up to 10 rows ],
"columns": EXPECTED_HEADERS,
}
Notes:
- This implementation always CREATES new rows (no dedupe).
If you want upserts later, we can key on entry_code or (talk_number, entry_code).
"""
report = {
"total": 0,
"created": 0,
"updated": 0,
"skipped": 0,
"errors": [], # list[str]
"preview": [], # first ~10 rows that would be imported
"total_rows": 0,
"header": [],
"errors": [],
"preview": [],
"columns": EXPECTED_HEADERS[:],
}
# --- decode safely (remove BOM, keep unknowns) ---
text = data.decode("utf-8-sig", errors="replace")
# --- sniff dialect; fall back to excel ---
# Decode once (BOM-safe), sniff dialect, fall back to excel
text = content.decode("utf-8-sig", errors="replace")
try:
sample = "\n".join(text.splitlines()[:10])
dialect = csv.Sniffer().sniff(sample) if sample.strip() else csv.excel
first_line = text.splitlines()[0] if text else ""
dialect = csv.Sniffer().sniff(first_line) if first_line else csv.excel
except Exception:
dialect = csv.excel
rdr = csv.reader(io.StringIO(text), dialect)
rows = list(csv.reader(io.StringIO(text), dialect))
if not rows:
return report # empty file
try:
raw_header = next(rdr, [])
except Exception as e:
report["errors"].append(f"Failed reading header: {e}")
return report
# Header handling (tolerant)
first = rows[0]
norm_first = [_clean_header_cell(c).lower() for c in first]
expected_norm = [h.lower() for h in EXPECTED_HEADERS]
header_ok = (norm_first == expected_norm)
# Clean & map header
cleaned = [_clean_header_token(h) for h in raw_header]
mapped: List[str] = []
unknowns: List[str] = []
for token in cleaned:
target = ACCEPTABLE_HEADERS.get(token)
if target:
mapped.append(target)
if header_ok:
data_rows = rows[1:]
else:
unknowns.append(token or "(empty)")
# If header doesn't match expected width but row count does, assume *no* header;
# inject expected header so downstream works.
has_header = True
if unknowns:
# Heuristic: if the number of columns equals EXPECTED_HEADERS and *none*
# of the cleaned tokens map, it's probably a data row (no header)
matches = sum(1 for t in cleaned if t in ACCEPTABLE_HEADERS)
if matches == 0 and len(cleaned) == len(EXPECTED_HEADERS):
# inject expected header and re-run
has_header = False
mapped = [HEADER_MAP[h] for h in EXPECTED_HEADERS]
# rebuild a reader with the expected header injected
sio = io.StringIO(text)
rdr_tmp = csv.reader(sio, dialect)
rows = list(rdr_tmp)
rows.insert(0, EXPECTED_HEADERS) # inject pretty header for report
rdr = iter(rows) # consume from this list iterator
next(rdr, None) # skip our injected header
# If first row isn't a match but the column count matches, treat it as data
if len(first) == len(EXPECTED_HEADERS):
data_rows = rows # treat all rows as data; we'll use EXPECTED order
else:
# keep going but warn in the report
# Try common alternate delimiters to recover
for delim in (";", "\t"):
rows2 = list(csv.reader(io.StringIO(text), delimiter=delim))
if rows2 and len(rows2[0]) == len(EXPECTED_HEADERS):
rows = rows2
first = rows[0]
norm_first = [_clean_header_cell(c).lower() for c in first]
header_ok = (norm_first == expected_norm)
data_rows = rows[1:] if header_ok else rows
break
else:
# Could not reconcile columns
report["errors"].append(
"Some header columns were not recognized: "
+ ", ".join(unknowns)
+ " (continuing with best-effort mapping)"
f"Column mismatch: saw {len(first)} but expected {len(EXPECTED_HEADERS)}."
)
report["header"] = mapped
# Read rows
rows = list(rdr)
report["total_rows"] = len(rows)
# Build row dicts
def row_to_obj(row_idx: int, row: List[str]) -> Tuple[Optional[Entry], Optional[Dict[str, Any]], Optional[str]]:
"""
Returns (entry_instance_or_None, values_dict_or_None, error_message_or_None)
but does not save to DB.
"""
if len(row) < len(mapped):
return None, None, f"Row {row_idx}: expected {len(mapped)} columns, found {len(row)}."
values: Dict[str, Any] = {}
for i, field in enumerate(mapped):
raw_val = row[i] if i < len(row) else ""
# Coerce types for specific fields
if field in ("date_added", "date_edited"):
values[field] = _parse_date(raw_val)
elif field == "talk_number":
values[field] = _to_int_or_none(raw_val)
else:
values[field] = (raw_val or "").strip()
# Create (unsaved) Entry instance for preview/validation
e = Entry(**{k: v for k, v in values.items() if v not in (None, "")})
return e, values, None
# Preview first few
for i, row in enumerate(rows[:10], start=1):
e, values, err = row_to_obj(i, row)
report["preview"].append({
"row": i,
"values": values if values else {},
"error": err,
})
if dry_run:
# Dry run: dont write, just validate basic structure
bad = [p for p in report["preview"] if p["error"]]
if bad:
report["errors"].extend(p["error"] for p in bad if p["error"])
report["ok"] = len(report["errors"]) == 0
return report
# Real import (create new rows).
# If you want update/merge behavior, add a key strategy here.
created = 0
updated = 0
skipped = 0
errors: List[str] = []
with transaction.atomic():
for idx, row in enumerate(rows, start=1):
e, values, err = row_to_obj(idx, row)
if err:
errors.append(err)
skipped += 1
# Normalize rows length (pad/trim) and build preview (first 10)
normalized_rows: List[List[str]] = []
for r in data_rows:
if not r or all((c or "").strip() == "" for c in r):
continue
if len(r) < len(EXPECTED_HEADERS):
r = r + [""] * (len(EXPECTED_HEADERS) - len(r))
elif len(r) > len(EXPECTED_HEADERS):
r = r[:len(EXPECTED_HEADERS)]
normalized_rows.append(r)
report["total"] = len(normalized_rows)
report["preview"] = normalized_rows[:10] # show first 10 rows exactly as seen
if dry_run or report["total"] == 0:
return report # preview only
# Create entries in batches (transactional)
to_create: List[Entry] = []
for r in normalized_rows:
try:
# Simple create-only behavior:
Entry.objects.create(**values)
created += 1
except Exception as ex:
errors.append(f"Row {idx}: failed to save ({ex})")
skipped += 1
obj = Entry(
subject=(r[0] or "").strip(),
illustration=(r[1] or "").strip(),
application=(r[2] or "").strip(),
scripture_raw=(r[3] or "").strip(),
source=(r[4] or "").strip(),
talk_title=(r[5] or "").strip(),
talk_number=_parse_int(r[6]),
entry_code=(r[7] or "").strip(),
date_added=_parse_date(r[8]),
date_edited=_parse_date(r[9]),
)
to_create.append(obj)
except Exception as e:
report["skipped"] += 1
report["errors"].append(f"Row skipped due to error: {e}")
if len(to_create) >= batch_size:
with transaction.atomic():
Entry.objects.bulk_create(to_create, batch_size=batch_size)
report["created"] += len(to_create)
to_create.clear()
if to_create:
with transaction.atomic():
Entry.objects.bulk_create(to_create, batch_size=batch_size)
report["created"] += len(to_create)
to_create.clear()
report.update({
"ok": len(errors) == 0,
"created": created,
"updated": updated,
"skipped": skipped,
"errors": errors,
})
return report
# small context manager used above