Update web/core/utils.py
This commit is contained in:
parent
f64d41313e
commit
a1b174adc3
@ -11,6 +11,42 @@ from django.db import transaction, IntegrityError, DataError, DatabaseError
|
||||
|
||||
from .models import Entry
|
||||
|
||||
EXPECTED_HEADERS: List[str] = [
|
||||
"Subject", "Illustration", "Application", "Scripture", "Source",
|
||||
"Talk Title", "Talk Number", "Code", "Date", "Date Edited",
|
||||
]
|
||||
|
||||
# Map CSV header labels -> Entry model field names
|
||||
HEADER_MAP: Dict[str, str] = {
|
||||
"Subject": "subject",
|
||||
"Illustration": "illustration",
|
||||
"Application": "application",
|
||||
"Scripture": "scripture_raw",
|
||||
"Source": "source",
|
||||
"Talk Title": "talk_title",
|
||||
"Talk Number": "talk_number",
|
||||
"Code": "entry_code",
|
||||
"Date": "date_added",
|
||||
"Date Edited": "date_edited",
|
||||
}
|
||||
|
||||
# Accept both the pretty labels *and* the actual model field names
|
||||
# (lets you import older dumps or hand-made files)
|
||||
ACCEPTABLE_HEADERS: Dict[str, str] = {
|
||||
**{h.lower(): HEADER_MAP[h] for h in EXPECTED_HEADERS},
|
||||
# direct model names also OK
|
||||
"subject": "subject",
|
||||
"illustration": "illustration",
|
||||
"application": "application",
|
||||
"scripture_raw": "scripture_raw",
|
||||
"source": "source",
|
||||
"talk_title": "talk_title",
|
||||
"talk_number": "talk_number",
|
||||
"entry_code": "entry_code",
|
||||
"date_added": "date_added",
|
||||
"date_edited": "date_edited",
|
||||
}
|
||||
|
||||
|
||||
# ============================
|
||||
# Search helpers (used by views)
|
||||
@ -215,149 +251,6 @@ def _coerce_int(val: str):
|
||||
return None
|
||||
|
||||
|
||||
def import_csv_bytes(b: bytes, dry_run: bool = False, commit_every: int = 500) -> Dict[str, object]:
|
||||
"""
|
||||
Robust CSV import. Commits each row in its own transaction so that one bad
|
||||
row does not poison the entire import (avoids TransactionManagementError cascades).
|
||||
|
||||
Returns a report dict with counts and first-line error messages.
|
||||
"""
|
||||
text = _decode_bytes(b)
|
||||
dialect = _sniff_dialect(text)
|
||||
delimiter = getattr(dialect, "delimiter", ",")
|
||||
|
||||
# --- headers ---
|
||||
f = io.StringIO(text)
|
||||
reader = csv.reader(f, dialect=dialect)
|
||||
try:
|
||||
raw_headers = next(reader)
|
||||
except StopIteration:
|
||||
return {
|
||||
"rows": 0, "inserted": 0, "updated": 0, "skipped": 0, "errors": [],
|
||||
"scripture_parsed": 0, "scripture_failed": 0,
|
||||
"dialect_delimiter": delimiter, "used_headerless_mode": False,
|
||||
"seen_headers": []
|
||||
}
|
||||
|
||||
headers = raw_headers if len(raw_headers) == EXPECTED_COLS else _split_lenient(
|
||||
",".join(raw_headers), delimiter=delimiter, expected=EXPECTED_COLS
|
||||
)
|
||||
header_map = _build_header_map(headers)
|
||||
|
||||
# Pair raw lines so we can repair rows mis-split by csv
|
||||
raw_lines = text.splitlines()[1:] # skip header
|
||||
|
||||
dict_reader = csv.DictReader(io.StringIO(text), fieldnames=headers, dialect=dialect)
|
||||
next(dict_reader, None) # skip header
|
||||
|
||||
total = inserted = updated = skipped = 0
|
||||
errors: List[str] = []
|
||||
scripture_ok = scripture_bad = 0
|
||||
|
||||
# Import loop (row-by-row atomic)
|
||||
for idx, (raw_line, row) in enumerate(zip(raw_lines, dict_reader), start=2):
|
||||
total += 1
|
||||
|
||||
# Repair if DictReader got the wrong shape (inconsistent quotes in source)
|
||||
if len(row) != EXPECTED_COLS or None in row:
|
||||
cells = _split_lenient(raw_line, delimiter=delimiter, expected=EXPECTED_COLS)
|
||||
row = dict(zip(headers, cells))
|
||||
|
||||
# Extract canonical fields
|
||||
subject = _getv(row, header_map, "subject").strip()
|
||||
illustration = _getv(row, header_map, "illustration").strip()
|
||||
application = _getv(row, header_map, "application").strip()
|
||||
scripture = _getv(row, header_map, "scripture").strip()
|
||||
source = _getv(row, header_map, "source").strip()
|
||||
talk_title = _getv(row, header_map, "talk title").strip()
|
||||
talk_number = _coerce_int(_getv(row, header_map, "talk number"))
|
||||
entry_code = _getv(row, header_map, "code").strip()
|
||||
date_added = _parse_date(_getv(row, header_map, "date"))
|
||||
date_edited = _parse_date(_getv(row, header_map, "date edited"))
|
||||
|
||||
# Skip rows with no meaningful text
|
||||
if not (subject or illustration or application):
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
# Clip to DB lengths
|
||||
subject = _clip("subject", subject)
|
||||
illustration = _clip("illustration", illustration)
|
||||
application = _clip("application", application)
|
||||
scripture = _clip("scripture_raw", scripture)
|
||||
source = _clip("source", source)
|
||||
talk_title = _clip("talk_title", talk_title)
|
||||
entry_code = _clip("entry_code", entry_code)
|
||||
|
||||
scripture_ok += 1 if scripture else 0
|
||||
scripture_bad += 0 if scripture else 1
|
||||
|
||||
# Upsert key: prefer entry_code; else (subject + illustration)
|
||||
lookup: Dict[str, object] = {}
|
||||
if entry_code:
|
||||
lookup["entry_code"] = entry_code
|
||||
else:
|
||||
lookup["subject"] = subject
|
||||
lookup["illustration"] = illustration
|
||||
|
||||
if dry_run:
|
||||
exists = Entry.objects.filter(**lookup).exists()
|
||||
inserted += 0 if exists else 1
|
||||
updated += 1 if exists else 0
|
||||
continue
|
||||
|
||||
try:
|
||||
# Isolate each row so a failure rolls back only that row
|
||||
with transaction.atomic():
|
||||
obj = Entry.objects.filter(**lookup).first()
|
||||
created = False
|
||||
if not obj:
|
||||
obj = Entry(**lookup)
|
||||
created = True
|
||||
|
||||
obj.subject = subject
|
||||
obj.illustration = illustration
|
||||
obj.application = application
|
||||
obj.scripture_raw = scripture
|
||||
obj.source = source
|
||||
obj.talk_title = talk_title
|
||||
obj.talk_number = talk_number
|
||||
if entry_code:
|
||||
obj.entry_code = entry_code
|
||||
if date_added:
|
||||
obj.date_added = date_added
|
||||
if date_edited:
|
||||
obj.date_edited = date_edited
|
||||
|
||||
obj.save()
|
||||
|
||||
inserted += 1 if created else 0
|
||||
updated += 0 if created else 1
|
||||
|
||||
except (IntegrityError, DataError, DatabaseError, ValueError) as e:
|
||||
msg = str(e).splitlines()[0]
|
||||
errors.append(f"line {idx}: {type(e).__name__}: {msg}")
|
||||
skipped += 1
|
||||
# continue to next row
|
||||
|
||||
return {
|
||||
"rows": total,
|
||||
"inserted": inserted,
|
||||
"updated": updated,
|
||||
"skipped": skipped,
|
||||
"errors": errors,
|
||||
"scripture_parsed": scripture_ok,
|
||||
"scripture_failed": scripture_bad,
|
||||
"dialect_delimiter": delimiter,
|
||||
"used_headerless_mode": False,
|
||||
"seen_headers": headers,
|
||||
}
|
||||
|
||||
EXPECTED_HEADERS = [
|
||||
"Subject","Illustration","Application","Scripture","Source",
|
||||
"Talk Title","Talk Number","Code","Date","Date Edited"
|
||||
]
|
||||
|
||||
def _to_int_or_none(s: str) -> Optional[int]:
|
||||
s = (s or "").strip()
|
||||
if not s:
|
||||
@ -378,96 +271,224 @@ def _to_date_or_none(s: str) -> Optional[datetime.date]:
|
||||
pass
|
||||
return None # let caller decide if this is acceptable
|
||||
|
||||
def import_csv_bytes(data: bytes, dry_run: bool = True, batch_size: int = 1000) -> Dict[str, Any]:
|
||||
def _clean_header_token(s: Any) -> str:
|
||||
"""
|
||||
Robust CSV importer for Entries.
|
||||
- data: raw bytes of the uploaded file
|
||||
- dry_run: when True, do not write to DB; return preview + errors
|
||||
- batch_size: bulk_create chunk size
|
||||
Returns: dict(report=..., rows=preview_rows, errors=[...])
|
||||
Make a header token safe/normalized:
|
||||
- None -> ""
|
||||
- trim spaces
|
||||
- strip surrounding single/double quotes
|
||||
- drop weird prefixes like r:"Talk Title" or r.'Talk Title'
|
||||
- lowercase for matching
|
||||
"""
|
||||
text = io.TextIOWrapper(io.BytesIO(data), encoding="utf-8-sig", newline="")
|
||||
reader = csv.reader(text)
|
||||
s = "" if s is None else str(s)
|
||||
s = s.strip()
|
||||
# strip surrounding quotes
|
||||
if len(s) >= 2 and s[0] == s[-1] and s[0] in ("'", '"'):
|
||||
s = s[1:-1]
|
||||
# drop r: or r. prefix some CSV tools add
|
||||
if s[:2].lower() in ("r:", "r."):
|
||||
s = s[2:].lstrip()
|
||||
return s.strip().lower()
|
||||
|
||||
# Read header row
|
||||
|
||||
_DATE_FORMATS = (
|
||||
"%Y-%m-%d",
|
||||
"%m/%d/%Y",
|
||||
"%m/%d/%y",
|
||||
"%d-%b-%Y", # 05-Sep-2024
|
||||
"%Y/%m/%d",
|
||||
)
|
||||
|
||||
def _parse_date(val: str) -> Optional[datetime.date]:
|
||||
if not val:
|
||||
return None
|
||||
txt = str(val).strip()
|
||||
# Accept ISO-like with time: 2024-01-02T00:00:00
|
||||
if "T" in txt:
|
||||
try:
|
||||
return datetime.fromisoformat(txt).date()
|
||||
except Exception:
|
||||
pass
|
||||
for fmt in _DATE_FORMATS:
|
||||
try:
|
||||
return datetime.strptime(txt, fmt).date()
|
||||
except Exception:
|
||||
continue
|
||||
# as a last resort, try only year-month-day pieces
|
||||
try:
|
||||
header = next(reader)
|
||||
except StopIteration:
|
||||
return {"report": "Empty file.", "rows": [], "errors": ["File is empty."]}
|
||||
parts = [int(p) for p in txt.replace("/", "-").split("-")]
|
||||
if len(parts) >= 3:
|
||||
return datetime(parts[0], parts[1], parts[2]).date()
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
# Loose header check: either exact match, or map by index if close
|
||||
header_norm = [h.strip() for h in header]
|
||||
if header_norm != EXPECTED_HEADERS:
|
||||
return {
|
||||
"report": "Header mismatch.",
|
||||
"rows": [],
|
||||
"errors": [
|
||||
"Expected header: " + ", ".join(EXPECTED_HEADERS),
|
||||
"Found header: " + ", ".join(header_norm),
|
||||
],
|
||||
}
|
||||
|
||||
to_create: List[Entry] = []
|
||||
def _to_int_or_none(v: Any) -> Optional[int]:
|
||||
if v is None:
|
||||
return None
|
||||
s = str(v).strip()
|
||||
if s == "":
|
||||
return None
|
||||
try:
|
||||
return int(float(s)) # tolerate "123.0"
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def import_csv_bytes(data: bytes, dry_run: bool = True) -> Dict[str, Any]:
|
||||
"""
|
||||
Robust CSV importer for Entry.
|
||||
|
||||
- Accepts your human-readable header (Subject, Illustration, ...)
|
||||
and/or direct model field names.
|
||||
- Normalizes odd headers like r."Talk Title".
|
||||
- Handles BOM & dialect sniffing.
|
||||
- Returns a report dict: {ok, created, updated, skipped, errors, preview, total_rows, header}
|
||||
"""
|
||||
report: Dict[str, Any] = {
|
||||
"ok": False,
|
||||
"created": 0,
|
||||
"updated": 0,
|
||||
"skipped": 0,
|
||||
"errors": [], # list[str]
|
||||
"preview": [], # first ~10 rows that would be imported
|
||||
"total_rows": 0,
|
||||
"header": [],
|
||||
}
|
||||
|
||||
# --- decode safely (remove BOM, keep unknowns) ---
|
||||
text = data.decode("utf-8-sig", errors="replace")
|
||||
|
||||
# --- sniff dialect; fall back to excel ---
|
||||
try:
|
||||
sample = "\n".join(text.splitlines()[:10])
|
||||
dialect = csv.Sniffer().sniff(sample) if sample.strip() else csv.excel
|
||||
except Exception:
|
||||
dialect = csv.excel
|
||||
|
||||
rdr = csv.reader(io.StringIO(text), dialect)
|
||||
|
||||
try:
|
||||
raw_header = next(rdr, [])
|
||||
except Exception as e:
|
||||
report["errors"].append(f"Failed reading header: {e}")
|
||||
return report
|
||||
|
||||
# Clean & map header
|
||||
cleaned = [_clean_header_token(h) for h in raw_header]
|
||||
mapped: List[str] = []
|
||||
unknowns: List[str] = []
|
||||
for token in cleaned:
|
||||
target = ACCEPTABLE_HEADERS.get(token)
|
||||
if target:
|
||||
mapped.append(target)
|
||||
else:
|
||||
unknowns.append(token or "(empty)")
|
||||
|
||||
# If header doesn't match expected width but row count does, assume *no* header;
|
||||
# inject expected header so downstream works.
|
||||
has_header = True
|
||||
if unknowns:
|
||||
# Heuristic: if the number of columns equals EXPECTED_HEADERS and *none*
|
||||
# of the cleaned tokens map, it's probably a data row (no header)
|
||||
matches = sum(1 for t in cleaned if t in ACCEPTABLE_HEADERS)
|
||||
if matches == 0 and len(cleaned) == len(EXPECTED_HEADERS):
|
||||
# inject expected header and re-run
|
||||
has_header = False
|
||||
mapped = [HEADER_MAP[h] for h in EXPECTED_HEADERS]
|
||||
# rebuild a reader with the expected header injected
|
||||
sio = io.StringIO(text)
|
||||
rdr_tmp = csv.reader(sio, dialect)
|
||||
rows = list(rdr_tmp)
|
||||
rows.insert(0, EXPECTED_HEADERS) # inject pretty header for report
|
||||
rdr = iter(rows) # consume from this list iterator
|
||||
next(rdr, None) # skip our injected header
|
||||
else:
|
||||
# keep going but warn in the report
|
||||
report["errors"].append(
|
||||
"Some header columns were not recognized: "
|
||||
+ ", ".join(unknowns)
|
||||
+ " (continuing with best-effort mapping)"
|
||||
)
|
||||
|
||||
report["header"] = mapped
|
||||
|
||||
# Read rows
|
||||
rows = list(rdr)
|
||||
report["total_rows"] = len(rows)
|
||||
|
||||
# Build row dicts
|
||||
def row_to_obj(row_idx: int, row: List[str]) -> Tuple[Optional[Entry], Optional[Dict[str, Any]], Optional[str]]:
|
||||
"""
|
||||
Returns (entry_instance_or_None, values_dict_or_None, error_message_or_None)
|
||||
but does not save to DB.
|
||||
"""
|
||||
if len(row) < len(mapped):
|
||||
return None, None, f"Row {row_idx}: expected {len(mapped)} columns, found {len(row)}."
|
||||
values: Dict[str, Any] = {}
|
||||
for i, field in enumerate(mapped):
|
||||
raw_val = row[i] if i < len(row) else ""
|
||||
# Coerce types for specific fields
|
||||
if field in ("date_added", "date_edited"):
|
||||
values[field] = _parse_date(raw_val)
|
||||
elif field == "talk_number":
|
||||
values[field] = _to_int_or_none(raw_val)
|
||||
else:
|
||||
values[field] = (raw_val or "").strip()
|
||||
|
||||
# Create (unsaved) Entry instance for preview/validation
|
||||
e = Entry(**{k: v for k, v in values.items() if v not in (None, "")})
|
||||
return e, values, None
|
||||
|
||||
# Preview first few
|
||||
for i, row in enumerate(rows[:10], start=1):
|
||||
e, values, err = row_to_obj(i, row)
|
||||
report["preview"].append({
|
||||
"row": i,
|
||||
"values": values if values else {},
|
||||
"error": err,
|
||||
})
|
||||
|
||||
if dry_run:
|
||||
# Dry run: don’t write, just validate basic structure
|
||||
bad = [p for p in report["preview"] if p["error"]]
|
||||
if bad:
|
||||
report["errors"].extend(p["error"] for p in bad if p["error"])
|
||||
report["ok"] = len(report["errors"]) == 0
|
||||
return report
|
||||
|
||||
# Real import (create new rows).
|
||||
# If you want update/merge behavior, add a key strategy here.
|
||||
created = 0
|
||||
updated = 0
|
||||
skipped = 0
|
||||
errors: List[str] = []
|
||||
preview: List[Tuple[int, Dict[str, Any]]] = [] # first 100 rows for the UI
|
||||
rownum = 1
|
||||
|
||||
def make_entry(row: List[str]) -> Optional[Entry]:
|
||||
# force length to 10, padding if needed
|
||||
padded = row + [""] * (10 - len(row))
|
||||
subj, ill, app, scr, src, talk_title, talk_num, code, d_added, d_edited = padded[:10]
|
||||
with transaction.atomic():
|
||||
for idx, row in enumerate(rows, start=1):
|
||||
e, values, err = row_to_obj(idx, row)
|
||||
if err:
|
||||
errors.append(err)
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
e = Entry(
|
||||
subject=(subj or "").strip(),
|
||||
illustration=(ill or "").strip(),
|
||||
application=(app or "").strip(),
|
||||
scripture_raw=(scr or "").strip(),
|
||||
source=(src or "").strip(),
|
||||
talk_title=(talk_title or "").strip(),
|
||||
talk_number=_to_int_or_none(talk_num),
|
||||
entry_code=(code or "").strip(),
|
||||
date_added=_to_date_or_none(d_added),
|
||||
date_edited=_to_date_or_none(d_edited),
|
||||
)
|
||||
return e
|
||||
|
||||
created_total = 0
|
||||
with (transaction.atomic() if not dry_run else _noop_context()):
|
||||
for row in reader:
|
||||
rownum += 1
|
||||
try:
|
||||
e = make_entry(row)
|
||||
# (optional) add required-field checks; e.g., at least one of illustration/application
|
||||
if not ((e.illustration and e.illustration.strip()) or (e.application and e.application.strip())):
|
||||
errors.append(f"Row {rownum}: missing Illustration and Application")
|
||||
continue
|
||||
|
||||
to_create.append(e)
|
||||
|
||||
if len(preview) < 100:
|
||||
preview.append((rownum, {
|
||||
"Subject": e.subject, "Illustration": e.illustration[:120],
|
||||
"Application": e.application[:120], "Scripture": e.scripture_raw,
|
||||
"Source": e.source, "Talk Title": e.talk_title,
|
||||
"Talk Number": e.talk_number, "Code": e.entry_code,
|
||||
"Date": e.date_added, "Date Edited": e.date_edited,
|
||||
}))
|
||||
|
||||
if not dry_run and len(to_create) >= batch_size:
|
||||
Entry.objects.bulk_create(to_create, batch_size=batch_size)
|
||||
created_total += len(to_create)
|
||||
to_create.clear()
|
||||
|
||||
# Simple create-only behavior:
|
||||
Entry.objects.create(**values)
|
||||
created += 1
|
||||
except Exception as ex:
|
||||
errors.append(f"Row {rownum}: {ex}")
|
||||
errors.append(f"Row {idx}: failed to save ({ex})")
|
||||
skipped += 1
|
||||
|
||||
if not dry_run and to_create:
|
||||
Entry.objects.bulk_create(to_create, batch_size=batch_size)
|
||||
created_total += len(to_create)
|
||||
to_create.clear()
|
||||
|
||||
report = f"{'Would import' if dry_run else 'Imported'} {created_total if not dry_run else len(preview)}+ rows."
|
||||
return {"report": report, "rows": preview, "errors": errors}
|
||||
report.update({
|
||||
"ok": len(errors) == 0,
|
||||
"created": created,
|
||||
"updated": updated,
|
||||
"skipped": skipped,
|
||||
"errors": errors,
|
||||
})
|
||||
return report
|
||||
|
||||
# small context manager used above
|
||||
class _noop_context:
|
||||
|
||||
Loading…
Reference in New Issue
Block a user