diff --git a/web/core/utils.py b/web/core/utils.py index d021332..1f92b8c 100644 --- a/web/core/utils.py +++ b/web/core/utils.py @@ -336,158 +336,181 @@ def _to_int_or_none(v: Any) -> Optional[int]: return None -def import_csv_bytes(data: bytes, dry_run: bool = True) -> Dict[str, Any]: - """ - Robust CSV importer for Entry. +import csv +import io +from datetime import datetime +from typing import Optional, List, Dict, Any - - Accepts your human-readable header (Subject, Illustration, ...) - and/or direct model field names. - - Normalizes odd headers like r."Talk Title". - - Handles BOM & dialect sniffing. - - Returns a report dict: {ok, created, updated, skipped, errors, preview, total_rows, header} +from django.db import transaction + +from .models import Entry + +# Canonical header order expected from the CSV (and shown in the UI) +EXPECTED_HEADERS = [ + "Subject", "Illustration", "Application", "Scripture", "Source", + "Talk Title", "Talk Number", "Code", "Date", "Date Edited", +] + +def _clean_header_cell(s: str) -> str: + if s is None: + return "" + s = str(s).strip() + # Handle odd prefixes like r:"Talk Title" + low = s.lower() + if low.startswith("r:") or low.startswith("r="): + s = s[2:].lstrip() + # Strip wrapping quotes + if len(s) >= 2 and s[0] == s[-1] and s[0] in ('"', "'"): + s = s[1:-1] + return s.strip() + +def _parse_int(x: str) -> Optional[int]: + x = (x or "").strip() + if not x: + return None + try: + return int(x) + except Exception: + return None + +def _parse_date(x: str): """ - report: Dict[str, Any] = { - "ok": False, + Returns a date object or None. + Tries several common formats, then ISO. + """ + x = (x or "").strip() + if not x: + return None + for fmt in ("%Y-%m-%d", "%m/%d/%Y", "%d/%m/%Y"): + try: + return datetime.strptime(x, fmt).date() + except Exception: + pass + try: + return datetime.fromisoformat(x).date() + except Exception: + return None + +def import_csv_bytes(content: bytes, dry_run: bool = True, batch_size: int = 1000) -> Dict[str, Any]: + """ + Parse the uploaded CSV (bytes), optionally write to DB. + Returns a report dict the templates expect: + + { + "total": , + "created": , + "updated": 0, + "skipped": , + "errors": [ ... ], + "preview": [ [cell,...], ... up to 10 rows ], + "columns": EXPECTED_HEADERS, + } + + Notes: + - This implementation always CREATES new rows (no dedupe). + If you want upserts later, we can key on entry_code or (talk_number, entry_code). + """ + report = { + "total": 0, "created": 0, "updated": 0, "skipped": 0, - "errors": [], # list[str] - "preview": [], # first ~10 rows that would be imported - "total_rows": 0, - "header": [], + "errors": [], + "preview": [], + "columns": EXPECTED_HEADERS[:], } - # --- decode safely (remove BOM, keep unknowns) --- - text = data.decode("utf-8-sig", errors="replace") - - # --- sniff dialect; fall back to excel --- + # Decode once (BOM-safe), sniff dialect, fall back to excel + text = content.decode("utf-8-sig", errors="replace") try: - sample = "\n".join(text.splitlines()[:10]) - dialect = csv.Sniffer().sniff(sample) if sample.strip() else csv.excel + first_line = text.splitlines()[0] if text else "" + dialect = csv.Sniffer().sniff(first_line) if first_line else csv.excel except Exception: dialect = csv.excel - rdr = csv.reader(io.StringIO(text), dialect) + rows = list(csv.reader(io.StringIO(text), dialect)) + if not rows: + return report # empty file - try: - raw_header = next(rdr, []) - except Exception as e: - report["errors"].append(f"Failed reading header: {e}") - return report + # Header handling (tolerant) + first = rows[0] + norm_first = [_clean_header_cell(c).lower() for c in first] + expected_norm = [h.lower() for h in EXPECTED_HEADERS] + header_ok = (norm_first == expected_norm) - # Clean & map header - cleaned = [_clean_header_token(h) for h in raw_header] - mapped: List[str] = [] - unknowns: List[str] = [] - for token in cleaned: - target = ACCEPTABLE_HEADERS.get(token) - if target: - mapped.append(target) + if header_ok: + data_rows = rows[1:] + else: + # If first row isn't a match but the column count matches, treat it as data + if len(first) == len(EXPECTED_HEADERS): + data_rows = rows # treat all rows as data; we'll use EXPECTED order else: - unknowns.append(token or "(empty)") - - # If header doesn't match expected width but row count does, assume *no* header; - # inject expected header so downstream works. - has_header = True - if unknowns: - # Heuristic: if the number of columns equals EXPECTED_HEADERS and *none* - # of the cleaned tokens map, it's probably a data row (no header) - matches = sum(1 for t in cleaned if t in ACCEPTABLE_HEADERS) - if matches == 0 and len(cleaned) == len(EXPECTED_HEADERS): - # inject expected header and re-run - has_header = False - mapped = [HEADER_MAP[h] for h in EXPECTED_HEADERS] - # rebuild a reader with the expected header injected - sio = io.StringIO(text) - rdr_tmp = csv.reader(sio, dialect) - rows = list(rdr_tmp) - rows.insert(0, EXPECTED_HEADERS) # inject pretty header for report - rdr = iter(rows) # consume from this list iterator - next(rdr, None) # skip our injected header - else: - # keep going but warn in the report - report["errors"].append( - "Some header columns were not recognized: " - + ", ".join(unknowns) - + " (continuing with best-effort mapping)" - ) - - report["header"] = mapped - - # Read rows - rows = list(rdr) - report["total_rows"] = len(rows) - - # Build row dicts - def row_to_obj(row_idx: int, row: List[str]) -> Tuple[Optional[Entry], Optional[Dict[str, Any]], Optional[str]]: - """ - Returns (entry_instance_or_None, values_dict_or_None, error_message_or_None) - but does not save to DB. - """ - if len(row) < len(mapped): - return None, None, f"Row {row_idx}: expected {len(mapped)} columns, found {len(row)}." - values: Dict[str, Any] = {} - for i, field in enumerate(mapped): - raw_val = row[i] if i < len(row) else "" - # Coerce types for specific fields - if field in ("date_added", "date_edited"): - values[field] = _parse_date(raw_val) - elif field == "talk_number": - values[field] = _to_int_or_none(raw_val) + # Try common alternate delimiters to recover + for delim in (";", "\t"): + rows2 = list(csv.reader(io.StringIO(text), delimiter=delim)) + if rows2 and len(rows2[0]) == len(EXPECTED_HEADERS): + rows = rows2 + first = rows[0] + norm_first = [_clean_header_cell(c).lower() for c in first] + header_ok = (norm_first == expected_norm) + data_rows = rows[1:] if header_ok else rows + break else: - values[field] = (raw_val or "").strip() + # Could not reconcile columns + report["errors"].append( + f"Column mismatch: saw {len(first)} but expected {len(EXPECTED_HEADERS)}." + ) + return report - # Create (unsaved) Entry instance for preview/validation - e = Entry(**{k: v for k, v in values.items() if v not in (None, "")}) - return e, values, None + # Normalize rows length (pad/trim) and build preview (first 10) + normalized_rows: List[List[str]] = [] + for r in data_rows: + if not r or all((c or "").strip() == "" for c in r): + continue + if len(r) < len(EXPECTED_HEADERS): + r = r + [""] * (len(EXPECTED_HEADERS) - len(r)) + elif len(r) > len(EXPECTED_HEADERS): + r = r[:len(EXPECTED_HEADERS)] + normalized_rows.append(r) - # Preview first few - for i, row in enumerate(rows[:10], start=1): - e, values, err = row_to_obj(i, row) - report["preview"].append({ - "row": i, - "values": values if values else {}, - "error": err, - }) + report["total"] = len(normalized_rows) + report["preview"] = normalized_rows[:10] # show first 10 rows exactly as seen + if dry_run or report["total"] == 0: + return report # preview only - if dry_run: - # Dry run: don’t write, just validate basic structure - bad = [p for p in report["preview"] if p["error"]] - if bad: - report["errors"].extend(p["error"] for p in bad if p["error"]) - report["ok"] = len(report["errors"]) == 0 - return report + # Create entries in batches (transactional) + to_create: List[Entry] = [] + for r in normalized_rows: + try: + obj = Entry( + subject=(r[0] or "").strip(), + illustration=(r[1] or "").strip(), + application=(r[2] or "").strip(), + scripture_raw=(r[3] or "").strip(), + source=(r[4] or "").strip(), + talk_title=(r[5] or "").strip(), + talk_number=_parse_int(r[6]), + entry_code=(r[7] or "").strip(), + date_added=_parse_date(r[8]), + date_edited=_parse_date(r[9]), + ) + to_create.append(obj) + except Exception as e: + report["skipped"] += 1 + report["errors"].append(f"Row skipped due to error: {e}") - # Real import (create new rows). - # If you want update/merge behavior, add a key strategy here. - created = 0 - updated = 0 - skipped = 0 - errors: List[str] = [] + if len(to_create) >= batch_size: + with transaction.atomic(): + Entry.objects.bulk_create(to_create, batch_size=batch_size) + report["created"] += len(to_create) + to_create.clear() - with transaction.atomic(): - for idx, row in enumerate(rows, start=1): - e, values, err = row_to_obj(idx, row) - if err: - errors.append(err) - skipped += 1 - continue + if to_create: + with transaction.atomic(): + Entry.objects.bulk_create(to_create, batch_size=batch_size) + report["created"] += len(to_create) + to_create.clear() - try: - # Simple create-only behavior: - Entry.objects.create(**values) - created += 1 - except Exception as ex: - errors.append(f"Row {idx}: failed to save ({ex})") - skipped += 1 - - report.update({ - "ok": len(errors) == 0, - "created": created, - "updated": updated, - "skipped": skipped, - "errors": errors, - }) return report # small context manager used above