Update web/core/utils.py

2025-08-22 13:57:58 +00:00 · 2025-08-22 13:57:58 +00:00 · 17c9a89848
commit 17c9a89848
parent cc6461ced2
1 changed files with 154 additions and 131 deletions
--- a/web/core/utils.py
+++ b/web/core/utils.py
@ -336,158 +336,181 @@ def _to_int_or_none(v: Any) -> Optional[int]:
        return None


-def import_csv_bytes(data: bytes, dry_run: bool = True) -> Dict[str, Any]:
-    """
-    Robust CSV importer for Entry.
+import csv
+import io
+from datetime import datetime
+from typing import Optional, List, Dict, Any

-    - Accepts your human-readable header (Subject, Illustration, ...)
-      and/or direct model field names.
-    - Normalizes odd headers like r."Talk Title".
-    - Handles BOM & dialect sniffing.
-    - Returns a report dict: {ok, created, updated, skipped, errors, preview, total_rows, header}
+from django.db import transaction
+
+from .models import Entry
+
+# Canonical header order expected from the CSV (and shown in the UI)
+EXPECTED_HEADERS = [
+    "Subject", "Illustration", "Application", "Scripture", "Source",
+    "Talk Title", "Talk Number", "Code", "Date", "Date Edited",
+]
+
+def _clean_header_cell(s: str) -> str:
+    if s is None:
+        return ""
+    s = str(s).strip()
+    # Handle odd prefixes like r:"Talk Title"
+    low = s.lower()
+    if low.startswith("r:") or low.startswith("r="):
+        s = s[2:].lstrip()
+    # Strip wrapping quotes
+    if len(s) >= 2 and s[0] == s[-1] and s[0] in ('"', "'"):
+        s = s[1:-1]
+    return s.strip()
+
+def _parse_int(x: str) -> Optional[int]:
+    x = (x or "").strip()
+    if not x:
+        return None
+    try:
+        return int(x)
+    except Exception:
+        return None
+
+def _parse_date(x: str):
    """
-    report: Dict[str, Any] = {
-        "ok": False,
+    Returns a date object or None.
+    Tries several common formats, then ISO.
+    """
+    x = (x or "").strip()
+    if not x:
+        return None
+    for fmt in ("%Y-%m-%d", "%m/%d/%Y", "%d/%m/%Y"):
+        try:
+            return datetime.strptime(x, fmt).date()
+        except Exception:
+            pass
+    try:
+        return datetime.fromisoformat(x).date()
+    except Exception:
+        return None
+
+def import_csv_bytes(content: bytes, dry_run: bool = True, batch_size: int = 1000) -> Dict[str, Any]:
+    """
+    Parse the uploaded CSV (bytes), optionally write to DB.
+    Returns a report dict the templates expect:
+
+      {
+        "total": <int>,
+        "created": <int>,
+        "updated": 0,
+        "skipped": <int>,
+        "errors": [ ... ],
+        "preview": [ [cell,...], ... up to 10 rows ],
+        "columns": EXPECTED_HEADERS,
+      }
+
+    Notes:
+    - This implementation always CREATES new rows (no dedupe).
+      If you want upserts later, we can key on entry_code or (talk_number, entry_code).
+    """
+    report = {
+        "total": 0,
        "created": 0,
        "updated": 0,
        "skipped": 0,
-        "errors": [],         # list[str]
-        "preview": [],        # first ~10 rows that would be imported
-        "total_rows": 0,
-        "header": [],
+        "errors": [],
+        "preview": [],
+        "columns": EXPECTED_HEADERS[:],
    }

-    # --- decode safely (remove BOM, keep unknowns) ---
-    text = data.decode("utf-8-sig", errors="replace")
-
-    # --- sniff dialect; fall back to excel ---
+    # Decode once (BOM-safe), sniff dialect, fall back to excel
+    text = content.decode("utf-8-sig", errors="replace")
    try:
-        sample = "\n".join(text.splitlines()[:10])
-        dialect = csv.Sniffer().sniff(sample) if sample.strip() else csv.excel
+        first_line = text.splitlines()[0] if text else ""
+        dialect = csv.Sniffer().sniff(first_line) if first_line else csv.excel
    except Exception:
        dialect = csv.excel

-    rdr = csv.reader(io.StringIO(text), dialect)
+    rows = list(csv.reader(io.StringIO(text), dialect))
+    if not rows:
+        return report  # empty file

-    try:
-        raw_header = next(rdr, [])
-    except Exception as e:
-        report["errors"].append(f"Failed reading header: {e}")
-        return report
+    # Header handling (tolerant)
+    first = rows[0]
+    norm_first = [_clean_header_cell(c).lower() for c in first]
+    expected_norm = [h.lower() for h in EXPECTED_HEADERS]
+    header_ok = (norm_first == expected_norm)

-    # Clean & map header
-    cleaned = [_clean_header_token(h) for h in raw_header]
-    mapped: List[str] = []
-    unknowns: List[str] = []
-    for token in cleaned:
-        target = ACCEPTABLE_HEADERS.get(token)
-        if target:
-            mapped.append(target)
+    if header_ok:
+        data_rows = rows[1:]
    else:
-            unknowns.append(token or "(empty)")
-
-    # If header doesn't match expected width but row count does, assume *no* header;
-    # inject expected header so downstream works.
-    has_header = True
-    if unknowns:
-        # Heuristic: if the number of columns equals EXPECTED_HEADERS and *none*
-        # of the cleaned tokens map, it's probably a data row (no header)
-        matches = sum(1 for t in cleaned if t in ACCEPTABLE_HEADERS)
-        if matches == 0 and len(cleaned) == len(EXPECTED_HEADERS):
-            # inject expected header and re-run
-            has_header = False
-            mapped = [HEADER_MAP[h] for h in EXPECTED_HEADERS]
-            # rebuild a reader with the expected header injected
-            sio = io.StringIO(text)
-            rdr_tmp = csv.reader(sio, dialect)
-            rows = list(rdr_tmp)
-            rows.insert(0, EXPECTED_HEADERS)  # inject pretty header for report
-            rdr = iter(rows)                  # consume from this list iterator
-            next(rdr, None)                   # skip our injected header
+        # If first row isn't a match but the column count matches, treat it as data
+        if len(first) == len(EXPECTED_HEADERS):
+            data_rows = rows  # treat all rows as data; we'll use EXPECTED order
        else:
-            # keep going but warn in the report
+            # Try common alternate delimiters to recover
+            for delim in (";", "\t"):
+                rows2 = list(csv.reader(io.StringIO(text), delimiter=delim))
+                if rows2 and len(rows2[0]) == len(EXPECTED_HEADERS):
+                    rows = rows2
+                    first = rows[0]
+                    norm_first = [_clean_header_cell(c).lower() for c in first]
+                    header_ok = (norm_first == expected_norm)
+                    data_rows = rows[1:] if header_ok else rows
+                    break
+            else:
+                # Could not reconcile columns
                report["errors"].append(
-                "Some header columns were not recognized: "
-                + ", ".join(unknowns)
-                + " (continuing with best-effort mapping)"
+                    f"Column mismatch: saw {len(first)} but expected {len(EXPECTED_HEADERS)}."
                )
-
-    report["header"] = mapped
-
-    # Read rows
-    rows = list(rdr)
-    report["total_rows"] = len(rows)
-
-    # Build row dicts
-    def row_to_obj(row_idx: int, row: List[str]) -> Tuple[Optional[Entry], Optional[Dict[str, Any]], Optional[str]]:
-        """
-        Returns (entry_instance_or_None, values_dict_or_None, error_message_or_None)
-        but does not save to DB.
-        """
-        if len(row) < len(mapped):
-            return None, None, f"Row {row_idx}: expected {len(mapped)} columns, found {len(row)}."
-        values: Dict[str, Any] = {}
-        for i, field in enumerate(mapped):
-            raw_val = row[i] if i < len(row) else ""
-            # Coerce types for specific fields
-            if field in ("date_added", "date_edited"):
-                values[field] = _parse_date(raw_val)
-            elif field == "talk_number":
-                values[field] = _to_int_or_none(raw_val)
-            else:
-                values[field] = (raw_val or "").strip()
-
-        # Create (unsaved) Entry instance for preview/validation
-        e = Entry(**{k: v for k, v in values.items() if v not in (None, "")})
-        return e, values, None
-
-    # Preview first few
-    for i, row in enumerate(rows[:10], start=1):
-        e, values, err = row_to_obj(i, row)
-        report["preview"].append({
-            "row": i,
-            "values": values if values else {},
-            "error": err,
-        })
-
-    if dry_run:
-        # Dry run: don’t write, just validate basic structure
-        bad = [p for p in report["preview"] if p["error"]]
-        if bad:
-            report["errors"].extend(p["error"] for p in bad if p["error"])
-        report["ok"] = len(report["errors"]) == 0
                return report

-    # Real import (create new rows).
-    # If you want update/merge behavior, add a key strategy here.
-    created = 0
-    updated = 0
-    skipped = 0
-    errors: List[str] = []
-
-    with transaction.atomic():
-        for idx, row in enumerate(rows, start=1):
-            e, values, err = row_to_obj(idx, row)
-            if err:
-                errors.append(err)
-                skipped += 1
+    # Normalize rows length (pad/trim) and build preview (first 10)
+    normalized_rows: List[List[str]] = []
+    for r in data_rows:
+        if not r or all((c or "").strip() == "" for c in r):
            continue
+        if len(r) < len(EXPECTED_HEADERS):
+            r = r + [""] * (len(EXPECTED_HEADERS) - len(r))
+        elif len(r) > len(EXPECTED_HEADERS):
+            r = r[:len(EXPECTED_HEADERS)]
+        normalized_rows.append(r)

+    report["total"] = len(normalized_rows)
+    report["preview"] = normalized_rows[:10]  # show first 10 rows exactly as seen
+    if dry_run or report["total"] == 0:
+        return report  # preview only
+
+    # Create entries in batches (transactional)
+    to_create: List[Entry] = []
+    for r in normalized_rows:
        try:
-                # Simple create-only behavior:
-                Entry.objects.create(**values)
-                created += 1
-            except Exception as ex:
-                errors.append(f"Row {idx}: failed to save ({ex})")
-                skipped += 1
+            obj = Entry(
+                subject=(r[0] or "").strip(),
+                illustration=(r[1] or "").strip(),
+                application=(r[2] or "").strip(),
+                scripture_raw=(r[3] or "").strip(),
+                source=(r[4] or "").strip(),
+                talk_title=(r[5] or "").strip(),
+                talk_number=_parse_int(r[6]),
+                entry_code=(r[7] or "").strip(),
+                date_added=_parse_date(r[8]),
+                date_edited=_parse_date(r[9]),
+            )
+            to_create.append(obj)
+        except Exception as e:
+            report["skipped"] += 1
+            report["errors"].append(f"Row skipped due to error: {e}")
+
+        if len(to_create) >= batch_size:
+            with transaction.atomic():
+                Entry.objects.bulk_create(to_create, batch_size=batch_size)
+            report["created"] += len(to_create)
+            to_create.clear()
+
+    if to_create:
+        with transaction.atomic():
+            Entry.objects.bulk_create(to_create, batch_size=batch_size)
+        report["created"] += len(to_create)
+        to_create.clear()

-    report.update({
-        "ok": len(errors) == 0,
-        "created": created,
-        "updated": updated,
-        "skipped": skipped,
-        "errors": errors,
-    })
    return report

 # small context manager used above