Update web/core/utils.py

2025-08-22 11:18:09 +00:00 · 2025-08-22 11:18:09 +00:00 · a1b174adc3
commit a1b174adc3
parent f64d41313e
1 changed files with 242 additions and 221 deletions
--- a/web/core/utils.py
+++ b/web/core/utils.py
@ -11,6 +11,42 @@ from django.db import transaction, IntegrityError, DataError, DatabaseError

 from .models import Entry

+EXPECTED_HEADERS: List[str] = [
+    "Subject", "Illustration", "Application", "Scripture", "Source",
+    "Talk Title", "Talk Number", "Code", "Date", "Date Edited",
+]
+
+# Map CSV header labels -> Entry model field names
+HEADER_MAP: Dict[str, str] = {
+    "Subject": "subject",
+    "Illustration": "illustration",
+    "Application": "application",
+    "Scripture": "scripture_raw",
+    "Source": "source",
+    "Talk Title": "talk_title",
+    "Talk Number": "talk_number",
+    "Code": "entry_code",
+    "Date": "date_added",
+    "Date Edited": "date_edited",
+}
+
+# Accept both the pretty labels *and* the actual model field names
+# (lets you import older dumps or hand-made files)
+ACCEPTABLE_HEADERS: Dict[str, str] = {
+    **{h.lower(): HEADER_MAP[h] for h in EXPECTED_HEADERS},
+    # direct model names also OK
+    "subject": "subject",
+    "illustration": "illustration",
+    "application": "application",
+    "scripture_raw": "scripture_raw",
+    "source": "source",
+    "talk_title": "talk_title",
+    "talk_number": "talk_number",
+    "entry_code": "entry_code",
+    "date_added": "date_added",
+    "date_edited": "date_edited",
+}
+

 # ============================
 # Search helpers (used by views)
@ -215,149 +251,6 @@ def _coerce_int(val: str):
        return None


-def import_csv_bytes(b: bytes, dry_run: bool = False, commit_every: int = 500) -> Dict[str, object]:
-    """
-    Robust CSV import. Commits each row in its own transaction so that one bad
-    row does not poison the entire import (avoids TransactionManagementError cascades).
-
-    Returns a report dict with counts and first-line error messages.
-    """
-    text = _decode_bytes(b)
-    dialect = _sniff_dialect(text)
-    delimiter = getattr(dialect, "delimiter", ",")
-
-    # --- headers ---
-    f = io.StringIO(text)
-    reader = csv.reader(f, dialect=dialect)
-    try:
-        raw_headers = next(reader)
-    except StopIteration:
-        return {
-            "rows": 0, "inserted": 0, "updated": 0, "skipped": 0, "errors": [],
-            "scripture_parsed": 0, "scripture_failed": 0,
-            "dialect_delimiter": delimiter, "used_headerless_mode": False,
-            "seen_headers": []
-        }
-
-    headers = raw_headers if len(raw_headers) == EXPECTED_COLS else _split_lenient(
-        ",".join(raw_headers), delimiter=delimiter, expected=EXPECTED_COLS
-    )
-    header_map = _build_header_map(headers)
-
-    # Pair raw lines so we can repair rows mis-split by csv
-    raw_lines = text.splitlines()[1:]  # skip header
-
-    dict_reader = csv.DictReader(io.StringIO(text), fieldnames=headers, dialect=dialect)
-    next(dict_reader, None)  # skip header
-
-    total = inserted = updated = skipped = 0
-    errors: List[str] = []
-    scripture_ok = scripture_bad = 0
-
-    # Import loop (row-by-row atomic)
-    for idx, (raw_line, row) in enumerate(zip(raw_lines, dict_reader), start=2):
-        total += 1
-
-        # Repair if DictReader got the wrong shape (inconsistent quotes in source)
-        if len(row) != EXPECTED_COLS or None in row:
-            cells = _split_lenient(raw_line, delimiter=delimiter, expected=EXPECTED_COLS)
-            row = dict(zip(headers, cells))
-
-        # Extract canonical fields
-        subject      = _getv(row, header_map, "subject").strip()
-        illustration = _getv(row, header_map, "illustration").strip()
-        application  = _getv(row, header_map, "application").strip()
-        scripture    = _getv(row, header_map, "scripture").strip()
-        source       = _getv(row, header_map, "source").strip()
-        talk_title   = _getv(row, header_map, "talk title").strip()
-        talk_number  = _coerce_int(_getv(row, header_map, "talk number"))
-        entry_code   = _getv(row, header_map, "code").strip()
-        date_added   = _parse_date(_getv(row, header_map, "date"))
-        date_edited  = _parse_date(_getv(row, header_map, "date edited"))
-
-        # Skip rows with no meaningful text
-        if not (subject or illustration or application):
-            skipped += 1
-            continue
-
-        # Clip to DB lengths
-        subject      = _clip("subject", subject)
-        illustration = _clip("illustration", illustration)
-        application  = _clip("application", application)
-        scripture    = _clip("scripture_raw", scripture)
-        source       = _clip("source", source)
-        talk_title   = _clip("talk_title", talk_title)
-        entry_code   = _clip("entry_code", entry_code)
-
-        scripture_ok += 1 if scripture else 0
-        scripture_bad += 0 if scripture else 1
-
-        # Upsert key: prefer entry_code; else (subject + illustration)
-        lookup: Dict[str, object] = {}
-        if entry_code:
-            lookup["entry_code"] = entry_code
-        else:
-            lookup["subject"] = subject
-            lookup["illustration"] = illustration
-
-        if dry_run:
-            exists = Entry.objects.filter(**lookup).exists()
-            inserted += 0 if exists else 1
-            updated  += 1 if exists else 0
-            continue
-
-        try:
-            # Isolate each row so a failure rolls back only that row
-            with transaction.atomic():
-                obj = Entry.objects.filter(**lookup).first()
-                created = False
-                if not obj:
-                    obj = Entry(**lookup)
-                    created = True
-
-                obj.subject       = subject
-                obj.illustration  = illustration
-                obj.application   = application
-                obj.scripture_raw = scripture
-                obj.source        = source
-                obj.talk_title    = talk_title
-                obj.talk_number   = talk_number
-                if entry_code:
-                    obj.entry_code = entry_code
-                if date_added:
-                    obj.date_added = date_added
-                if date_edited:
-                    obj.date_edited = date_edited
-
-                obj.save()
-
-                inserted += 1 if created else 0
-                updated  += 0 if created else 1
-
-        except (IntegrityError, DataError, DatabaseError, ValueError) as e:
-            msg = str(e).splitlines()[0]
-            errors.append(f"line {idx}: {type(e).__name__}: {msg}")
-            skipped += 1
-            # continue to next row
-
-    return {
-        "rows": total,
-        "inserted": inserted,
-        "updated": updated,
-        "skipped": skipped,
-        "errors": errors,
-        "scripture_parsed": scripture_ok,
-        "scripture_failed": scripture_bad,
-        "dialect_delimiter": delimiter,
-        "used_headerless_mode": False,
-        "seen_headers": headers,
-    }
-
-    EXPECTED_HEADERS = [
-    "Subject","Illustration","Application","Scripture","Source",
-    "Talk Title","Talk Number","Code","Date","Date Edited"
-]
-
 def _to_int_or_none(s: str) -> Optional[int]:
    s = (s or "").strip()
    if not s:
@ -378,96 +271,224 @@ def _to_date_or_none(s: str) -> Optional[datetime.date]:
            pass
    return None  # let caller decide if this is acceptable

-def import_csv_bytes(data: bytes, dry_run: bool = True, batch_size: int = 1000) -> Dict[str, Any]:
+def _clean_header_token(s: Any) -> str:
    """
-    Robust CSV importer for Entries.
-    - data: raw bytes of the uploaded file
-    - dry_run: when True, do not write to DB; return preview + errors
-    - batch_size: bulk_create chunk size
-    Returns: dict(report=..., rows=preview_rows, errors=[...])
+    Make a header token safe/normalized:
+    - None -> ""
+    - trim spaces
+    - strip surrounding single/double quotes
+    - drop weird prefixes like r:"Talk Title"  or r.'Talk Title'
+    - lowercase for matching
    """
-    text = io.TextIOWrapper(io.BytesIO(data), encoding="utf-8-sig", newline="")
-    reader = csv.reader(text)
+    s = "" if s is None else str(s)
+    s = s.strip()
+    # strip surrounding quotes
+    if len(s) >= 2 and s[0] == s[-1] and s[0] in ("'", '"'):
+        s = s[1:-1]
+    # drop r: or r. prefix some CSV tools add
+    if s[:2].lower() in ("r:", "r."):
+        s = s[2:].lstrip()
+    return s.strip().lower()

-    # Read header row
+
+_DATE_FORMATS = (
+    "%Y-%m-%d",
+    "%m/%d/%Y",
+    "%m/%d/%y",
+    "%d-%b-%Y",     # 05-Sep-2024
+    "%Y/%m/%d",
+)
+
+def _parse_date(val: str) -> Optional[datetime.date]:
+    if not val:
+        return None
+    txt = str(val).strip()
+    # Accept ISO-like with time: 2024-01-02T00:00:00
+    if "T" in txt:
+        try:
+            return datetime.fromisoformat(txt).date()
+        except Exception:
+            pass
+    for fmt in _DATE_FORMATS:
+        try:
+            return datetime.strptime(txt, fmt).date()
+        except Exception:
+            continue
+    # as a last resort, try only year-month-day pieces
    try:
-        header = next(reader)
-    except StopIteration:
-        return {"report": "Empty file.", "rows": [], "errors": ["File is empty."]}
+        parts = [int(p) for p in txt.replace("/", "-").split("-")]
+        if len(parts) >= 3:
+            return datetime(parts[0], parts[1], parts[2]).date()
+    except Exception:
+        pass
+    return None

-    # Loose header check: either exact match, or map by index if close
-    header_norm = [h.strip() for h in header]
-    if header_norm != EXPECTED_HEADERS:
-        return {
-            "report": "Header mismatch.",
-            "rows": [],
-            "errors": [
-                "Expected header: " + ", ".join(EXPECTED_HEADERS),
-                "Found header: " + ", ".join(header_norm),
-            ],
-        }

-    to_create: List[Entry] = []
+def _to_int_or_none(v: Any) -> Optional[int]:
+    if v is None:
+        return None
+    s = str(v).strip()
+    if s == "":
+        return None
+    try:
+        return int(float(s))  # tolerate "123.0"
+    except Exception:
+        return None
+
+
+def import_csv_bytes(data: bytes, dry_run: bool = True) -> Dict[str, Any]:
+    """
+    Robust CSV importer for Entry.
+
+    - Accepts your human-readable header (Subject, Illustration, ...)
+      and/or direct model field names.
+    - Normalizes odd headers like r."Talk Title".
+    - Handles BOM & dialect sniffing.
+    - Returns a report dict: {ok, created, updated, skipped, errors, preview, total_rows, header}
+    """
+    report: Dict[str, Any] = {
+        "ok": False,
+        "created": 0,
+        "updated": 0,
+        "skipped": 0,
+        "errors": [],         # list[str]
+        "preview": [],        # first ~10 rows that would be imported
+        "total_rows": 0,
+        "header": [],
+    }
+
+    # --- decode safely (remove BOM, keep unknowns) ---
+    text = data.decode("utf-8-sig", errors="replace")
+
+    # --- sniff dialect; fall back to excel ---
+    try:
+        sample = "\n".join(text.splitlines()[:10])
+        dialect = csv.Sniffer().sniff(sample) if sample.strip() else csv.excel
+    except Exception:
+        dialect = csv.excel
+
+    rdr = csv.reader(io.StringIO(text), dialect)
+
+    try:
+        raw_header = next(rdr, [])
+    except Exception as e:
+        report["errors"].append(f"Failed reading header: {e}")
+        return report
+
+    # Clean & map header
+    cleaned = [_clean_header_token(h) for h in raw_header]
+    mapped: List[str] = []
+    unknowns: List[str] = []
+    for token in cleaned:
+        target = ACCEPTABLE_HEADERS.get(token)
+        if target:
+            mapped.append(target)
+        else:
+            unknowns.append(token or "(empty)")
+
+    # If header doesn't match expected width but row count does, assume *no* header;
+    # inject expected header so downstream works.
+    has_header = True
+    if unknowns:
+        # Heuristic: if the number of columns equals EXPECTED_HEADERS and *none*
+        # of the cleaned tokens map, it's probably a data row (no header)
+        matches = sum(1 for t in cleaned if t in ACCEPTABLE_HEADERS)
+        if matches == 0 and len(cleaned) == len(EXPECTED_HEADERS):
+            # inject expected header and re-run
+            has_header = False
+            mapped = [HEADER_MAP[h] for h in EXPECTED_HEADERS]
+            # rebuild a reader with the expected header injected
+            sio = io.StringIO(text)
+            rdr_tmp = csv.reader(sio, dialect)
+            rows = list(rdr_tmp)
+            rows.insert(0, EXPECTED_HEADERS)  # inject pretty header for report
+            rdr = iter(rows)                  # consume from this list iterator
+            next(rdr, None)                   # skip our injected header
+        else:
+            # keep going but warn in the report
+            report["errors"].append(
+                "Some header columns were not recognized: "
+                + ", ".join(unknowns)
+                + " (continuing with best-effort mapping)"
+            )
+
+    report["header"] = mapped
+
+    # Read rows
+    rows = list(rdr)
+    report["total_rows"] = len(rows)
+
+    # Build row dicts
+    def row_to_obj(row_idx: int, row: List[str]) -> Tuple[Optional[Entry], Optional[Dict[str, Any]], Optional[str]]:
+        """
+        Returns (entry_instance_or_None, values_dict_or_None, error_message_or_None)
+        but does not save to DB.
+        """
+        if len(row) < len(mapped):
+            return None, None, f"Row {row_idx}: expected {len(mapped)} columns, found {len(row)}."
+        values: Dict[str, Any] = {}
+        for i, field in enumerate(mapped):
+            raw_val = row[i] if i < len(row) else ""
+            # Coerce types for specific fields
+            if field in ("date_added", "date_edited"):
+                values[field] = _parse_date(raw_val)
+            elif field == "talk_number":
+                values[field] = _to_int_or_none(raw_val)
+            else:
+                values[field] = (raw_val or "").strip()
+
+        # Create (unsaved) Entry instance for preview/validation
+        e = Entry(**{k: v for k, v in values.items() if v not in (None, "")})
+        return e, values, None
+
+    # Preview first few
+    for i, row in enumerate(rows[:10], start=1):
+        e, values, err = row_to_obj(i, row)
+        report["preview"].append({
+            "row": i,
+            "values": values if values else {},
+            "error": err,
+        })
+
+    if dry_run:
+        # Dry run: don’t write, just validate basic structure
+        bad = [p for p in report["preview"] if p["error"]]
+        if bad:
+            report["errors"].extend(p["error"] for p in bad if p["error"])
+        report["ok"] = len(report["errors"]) == 0
+        return report
+
+    # Real import (create new rows).
+    # If you want update/merge behavior, add a key strategy here.
+    created = 0
+    updated = 0
+    skipped = 0
    errors: List[str] = []
-    preview: List[Tuple[int, Dict[str, Any]]] = []  # first 100 rows for the UI
-    rownum = 1

-    def make_entry(row: List[str]) -> Optional[Entry]:
-        # force length to 10, padding if needed
-        padded = row + [""] * (10 - len(row))
-        subj, ill, app, scr, src, talk_title, talk_num, code, d_added, d_edited = padded[:10]
+    with transaction.atomic():
+        for idx, row in enumerate(rows, start=1):
+            e, values, err = row_to_obj(idx, row)
+            if err:
+                errors.append(err)
+                skipped += 1
+                continue

-        e = Entry(
-            subject=(subj or "").strip(),
-            illustration=(ill or "").strip(),
-            application=(app or "").strip(),
-            scripture_raw=(scr or "").strip(),
-            source=(src or "").strip(),
-            talk_title=(talk_title or "").strip(),
-            talk_number=_to_int_or_none(talk_num),
-            entry_code=(code or "").strip(),
-            date_added=_to_date_or_none(d_added),
-            date_edited=_to_date_or_none(d_edited),
-        )
-        return e
-
-    created_total = 0
-    with (transaction.atomic() if not dry_run else _noop_context()):
-        for row in reader:
-            rownum += 1
            try:
-                e = make_entry(row)
-                # (optional) add required-field checks; e.g., at least one of illustration/application
-                if not ((e.illustration and e.illustration.strip()) or (e.application and e.application.strip())):
-                    errors.append(f"Row {rownum}: missing Illustration and Application")
-                    continue
-
-                to_create.append(e)
-
-                if len(preview) < 100:
-                    preview.append((rownum, {
-                        "Subject": e.subject, "Illustration": e.illustration[:120],
-                        "Application": e.application[:120], "Scripture": e.scripture_raw,
-                        "Source": e.source, "Talk Title": e.talk_title,
-                        "Talk Number": e.talk_number, "Code": e.entry_code,
-                        "Date": e.date_added, "Date Edited": e.date_edited,
-                    }))
-
-                if not dry_run and len(to_create) >= batch_size:
-                    Entry.objects.bulk_create(to_create, batch_size=batch_size)
-                    created_total += len(to_create)
-                    to_create.clear()
-
+                # Simple create-only behavior:
+                Entry.objects.create(**values)
+                created += 1
            except Exception as ex:
-                errors.append(f"Row {rownum}: {ex}")
+                errors.append(f"Row {idx}: failed to save ({ex})")
+                skipped += 1

-        if not dry_run and to_create:
-            Entry.objects.bulk_create(to_create, batch_size=batch_size)
-            created_total += len(to_create)
-            to_create.clear()
-
-    report = f"{'Would import' if dry_run else 'Imported'} {created_total if not dry_run else len(preview)}+ rows."
-    return {"report": report, "rows": preview, "errors": errors}
+    report.update({
+        "ok": len(errors) == 0,
+        "created": created,
+        "updated": updated,
+        "skipped": skipped,
+        "errors": errors,
+    })
+    return report

 # small context manager used above
 class _noop_context: