diff --git a/web/core/utils.py b/web/core/utils.py index 284185d..d021332 100644 --- a/web/core/utils.py +++ b/web/core/utils.py @@ -11,6 +11,42 @@ from django.db import transaction, IntegrityError, DataError, DatabaseError from .models import Entry +EXPECTED_HEADERS: List[str] = [ + "Subject", "Illustration", "Application", "Scripture", "Source", + "Talk Title", "Talk Number", "Code", "Date", "Date Edited", +] + +# Map CSV header labels -> Entry model field names +HEADER_MAP: Dict[str, str] = { + "Subject": "subject", + "Illustration": "illustration", + "Application": "application", + "Scripture": "scripture_raw", + "Source": "source", + "Talk Title": "talk_title", + "Talk Number": "talk_number", + "Code": "entry_code", + "Date": "date_added", + "Date Edited": "date_edited", +} + +# Accept both the pretty labels *and* the actual model field names +# (lets you import older dumps or hand-made files) +ACCEPTABLE_HEADERS: Dict[str, str] = { + **{h.lower(): HEADER_MAP[h] for h in EXPECTED_HEADERS}, + # direct model names also OK + "subject": "subject", + "illustration": "illustration", + "application": "application", + "scripture_raw": "scripture_raw", + "source": "source", + "talk_title": "talk_title", + "talk_number": "talk_number", + "entry_code": "entry_code", + "date_added": "date_added", + "date_edited": "date_edited", +} + # ============================ # Search helpers (used by views) @@ -215,149 +251,6 @@ def _coerce_int(val: str): return None -def import_csv_bytes(b: bytes, dry_run: bool = False, commit_every: int = 500) -> Dict[str, object]: - """ - Robust CSV import. Commits each row in its own transaction so that one bad - row does not poison the entire import (avoids TransactionManagementError cascades). - - Returns a report dict with counts and first-line error messages. - """ - text = _decode_bytes(b) - dialect = _sniff_dialect(text) - delimiter = getattr(dialect, "delimiter", ",") - - # --- headers --- - f = io.StringIO(text) - reader = csv.reader(f, dialect=dialect) - try: - raw_headers = next(reader) - except StopIteration: - return { - "rows": 0, "inserted": 0, "updated": 0, "skipped": 0, "errors": [], - "scripture_parsed": 0, "scripture_failed": 0, - "dialect_delimiter": delimiter, "used_headerless_mode": False, - "seen_headers": [] - } - - headers = raw_headers if len(raw_headers) == EXPECTED_COLS else _split_lenient( - ",".join(raw_headers), delimiter=delimiter, expected=EXPECTED_COLS - ) - header_map = _build_header_map(headers) - - # Pair raw lines so we can repair rows mis-split by csv - raw_lines = text.splitlines()[1:] # skip header - - dict_reader = csv.DictReader(io.StringIO(text), fieldnames=headers, dialect=dialect) - next(dict_reader, None) # skip header - - total = inserted = updated = skipped = 0 - errors: List[str] = [] - scripture_ok = scripture_bad = 0 - - # Import loop (row-by-row atomic) - for idx, (raw_line, row) in enumerate(zip(raw_lines, dict_reader), start=2): - total += 1 - - # Repair if DictReader got the wrong shape (inconsistent quotes in source) - if len(row) != EXPECTED_COLS or None in row: - cells = _split_lenient(raw_line, delimiter=delimiter, expected=EXPECTED_COLS) - row = dict(zip(headers, cells)) - - # Extract canonical fields - subject = _getv(row, header_map, "subject").strip() - illustration = _getv(row, header_map, "illustration").strip() - application = _getv(row, header_map, "application").strip() - scripture = _getv(row, header_map, "scripture").strip() - source = _getv(row, header_map, "source").strip() - talk_title = _getv(row, header_map, "talk title").strip() - talk_number = _coerce_int(_getv(row, header_map, "talk number")) - entry_code = _getv(row, header_map, "code").strip() - date_added = _parse_date(_getv(row, header_map, "date")) - date_edited = _parse_date(_getv(row, header_map, "date edited")) - - # Skip rows with no meaningful text - if not (subject or illustration or application): - skipped += 1 - continue - - # Clip to DB lengths - subject = _clip("subject", subject) - illustration = _clip("illustration", illustration) - application = _clip("application", application) - scripture = _clip("scripture_raw", scripture) - source = _clip("source", source) - talk_title = _clip("talk_title", talk_title) - entry_code = _clip("entry_code", entry_code) - - scripture_ok += 1 if scripture else 0 - scripture_bad += 0 if scripture else 1 - - # Upsert key: prefer entry_code; else (subject + illustration) - lookup: Dict[str, object] = {} - if entry_code: - lookup["entry_code"] = entry_code - else: - lookup["subject"] = subject - lookup["illustration"] = illustration - - if dry_run: - exists = Entry.objects.filter(**lookup).exists() - inserted += 0 if exists else 1 - updated += 1 if exists else 0 - continue - - try: - # Isolate each row so a failure rolls back only that row - with transaction.atomic(): - obj = Entry.objects.filter(**lookup).first() - created = False - if not obj: - obj = Entry(**lookup) - created = True - - obj.subject = subject - obj.illustration = illustration - obj.application = application - obj.scripture_raw = scripture - obj.source = source - obj.talk_title = talk_title - obj.talk_number = talk_number - if entry_code: - obj.entry_code = entry_code - if date_added: - obj.date_added = date_added - if date_edited: - obj.date_edited = date_edited - - obj.save() - - inserted += 1 if created else 0 - updated += 0 if created else 1 - - except (IntegrityError, DataError, DatabaseError, ValueError) as e: - msg = str(e).splitlines()[0] - errors.append(f"line {idx}: {type(e).__name__}: {msg}") - skipped += 1 - # continue to next row - - return { - "rows": total, - "inserted": inserted, - "updated": updated, - "skipped": skipped, - "errors": errors, - "scripture_parsed": scripture_ok, - "scripture_failed": scripture_bad, - "dialect_delimiter": delimiter, - "used_headerless_mode": False, - "seen_headers": headers, - } - - EXPECTED_HEADERS = [ - "Subject","Illustration","Application","Scripture","Source", - "Talk Title","Talk Number","Code","Date","Date Edited" -] - def _to_int_or_none(s: str) -> Optional[int]: s = (s or "").strip() if not s: @@ -378,96 +271,224 @@ def _to_date_or_none(s: str) -> Optional[datetime.date]: pass return None # let caller decide if this is acceptable -def import_csv_bytes(data: bytes, dry_run: bool = True, batch_size: int = 1000) -> Dict[str, Any]: +def _clean_header_token(s: Any) -> str: """ - Robust CSV importer for Entries. - - data: raw bytes of the uploaded file - - dry_run: when True, do not write to DB; return preview + errors - - batch_size: bulk_create chunk size - Returns: dict(report=..., rows=preview_rows, errors=[...]) + Make a header token safe/normalized: + - None -> "" + - trim spaces + - strip surrounding single/double quotes + - drop weird prefixes like r:"Talk Title" or r.'Talk Title' + - lowercase for matching """ - text = io.TextIOWrapper(io.BytesIO(data), encoding="utf-8-sig", newline="") - reader = csv.reader(text) + s = "" if s is None else str(s) + s = s.strip() + # strip surrounding quotes + if len(s) >= 2 and s[0] == s[-1] and s[0] in ("'", '"'): + s = s[1:-1] + # drop r: or r. prefix some CSV tools add + if s[:2].lower() in ("r:", "r."): + s = s[2:].lstrip() + return s.strip().lower() - # Read header row + +_DATE_FORMATS = ( + "%Y-%m-%d", + "%m/%d/%Y", + "%m/%d/%y", + "%d-%b-%Y", # 05-Sep-2024 + "%Y/%m/%d", +) + +def _parse_date(val: str) -> Optional[datetime.date]: + if not val: + return None + txt = str(val).strip() + # Accept ISO-like with time: 2024-01-02T00:00:00 + if "T" in txt: + try: + return datetime.fromisoformat(txt).date() + except Exception: + pass + for fmt in _DATE_FORMATS: + try: + return datetime.strptime(txt, fmt).date() + except Exception: + continue + # as a last resort, try only year-month-day pieces try: - header = next(reader) - except StopIteration: - return {"report": "Empty file.", "rows": [], "errors": ["File is empty."]} + parts = [int(p) for p in txt.replace("/", "-").split("-")] + if len(parts) >= 3: + return datetime(parts[0], parts[1], parts[2]).date() + except Exception: + pass + return None - # Loose header check: either exact match, or map by index if close - header_norm = [h.strip() for h in header] - if header_norm != EXPECTED_HEADERS: - return { - "report": "Header mismatch.", - "rows": [], - "errors": [ - "Expected header: " + ", ".join(EXPECTED_HEADERS), - "Found header: " + ", ".join(header_norm), - ], - } - to_create: List[Entry] = [] +def _to_int_or_none(v: Any) -> Optional[int]: + if v is None: + return None + s = str(v).strip() + if s == "": + return None + try: + return int(float(s)) # tolerate "123.0" + except Exception: + return None + + +def import_csv_bytes(data: bytes, dry_run: bool = True) -> Dict[str, Any]: + """ + Robust CSV importer for Entry. + + - Accepts your human-readable header (Subject, Illustration, ...) + and/or direct model field names. + - Normalizes odd headers like r."Talk Title". + - Handles BOM & dialect sniffing. + - Returns a report dict: {ok, created, updated, skipped, errors, preview, total_rows, header} + """ + report: Dict[str, Any] = { + "ok": False, + "created": 0, + "updated": 0, + "skipped": 0, + "errors": [], # list[str] + "preview": [], # first ~10 rows that would be imported + "total_rows": 0, + "header": [], + } + + # --- decode safely (remove BOM, keep unknowns) --- + text = data.decode("utf-8-sig", errors="replace") + + # --- sniff dialect; fall back to excel --- + try: + sample = "\n".join(text.splitlines()[:10]) + dialect = csv.Sniffer().sniff(sample) if sample.strip() else csv.excel + except Exception: + dialect = csv.excel + + rdr = csv.reader(io.StringIO(text), dialect) + + try: + raw_header = next(rdr, []) + except Exception as e: + report["errors"].append(f"Failed reading header: {e}") + return report + + # Clean & map header + cleaned = [_clean_header_token(h) for h in raw_header] + mapped: List[str] = [] + unknowns: List[str] = [] + for token in cleaned: + target = ACCEPTABLE_HEADERS.get(token) + if target: + mapped.append(target) + else: + unknowns.append(token or "(empty)") + + # If header doesn't match expected width but row count does, assume *no* header; + # inject expected header so downstream works. + has_header = True + if unknowns: + # Heuristic: if the number of columns equals EXPECTED_HEADERS and *none* + # of the cleaned tokens map, it's probably a data row (no header) + matches = sum(1 for t in cleaned if t in ACCEPTABLE_HEADERS) + if matches == 0 and len(cleaned) == len(EXPECTED_HEADERS): + # inject expected header and re-run + has_header = False + mapped = [HEADER_MAP[h] for h in EXPECTED_HEADERS] + # rebuild a reader with the expected header injected + sio = io.StringIO(text) + rdr_tmp = csv.reader(sio, dialect) + rows = list(rdr_tmp) + rows.insert(0, EXPECTED_HEADERS) # inject pretty header for report + rdr = iter(rows) # consume from this list iterator + next(rdr, None) # skip our injected header + else: + # keep going but warn in the report + report["errors"].append( + "Some header columns were not recognized: " + + ", ".join(unknowns) + + " (continuing with best-effort mapping)" + ) + + report["header"] = mapped + + # Read rows + rows = list(rdr) + report["total_rows"] = len(rows) + + # Build row dicts + def row_to_obj(row_idx: int, row: List[str]) -> Tuple[Optional[Entry], Optional[Dict[str, Any]], Optional[str]]: + """ + Returns (entry_instance_or_None, values_dict_or_None, error_message_or_None) + but does not save to DB. + """ + if len(row) < len(mapped): + return None, None, f"Row {row_idx}: expected {len(mapped)} columns, found {len(row)}." + values: Dict[str, Any] = {} + for i, field in enumerate(mapped): + raw_val = row[i] if i < len(row) else "" + # Coerce types for specific fields + if field in ("date_added", "date_edited"): + values[field] = _parse_date(raw_val) + elif field == "talk_number": + values[field] = _to_int_or_none(raw_val) + else: + values[field] = (raw_val or "").strip() + + # Create (unsaved) Entry instance for preview/validation + e = Entry(**{k: v for k, v in values.items() if v not in (None, "")}) + return e, values, None + + # Preview first few + for i, row in enumerate(rows[:10], start=1): + e, values, err = row_to_obj(i, row) + report["preview"].append({ + "row": i, + "values": values if values else {}, + "error": err, + }) + + if dry_run: + # Dry run: don’t write, just validate basic structure + bad = [p for p in report["preview"] if p["error"]] + if bad: + report["errors"].extend(p["error"] for p in bad if p["error"]) + report["ok"] = len(report["errors"]) == 0 + return report + + # Real import (create new rows). + # If you want update/merge behavior, add a key strategy here. + created = 0 + updated = 0 + skipped = 0 errors: List[str] = [] - preview: List[Tuple[int, Dict[str, Any]]] = [] # first 100 rows for the UI - rownum = 1 - def make_entry(row: List[str]) -> Optional[Entry]: - # force length to 10, padding if needed - padded = row + [""] * (10 - len(row)) - subj, ill, app, scr, src, talk_title, talk_num, code, d_added, d_edited = padded[:10] + with transaction.atomic(): + for idx, row in enumerate(rows, start=1): + e, values, err = row_to_obj(idx, row) + if err: + errors.append(err) + skipped += 1 + continue - e = Entry( - subject=(subj or "").strip(), - illustration=(ill or "").strip(), - application=(app or "").strip(), - scripture_raw=(scr or "").strip(), - source=(src or "").strip(), - talk_title=(talk_title or "").strip(), - talk_number=_to_int_or_none(talk_num), - entry_code=(code or "").strip(), - date_added=_to_date_or_none(d_added), - date_edited=_to_date_or_none(d_edited), - ) - return e - - created_total = 0 - with (transaction.atomic() if not dry_run else _noop_context()): - for row in reader: - rownum += 1 try: - e = make_entry(row) - # (optional) add required-field checks; e.g., at least one of illustration/application - if not ((e.illustration and e.illustration.strip()) or (e.application and e.application.strip())): - errors.append(f"Row {rownum}: missing Illustration and Application") - continue - - to_create.append(e) - - if len(preview) < 100: - preview.append((rownum, { - "Subject": e.subject, "Illustration": e.illustration[:120], - "Application": e.application[:120], "Scripture": e.scripture_raw, - "Source": e.source, "Talk Title": e.talk_title, - "Talk Number": e.talk_number, "Code": e.entry_code, - "Date": e.date_added, "Date Edited": e.date_edited, - })) - - if not dry_run and len(to_create) >= batch_size: - Entry.objects.bulk_create(to_create, batch_size=batch_size) - created_total += len(to_create) - to_create.clear() - + # Simple create-only behavior: + Entry.objects.create(**values) + created += 1 except Exception as ex: - errors.append(f"Row {rownum}: {ex}") + errors.append(f"Row {idx}: failed to save ({ex})") + skipped += 1 - if not dry_run and to_create: - Entry.objects.bulk_create(to_create, batch_size=batch_size) - created_total += len(to_create) - to_create.clear() - - report = f"{'Would import' if dry_run else 'Imported'} {created_total if not dry_run else len(preview)}+ rows." - return {"report": report, "rows": preview, "errors": errors} + report.update({ + "ok": len(errors) == 0, + "created": created, + "updated": updated, + "skipped": skipped, + "errors": errors, + }) + return report # small context manager used above class _noop_context: