Update web/core/utils.py

still trying to fix broken importer
2025-08-13 15:33:01 +00:00
parent bf084fc13c
commit ed4b4a8f62
1 changed files with 295 additions and 197 deletions
@@ -4,263 +4,361 @@ from __future__ import annotations
 import csv
 import io
 import re
 import unicodedata
 from datetime import datetime
-from typing import Any, Dict, List, Optional
+from typing import Dict, List, Optional, Tuple
 from django.db import transaction
-from core.models import Entry
+from django.db.models import Model
 from .models import Entry
-# =============================================================================
+# ============================
 # Search helpers (used by views)
-# =============================================================================
+# ============================
 _WORD_RE = re.compile(r"[^\s]+")
 def terms(q: str) -> List[str]:
-    """
+    """Split search query into terms; keep quoted phrases together."""
    Split a query string into tokens.
    - Quoted phrases are kept together:  "good shepherd"
    - Unquoted text splits on whitespace.
    """
    if not q:
        return []
-    rx = re.compile(r'"([^"]+)"|(\S+)')
+    out, buf, in_quote = [], [], False
-    out: List[str] = []
+    for ch in q:
-    for m in rx.finditer(q):
+        if ch == '"':
-        piece = m.group(1) if m.group(1) is not None else m.group(2)
+            in_quote = not in_quote
-        t = (piece or "").strip()
+            continue
-        if t:
+        if ch.isspace() and not in_quote:
-            out.append(t)
+            if buf:
                out.append("".join(buf))
                buf = []
        else:
            buf.append(ch)
    if buf:
        out.append("".join(buf))
    return out
 def has_wildcards(s: str) -> bool:
    return bool(s) and ("*" in s or "?" in s)
-def has_wildcards(s: Optional[str]) -> bool:
+def wildcard_to_regex(s: str) -> str:
-    """True if user supplied wildcard characters (*, ?, % or _)."""
+    """
-    if not s:
+    Convert user wildcards to a Postgres-friendly regex:
-        return False
+    * -> .*   ? -> .    escape regex meta first
    return any(ch in s for ch in ("*", "?", "%", "_"))
 def wildcard_to_regex(s: Optional[str]) -> str:
    r"""
    Convert FileMaker-style wildcards to a regex fragment suitable for Django's
    iregex lookup.
    Rules:
      - Escape regex meta first, then replace \* -> .*  and \? -> .
      - Wrap with '.*' so it matches anywhere (like icontains).
    """
    if s is None:
-        s = ""
+        return ""
-    pat = re.escape(s)
+    # Escape regex meta, then translate wildcards
-    pat = pat.replace(r"\*", ".*").replace(r"\?", ".")
+    s = re.escape(s)
-    pat = f".*{pat}.*"
+    s = s.replace(r"\*", ".*").replace(r"\?", ".")
-    pat = re.sub(r"(?:\.\*){2,}", ".*", pat)  # collapse repeats
+    return f"^{s}$"
    return pat
-# =============================================================================
+# ============================
-# CSV import utilities
+# CSV import – robust version
-# =============================================================================
+# ============================
 # Canonical header names we expect (case-insensitive on input):
 CANON_HEADERS = [
    "subject", "illustration", "application", "scripture",
    "source", "talk title", "talk number", "code", "date", "date edited"
 ]
 EXPECTED_COLS = len(CANON_HEADERS)
 # Curly quotes & odd whitespace we normalize
 QUOTE_MAP = {
    "\u201c": '"', "\u201d": '"',  # “ ”
    "\u2018": "'", "\u2019": "'",  # ‘ ’
 }
 CTRL_MAP = {
    "\x0b": " ",  # vertical tab
    "\x0c": " ",  # form feed
 }
 def _decode_bytes(b: bytes) -> str:
-    # BOM-safe decode
+    """Decode bytes with utf-8-sig, normalize line endings and characters."""
-    return b.decode("utf-8-sig", errors="replace")
+    t = b.decode("utf-8-sig", errors="replace")
    # normalize curly quotes and control chars
    for k, v in QUOTE_MAP.items():
        t = t.replace(k, v)
    for k, v in CTRL_MAP.items():
        t = t.replace(k, v)
    # normalize newlines
    t = t.replace("\r\n", "\n").replace("\r", "\n")
    return t
-def _sniff_dialect(txt: str):
+def _sniff_dialect(text: str) -> csv.Dialect:
    """Sniff CSV dialect or default to comma."""
    snippet = text[:4096]
    try:
-        return csv.Sniffer().sniff(txt[:4096], delimiters=[",", ";", "\t", "|"])
+        return csv.Sniffer().sniff(snippet, delimiters=[",", ";", "\t", "|"])
    except Exception:
-        class _D: delimiter = ","
+        class D(csv.Dialect):
-        return _D()
+            delimiter = ","
            quotechar = '"'
            doublequote = True
            skipinitialspace = False
            lineterminator = "\n"
            quoting = csv.QUOTE_MINIMAL
        return D()
-def _norm_header(h: str) -> str:
+def _split_lenient(line: str, delimiter: str, expected: int) -> List[str]:
    """
-    Normalize a header name in a forgiving way:
+    Split a CSV line manually, respecting quotes. Works even if the line
-      - lower-case
+    contains inconsistent quoting (e.g., inner quotes not doubled).
-      - treat underscores as spaces
+    Ensures we return exactly `expected` fields by merging overflow cells
-      - collapse spaces
+    into the current text field (typically Illustration/Application/Scripture).
      - drop non-alphanumerics
    """
-    if not h:
+    out, field = [], []
-        return ""
+    in_quotes = False
-    h = h.strip().lower().replace("_", " ")
+    i, n = 0, len(line)
-    h = re.sub(r"\s+", " ", h)
+    while i < n:
-    h = re.sub(r"[^a-z0-9 ]+", "", h)
+        ch = line[i]
-    return h.replace(" ", "")
+        if ch == '"':
            # If we see a doubled quote, treat as a literal quote and skip one
            if in_quotes and i + 1 < n and line[i + 1] == '"':
                field.append('"')
                i += 2
                continue
            in_quotes = not in_quotes
            i += 1
            continue
        if ch == delimiter and not in_quotes:
            out.append("".join(field))
            field = []
            i += 1
            continue
        field.append(ch)
        i += 1
    out.append("".join(field))
    # If we ended with quotes unbalanced, we still got something. Now repair count.
    if len(out) < expected:
        out += [""] * (expected - len(out))
    elif len(out) > expected:
        # Merge overflow columns into the last texty field before we hit short fields.
        # Strategy: merge extras into the last non-empty field before Date columns.
        head = out[:expected - 1]
        tail = out[expected - 1:]
        head[-1] = head[-1] + delimiter + delimiter.join(tail)
        out = head
    return out
 def _build_header_map(headers: List[str]) -> Dict[str, str]:
    """
-    Map original header -> canonical key the importer expects.
+    Map incoming headers (any case) to our canonical keys.
    Canonical keys we use internally:
      subject, illustration, application, scripture, source,
      talk_title, talk_number, code, date, date_edited
    """
-    canon_targets = {
+    key = {h.lower().strip(): h for h in headers}
-        "subject": "subject",
+    mapping = {}
-        "illustration": "illustration",
+    for canon in CANON_HEADERS:
-        "application": "application",
+        # exact match first (case-insensitive)
-        "scripture": "scripture",
+        if canon in key:
-        "source": "source",
+            mapping[canon] = key[canon]
-        "talktitle": "talk_title",
+        else:
-        "title": "talk_title",
+            # fallback: try common variants
-        "talknumber": "talk_number",
+            aliases = {
-        "number": "talk_number",
+                "talk title": ["talk_title", "title"],
-        "code": "code",
+                "talk number": ["talk_no", "talk#", "talk number", "talknum"],
-        "date": "date",
+                "date edited": ["edited", "date_edited", "edited date"],
-        "dateedited": "date_edited",
+            }.get(canon, [])
-        "edited": "date_edited",
+            found = next((a for a in aliases if a in key), None)
-    }
+            mapping[canon] = key.get(found, None)
-    out: Dict[str, str] = {}
+    return mapping
    for h in headers:
        norm = _norm_header(h)
        out[h] = canon_targets.get(norm, norm)  # unknowns map to their normalized name
    return out
-def _getv(row: Dict[str, Any], hdr_map: Dict[str, str], canon: str) -> str:
+def _getv(row: Dict[str, str], header_map: Dict[str, str], canon_key: str) -> str:
-    """Case/spacing-insensitive value lookup."""
+    src = header_map.get(canon_key)
-    for original, mapped in hdr_map.items():
+    return (row.get(src) if src else "") or ""
        if mapped == canon:
            return (row.get(original) or "").strip()
    return ""
-def _clip(s: str, n: int) -> str:
+def _parse_date(val: str) -> Optional[datetime.date]:
-    s = (s or "").strip()
+    val = (val or "").strip()
-    return s[:n] if n and len(s) > n else s
+    if not val:
 def _parse_date(s: str):
    s = (s or "").strip()
    if not s:
        return None
-    for fmt in ("%Y-%m-%d", "%Y/%m/%d", "%m/%d/%Y", "%m/%d/%y", "%Y.%m.%d", "%m-%d-%Y"):
+    # Try common formats: m/d/Y, Y-m-d
    for fmt in ("%m/%d/%Y", "%-m/%-d/%Y", "%Y-%m-%d"):
        try:
-            return datetime.strptime(s, fmt).date()
+            return datetime.strptime(val, fmt).date()
-        except ValueError:
+        except Exception:
-            continue
+            pass
-    return None
+    # Try letting dateutil if available (optional), else skip
-
+    try:
-
+        from dateutil import parser  # type: ignore
-def _parse_int(s: str) -> Optional[int]:
+        return parser.parse(val).date()
-    """Return an int from a string (tolerates commas), else None."""
+    except Exception:
    s = (s or "").strip()
    if not s:
        return None
    m = re.match(r"^-?\d+", s.replace(",", ""))
    return int(m.group(0)) if m else None
-def import_csv_bytes(
+def _clip(field_name: str, value: str) -> str:
    csv_bytes: bytes,
    dry_run: bool = False,
    *,
    # tune these if you changed model field sizes
    max_source=255,
    max_code=128,
    max_talk_number=128,   # only affects clipping BEFORE int parse; int parse handles None
    max_talk_title=512,
    max_scripture=512,
 ):
    """
-    Import CSV seed in an idempotent/upsert fashion.
+    Clip to model field's max_length if needed, to avoid DB DataError.
    Expected headers (case/spacing-insensitive):
      Subject, Illustration, Application, Scripture, Source,
      Talk Title, Talk Number, Code, Date, Date Edited
    Upsert rule:
      1) Prefer Code if present (treat as external key).
      2) Else fall back to the triple (subject, illustration, application).
    """
-    text = _decode_bytes(csv_bytes)
+    try:
        f = Entry._meta.get_field(field_name)
        max_len = getattr(f, "max_length", None)
        if max_len and value and len(value) > max_len:
            return value[:max_len]
    except Exception:
        pass
    return value
 def _coerce_int(val: str) -> Optional[int]:
    val = (val or "").strip()
    if not val:
        return None
    # allow like "#35" or "35)"
    m = re.search(r"(-?\d+)", val)
    if not m:
        return None
    try:
        return int(m.group(1))
    except Exception:
        return None
@transaction.atomic
 def import_csv_bytes(b: bytes, dry_run: bool = False) -> Dict[str, object]:
    """
    Robust CSV import. Idempotent-ish upsert by (subject, illustration).
    """
    text = _decode_bytes(b)
    dialect = _sniff_dialect(text)
    f = io.StringIO(text)
-    rdr = csv.DictReader(f, dialect=dialect)
+    reader = csv.reader(f, dialect=dialect)
-    seen_headers = [h.strip() for h in (rdr.fieldnames or [])]
+    # Read header row
-    header_map = _build_header_map(seen_headers)
+    try:
        raw_headers = next(reader)
    except StopIteration:
        return {"rows": 0, "inserted": 0, "updated": 0, "skipped": 0, "errors": [], "scripture_parsed": 0, "scripture_failed": 0, "dialect_delimiter": dialect.delimiter, "used_headerless_mode": False, "seen_headers": []}
-    inserted = updated = skipped = 0
+    # If header count is wrong, repair via lenient split
    if len(raw_headers) != EXPECTED_COLS:
        fixed = _split_lenient(",".join(raw_headers), delimiter=dialect.delimiter, expected=EXPECTED_COLS)
        headers = fixed
    else:
        headers = raw_headers
    header_map = _build_header_map(headers)
    total = 0
    inserted = 0
    updated = 0
    skipped = 0
    errors: List[str] = []
-    scripture_parsed = 0
+    scripture_ok = 0
    scripture_bad = 0
    # Re-open to iterate rows with the *raw* lines paired to parsed ones
    f2 = io.StringIO(text)
    lines = f2.read().splitlines()
    # first line is header
    raw_data_lines = lines[1:]
    # Iterate again with DictReader for convenience
    f3 = io.StringIO(text)
    dict_reader = csv.DictReader(f3, fieldnames=headers, dialect=dialect)
    next(dict_reader, None)  # skip header
    for idx, (raw_line, row) in enumerate(zip(raw_data_lines, dict_reader), start=2):
        total += 1
        # Some rows are mis-split by csv due to bad quotes -> repair
        if len(row) != EXPECTED_COLS or None in row:
            cells = _split_lenient(raw_line, delimiter=dialect.delimiter, expected=EXPECTED_COLS)
            row = dict(zip(headers, cells))
        # Extract using canonical keys
        subject      = _getv(row, header_map, "subject").strip()
        illustration = _getv(row, header_map, "illustration").strip()
        application  = _getv(row, header_map, "application").strip()
        scripture    = _getv(row, header_map, "scripture").strip()
        source       = _getv(row, header_map, "source").strip()
        talk_title   = _getv(row, header_map, "talk title").strip()
        talk_number  = _coerce_int(_getv(row, header_map, "talk number"))
        entry_code   = _getv(row, header_map, "code").strip()
        date_added   = _parse_date(_getv(row, header_map, "date"))
        date_edited  = _parse_date(_getv(row, header_map, "date edited"))
        # Basic sanity: if all major text fields empty, skip
        if not (subject or illustration or application):
            skipped += 1
            continue
        # Clip to DB lengths to avoid DataError (robustness)
        subject      = _clip("subject", subject)
        illustration = _clip("illustration", illustration)
        application  = _clip("application", application)
        scripture    = _clip("scripture_raw", scripture)
        source       = _clip("source", source)
        talk_title   = _clip("talk_title", talk_title)
        entry_code   = _clip("entry_code", entry_code)
        if scripture:
            scripture_ok += 1
        else:
            scripture_bad += 1
        # Upsert key: prefer entry_code; else (subject + illustration)
        lookup: Dict[str, object] = {}
        if entry_code:
            lookup["entry_code"] = entry_code
        else:
            lookup["subject"] = subject
            lookup["illustration"] = illustration
    for idx, row in enumerate(rdr, start=2):  # data starts at line 2
        try:
-            with transaction.atomic():
+            obj = Entry.objects.filter(**lookup).first()
-                subject       = _getv(row, header_map, "subject")
+            if not obj:
-                illustration  = _getv(row, header_map, "illustration")
+                obj = Entry(**lookup)
-                application   = _getv(row, header_map, "application")
+                created = True
            else:
                created = False
-                scripture_raw = _clip(_getv(row, header_map, "scripture"),   max_scripture)
+            obj.subject = subject
-                source        = _clip(_getv(row, header_map, "source"),      max_source)
+            obj.illustration = illustration
-                talk_title    = _clip(_getv(row, header_map, "talk_title"),  max_talk_title)
+            obj.application = application
            obj.scripture_raw = scripture
            obj.source = source
            obj.talk_title = talk_title
            obj.talk_number = talk_number
            obj.entry_code = entry_code or obj.entry_code
            if date_added:
                obj.date_added = date_added
            if date_edited:
                obj.date_edited = date_edited
-                # Safe talk number parse (non-numeric -> None)
+            if not dry_run:
-                talk_number_raw = _clip(_getv(row, header_map, "talk_number"), max_talk_number)
+                obj.save()
                talk_number     = _parse_int(talk_number_raw)
-                entry_code    = _clip(_getv(row, header_map, "code"), max_code)
+            if created:
-
+                inserted += 1
-                date_added  = _parse_date(_getv(row, header_map, "date"))
+            else:
-                date_edited = _parse_date(_getv(row, header_map, "date_edited"))
+                updated += 1
                # Find existing
                obj: Optional[Entry] = None
                if entry_code:
                    obj = Entry.objects.filter(entry_code=entry_code).first()
                if obj is None:
                    obj = Entry.objects.filter(
                        subject=subject,
                        illustration=illustration,
                        application=application,
                    ).first()
                created = obj is None
                if created:
                    obj = Entry()
                # Assign
                obj.subject       = subject
                obj.illustration  = illustration
                obj.application   = application
                obj.scripture_raw = scripture_raw
                obj.source        = source
                obj.talk_title    = talk_title
                obj.talk_number   = talk_number   # None is fine for IntegerField
                obj.entry_code    = entry_code
                if date_added:
                    obj.date_added = date_added
                if date_edited:
                    obj.date_edited = date_edited
                if not dry_run:
                    obj.save()
                if created:
                    inserted += 1
                else:
                    updated += 1
                if scripture_raw:
                    scripture_parsed += 1
        except Exception as e:
            # Keep importing other rows; capture the first part of the error
            msg = str(e).splitlines()[0]
            errors.append(f"line {idx}: {type(e).__name__}: {msg}")
            skipped += 1
            errors.append(f"line {idx}: {type(e).__name__}: {e}")
    return {
-        "rows": inserted + updated + skipped,
+        "rows": total,
        "inserted": inserted,
        "updated": updated,
        "skipped": skipped,
-        "errors": errors[:200],  # cap output
+        "errors": errors,
-        "scripture_parsed": scripture_parsed,
+        "scripture_parsed": scripture_ok,
-        "scripture_failed": 0,
+        "scripture_failed": scripture_bad,
-        "dialect_delimiter": dialect.delimiter,
+        "dialect_delimiter": getattr(_sniff_dialect(text), "delimiter", ","),
        "used_headerless_mode": False,
-        "seen_headers": [h.lower() for h in seen_headers],
+        "seen_headers": headers,
    }