Update web/core/utils.py

still trying to fix broken importer
This commit is contained in:
2025-08-13 15:33:01 +00:00
parent bf084fc13c
commit ed4b4a8f62
+295 -197
View File
@@ -4,263 +4,361 @@ from __future__ import annotations
import csv import csv
import io import io
import re import re
import unicodedata
from datetime import datetime from datetime import datetime
from typing import Any, Dict, List, Optional from typing import Dict, List, Optional, Tuple
from django.db import transaction from django.db import transaction
from core.models import Entry from django.db.models import Model
from .models import Entry
# ============================================================================= # ============================
# Search helpers (used by views) # Search helpers (used by views)
# ============================================================================= # ============================
_WORD_RE = re.compile(r"[^\s]+")
def terms(q: str) -> List[str]: def terms(q: str) -> List[str]:
""" """Split search query into terms; keep quoted phrases together."""
Split a query string into tokens.
- Quoted phrases are kept together: "good shepherd"
- Unquoted text splits on whitespace.
"""
if not q: if not q:
return [] return []
rx = re.compile(r'"([^"]+)"|(\S+)') out, buf, in_quote = [], [], False
out: List[str] = [] for ch in q:
for m in rx.finditer(q): if ch == '"':
piece = m.group(1) if m.group(1) is not None else m.group(2) in_quote = not in_quote
t = (piece or "").strip() continue
if t: if ch.isspace() and not in_quote:
out.append(t) if buf:
out.append("".join(buf))
buf = []
else:
buf.append(ch)
if buf:
out.append("".join(buf))
return out return out
def has_wildcards(s: str) -> bool:
return bool(s) and ("*" in s or "?" in s)
def has_wildcards(s: Optional[str]) -> bool: def wildcard_to_regex(s: str) -> str:
"""True if user supplied wildcard characters (*, ?, % or _).""" """
if not s: Convert user wildcards to a Postgres-friendly regex:
return False * -> .* ? -> . escape regex meta first
return any(ch in s for ch in ("*", "?", "%", "_"))
def wildcard_to_regex(s: Optional[str]) -> str:
r"""
Convert FileMaker-style wildcards to a regex fragment suitable for Django's
iregex lookup.
Rules:
- Escape regex meta first, then replace \* -> .* and \? -> .
- Wrap with '.*' so it matches anywhere (like icontains).
""" """
if s is None: if s is None:
s = "" return ""
pat = re.escape(s) # Escape regex meta, then translate wildcards
pat = pat.replace(r"\*", ".*").replace(r"\?", ".") s = re.escape(s)
pat = f".*{pat}.*" s = s.replace(r"\*", ".*").replace(r"\?", ".")
pat = re.sub(r"(?:\.\*){2,}", ".*", pat) # collapse repeats return f"^{s}$"
return pat
# ============================================================================= # ============================
# CSV import utilities # CSV import robust version
# ============================================================================= # ============================
# Canonical header names we expect (case-insensitive on input):
CANON_HEADERS = [
"subject", "illustration", "application", "scripture",
"source", "talk title", "talk number", "code", "date", "date edited"
]
EXPECTED_COLS = len(CANON_HEADERS)
# Curly quotes & odd whitespace we normalize
QUOTE_MAP = {
"\u201c": '"', "\u201d": '"', # “ ”
"\u2018": "'", "\u2019": "'", #
}
CTRL_MAP = {
"\x0b": " ", # vertical tab
"\x0c": " ", # form feed
}
def _decode_bytes(b: bytes) -> str: def _decode_bytes(b: bytes) -> str:
# BOM-safe decode """Decode bytes with utf-8-sig, normalize line endings and characters."""
return b.decode("utf-8-sig", errors="replace") t = b.decode("utf-8-sig", errors="replace")
# normalize curly quotes and control chars
for k, v in QUOTE_MAP.items():
t = t.replace(k, v)
for k, v in CTRL_MAP.items():
t = t.replace(k, v)
# normalize newlines
t = t.replace("\r\n", "\n").replace("\r", "\n")
return t
def _sniff_dialect(txt: str): def _sniff_dialect(text: str) -> csv.Dialect:
"""Sniff CSV dialect or default to comma."""
snippet = text[:4096]
try: try:
return csv.Sniffer().sniff(txt[:4096], delimiters=[",", ";", "\t", "|"]) return csv.Sniffer().sniff(snippet, delimiters=[",", ";", "\t", "|"])
except Exception: except Exception:
class _D: delimiter = "," class D(csv.Dialect):
return _D() delimiter = ","
quotechar = '"'
doublequote = True
skipinitialspace = False
lineterminator = "\n"
quoting = csv.QUOTE_MINIMAL
return D()
def _norm_header(h: str) -> str: def _split_lenient(line: str, delimiter: str, expected: int) -> List[str]:
""" """
Normalize a header name in a forgiving way: Split a CSV line manually, respecting quotes. Works even if the line
- lower-case contains inconsistent quoting (e.g., inner quotes not doubled).
- treat underscores as spaces Ensures we return exactly `expected` fields by merging overflow cells
- collapse spaces into the current text field (typically Illustration/Application/Scripture).
- drop non-alphanumerics
""" """
if not h: out, field = [], []
return "" in_quotes = False
h = h.strip().lower().replace("_", " ") i, n = 0, len(line)
h = re.sub(r"\s+", " ", h) while i < n:
h = re.sub(r"[^a-z0-9 ]+", "", h) ch = line[i]
return h.replace(" ", "") if ch == '"':
# If we see a doubled quote, treat as a literal quote and skip one
if in_quotes and i + 1 < n and line[i + 1] == '"':
field.append('"')
i += 2
continue
in_quotes = not in_quotes
i += 1
continue
if ch == delimiter and not in_quotes:
out.append("".join(field))
field = []
i += 1
continue
field.append(ch)
i += 1
out.append("".join(field))
# If we ended with quotes unbalanced, we still got something. Now repair count.
if len(out) < expected:
out += [""] * (expected - len(out))
elif len(out) > expected:
# Merge overflow columns into the last texty field before we hit short fields.
# Strategy: merge extras into the last non-empty field before Date columns.
head = out[:expected - 1]
tail = out[expected - 1:]
head[-1] = head[-1] + delimiter + delimiter.join(tail)
out = head
return out
def _build_header_map(headers: List[str]) -> Dict[str, str]: def _build_header_map(headers: List[str]) -> Dict[str, str]:
""" """
Map original header -> canonical key the importer expects. Map incoming headers (any case) to our canonical keys.
Canonical keys we use internally:
subject, illustration, application, scripture, source,
talk_title, talk_number, code, date, date_edited
""" """
canon_targets = { key = {h.lower().strip(): h for h in headers}
"subject": "subject", mapping = {}
"illustration": "illustration", for canon in CANON_HEADERS:
"application": "application", # exact match first (case-insensitive)
"scripture": "scripture", if canon in key:
"source": "source", mapping[canon] = key[canon]
"talktitle": "talk_title", else:
"title": "talk_title", # fallback: try common variants
"talknumber": "talk_number", aliases = {
"number": "talk_number", "talk title": ["talk_title", "title"],
"code": "code", "talk number": ["talk_no", "talk#", "talk number", "talknum"],
"date": "date", "date edited": ["edited", "date_edited", "edited date"],
"dateedited": "date_edited", }.get(canon, [])
"edited": "date_edited", found = next((a for a in aliases if a in key), None)
} mapping[canon] = key.get(found, None)
out: Dict[str, str] = {} return mapping
for h in headers:
norm = _norm_header(h)
out[h] = canon_targets.get(norm, norm) # unknowns map to their normalized name
return out
def _getv(row: Dict[str, Any], hdr_map: Dict[str, str], canon: str) -> str: def _getv(row: Dict[str, str], header_map: Dict[str, str], canon_key: str) -> str:
"""Case/spacing-insensitive value lookup.""" src = header_map.get(canon_key)
for original, mapped in hdr_map.items(): return (row.get(src) if src else "") or ""
if mapped == canon:
return (row.get(original) or "").strip()
return ""
def _clip(s: str, n: int) -> str: def _parse_date(val: str) -> Optional[datetime.date]:
s = (s or "").strip() val = (val or "").strip()
return s[:n] if n and len(s) > n else s if not val:
def _parse_date(s: str):
s = (s or "").strip()
if not s:
return None return None
for fmt in ("%Y-%m-%d", "%Y/%m/%d", "%m/%d/%Y", "%m/%d/%y", "%Y.%m.%d", "%m-%d-%Y"): # Try common formats: m/d/Y, Y-m-d
for fmt in ("%m/%d/%Y", "%-m/%-d/%Y", "%Y-%m-%d"):
try: try:
return datetime.strptime(s, fmt).date() return datetime.strptime(val, fmt).date()
except ValueError: except Exception:
continue pass
return None # Try letting dateutil if available (optional), else skip
try:
from dateutil import parser # type: ignore
def _parse_int(s: str) -> Optional[int]: return parser.parse(val).date()
"""Return an int from a string (tolerates commas), else None.""" except Exception:
s = (s or "").strip()
if not s:
return None return None
m = re.match(r"^-?\d+", s.replace(",", ""))
return int(m.group(0)) if m else None
def import_csv_bytes( def _clip(field_name: str, value: str) -> str:
csv_bytes: bytes,
dry_run: bool = False,
*,
# tune these if you changed model field sizes
max_source=255,
max_code=128,
max_talk_number=128, # only affects clipping BEFORE int parse; int parse handles None
max_talk_title=512,
max_scripture=512,
):
""" """
Import CSV seed in an idempotent/upsert fashion. Clip to model field's max_length if needed, to avoid DB DataError.
Expected headers (case/spacing-insensitive):
Subject, Illustration, Application, Scripture, Source,
Talk Title, Talk Number, Code, Date, Date Edited
Upsert rule:
1) Prefer Code if present (treat as external key).
2) Else fall back to the triple (subject, illustration, application).
""" """
text = _decode_bytes(csv_bytes) try:
f = Entry._meta.get_field(field_name)
max_len = getattr(f, "max_length", None)
if max_len and value and len(value) > max_len:
return value[:max_len]
except Exception:
pass
return value
def _coerce_int(val: str) -> Optional[int]:
val = (val or "").strip()
if not val:
return None
# allow like "#35" or "35)"
m = re.search(r"(-?\d+)", val)
if not m:
return None
try:
return int(m.group(1))
except Exception:
return None
@transaction.atomic
def import_csv_bytes(b: bytes, dry_run: bool = False) -> Dict[str, object]:
"""
Robust CSV import. Idempotent-ish upsert by (subject, illustration).
"""
text = _decode_bytes(b)
dialect = _sniff_dialect(text) dialect = _sniff_dialect(text)
f = io.StringIO(text) f = io.StringIO(text)
rdr = csv.DictReader(f, dialect=dialect) reader = csv.reader(f, dialect=dialect)
seen_headers = [h.strip() for h in (rdr.fieldnames or [])] # Read header row
header_map = _build_header_map(seen_headers) try:
raw_headers = next(reader)
except StopIteration:
return {"rows": 0, "inserted": 0, "updated": 0, "skipped": 0, "errors": [], "scripture_parsed": 0, "scripture_failed": 0, "dialect_delimiter": dialect.delimiter, "used_headerless_mode": False, "seen_headers": []}
inserted = updated = skipped = 0 # If header count is wrong, repair via lenient split
if len(raw_headers) != EXPECTED_COLS:
fixed = _split_lenient(",".join(raw_headers), delimiter=dialect.delimiter, expected=EXPECTED_COLS)
headers = fixed
else:
headers = raw_headers
header_map = _build_header_map(headers)
total = 0
inserted = 0
updated = 0
skipped = 0
errors: List[str] = [] errors: List[str] = []
scripture_parsed = 0 scripture_ok = 0
scripture_bad = 0
# Re-open to iterate rows with the *raw* lines paired to parsed ones
f2 = io.StringIO(text)
lines = f2.read().splitlines()
# first line is header
raw_data_lines = lines[1:]
# Iterate again with DictReader for convenience
f3 = io.StringIO(text)
dict_reader = csv.DictReader(f3, fieldnames=headers, dialect=dialect)
next(dict_reader, None) # skip header
for idx, (raw_line, row) in enumerate(zip(raw_data_lines, dict_reader), start=2):
total += 1
# Some rows are mis-split by csv due to bad quotes -> repair
if len(row) != EXPECTED_COLS or None in row:
cells = _split_lenient(raw_line, delimiter=dialect.delimiter, expected=EXPECTED_COLS)
row = dict(zip(headers, cells))
# Extract using canonical keys
subject = _getv(row, header_map, "subject").strip()
illustration = _getv(row, header_map, "illustration").strip()
application = _getv(row, header_map, "application").strip()
scripture = _getv(row, header_map, "scripture").strip()
source = _getv(row, header_map, "source").strip()
talk_title = _getv(row, header_map, "talk title").strip()
talk_number = _coerce_int(_getv(row, header_map, "talk number"))
entry_code = _getv(row, header_map, "code").strip()
date_added = _parse_date(_getv(row, header_map, "date"))
date_edited = _parse_date(_getv(row, header_map, "date edited"))
# Basic sanity: if all major text fields empty, skip
if not (subject or illustration or application):
skipped += 1
continue
# Clip to DB lengths to avoid DataError (robustness)
subject = _clip("subject", subject)
illustration = _clip("illustration", illustration)
application = _clip("application", application)
scripture = _clip("scripture_raw", scripture)
source = _clip("source", source)
talk_title = _clip("talk_title", talk_title)
entry_code = _clip("entry_code", entry_code)
if scripture:
scripture_ok += 1
else:
scripture_bad += 1
# Upsert key: prefer entry_code; else (subject + illustration)
lookup: Dict[str, object] = {}
if entry_code:
lookup["entry_code"] = entry_code
else:
lookup["subject"] = subject
lookup["illustration"] = illustration
for idx, row in enumerate(rdr, start=2): # data starts at line 2
try: try:
with transaction.atomic(): obj = Entry.objects.filter(**lookup).first()
subject = _getv(row, header_map, "subject") if not obj:
illustration = _getv(row, header_map, "illustration") obj = Entry(**lookup)
application = _getv(row, header_map, "application") created = True
else:
created = False
scripture_raw = _clip(_getv(row, header_map, "scripture"), max_scripture) obj.subject = subject
source = _clip(_getv(row, header_map, "source"), max_source) obj.illustration = illustration
talk_title = _clip(_getv(row, header_map, "talk_title"), max_talk_title) obj.application = application
obj.scripture_raw = scripture
obj.source = source
obj.talk_title = talk_title
obj.talk_number = talk_number
obj.entry_code = entry_code or obj.entry_code
if date_added:
obj.date_added = date_added
if date_edited:
obj.date_edited = date_edited
# Safe talk number parse (non-numeric -> None) if not dry_run:
talk_number_raw = _clip(_getv(row, header_map, "talk_number"), max_talk_number) obj.save()
talk_number = _parse_int(talk_number_raw)
entry_code = _clip(_getv(row, header_map, "code"), max_code) if created:
inserted += 1
date_added = _parse_date(_getv(row, header_map, "date")) else:
date_edited = _parse_date(_getv(row, header_map, "date_edited")) updated += 1
# Find existing
obj: Optional[Entry] = None
if entry_code:
obj = Entry.objects.filter(entry_code=entry_code).first()
if obj is None:
obj = Entry.objects.filter(
subject=subject,
illustration=illustration,
application=application,
).first()
created = obj is None
if created:
obj = Entry()
# Assign
obj.subject = subject
obj.illustration = illustration
obj.application = application
obj.scripture_raw = scripture_raw
obj.source = source
obj.talk_title = talk_title
obj.talk_number = talk_number # None is fine for IntegerField
obj.entry_code = entry_code
if date_added:
obj.date_added = date_added
if date_edited:
obj.date_edited = date_edited
if not dry_run:
obj.save()
if created:
inserted += 1
else:
updated += 1
if scripture_raw:
scripture_parsed += 1
except Exception as e: except Exception as e:
# Keep importing other rows; capture the first part of the error
msg = str(e).splitlines()[0]
errors.append(f"line {idx}: {type(e).__name__}: {msg}")
skipped += 1 skipped += 1
errors.append(f"line {idx}: {type(e).__name__}: {e}")
return { return {
"rows": inserted + updated + skipped, "rows": total,
"inserted": inserted, "inserted": inserted,
"updated": updated, "updated": updated,
"skipped": skipped, "skipped": skipped,
"errors": errors[:200], # cap output "errors": errors,
"scripture_parsed": scripture_parsed, "scripture_parsed": scripture_ok,
"scripture_failed": 0, "scripture_failed": scripture_bad,
"dialect_delimiter": dialect.delimiter, "dialect_delimiter": getattr(_sniff_dialect(text), "delimiter", ","),
"used_headerless_mode": False, "used_headerless_mode": False,
"seen_headers": [h.lower() for h in seen_headers], "seen_headers": headers,
} }