Update web/core/utils.py
breaking down imports into chunks
This commit is contained in:
+80
-90
@@ -4,12 +4,10 @@ from __future__ import annotations
|
|||||||
import csv
|
import csv
|
||||||
import io
|
import io
|
||||||
import re
|
import re
|
||||||
import unicodedata
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import Dict, List, Optional, Tuple
|
from typing import Dict, List, Optional
|
||||||
|
|
||||||
from django.db import transaction
|
from django.db import transaction, IntegrityError, DataError, DatabaseError
|
||||||
from django.db.models import Model
|
|
||||||
|
|
||||||
from .models import Entry
|
from .models import Entry
|
||||||
|
|
||||||
@@ -18,8 +16,6 @@ from .models import Entry
|
|||||||
# Search helpers (used by views)
|
# Search helpers (used by views)
|
||||||
# ============================
|
# ============================
|
||||||
|
|
||||||
_WORD_RE = re.compile(r"[^\s]+")
|
|
||||||
|
|
||||||
def terms(q: str) -> List[str]:
|
def terms(q: str) -> List[str]:
|
||||||
"""Split search query into terms; keep quoted phrases together."""
|
"""Split search query into terms; keep quoted phrases together."""
|
||||||
if not q:
|
if not q:
|
||||||
@@ -39,17 +35,18 @@ def terms(q: str) -> List[str]:
|
|||||||
out.append("".join(buf))
|
out.append("".join(buf))
|
||||||
return out
|
return out
|
||||||
|
|
||||||
|
|
||||||
def has_wildcards(s: str) -> bool:
|
def has_wildcards(s: str) -> bool:
|
||||||
return bool(s) and ("*" in s or "?" in s)
|
return bool(s) and ("*" in s or "?" in s)
|
||||||
|
|
||||||
|
|
||||||
def wildcard_to_regex(s: str) -> str:
|
def wildcard_to_regex(s: str) -> str:
|
||||||
"""
|
"""
|
||||||
Convert user wildcards to a Postgres-friendly regex:
|
Convert user wildcards to a Postgres-friendly regex:
|
||||||
* -> .* ? -> . escape regex meta first
|
* -> .* ? -> . (escape regex meta first)
|
||||||
"""
|
"""
|
||||||
if s is None:
|
if s is None:
|
||||||
return ""
|
return ""
|
||||||
# Escape regex meta, then translate wildcards
|
|
||||||
s = re.escape(s)
|
s = re.escape(s)
|
||||||
s = s.replace(r"\*", ".*").replace(r"\?", ".")
|
s = s.replace(r"\*", ".*").replace(r"\?", ".")
|
||||||
return f"^{s}$"
|
return f"^{s}$"
|
||||||
@@ -62,7 +59,7 @@ def wildcard_to_regex(s: str) -> str:
|
|||||||
# Canonical header names we expect (case-insensitive on input):
|
# Canonical header names we expect (case-insensitive on input):
|
||||||
CANON_HEADERS = [
|
CANON_HEADERS = [
|
||||||
"subject", "illustration", "application", "scripture",
|
"subject", "illustration", "application", "scripture",
|
||||||
"source", "talk title", "talk number", "code", "date", "date edited"
|
"source", "talk title", "talk number", "code", "date", "date edited",
|
||||||
]
|
]
|
||||||
EXPECTED_COLS = len(CANON_HEADERS)
|
EXPECTED_COLS = len(CANON_HEADERS)
|
||||||
|
|
||||||
@@ -111,7 +108,7 @@ def _split_lenient(line: str, delimiter: str, expected: int) -> List[str]:
|
|||||||
Split a CSV line manually, respecting quotes. Works even if the line
|
Split a CSV line manually, respecting quotes. Works even if the line
|
||||||
contains inconsistent quoting (e.g., inner quotes not doubled).
|
contains inconsistent quoting (e.g., inner quotes not doubled).
|
||||||
Ensures we return exactly `expected` fields by merging overflow cells
|
Ensures we return exactly `expected` fields by merging overflow cells
|
||||||
into the current text field (typically Illustration/Application/Scripture).
|
into the current text field (before the trailing short/meta columns).
|
||||||
"""
|
"""
|
||||||
out, field = [], []
|
out, field = [], []
|
||||||
in_quotes = False
|
in_quotes = False
|
||||||
@@ -119,7 +116,7 @@ def _split_lenient(line: str, delimiter: str, expected: int) -> List[str]:
|
|||||||
while i < n:
|
while i < n:
|
||||||
ch = line[i]
|
ch = line[i]
|
||||||
if ch == '"':
|
if ch == '"':
|
||||||
# If we see a doubled quote, treat as a literal quote and skip one
|
# doubled quote inside a quoted field -> literal quote
|
||||||
if in_quotes and i + 1 < n and line[i + 1] == '"':
|
if in_quotes and i + 1 < n and line[i + 1] == '"':
|
||||||
field.append('"')
|
field.append('"')
|
||||||
i += 2
|
i += 2
|
||||||
@@ -136,12 +133,10 @@ def _split_lenient(line: str, delimiter: str, expected: int) -> List[str]:
|
|||||||
i += 1
|
i += 1
|
||||||
out.append("".join(field))
|
out.append("".join(field))
|
||||||
|
|
||||||
# If we ended with quotes unbalanced, we still got something. Now repair count.
|
# Repair count to exactly `expected`
|
||||||
if len(out) < expected:
|
if len(out) < expected:
|
||||||
out += [""] * (expected - len(out))
|
out += [""] * (expected - len(out))
|
||||||
elif len(out) > expected:
|
elif len(out) > expected:
|
||||||
# Merge overflow columns into the last texty field before we hit short fields.
|
|
||||||
# Strategy: merge extras into the last non-empty field before Date columns.
|
|
||||||
head = out[:expected - 1]
|
head = out[:expected - 1]
|
||||||
tail = out[expected - 1:]
|
tail = out[expected - 1:]
|
||||||
head[-1] = head[-1] + delimiter + delimiter.join(tail)
|
head[-1] = head[-1] + delimiter + delimiter.join(tail)
|
||||||
@@ -155,21 +150,19 @@ def _build_header_map(headers: List[str]) -> Dict[str, str]:
|
|||||||
Map incoming headers (any case) to our canonical keys.
|
Map incoming headers (any case) to our canonical keys.
|
||||||
"""
|
"""
|
||||||
key = {h.lower().strip(): h for h in headers}
|
key = {h.lower().strip(): h for h in headers}
|
||||||
mapping = {}
|
mapping: Dict[str, Optional[str]] = {}
|
||||||
for canon in CANON_HEADERS:
|
for canon in CANON_HEADERS:
|
||||||
# exact match first (case-insensitive)
|
|
||||||
if canon in key:
|
if canon in key:
|
||||||
mapping[canon] = key[canon]
|
mapping[canon] = key[canon]
|
||||||
else:
|
else:
|
||||||
# fallback: try common variants
|
|
||||||
aliases = {
|
aliases = {
|
||||||
"talk title": ["talk_title", "title"],
|
"talk title": ["talk_title", "title"],
|
||||||
"talk number": ["talk_no", "talk#", "talk number", "talknum"],
|
"talk number": ["talk_no", "talk#", "talknum"],
|
||||||
"date edited": ["edited", "date_edited", "edited date"],
|
"date edited": ["edited", "date_edited", "edited date"],
|
||||||
}.get(canon, [])
|
}.get(canon, [])
|
||||||
found = next((a for a in aliases if a in key), None)
|
found = next((a for a in aliases if a in key), None)
|
||||||
mapping[canon] = key.get(found, None)
|
mapping[canon] = key.get(found) if found else None
|
||||||
return mapping
|
return mapping # type: ignore[return-value]
|
||||||
|
|
||||||
|
|
||||||
def _getv(row: Dict[str, str], header_map: Dict[str, str], canon_key: str) -> str:
|
def _getv(row: Dict[str, str], header_map: Dict[str, str], canon_key: str) -> str:
|
||||||
@@ -177,17 +170,17 @@ def _getv(row: Dict[str, str], header_map: Dict[str, str], canon_key: str) -> st
|
|||||||
return (row.get(src) if src else "") or ""
|
return (row.get(src) if src else "") or ""
|
||||||
|
|
||||||
|
|
||||||
def _parse_date(val: str) -> Optional[datetime.date]:
|
def _parse_date(val: str):
|
||||||
val = (val or "").strip()
|
val = (val or "").strip()
|
||||||
if not val:
|
if not val:
|
||||||
return None
|
return None
|
||||||
# Try common formats: m/d/Y, Y-m-d
|
# Common formats: m/d/Y, Y-m-d (also tolerate single-digit m/d on Linux)
|
||||||
for fmt in ("%m/%d/%Y", "%-m/%-d/%Y", "%Y-%m-%d"):
|
for fmt in ("%m/%d/%Y", "%-m/%-d/%Y", "%Y-%m-%d"):
|
||||||
try:
|
try:
|
||||||
return datetime.strptime(val, fmt).date()
|
return datetime.strptime(val, fmt).date()
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
# Try letting dateutil if available (optional), else skip
|
# Fallback to dateutil if present
|
||||||
try:
|
try:
|
||||||
from dateutil import parser # type: ignore
|
from dateutil import parser # type: ignore
|
||||||
return parser.parse(val).date()
|
return parser.parse(val).date()
|
||||||
@@ -209,12 +202,11 @@ def _clip(field_name: str, value: str) -> str:
|
|||||||
return value
|
return value
|
||||||
|
|
||||||
|
|
||||||
def _coerce_int(val: str) -> Optional[int]:
|
def _coerce_int(val: str):
|
||||||
val = (val or "").strip()
|
val = (val or "").strip()
|
||||||
if not val:
|
if not val:
|
||||||
return None
|
return None
|
||||||
# allow like "#35" or "35)"
|
m = re.search(r"(-?\d+)", val.replace(",", ""))
|
||||||
m = re.search(r"(-?\d+)", val)
|
|
||||||
if not m:
|
if not m:
|
||||||
return None
|
return None
|
||||||
try:
|
try:
|
||||||
@@ -223,60 +215,55 @@ def _coerce_int(val: str) -> Optional[int]:
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
@transaction.atomic
|
def import_csv_bytes(b: bytes, dry_run: bool = False, commit_every: int = 500) -> Dict[str, object]:
|
||||||
def import_csv_bytes(b: bytes, dry_run: bool = False) -> Dict[str, object]:
|
|
||||||
"""
|
"""
|
||||||
Robust CSV import. Idempotent-ish upsert by (subject, illustration).
|
Robust CSV import. Commits each row in its own transaction so that one bad
|
||||||
|
row does not poison the entire import (avoids TransactionManagementError cascades).
|
||||||
|
|
||||||
|
Returns a report dict with counts and first-line error messages.
|
||||||
"""
|
"""
|
||||||
text = _decode_bytes(b)
|
text = _decode_bytes(b)
|
||||||
dialect = _sniff_dialect(text)
|
dialect = _sniff_dialect(text)
|
||||||
|
delimiter = getattr(dialect, "delimiter", ",")
|
||||||
|
|
||||||
|
# --- headers ---
|
||||||
f = io.StringIO(text)
|
f = io.StringIO(text)
|
||||||
reader = csv.reader(f, dialect=dialect)
|
reader = csv.reader(f, dialect=dialect)
|
||||||
|
|
||||||
# Read header row
|
|
||||||
try:
|
try:
|
||||||
raw_headers = next(reader)
|
raw_headers = next(reader)
|
||||||
except StopIteration:
|
except StopIteration:
|
||||||
return {"rows": 0, "inserted": 0, "updated": 0, "skipped": 0, "errors": [], "scripture_parsed": 0, "scripture_failed": 0, "dialect_delimiter": dialect.delimiter, "used_headerless_mode": False, "seen_headers": []}
|
return {
|
||||||
|
"rows": 0, "inserted": 0, "updated": 0, "skipped": 0, "errors": [],
|
||||||
# If header count is wrong, repair via lenient split
|
"scripture_parsed": 0, "scripture_failed": 0,
|
||||||
if len(raw_headers) != EXPECTED_COLS:
|
"dialect_delimiter": delimiter, "used_headerless_mode": False,
|
||||||
fixed = _split_lenient(",".join(raw_headers), delimiter=dialect.delimiter, expected=EXPECTED_COLS)
|
"seen_headers": []
|
||||||
headers = fixed
|
}
|
||||||
else:
|
|
||||||
headers = raw_headers
|
|
||||||
|
|
||||||
|
headers = raw_headers if len(raw_headers) == EXPECTED_COLS else _split_lenient(
|
||||||
|
",".join(raw_headers), delimiter=delimiter, expected=EXPECTED_COLS
|
||||||
|
)
|
||||||
header_map = _build_header_map(headers)
|
header_map = _build_header_map(headers)
|
||||||
|
|
||||||
total = 0
|
# Pair raw lines so we can repair rows mis-split by csv
|
||||||
inserted = 0
|
raw_lines = text.splitlines()[1:] # skip header
|
||||||
updated = 0
|
|
||||||
skipped = 0
|
|
||||||
errors: List[str] = []
|
|
||||||
scripture_ok = 0
|
|
||||||
scripture_bad = 0
|
|
||||||
|
|
||||||
# Re-open to iterate rows with the *raw* lines paired to parsed ones
|
dict_reader = csv.DictReader(io.StringIO(text), fieldnames=headers, dialect=dialect)
|
||||||
f2 = io.StringIO(text)
|
|
||||||
lines = f2.read().splitlines()
|
|
||||||
# first line is header
|
|
||||||
raw_data_lines = lines[1:]
|
|
||||||
|
|
||||||
# Iterate again with DictReader for convenience
|
|
||||||
f3 = io.StringIO(text)
|
|
||||||
dict_reader = csv.DictReader(f3, fieldnames=headers, dialect=dialect)
|
|
||||||
next(dict_reader, None) # skip header
|
next(dict_reader, None) # skip header
|
||||||
|
|
||||||
for idx, (raw_line, row) in enumerate(zip(raw_data_lines, dict_reader), start=2):
|
total = inserted = updated = skipped = 0
|
||||||
|
errors: List[str] = []
|
||||||
|
scripture_ok = scripture_bad = 0
|
||||||
|
|
||||||
|
# Import loop (row-by-row atomic)
|
||||||
|
for idx, (raw_line, row) in enumerate(zip(raw_lines, dict_reader), start=2):
|
||||||
total += 1
|
total += 1
|
||||||
|
|
||||||
# Some rows are mis-split by csv due to bad quotes -> repair
|
# Repair if DictReader got the wrong shape (inconsistent quotes in source)
|
||||||
if len(row) != EXPECTED_COLS or None in row:
|
if len(row) != EXPECTED_COLS or None in row:
|
||||||
cells = _split_lenient(raw_line, delimiter=dialect.delimiter, expected=EXPECTED_COLS)
|
cells = _split_lenient(raw_line, delimiter=delimiter, expected=EXPECTED_COLS)
|
||||||
row = dict(zip(headers, cells))
|
row = dict(zip(headers, cells))
|
||||||
|
|
||||||
# Extract using canonical keys
|
# Extract canonical fields
|
||||||
subject = _getv(row, header_map, "subject").strip()
|
subject = _getv(row, header_map, "subject").strip()
|
||||||
illustration = _getv(row, header_map, "illustration").strip()
|
illustration = _getv(row, header_map, "illustration").strip()
|
||||||
application = _getv(row, header_map, "application").strip()
|
application = _getv(row, header_map, "application").strip()
|
||||||
@@ -288,12 +275,12 @@ def import_csv_bytes(b: bytes, dry_run: bool = False) -> Dict[str, object]:
|
|||||||
date_added = _parse_date(_getv(row, header_map, "date"))
|
date_added = _parse_date(_getv(row, header_map, "date"))
|
||||||
date_edited = _parse_date(_getv(row, header_map, "date edited"))
|
date_edited = _parse_date(_getv(row, header_map, "date edited"))
|
||||||
|
|
||||||
# Basic sanity: if all major text fields empty, skip
|
# Skip rows with no meaningful text
|
||||||
if not (subject or illustration or application):
|
if not (subject or illustration or application):
|
||||||
skipped += 1
|
skipped += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Clip to DB lengths to avoid DataError (robustness)
|
# Clip to DB lengths
|
||||||
subject = _clip("subject", subject)
|
subject = _clip("subject", subject)
|
||||||
illustration = _clip("illustration", illustration)
|
illustration = _clip("illustration", illustration)
|
||||||
application = _clip("application", application)
|
application = _clip("application", application)
|
||||||
@@ -302,10 +289,8 @@ def import_csv_bytes(b: bytes, dry_run: bool = False) -> Dict[str, object]:
|
|||||||
talk_title = _clip("talk_title", talk_title)
|
talk_title = _clip("talk_title", talk_title)
|
||||||
entry_code = _clip("entry_code", entry_code)
|
entry_code = _clip("entry_code", entry_code)
|
||||||
|
|
||||||
if scripture:
|
scripture_ok += 1 if scripture else 0
|
||||||
scripture_ok += 1
|
scripture_bad += 0 if scripture else 1
|
||||||
else:
|
|
||||||
scripture_bad += 1
|
|
||||||
|
|
||||||
# Upsert key: prefer entry_code; else (subject + illustration)
|
# Upsert key: prefer entry_code; else (subject + illustration)
|
||||||
lookup: Dict[str, object] = {}
|
lookup: Dict[str, object] = {}
|
||||||
@@ -315,40 +300,45 @@ def import_csv_bytes(b: bytes, dry_run: bool = False) -> Dict[str, object]:
|
|||||||
lookup["subject"] = subject
|
lookup["subject"] = subject
|
||||||
lookup["illustration"] = illustration
|
lookup["illustration"] = illustration
|
||||||
|
|
||||||
|
if dry_run:
|
||||||
|
exists = Entry.objects.filter(**lookup).exists()
|
||||||
|
inserted += 0 if exists else 1
|
||||||
|
updated += 1 if exists else 0
|
||||||
|
continue
|
||||||
|
|
||||||
try:
|
try:
|
||||||
obj = Entry.objects.filter(**lookup).first()
|
# Isolate each row so a failure rolls back only that row
|
||||||
if not obj:
|
with transaction.atomic():
|
||||||
obj = Entry(**lookup)
|
obj = Entry.objects.filter(**lookup).first()
|
||||||
created = True
|
|
||||||
else:
|
|
||||||
created = False
|
created = False
|
||||||
|
if not obj:
|
||||||
|
obj = Entry(**lookup)
|
||||||
|
created = True
|
||||||
|
|
||||||
obj.subject = subject
|
obj.subject = subject
|
||||||
obj.illustration = illustration
|
obj.illustration = illustration
|
||||||
obj.application = application
|
obj.application = application
|
||||||
obj.scripture_raw = scripture
|
obj.scripture_raw = scripture
|
||||||
obj.source = source
|
obj.source = source
|
||||||
obj.talk_title = talk_title
|
obj.talk_title = talk_title
|
||||||
obj.talk_number = talk_number
|
obj.talk_number = talk_number
|
||||||
obj.entry_code = entry_code or obj.entry_code
|
if entry_code:
|
||||||
if date_added:
|
obj.entry_code = entry_code
|
||||||
obj.date_added = date_added
|
if date_added:
|
||||||
if date_edited:
|
obj.date_added = date_added
|
||||||
obj.date_edited = date_edited
|
if date_edited:
|
||||||
|
obj.date_edited = date_edited
|
||||||
|
|
||||||
if not dry_run:
|
|
||||||
obj.save()
|
obj.save()
|
||||||
|
|
||||||
if created:
|
inserted += 1 if created else 0
|
||||||
inserted += 1
|
updated += 0 if created else 1
|
||||||
else:
|
|
||||||
updated += 1
|
|
||||||
|
|
||||||
except Exception as e:
|
except (IntegrityError, DataError, DatabaseError, ValueError) as e:
|
||||||
# Keep importing other rows; capture the first part of the error
|
|
||||||
msg = str(e).splitlines()[0]
|
msg = str(e).splitlines()[0]
|
||||||
errors.append(f"line {idx}: {type(e).__name__}: {msg}")
|
errors.append(f"line {idx}: {type(e).__name__}: {msg}")
|
||||||
skipped += 1
|
skipped += 1
|
||||||
|
# continue to next row
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"rows": total,
|
"rows": total,
|
||||||
@@ -358,7 +348,7 @@ def import_csv_bytes(b: bytes, dry_run: bool = False) -> Dict[str, object]:
|
|||||||
"errors": errors,
|
"errors": errors,
|
||||||
"scripture_parsed": scripture_ok,
|
"scripture_parsed": scripture_ok,
|
||||||
"scripture_failed": scripture_bad,
|
"scripture_failed": scripture_bad,
|
||||||
"dialect_delimiter": getattr(_sniff_dialect(text), "delimiter", ","),
|
"dialect_delimiter": delimiter,
|
||||||
"used_headerless_mode": False,
|
"used_headerless_mode": False,
|
||||||
"seen_headers": headers,
|
"seen_headers": headers,
|
||||||
}
|
}
|
||||||
Reference in New Issue
Block a user