496 lines
15 KiB
Python
496 lines
15 KiB
Python
# core/utils.py
|
||
from __future__ import annotations
|
||
|
||
import csv
|
||
import io
|
||
import re
|
||
from datetime import datetime
|
||
from typing import Dict, List, Optional, Any, Tuple, Iterable
|
||
|
||
from django.db import transaction, IntegrityError, DataError, DatabaseError
|
||
|
||
from .models import Entry
|
||
|
||
EXPECTED_HEADERS: List[str] = [
|
||
"Subject", "Illustration", "Application", "Scripture", "Source",
|
||
"Talk Title", "Talk Number", "Code", "Date", "Date Edited",
|
||
]
|
||
|
||
# Map CSV header labels -> Entry model field names
|
||
HEADER_MAP: Dict[str, str] = {
|
||
"Subject": "subject",
|
||
"Illustration": "illustration",
|
||
"Application": "application",
|
||
"Scripture": "scripture_raw",
|
||
"Source": "source",
|
||
"Talk Title": "talk_title",
|
||
"Talk Number": "talk_number",
|
||
"Code": "entry_code",
|
||
"Date": "date_added",
|
||
"Date Edited": "date_edited",
|
||
}
|
||
|
||
# Accept both the pretty labels *and* the actual model field names
|
||
# (lets you import older dumps or hand-made files)
|
||
ACCEPTABLE_HEADERS: Dict[str, str] = {
|
||
**{h.lower(): HEADER_MAP[h] for h in EXPECTED_HEADERS},
|
||
# direct model names also OK
|
||
"subject": "subject",
|
||
"illustration": "illustration",
|
||
"application": "application",
|
||
"scripture_raw": "scripture_raw",
|
||
"source": "source",
|
||
"talk_title": "talk_title",
|
||
"talk_number": "talk_number",
|
||
"entry_code": "entry_code",
|
||
"date_added": "date_added",
|
||
"date_edited": "date_edited",
|
||
}
|
||
|
||
|
||
# ============================
|
||
# Search helpers (used by views)
|
||
# ============================
|
||
|
||
def terms(q: str) -> List[str]:
|
||
"""Split search query into terms; keep quoted phrases together."""
|
||
if not q:
|
||
return []
|
||
out, buf, in_quote = [], [], False
|
||
for ch in q:
|
||
if ch == '"':
|
||
in_quote = not in_quote
|
||
continue
|
||
if ch.isspace() and not in_quote:
|
||
if buf:
|
||
out.append("".join(buf))
|
||
buf = []
|
||
else:
|
||
buf.append(ch)
|
||
if buf:
|
||
out.append("".join(buf))
|
||
return out
|
||
|
||
|
||
def has_wildcards(s: str) -> bool:
|
||
return bool(s) and ("*" in s or "?" in s)
|
||
|
||
|
||
def wildcard_to_regex(s: str) -> str:
|
||
"""
|
||
Convert user wildcards to a Postgres-friendly regex:
|
||
* -> .* ? -> . (escape regex meta first)
|
||
"""
|
||
if s is None:
|
||
return ""
|
||
s = re.escape(s)
|
||
s = s.replace(r"\*", ".*").replace(r"\?", ".")
|
||
return f"^{s}$"
|
||
|
||
|
||
# ============================
|
||
# CSV import – robust version
|
||
# ============================
|
||
|
||
# Canonical header names we expect (case-insensitive on input):
|
||
CANON_HEADERS = [
|
||
"subject", "illustration", "application", "scripture",
|
||
"source", "talk title", "talk number", "code", "date", "date edited",
|
||
]
|
||
EXPECTED_COLS = len(CANON_HEADERS)
|
||
|
||
# Curly quotes & odd whitespace we normalize
|
||
QUOTE_MAP = {
|
||
"\u201c": '"', "\u201d": '"', # “ ”
|
||
"\u2018": "'", "\u2019": "'", # ‘ ’
|
||
}
|
||
CTRL_MAP = {
|
||
"\x0b": " ", # vertical tab
|
||
"\x0c": " ", # form feed
|
||
}
|
||
|
||
|
||
def _decode_bytes(b: bytes) -> str:
|
||
"""Decode bytes with utf-8-sig, normalize line endings and characters."""
|
||
t = b.decode("utf-8-sig", errors="replace")
|
||
# normalize curly quotes and control chars
|
||
for k, v in QUOTE_MAP.items():
|
||
t = t.replace(k, v)
|
||
for k, v in CTRL_MAP.items():
|
||
t = t.replace(k, v)
|
||
# normalize newlines
|
||
t = t.replace("\r\n", "\n").replace("\r", "\n")
|
||
return t
|
||
|
||
|
||
def _sniff_dialect(text: str) -> csv.Dialect:
|
||
"""Sniff CSV dialect or default to comma."""
|
||
snippet = text[:4096]
|
||
try:
|
||
return csv.Sniffer().sniff(snippet, delimiters=[",", ";", "\t", "|"])
|
||
except Exception:
|
||
class D(csv.Dialect):
|
||
delimiter = ","
|
||
quotechar = '"'
|
||
doublequote = True
|
||
skipinitialspace = False
|
||
lineterminator = "\n"
|
||
quoting = csv.QUOTE_MINIMAL
|
||
return D()
|
||
|
||
|
||
def _split_lenient(line: str, delimiter: str, expected: int) -> List[str]:
|
||
"""
|
||
Split a CSV line manually, respecting quotes. Works even if the line
|
||
contains inconsistent quoting (e.g., inner quotes not doubled).
|
||
Ensures we return exactly `expected` fields by merging overflow cells
|
||
into the current text field (before the trailing short/meta columns).
|
||
"""
|
||
out, field = [], []
|
||
in_quotes = False
|
||
i, n = 0, len(line)
|
||
while i < n:
|
||
ch = line[i]
|
||
if ch == '"':
|
||
# doubled quote inside a quoted field -> literal quote
|
||
if in_quotes and i + 1 < n and line[i + 1] == '"':
|
||
field.append('"')
|
||
i += 2
|
||
continue
|
||
in_quotes = not in_quotes
|
||
i += 1
|
||
continue
|
||
if ch == delimiter and not in_quotes:
|
||
out.append("".join(field))
|
||
field = []
|
||
i += 1
|
||
continue
|
||
field.append(ch)
|
||
i += 1
|
||
out.append("".join(field))
|
||
|
||
# Repair count to exactly `expected`
|
||
if len(out) < expected:
|
||
out += [""] * (expected - len(out))
|
||
elif len(out) > expected:
|
||
head = out[:expected - 1]
|
||
tail = out[expected - 1:]
|
||
head[-1] = head[-1] + delimiter + delimiter.join(tail)
|
||
out = head
|
||
|
||
return out
|
||
|
||
|
||
def _build_header_map(headers: List[str]) -> Dict[str, str]:
|
||
"""
|
||
Map incoming headers (any case) to our canonical keys.
|
||
"""
|
||
key = {h.lower().strip(): h for h in headers}
|
||
mapping: Dict[str, Optional[str]] = {}
|
||
for canon in CANON_HEADERS:
|
||
if canon in key:
|
||
mapping[canon] = key[canon]
|
||
else:
|
||
aliases = {
|
||
"talk title": ["talk_title", "title"],
|
||
"talk number": ["talk_no", "talk#", "talknum"],
|
||
"date edited": ["edited", "date_edited", "edited date"],
|
||
}.get(canon, [])
|
||
found = next((a for a in aliases if a in key), None)
|
||
mapping[canon] = key.get(found) if found else None
|
||
return mapping # type: ignore[return-value]
|
||
|
||
|
||
def _getv(row: Dict[str, str], header_map: Dict[str, str], canon_key: str) -> str:
|
||
src = header_map.get(canon_key)
|
||
return (row.get(src) if src else "") or ""
|
||
|
||
|
||
def _parse_date(val: str):
|
||
val = (val or "").strip()
|
||
if not val:
|
||
return None
|
||
# Common formats: m/d/Y, Y-m-d (also tolerate single-digit m/d on Linux)
|
||
for fmt in ("%m/%d/%Y", "%-m/%-d/%Y", "%Y-%m-%d"):
|
||
try:
|
||
return datetime.strptime(val, fmt).date()
|
||
except Exception:
|
||
pass
|
||
# Fallback to dateutil if present
|
||
try:
|
||
from dateutil import parser # type: ignore
|
||
return parser.parse(val).date()
|
||
except Exception:
|
||
return None
|
||
|
||
|
||
def _clip(field_name: str, value: str) -> str:
|
||
"""
|
||
Clip to model field's max_length if needed, to avoid DB DataError.
|
||
"""
|
||
try:
|
||
f = Entry._meta.get_field(field_name)
|
||
max_len = getattr(f, "max_length", None)
|
||
if max_len and value and len(value) > max_len:
|
||
return value[:max_len]
|
||
except Exception:
|
||
pass
|
||
return value
|
||
|
||
|
||
def _coerce_int(val: str):
|
||
val = (val or "").strip()
|
||
if not val:
|
||
return None
|
||
m = re.search(r"(-?\d+)", val.replace(",", ""))
|
||
if not m:
|
||
return None
|
||
try:
|
||
return int(m.group(1))
|
||
except Exception:
|
||
return None
|
||
|
||
|
||
def _to_int_or_none(s: str) -> Optional[int]:
|
||
s = (s or "").strip()
|
||
if not s:
|
||
return None
|
||
try:
|
||
return int(s)
|
||
except Exception:
|
||
return None
|
||
|
||
def _to_date_or_none(s: str) -> Optional[datetime.date]:
|
||
s = (s or "").strip()
|
||
if not s:
|
||
return None
|
||
for fmt in ("%Y-%m-%d", "%m/%d/%Y", "%m/%d/%y"):
|
||
try:
|
||
return datetime.datetime.strptime(s, fmt).date()
|
||
except Exception:
|
||
pass
|
||
return None # let caller decide if this is acceptable
|
||
|
||
def _clean_header_token(s: Any) -> str:
|
||
"""
|
||
Make a header token safe/normalized:
|
||
- None -> ""
|
||
- trim spaces
|
||
- strip surrounding single/double quotes
|
||
- drop weird prefixes like r:"Talk Title" or r.'Talk Title'
|
||
- lowercase for matching
|
||
"""
|
||
s = "" if s is None else str(s)
|
||
s = s.strip()
|
||
# strip surrounding quotes
|
||
if len(s) >= 2 and s[0] == s[-1] and s[0] in ("'", '"'):
|
||
s = s[1:-1]
|
||
# drop r: or r. prefix some CSV tools add
|
||
if s[:2].lower() in ("r:", "r."):
|
||
s = s[2:].lstrip()
|
||
return s.strip().lower()
|
||
|
||
|
||
_DATE_FORMATS = (
|
||
"%Y-%m-%d",
|
||
"%m/%d/%Y",
|
||
"%m/%d/%y",
|
||
"%d-%b-%Y", # 05-Sep-2024
|
||
"%Y/%m/%d",
|
||
)
|
||
|
||
def _parse_date(val: str) -> Optional[datetime.date]:
|
||
if not val:
|
||
return None
|
||
txt = str(val).strip()
|
||
# Accept ISO-like with time: 2024-01-02T00:00:00
|
||
if "T" in txt:
|
||
try:
|
||
return datetime.fromisoformat(txt).date()
|
||
except Exception:
|
||
pass
|
||
for fmt in _DATE_FORMATS:
|
||
try:
|
||
return datetime.strptime(txt, fmt).date()
|
||
except Exception:
|
||
continue
|
||
# as a last resort, try only year-month-day pieces
|
||
try:
|
||
parts = [int(p) for p in txt.replace("/", "-").split("-")]
|
||
if len(parts) >= 3:
|
||
return datetime(parts[0], parts[1], parts[2]).date()
|
||
except Exception:
|
||
pass
|
||
return None
|
||
|
||
|
||
def _to_int_or_none(v: Any) -> Optional[int]:
|
||
if v is None:
|
||
return None
|
||
s = str(v).strip()
|
||
if s == "":
|
||
return None
|
||
try:
|
||
return int(float(s)) # tolerate "123.0"
|
||
except Exception:
|
||
return None
|
||
|
||
|
||
def import_csv_bytes(data: bytes, dry_run: bool = True) -> Dict[str, Any]:
|
||
"""
|
||
Robust CSV importer for Entry.
|
||
|
||
- Accepts your human-readable header (Subject, Illustration, ...)
|
||
and/or direct model field names.
|
||
- Normalizes odd headers like r."Talk Title".
|
||
- Handles BOM & dialect sniffing.
|
||
- Returns a report dict: {ok, created, updated, skipped, errors, preview, total_rows, header}
|
||
"""
|
||
report: Dict[str, Any] = {
|
||
"ok": False,
|
||
"created": 0,
|
||
"updated": 0,
|
||
"skipped": 0,
|
||
"errors": [], # list[str]
|
||
"preview": [], # first ~10 rows that would be imported
|
||
"total_rows": 0,
|
||
"header": [],
|
||
}
|
||
|
||
# --- decode safely (remove BOM, keep unknowns) ---
|
||
text = data.decode("utf-8-sig", errors="replace")
|
||
|
||
# --- sniff dialect; fall back to excel ---
|
||
try:
|
||
sample = "\n".join(text.splitlines()[:10])
|
||
dialect = csv.Sniffer().sniff(sample) if sample.strip() else csv.excel
|
||
except Exception:
|
||
dialect = csv.excel
|
||
|
||
rdr = csv.reader(io.StringIO(text), dialect)
|
||
|
||
try:
|
||
raw_header = next(rdr, [])
|
||
except Exception as e:
|
||
report["errors"].append(f"Failed reading header: {e}")
|
||
return report
|
||
|
||
# Clean & map header
|
||
cleaned = [_clean_header_token(h) for h in raw_header]
|
||
mapped: List[str] = []
|
||
unknowns: List[str] = []
|
||
for token in cleaned:
|
||
target = ACCEPTABLE_HEADERS.get(token)
|
||
if target:
|
||
mapped.append(target)
|
||
else:
|
||
unknowns.append(token or "(empty)")
|
||
|
||
# If header doesn't match expected width but row count does, assume *no* header;
|
||
# inject expected header so downstream works.
|
||
has_header = True
|
||
if unknowns:
|
||
# Heuristic: if the number of columns equals EXPECTED_HEADERS and *none*
|
||
# of the cleaned tokens map, it's probably a data row (no header)
|
||
matches = sum(1 for t in cleaned if t in ACCEPTABLE_HEADERS)
|
||
if matches == 0 and len(cleaned) == len(EXPECTED_HEADERS):
|
||
# inject expected header and re-run
|
||
has_header = False
|
||
mapped = [HEADER_MAP[h] for h in EXPECTED_HEADERS]
|
||
# rebuild a reader with the expected header injected
|
||
sio = io.StringIO(text)
|
||
rdr_tmp = csv.reader(sio, dialect)
|
||
rows = list(rdr_tmp)
|
||
rows.insert(0, EXPECTED_HEADERS) # inject pretty header for report
|
||
rdr = iter(rows) # consume from this list iterator
|
||
next(rdr, None) # skip our injected header
|
||
else:
|
||
# keep going but warn in the report
|
||
report["errors"].append(
|
||
"Some header columns were not recognized: "
|
||
+ ", ".join(unknowns)
|
||
+ " (continuing with best-effort mapping)"
|
||
)
|
||
|
||
report["header"] = mapped
|
||
|
||
# Read rows
|
||
rows = list(rdr)
|
||
report["total_rows"] = len(rows)
|
||
|
||
# Build row dicts
|
||
def row_to_obj(row_idx: int, row: List[str]) -> Tuple[Optional[Entry], Optional[Dict[str, Any]], Optional[str]]:
|
||
"""
|
||
Returns (entry_instance_or_None, values_dict_or_None, error_message_or_None)
|
||
but does not save to DB.
|
||
"""
|
||
if len(row) < len(mapped):
|
||
return None, None, f"Row {row_idx}: expected {len(mapped)} columns, found {len(row)}."
|
||
values: Dict[str, Any] = {}
|
||
for i, field in enumerate(mapped):
|
||
raw_val = row[i] if i < len(row) else ""
|
||
# Coerce types for specific fields
|
||
if field in ("date_added", "date_edited"):
|
||
values[field] = _parse_date(raw_val)
|
||
elif field == "talk_number":
|
||
values[field] = _to_int_or_none(raw_val)
|
||
else:
|
||
values[field] = (raw_val or "").strip()
|
||
|
||
# Create (unsaved) Entry instance for preview/validation
|
||
e = Entry(**{k: v for k, v in values.items() if v not in (None, "")})
|
||
return e, values, None
|
||
|
||
# Preview first few
|
||
for i, row in enumerate(rows[:10], start=1):
|
||
e, values, err = row_to_obj(i, row)
|
||
report["preview"].append({
|
||
"row": i,
|
||
"values": values if values else {},
|
||
"error": err,
|
||
})
|
||
|
||
if dry_run:
|
||
# Dry run: don’t write, just validate basic structure
|
||
bad = [p for p in report["preview"] if p["error"]]
|
||
if bad:
|
||
report["errors"].extend(p["error"] for p in bad if p["error"])
|
||
report["ok"] = len(report["errors"]) == 0
|
||
return report
|
||
|
||
# Real import (create new rows).
|
||
# If you want update/merge behavior, add a key strategy here.
|
||
created = 0
|
||
updated = 0
|
||
skipped = 0
|
||
errors: List[str] = []
|
||
|
||
with transaction.atomic():
|
||
for idx, row in enumerate(rows, start=1):
|
||
e, values, err = row_to_obj(idx, row)
|
||
if err:
|
||
errors.append(err)
|
||
skipped += 1
|
||
continue
|
||
|
||
try:
|
||
# Simple create-only behavior:
|
||
Entry.objects.create(**values)
|
||
created += 1
|
||
except Exception as ex:
|
||
errors.append(f"Row {idx}: failed to save ({ex})")
|
||
skipped += 1
|
||
|
||
report.update({
|
||
"ok": len(errors) == 0,
|
||
"created": created,
|
||
"updated": updated,
|
||
"skipped": skipped,
|
||
"errors": errors,
|
||
})
|
||
return report
|
||
|
||
# small context manager used above
|
||
class _noop_context:
|
||
def __enter__(self): return self
|
||
def __exit__(self, exc_type, exc, tb): return False |