Update web/core/utils.py
This commit is contained in:
+84
-38
@@ -1,18 +1,62 @@
|
|||||||
import csv
|
import csv
|
||||||
import io
|
import io
|
||||||
import re
|
import re
|
||||||
from dateutil import parser as dateparser
|
|
||||||
from datetime import date
|
from datetime import date
|
||||||
|
from dateutil import parser as dateparser
|
||||||
|
|
||||||
from core.models import Entry, ScriptureRef
|
from core.models import Entry, ScriptureRef
|
||||||
|
|
||||||
|
# ----------------------------
|
||||||
|
# Search helpers (needed by views)
|
||||||
|
# ----------------------------
|
||||||
|
# Split query into tokens while preserving quoted phrases
|
||||||
|
_QUOTED_OR_WORD = re.compile(r'"([^"]+)"|(\S+)')
|
||||||
|
|
||||||
|
def terms(q: str):
|
||||||
|
out = []
|
||||||
|
for m in _QUOTED_OR_WORD.finditer(q or ""):
|
||||||
|
token = (m.group(1) or m.group(2) or "").strip()
|
||||||
|
if token:
|
||||||
|
out.append(token)
|
||||||
|
return out
|
||||||
|
|
||||||
|
def has_wildcards(token: str) -> bool:
|
||||||
|
return "*" in token or "?" in token
|
||||||
|
|
||||||
|
def wildcard_to_regex(token: str) -> str:
|
||||||
|
"""
|
||||||
|
Convert user wildcard token to a safe regex:
|
||||||
|
* -> .*
|
||||||
|
? -> .
|
||||||
|
Everything else is escaped. Suitable for Django __iregex.
|
||||||
|
"""
|
||||||
|
STAR = "__STAR__"
|
||||||
|
QMARK = "__QMARK__"
|
||||||
|
s = token.replace("*", STAR).replace("?", QMARK)
|
||||||
|
s = re.escape(s)
|
||||||
|
s = s.replace(STAR, ".*").replace(QMARK, ".")
|
||||||
|
return s
|
||||||
|
|
||||||
|
# ----------------------------
|
||||||
|
# Scripture parsing (minimal, non-blocking)
|
||||||
|
# ----------------------------
|
||||||
|
def parse_scripture(scripture_str: str):
|
||||||
|
"""
|
||||||
|
Minimal placeholder: keep as a list with raw string so imports never fail.
|
||||||
|
Replace with your richer parser when ready.
|
||||||
|
"""
|
||||||
|
if not scripture_str:
|
||||||
|
return []
|
||||||
|
return [{"raw": scripture_str}]
|
||||||
|
|
||||||
|
# ----------------------------
|
||||||
|
# CSV import (robust)
|
||||||
|
# ----------------------------
|
||||||
EXPECTED_HEADERS = [h.lower() for h in [
|
EXPECTED_HEADERS = [h.lower() for h in [
|
||||||
"Subject","Illustration","Application","Scripture","Source",
|
"Subject","Illustration","Application","Scripture","Source",
|
||||||
"Talk Title","Talk Number","Code","Date","Date Edited"
|
"Talk Title","Talk Number","Code","Date","Date Edited"
|
||||||
]]
|
]]
|
||||||
|
|
||||||
|
|
||||||
def _sniff(text: str):
|
def _sniff(text: str):
|
||||||
sample = text[:8192]
|
sample = text[:8192]
|
||||||
try:
|
try:
|
||||||
@@ -29,51 +73,49 @@ def _sniff(text: str):
|
|||||||
dialect = _Simple
|
dialect = _Simple
|
||||||
return dialect
|
return dialect
|
||||||
|
|
||||||
|
|
||||||
def _as_dictreader(text: str, dialect, fieldnames=None):
|
def _as_dictreader(text: str, dialect, fieldnames=None):
|
||||||
|
"""
|
||||||
|
Yield rows as dicts. If fieldnames are provided, treat file as headerless.
|
||||||
|
We also peek one row: if it looks like an actual header row, we skip it.
|
||||||
|
"""
|
||||||
sio = io.StringIO(text)
|
sio = io.StringIO(text)
|
||||||
if fieldnames is None:
|
if fieldnames is None:
|
||||||
reader = csv.DictReader(sio, dialect=dialect)
|
reader = csv.DictReader(sio, dialect=dialect)
|
||||||
else:
|
|
||||||
reader = csv.DictReader(sio, dialect=dialect, fieldnames=fieldnames)
|
|
||||||
first = next(reader, None)
|
|
||||||
if first is not None:
|
|
||||||
matches = sum(1 for k, v in first.items() if (v or "").strip().lower() == k.strip().lower())
|
|
||||||
if matches < 5:
|
|
||||||
yield first
|
|
||||||
for row in reader:
|
for row in reader:
|
||||||
yield row
|
yield row
|
||||||
return
|
return
|
||||||
|
# Headerless mode
|
||||||
|
reader = csv.DictReader(sio, dialect=dialect, fieldnames=fieldnames)
|
||||||
|
first = next(reader, None)
|
||||||
|
if first is not None:
|
||||||
|
# If many columns equal their header names, it's probably a header row
|
||||||
|
matches = sum(1 for k, v in first.items() if (v or "").strip().lower() == k.strip().lower())
|
||||||
|
if matches < 5:
|
||||||
|
# Not a header row, yield it
|
||||||
|
yield first
|
||||||
for row in reader:
|
for row in reader:
|
||||||
yield row
|
yield row
|
||||||
|
|
||||||
|
def import_csv_bytes(b: bytes, dry_run: bool = True):
|
||||||
def parse_scripture(scripture_str):
|
|
||||||
"""
|
|
||||||
Placeholder scripture parser — adjust as needed.
|
|
||||||
"""
|
|
||||||
if not scripture_str:
|
|
||||||
return []
|
|
||||||
# Very basic parsing, could be replaced with real logic
|
|
||||||
return [{"raw": scripture_str}]
|
|
||||||
|
|
||||||
|
|
||||||
def import_csv_bytes(b: bytes, dry_run=True):
|
|
||||||
"""
|
"""
|
||||||
Robust import:
|
Robust import:
|
||||||
- Auto-detect delimiter (comma/semicolon/tab/pipe).
|
- Auto-detect delimiter (comma/semicolon/tab/pipe).
|
||||||
- If required headers are missing, re-parse treating file as *headerless*
|
- If required headers are missing, re-parse treating file as *headerless*
|
||||||
using the canonical column order.
|
using the canonical column order.
|
||||||
- Upsert by Code; skip rows that are entirely empty.
|
- Skip fully empty rows.
|
||||||
|
- Upsert by Code (if Code present), else insert.
|
||||||
|
Returns a report dict with counts and diagnostics.
|
||||||
"""
|
"""
|
||||||
text = b.decode("utf-8-sig", errors="replace")
|
text = b.decode("utf-8-sig", errors="replace")
|
||||||
dialect = _sniff(text)
|
dialect = _sniff(text)
|
||||||
|
|
||||||
|
# First attempt: use file-provided headers
|
||||||
reader1 = csv.DictReader(io.StringIO(text), dialect=dialect)
|
reader1 = csv.DictReader(io.StringIO(text), dialect=dialect)
|
||||||
headers1 = [(h or "").strip().lower() for h in (reader1.fieldnames or [])]
|
headers1 = [(h or "").strip().lower() for h in (reader1.fieldnames or [])]
|
||||||
|
|
||||||
used_headerless = False
|
used_headerless = False
|
||||||
if not headers1 or sum(h in EXPECTED_HEADERS for h in headers1) < 5:
|
if not headers1 or sum(h in EXPECTED_HEADERS for h in headers1) < 5:
|
||||||
|
# Not enough expected headers -> treat as headerless/positional
|
||||||
used_headerless = True
|
used_headerless = True
|
||||||
rows_iter = _as_dictreader(text, dialect, fieldnames=EXPECTED_HEADERS)
|
rows_iter = _as_dictreader(text, dialect, fieldnames=EXPECTED_HEADERS)
|
||||||
else:
|
else:
|
||||||
@@ -103,19 +145,20 @@ def import_csv_bytes(b: bytes, dry_run=True):
|
|||||||
subj = (row_lc.get("subject") or "").strip()
|
subj = (row_lc.get("subject") or "").strip()
|
||||||
illu = (row_lc.get("illustration") or "").strip()
|
illu = (row_lc.get("illustration") or "").strip()
|
||||||
appl = (row_lc.get("application") or "").strip()
|
appl = (row_lc.get("application") or "").strip()
|
||||||
scr = (row_lc.get("scripture") or "").strip()
|
scr = (row_lc.get("scripture") or "").strip()
|
||||||
src = (row_lc.get("source") or "").strip()
|
src = (row_lc.get("source") or "").strip()
|
||||||
tt = (row_lc.get("talk title") or "").strip()
|
tt = (row_lc.get("talk title") or "").strip()
|
||||||
tnum = (row_lc.get("talk number") or "").strip()
|
tnum = (row_lc.get("talk number") or "").strip()
|
||||||
code = (row_lc.get("code") or "").strip()
|
code = (row_lc.get("code") or "").strip()
|
||||||
dadd = parse_date_safe(row_lc.get("date"))
|
dadd = parse_date_safe(row_lc.get("date"))
|
||||||
ded = parse_date_safe(row_lc.get("date edited"))
|
ded = parse_date_safe(row_lc.get("date edited"))
|
||||||
|
|
||||||
try:
|
try:
|
||||||
tnum = int(tnum) if tnum else None
|
tnum = int(tnum) if tnum else None
|
||||||
except Exception:
|
except Exception:
|
||||||
tnum = None
|
tnum = None
|
||||||
|
|
||||||
|
# Skip rows that are completely empty across all tracked fields
|
||||||
if not any([subj, illu, appl, scr, src, tt, code, tnum, dadd, ded]):
|
if not any([subj, illu, appl, scr, src, tt, code, tnum, dadd, ded]):
|
||||||
report["skipped"] += 1
|
report["skipped"] += 1
|
||||||
continue
|
continue
|
||||||
@@ -126,12 +169,11 @@ def import_csv_bytes(b: bytes, dry_run=True):
|
|||||||
talk_title=tt, entry_code=code, date_added=dadd, date_edited=ded
|
talk_title=tt, entry_code=code, date_added=dadd, date_edited=ded
|
||||||
)
|
)
|
||||||
|
|
||||||
parsed = parse_scripture(scr)
|
# Scripture parse diagnostics
|
||||||
for it in parsed:
|
parsed_list = parse_scripture(scr)
|
||||||
if it:
|
for it in parsed_list:
|
||||||
report["scripture_parsed"] += 1
|
if it: report["scripture_parsed"] += 1
|
||||||
else:
|
else: report["scripture_failed"] += 1
|
||||||
report["scripture_failed"] += 1
|
|
||||||
|
|
||||||
if dry_run:
|
if dry_run:
|
||||||
continue
|
continue
|
||||||
@@ -153,8 +195,12 @@ def import_csv_bytes(b: bytes, dry_run=True):
|
|||||||
obj = Entry.objects.create(**data)
|
obj = Entry.objects.create(**data)
|
||||||
report["inserted"] += 1
|
report["inserted"] += 1
|
||||||
|
|
||||||
for it in parsed:
|
for it in parsed_list:
|
||||||
if it:
|
if it and isinstance(it, dict) and "raw" in it:
|
||||||
|
# Keep raw-only ref optional; skip creating ScriptureRef if schema differs
|
||||||
|
pass
|
||||||
|
elif it:
|
||||||
|
# If you switch to a structured parser, create records like:
|
||||||
ScriptureRef.objects.create(entry=obj, **it)
|
ScriptureRef.objects.create(entry=obj, **it)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|||||||
Reference in New Issue
Block a user