Update web/core/utils.py

This commit is contained in:
2025-08-13 05:42:08 +00:00
parent 70d12b776a
commit ec6be70805
+77 -31
View File
@@ -1,18 +1,62 @@
import csv import csv
import io import io
import re import re
from dateutil import parser as dateparser
from datetime import date from datetime import date
from dateutil import parser as dateparser
from core.models import Entry, ScriptureRef from core.models import Entry, ScriptureRef
# ----------------------------
# Search helpers (needed by views)
# ----------------------------
# Split query into tokens while preserving quoted phrases
_QUOTED_OR_WORD = re.compile(r'"([^"]+)"|(\S+)')
def terms(q: str):
out = []
for m in _QUOTED_OR_WORD.finditer(q or ""):
token = (m.group(1) or m.group(2) or "").strip()
if token:
out.append(token)
return out
def has_wildcards(token: str) -> bool:
return "*" in token or "?" in token
def wildcard_to_regex(token: str) -> str:
"""
Convert user wildcard token to a safe regex:
* -> .*
? -> .
Everything else is escaped. Suitable for Django __iregex.
"""
STAR = "__STAR__"
QMARK = "__QMARK__"
s = token.replace("*", STAR).replace("?", QMARK)
s = re.escape(s)
s = s.replace(STAR, ".*").replace(QMARK, ".")
return s
# ----------------------------
# Scripture parsing (minimal, non-blocking)
# ----------------------------
def parse_scripture(scripture_str: str):
"""
Minimal placeholder: keep as a list with raw string so imports never fail.
Replace with your richer parser when ready.
"""
if not scripture_str:
return []
return [{"raw": scripture_str}]
# ----------------------------
# CSV import (robust)
# ----------------------------
EXPECTED_HEADERS = [h.lower() for h in [ EXPECTED_HEADERS = [h.lower() for h in [
"Subject","Illustration","Application","Scripture","Source", "Subject","Illustration","Application","Scripture","Source",
"Talk Title","Talk Number","Code","Date","Date Edited" "Talk Title","Talk Number","Code","Date","Date Edited"
]] ]]
def _sniff(text: str): def _sniff(text: str):
sample = text[:8192] sample = text[:8192]
try: try:
@@ -29,51 +73,49 @@ def _sniff(text: str):
dialect = _Simple dialect = _Simple
return dialect return dialect
def _as_dictreader(text: str, dialect, fieldnames=None): def _as_dictreader(text: str, dialect, fieldnames=None):
"""
Yield rows as dicts. If fieldnames are provided, treat file as headerless.
We also peek one row: if it looks like an actual header row, we skip it.
"""
sio = io.StringIO(text) sio = io.StringIO(text)
if fieldnames is None: if fieldnames is None:
reader = csv.DictReader(sio, dialect=dialect) reader = csv.DictReader(sio, dialect=dialect)
else:
reader = csv.DictReader(sio, dialect=dialect, fieldnames=fieldnames)
first = next(reader, None)
if first is not None:
matches = sum(1 for k, v in first.items() if (v or "").strip().lower() == k.strip().lower())
if matches < 5:
yield first
for row in reader: for row in reader:
yield row yield row
return return
# Headerless mode
reader = csv.DictReader(sio, dialect=dialect, fieldnames=fieldnames)
first = next(reader, None)
if first is not None:
# If many columns equal their header names, it's probably a header row
matches = sum(1 for k, v in first.items() if (v or "").strip().lower() == k.strip().lower())
if matches < 5:
# Not a header row, yield it
yield first
for row in reader: for row in reader:
yield row yield row
def import_csv_bytes(b: bytes, dry_run: bool = True):
def parse_scripture(scripture_str):
"""
Placeholder scripture parser — adjust as needed.
"""
if not scripture_str:
return []
# Very basic parsing, could be replaced with real logic
return [{"raw": scripture_str}]
def import_csv_bytes(b: bytes, dry_run=True):
""" """
Robust import: Robust import:
- Auto-detect delimiter (comma/semicolon/tab/pipe). - Auto-detect delimiter (comma/semicolon/tab/pipe).
- If required headers are missing, re-parse treating file as *headerless* - If required headers are missing, re-parse treating file as *headerless*
using the canonical column order. using the canonical column order.
- Upsert by Code; skip rows that are entirely empty. - Skip fully empty rows.
- Upsert by Code (if Code present), else insert.
Returns a report dict with counts and diagnostics.
""" """
text = b.decode("utf-8-sig", errors="replace") text = b.decode("utf-8-sig", errors="replace")
dialect = _sniff(text) dialect = _sniff(text)
# First attempt: use file-provided headers
reader1 = csv.DictReader(io.StringIO(text), dialect=dialect) reader1 = csv.DictReader(io.StringIO(text), dialect=dialect)
headers1 = [(h or "").strip().lower() for h in (reader1.fieldnames or [])] headers1 = [(h or "").strip().lower() for h in (reader1.fieldnames or [])]
used_headerless = False used_headerless = False
if not headers1 or sum(h in EXPECTED_HEADERS for h in headers1) < 5: if not headers1 or sum(h in EXPECTED_HEADERS for h in headers1) < 5:
# Not enough expected headers -> treat as headerless/positional
used_headerless = True used_headerless = True
rows_iter = _as_dictreader(text, dialect, fieldnames=EXPECTED_HEADERS) rows_iter = _as_dictreader(text, dialect, fieldnames=EXPECTED_HEADERS)
else: else:
@@ -116,6 +158,7 @@ def import_csv_bytes(b: bytes, dry_run=True):
except Exception: except Exception:
tnum = None tnum = None
# Skip rows that are completely empty across all tracked fields
if not any([subj, illu, appl, scr, src, tt, code, tnum, dadd, ded]): if not any([subj, illu, appl, scr, src, tt, code, tnum, dadd, ded]):
report["skipped"] += 1 report["skipped"] += 1
continue continue
@@ -126,12 +169,11 @@ def import_csv_bytes(b: bytes, dry_run=True):
talk_title=tt, entry_code=code, date_added=dadd, date_edited=ded talk_title=tt, entry_code=code, date_added=dadd, date_edited=ded
) )
parsed = parse_scripture(scr) # Scripture parse diagnostics
for it in parsed: parsed_list = parse_scripture(scr)
if it: for it in parsed_list:
report["scripture_parsed"] += 1 if it: report["scripture_parsed"] += 1
else: else: report["scripture_failed"] += 1
report["scripture_failed"] += 1
if dry_run: if dry_run:
continue continue
@@ -153,8 +195,12 @@ def import_csv_bytes(b: bytes, dry_run=True):
obj = Entry.objects.create(**data) obj = Entry.objects.create(**data)
report["inserted"] += 1 report["inserted"] += 1
for it in parsed: for it in parsed_list:
if it: if it and isinstance(it, dict) and "raw" in it:
# Keep raw-only ref optional; skip creating ScriptureRef if schema differs
pass
elif it:
# If you switch to a structured parser, create records like:
ScriptureRef.objects.create(entry=obj, **it) ScriptureRef.objects.create(entry=obj, **it)
except Exception as e: except Exception as e: