# core/utils.py
from __future__ import annotations

import csv
import io
import re
from datetime import datetime
from typing import Dict, List, Optional, Any, Tuple, Iterable

from django.db import transaction, IntegrityError, DataError, DatabaseError

from .models import Entry


# ============================
# Search helpers (used by views)
# ============================

def terms(q: str) -> List[str]:
    """Split search query into terms; keep quoted phrases together."""
    if not q:
        return []
    out, buf, in_quote = [], [], False
    for ch in q:
        if ch == '"':
            in_quote = not in_quote
            continue
        if ch.isspace() and not in_quote:
            if buf:
                out.append("".join(buf))
                buf = []
        else:
            buf.append(ch)
    if buf:
        out.append("".join(buf))
    return out


def has_wildcards(s: str) -> bool:
    return bool(s) and ("*" in s or "?" in s)


def wildcard_to_regex(s: str) -> str:
    """
    Convert user wildcards to a Postgres-friendly regex:
    * -> .*   ? -> .    (escape regex meta first)
    """
    if s is None:
        return ""
    s = re.escape(s)
    s = s.replace(r"\*", ".*").replace(r"\?", ".")
    return f"^{s}$"


# ============================
# CSV import – robust version
# ============================

# Canonical header names we expect (case-insensitive on input):
CANON_HEADERS = [
    "subject", "illustration", "application", "scripture",
    "source", "talk title", "talk number", "code", "date", "date edited",
]
EXPECTED_COLS = len(CANON_HEADERS)

# Curly quotes & odd whitespace we normalize
QUOTE_MAP = {
    "\u201c": '"', "\u201d": '"',  # “ ”
    "\u2018": "'", "\u2019": "'",  # ‘ ’
}
CTRL_MAP = {
    "\x0b": " ",  # vertical tab
    "\x0c": " ",  # form feed
}


def _decode_bytes(b: bytes) -> str:
    """Decode bytes with utf-8-sig, normalize line endings and characters."""
    t = b.decode("utf-8-sig", errors="replace")
    # normalize curly quotes and control chars
    for k, v in QUOTE_MAP.items():
        t = t.replace(k, v)
    for k, v in CTRL_MAP.items():
        t = t.replace(k, v)
    # normalize newlines
    t = t.replace("\r\n", "\n").replace("\r", "\n")
    return t


def _sniff_dialect(text: str) -> csv.Dialect:
    """Sniff CSV dialect or default to comma."""
    snippet = text[:4096]
    try:
        return csv.Sniffer().sniff(snippet, delimiters=[",", ";", "\t", "|"])
    except Exception:
        class D(csv.Dialect):
            delimiter = ","
            quotechar = '"'
            doublequote = True
            skipinitialspace = False
            lineterminator = "\n"
            quoting = csv.QUOTE_MINIMAL
        return D()


def _split_lenient(line: str, delimiter: str, expected: int) -> List[str]:
    """
    Split a CSV line manually, respecting quotes. Works even if the line
    contains inconsistent quoting (e.g., inner quotes not doubled).
    Ensures we return exactly `expected` fields by merging overflow cells
    into the current text field (before the trailing short/meta columns).
    """
    out, field = [], []
    in_quotes = False
    i, n = 0, len(line)
    while i < n:
        ch = line[i]
        if ch == '"':
            # doubled quote inside a quoted field -> literal quote
            if in_quotes and i + 1 < n and line[i + 1] == '"':
                field.append('"')
                i += 2
                continue
            in_quotes = not in_quotes
            i += 1
            continue
        if ch == delimiter and not in_quotes:
            out.append("".join(field))
            field = []
            i += 1
            continue
        field.append(ch)
        i += 1
    out.append("".join(field))

    # Repair count to exactly `expected`
    if len(out) < expected:
        out += [""] * (expected - len(out))
    elif len(out) > expected:
        head = out[:expected - 1]
        tail = out[expected - 1:]
        head[-1] = head[-1] + delimiter + delimiter.join(tail)
        out = head

    return out


def _build_header_map(headers: List[str]) -> Dict[str, str]:
    """
    Map incoming headers (any case) to our canonical keys.
    """
    key = {h.lower().strip(): h for h in headers}
    mapping: Dict[str, Optional[str]] = {}
    for canon in CANON_HEADERS:
        if canon in key:
            mapping[canon] = key[canon]
        else:
            aliases = {
                "talk title": ["talk_title", "title"],
                "talk number": ["talk_no", "talk#", "talknum"],
                "date edited": ["edited", "date_edited", "edited date"],
            }.get(canon, [])
            found = next((a for a in aliases if a in key), None)
            mapping[canon] = key.get(found) if found else None
    return mapping  # type: ignore[return-value]


def _getv(row: Dict[str, str], header_map: Dict[str, str], canon_key: str) -> str:
    src = header_map.get(canon_key)
    return (row.get(src) if src else "") or ""


def _parse_date(val: str):
    val = (val or "").strip()
    if not val:
        return None
    # Common formats: m/d/Y, Y-m-d (also tolerate single-digit m/d on Linux)
    for fmt in ("%m/%d/%Y", "%-m/%-d/%Y", "%Y-%m-%d"):
        try:
            return datetime.strptime(val, fmt).date()
        except Exception:
            pass
    # Fallback to dateutil if present
    try:
        from dateutil import parser  # type: ignore
        return parser.parse(val).date()
    except Exception:
        return None


def _clip(field_name: str, value: str) -> str:
    """
    Clip to model field's max_length if needed, to avoid DB DataError.
    """
    try:
        f = Entry._meta.get_field(field_name)
        max_len = getattr(f, "max_length", None)
        if max_len and value and len(value) > max_len:
            return value[:max_len]
    except Exception:
        pass
    return value


def _coerce_int(val: str):
    val = (val or "").strip()
    if not val:
        return None
    m = re.search(r"(-?\d+)", val.replace(",", ""))
    if not m:
        return None
    try:
        return int(m.group(1))
    except Exception:
        return None


def import_csv_bytes(b: bytes, dry_run: bool = False, commit_every: int = 500) -> Dict[str, object]:
    """
    Robust CSV import. Commits each row in its own transaction so that one bad
    row does not poison the entire import (avoids TransactionManagementError cascades).

    Returns a report dict with counts and first-line error messages.
    """
    text = _decode_bytes(b)
    dialect = _sniff_dialect(text)
    delimiter = getattr(dialect, "delimiter", ",")

    # --- headers ---
    f = io.StringIO(text)
    reader = csv.reader(f, dialect=dialect)
    try:
        raw_headers = next(reader)
    except StopIteration:
        return {
            "rows": 0, "inserted": 0, "updated": 0, "skipped": 0, "errors": [],
            "scripture_parsed": 0, "scripture_failed": 0,
            "dialect_delimiter": delimiter, "used_headerless_mode": False,
            "seen_headers": []
        }

    headers = raw_headers if len(raw_headers) == EXPECTED_COLS else _split_lenient(
        ",".join(raw_headers), delimiter=delimiter, expected=EXPECTED_COLS
    )
    header_map = _build_header_map(headers)

    # Pair raw lines so we can repair rows mis-split by csv
    raw_lines = text.splitlines()[1:]  # skip header

    dict_reader = csv.DictReader(io.StringIO(text), fieldnames=headers, dialect=dialect)
    next(dict_reader, None)  # skip header

    total = inserted = updated = skipped = 0
    errors: List[str] = []
    scripture_ok = scripture_bad = 0

    # Import loop (row-by-row atomic)
    for idx, (raw_line, row) in enumerate(zip(raw_lines, dict_reader), start=2):
        total += 1

        # Repair if DictReader got the wrong shape (inconsistent quotes in source)
        if len(row) != EXPECTED_COLS or None in row:
            cells = _split_lenient(raw_line, delimiter=delimiter, expected=EXPECTED_COLS)
            row = dict(zip(headers, cells))

        # Extract canonical fields
        subject      = _getv(row, header_map, "subject").strip()
        illustration = _getv(row, header_map, "illustration").strip()
        application  = _getv(row, header_map, "application").strip()
        scripture    = _getv(row, header_map, "scripture").strip()
        source       = _getv(row, header_map, "source").strip()
        talk_title   = _getv(row, header_map, "talk title").strip()
        talk_number  = _coerce_int(_getv(row, header_map, "talk number"))
        entry_code   = _getv(row, header_map, "code").strip()
        date_added   = _parse_date(_getv(row, header_map, "date"))
        date_edited  = _parse_date(_getv(row, header_map, "date edited"))

        # Skip rows with no meaningful text
        if not (subject or illustration or application):
            skipped += 1
            continue

        # Clip to DB lengths
        subject      = _clip("subject", subject)
        illustration = _clip("illustration", illustration)
        application  = _clip("application", application)
        scripture    = _clip("scripture_raw", scripture)
        source       = _clip("source", source)
        talk_title   = _clip("talk_title", talk_title)
        entry_code   = _clip("entry_code", entry_code)

        scripture_ok += 1 if scripture else 0
        scripture_bad += 0 if scripture else 1

        # Upsert key: prefer entry_code; else (subject + illustration)
        lookup: Dict[str, object] = {}
        if entry_code:
            lookup["entry_code"] = entry_code
        else:
            lookup["subject"] = subject
            lookup["illustration"] = illustration

        if dry_run:
            exists = Entry.objects.filter(**lookup).exists()
            inserted += 0 if exists else 1
            updated  += 1 if exists else 0
            continue

        try:
            # Isolate each row so a failure rolls back only that row
            with transaction.atomic():
                obj = Entry.objects.filter(**lookup).first()
                created = False
                if not obj:
                    obj = Entry(**lookup)
                    created = True

                obj.subject       = subject
                obj.illustration  = illustration
                obj.application   = application
                obj.scripture_raw = scripture
                obj.source        = source
                obj.talk_title    = talk_title
                obj.talk_number   = talk_number
                if entry_code:
                    obj.entry_code = entry_code
                if date_added:
                    obj.date_added = date_added
                if date_edited:
                    obj.date_edited = date_edited

                obj.save()

                inserted += 1 if created else 0
                updated  += 0 if created else 1

        except (IntegrityError, DataError, DatabaseError, ValueError) as e:
            msg = str(e).splitlines()[0]
            errors.append(f"line {idx}: {type(e).__name__}: {msg}")
            skipped += 1
            # continue to next row

    return {
        "rows": total,
        "inserted": inserted,
        "updated": updated,
        "skipped": skipped,
        "errors": errors,
        "scripture_parsed": scripture_ok,
        "scripture_failed": scripture_bad,
        "dialect_delimiter": delimiter,
        "used_headerless_mode": False,
        "seen_headers": headers,
    }

    EXPECTED_HEADERS = [
    "Subject","Illustration","Application","Scripture","Source",
    "Talk Title","Talk Number","Code","Date","Date Edited"
]

def _to_int_or_none(s: str) -> Optional[int]:
    s = (s or "").strip()
    if not s:
        return None
    try:
        return int(s)
    except Exception:
        return None

def _to_date_or_none(s: str) -> Optional[datetime.date]:
    s = (s or "").strip()
    if not s:
        return None
    for fmt in ("%Y-%m-%d", "%m/%d/%Y", "%m/%d/%y"):
        try:
            return datetime.datetime.strptime(s, fmt).date()
        except Exception:
            pass
    return None  # let caller decide if this is acceptable

def import_csv_bytes(data: bytes, dry_run: bool = True, batch_size: int = 1000) -> Dict[str, Any]:
    """
    Robust CSV importer for Entries.
    - data: raw bytes of the uploaded file
    - dry_run: when True, do not write to DB; return preview + errors
    - batch_size: bulk_create chunk size
    Returns: dict(report=..., rows=preview_rows, errors=[...])
    """
    text = io.TextIOWrapper(io.BytesIO(data), encoding="utf-8-sig", newline="")
    reader = csv.reader(text)

    # Read header row
    try:
        header = next(reader)
    except StopIteration:
        return {"report": "Empty file.", "rows": [], "errors": ["File is empty."]}

    # Loose header check: either exact match, or map by index if close
    header_norm = [h.strip() for h in header]
    if header_norm != EXPECTED_HEADERS:
        return {
            "report": "Header mismatch.",
            "rows": [],
            "errors": [
                "Expected header: " + ", ".join(EXPECTED_HEADERS),
                "Found header: " + ", ".join(header_norm),
            ],
        }

    to_create: List[Entry] = []
    errors: List[str] = []
    preview: List[Tuple[int, Dict[str, Any]]] = []  # first 100 rows for the UI
    rownum = 1

    def make_entry(row: List[str]) -> Optional[Entry]:
        # force length to 10, padding if needed
        padded = row + [""] * (10 - len(row))
        subj, ill, app, scr, src, talk_title, talk_num, code, d_added, d_edited = padded[:10]

        e = Entry(
            subject=(subj or "").strip(),
            illustration=(ill or "").strip(),
            application=(app or "").strip(),
            scripture_raw=(scr or "").strip(),
            source=(src or "").strip(),
            talk_title=(talk_title or "").strip(),
            talk_number=_to_int_or_none(talk_num),
            entry_code=(code or "").strip(),
            date_added=_to_date_or_none(d_added),
            date_edited=_to_date_or_none(d_edited),
        )
        return e

    created_total = 0
    with (transaction.atomic() if not dry_run else _noop_context()):
        for row in reader:
            rownum += 1
            try:
                e = make_entry(row)
                # (optional) add required-field checks; e.g., at least one of illustration/application
                if not ((e.illustration and e.illustration.strip()) or (e.application and e.application.strip())):
                    errors.append(f"Row {rownum}: missing Illustration and Application")
                    continue

                to_create.append(e)

                if len(preview) < 100:
                    preview.append((rownum, {
                        "Subject": e.subject, "Illustration": e.illustration[:120],
                        "Application": e.application[:120], "Scripture": e.scripture_raw,
                        "Source": e.source, "Talk Title": e.talk_title,
                        "Talk Number": e.talk_number, "Code": e.entry_code,
                        "Date": e.date_added, "Date Edited": e.date_edited,
                    }))

                if not dry_run and len(to_create) >= batch_size:
                    Entry.objects.bulk_create(to_create, batch_size=batch_size)
                    created_total += len(to_create)
                    to_create.clear()

            except Exception as ex:
                errors.append(f"Row {rownum}: {ex}")

        if not dry_run and to_create:
            Entry.objects.bulk_create(to_create, batch_size=batch_size)
            created_total += len(to_create)
            to_create.clear()

    report = f"{'Would import' if dry_run else 'Imported'} {created_total if not dry_run else len(preview)}+ rows."
    return {"report": report, "rows": preview, "errors": errors}

# small context manager used above
class _noop_context:
    def __enter__(self): return self
    def __exit__(self, exc_type, exc, tb): return False