Update imports/cleaner.py

2025-08-13 16:52:11 +00:00 · 2025-08-13 16:52:11 +00:00 · 54a5ecaf12
commit 54a5ecaf12
parent bd9af4fd6b
1 changed files with 82 additions and 383 deletions
--- a/imports/cleaner.py
+++ b/imports/cleaner.py
@ -1,33 +1,22 @@
 #!/usr/bin/env python3
 """
-Clean and normalize `illustrations_seed.csv` -> `illustrations_clean.csv`.
+Content-preserving CSV sanitizer for 'illustrations_seed.csv' -> 'illustrations_clean.csv'.
-Key behaviors:
+What it does (and ONLY this by default):
- Robustly parse CSV with many commas and embedded quotes/newlines.
+- Parses your CSV safely.
- Prepass to normalize smart quotes/non-breaking spaces before parsing.
+- Writes a new CSV where EVERY FIELD is QUOTED and any inner " become "".
- Quote ALL fields on output to guarantee importer-friendly CSV.
+- Keeps the exact text of every field (no trimming, no subject/scripture tweaking, no punctuation edits).
- Subject: trim parts, drop empties, remove trailing comma, rejoin with ", ".
+- Keeps the exact column order expected by your importer.
 - Scripture: remove trailing semicolons, normalize common book abbreviations.
 - Dates: accept flexible M/D/YY, M/D/YYYY, etc.; output ISO YYYY-MM-DD.
 - Talk Number / Code: numeric if possible; blank if invalid.
 - Write rejects to `illustrations_rejects.csv` with a reason, if any.
 - Post-write self-check: verifies column count of every row.
-Usage:
+Optional flag --normalize-dates will convert Date and Date Edited to ISO YYYY-MM-DD.
  python3 clean_illustrations_csv.py \
    --in illustrations_seed.csv \
    --out illustrations_clean.csv \
    --rejects illustrations_rejects.csv
 """
 import argparse
 import csv
 import io
 import os
 import re
 from datetime import datetime
-HEADER = [
+EXPECTED_HEADER = [
    "Subject",
    "Illustration",
    "Application",
@ -40,280 +29,76 @@ HEADER = [
    "Date Edited",
 ]
-# Common scripture book abbreviation normalization.
+def parse_date_flex(v: str) -> str:
-BOOK_MAP = {
+    """Very permissive date parser; returns ISO or original string if parsing fails."""
-    # New Testament (common abbreviations)
+    v = (v or "").strip()
    "rom": "Romans",
    "romans": "Romans",
    "eph": "Ephesians",
    "ephesians": "Ephesians",
    "col": "Colossians",
    "colossians": "Colossians",
    "1 cor": "1 Corinthians",
    "2 cor": "2 Corinthians",
    "1 thess": "1 Thessalonians",
    "2 thess": "2 Thessalonians",
    "1 tim": "1 Timothy",
    "2 tim": "2 Timothy",
    "1 pet": "1 Peter",
    "2 pet": "2 Peter",
    "1 john": "1 John",
    "2 john": "2 John",
    "3 john": "3 John",
    "heb": "Hebrews",
    "rev": "Revelation",
    "revelation": "Revelation",
    "acts": "Acts",
    "matt": "Matthew",
    "mt": "Matthew",
    "mark": "Mark",
    "mk": "Mark",
    "luke": "Luke",
    "lk": "Luke",
    "john": "John",
    "jn": "John",
    "jude": "Jude",
    "phil": "Philippians",
    "php": "Philippians",
    "philem": "Philemon",
    "titus": "Titus",
    "gal": "Galatians",
    "galatians": "Galatians",
    "james": "James",
    "jas": "James",
    # Old Testament (examples + the ones in your sample)
    "eccl": "Ecclesiastes",
    "eccles": "Ecclesiastes",
    "ecclesiastes": "Ecclesiastes",
    "ps": "Psalms",
    "psalm": "Psalms",
    "psalms": "Psalms",
    "prov": "Proverbs",
    "proverbs": "Proverbs",
    "gen": "Genesis",
    "genesis": "Genesis",
    "ex": "Exodus",
    "exod": "Exodus",
    "exodus": "Exodus",
    "isa": "Isaiah",
    "isaiah": "Isaiah",
    "jer": "Jeremiah",
    "jeremiah": "Jeremiah",
    "dan": "Daniel",
    "daniel": "Daniel",
 }
 DATE_FORMATS = [
    "%m/%d/%y",
    "%m/%d/%Y",
    "%-m/%-d/%y",   # on Linux/mac
    "%-m/%-d/%Y",
    "%m/%-d/%y",
    "%-m/%d/%y",
    "%m/%-d/%Y",
    "%-m/%d/%Y",
 ]
 NBSP = "\u00A0"
 SMARTS = {
    "\u201C": '"',  # left double
    "\u201D": '"',  # right double
    "\u2018": "'",  # left single
    "\u2019": "'",  # right single / apostrophe
    "\u00AB": '"',  # «
    "\u00BB": '"',  # »
 }
 def pre_normalize_text(raw: str) -> str:
    """Prepass to remove non-breaking spaces and normalize smart quotes."""
    if raw is None:
        return ""
    s = str(raw).replace(NBSP, " ")
    for k, v in SMARTS.items():
        s = s.replace(k, v)
    # Normalize any stray CR-only line endings
    s = s.replace("\r\n", "\n").replace("\r", "\n")
    return s
 def parse_date(value: str) -> str:
    """Parse flexible US-style dates; return ISO YYYY-MM-DD or '' if empty/invalid."""
    if not value:
        return ""
    v = value.strip()
    if not v:
        return ""
-    # Common separators already handled; try multiple formats.
+    fmts = ["%m/%d/%y", "%m/%d/%Y", "%-m/%-d/%y", "%-m/%-d/%Y"]
-    for fmt in DATE_FORMATS:
+    for fmt in fmts:
        try:
-            dt = datetime.strptime(v, fmt)
+            return datetime.strptime(v, fmt).strftime("%Y-%m-%d")
            # Heuristic for 2-digit years if needed (datetime handles 1900s/2000s defaults)
            # We trust strptime here; user examples are 2000s.
            return dt.strftime("%Y-%m-%d")
        except Exception:
            pass
-    # Try to interpret like M/D/YY or M/D/YYYY with flexible spacing
+    # last-ditch: split by / or -
    m = re.match(r"^\s*(\d{1,2})[/-](\d{1,2})[/-](\d{2,4})\s*$", v)
    if m:
        mth, day, yr = int(m.group(1)), int(m.group(2)), m.group(3)
        if len(yr) == 2:
            # Assume 20xx for 00-69, 19xx for 70-99 (datetime default logic)
            year = 2000 + int(yr) if int(yr) <= 69 else 1900 + int(yr)
        else:
            year = int(yr)
        try:
            return datetime(year, mth, day).strftime("%Y-%m-%d")
        except Exception:
            return ""
    return ""
 BOOK_RE = re.compile(r"^\s*([1-3]?\s*[A-Za-z\.]+)\s*(.*)$")
 def normalize_scripture(value: str) -> str:
    """Normalize scripture strings: strip trailing semicolons/spaces, normalize book name if easily detectable."""
    if not value:
        return ""
    s = value.strip()
    # Remove trailing semicolons and excess punctuation/spaces.
    s = re.sub(r"[;,\s]+$", "", s)
    # Try to normalize the *first* book token if identifiable.
    m = BOOK_RE.match(s)
    if not m:
        return s
    book_raw, rest = m.group(1), m.group(2)
    # normalize book key
    key = book_raw.lower().replace(".", "")
    key = re.sub(r"\s+", " ", key).strip()
    # normalize ordinal spacing: "1cor" -> "1 cor"
    key = re.sub(r"^([1-3])([a-z])", r"\1 \2", key)
    book = BOOK_MAP.get(key, None)
    if not book:
        # Title-case fallback (basic)
        book = " ".join(w.capitalize() for w in key.split())
    rest = rest.strip()
    # Normalize spacing in the chapter/verse segment (e.g., "14:13, 19")
    rest = re.sub(r"\s*,\s*", ", ", rest)
    rest = re.sub(r"\s*;\s*", "; ", rest)
    rest = re.sub(r"\s+", " ", rest)
    out = (book + (" " + rest if rest else "")).strip()
    return out
 def clean_subject(value: str) -> str:
    """Trim parts, drop empty entries, remove trailing commas/spaces, re-join with ', '."""
    if not value:
        return ""
    # Strip external quotes handled by csv; here we just process content
    s = value.strip()
    # Split by comma, trim each token, drop empties
    parts = [p.strip() for p in s.split(",")]
    parts = [p for p in parts if p]  # drop empty tokens
    # Re-join
    return ", ".join(parts)
 def to_int_or_blank(value: str) -> str:
    if value is None:
        return ""
    v = str(value).strip()
    if v == "":
        return ""
    # Strip non-digit chars (but keep minus? not needed here)
    v2 = re.sub(r"[^0-9-]+", "", v)
    if v2 in ("", "-", "--"):
        return ""
    try:
-        int(v2)
+        parts = [p.strip() for p in v.replace("-", "/").split("/")]
-        return v2
+        if len(parts) == 3:
            m, d, y = parts
            y = int(y)
            m = int(m)
            d = int(d)
            return datetime(y if y > 99 else (2000 + y if y <= 69 else 1900 + y), m, d).strftime("%Y-%m-%d")
    except Exception:
-        return ""
+        pass
    return v  # preserve original if we can't confidently parse
-def normalize_row(row_dict, stats, rownum):
+def main():
-    """Return (cleaned_row_dict, reject_reason_or_None)."""
+    ap = argparse.ArgumentParser()
-    clean = {}
+    ap.add_argument("--in", dest="in_path", default="illustrations_seed.csv")
    ap.add_argument("--out", dest="out_path", default="illustrations_clean.csv")
    ap.add_argument("--normalize-dates", action="store_true",
                    help="If set, convert Date and Date Edited to ISO YYYY-MM-DD. Otherwise leave as-is.")
    args = ap.parse_args()
-    # Subject
+    if not os.path.exists(args.in_path):
-    subject = row_dict.get("Subject", "")
+        raise SystemExit(f"Input file not found: {args.in_path}")
    subject = clean_subject(subject)
    clean["Subject"] = subject
-    # Illustration
+    # Read using DictReader; accept whatever header is present but verify shape.
-    ill = pre_normalize_text(row_dict.get("Illustration", "")).strip()
+    with open(args.in_path, "r", encoding="utf-8-sig", newline="") as f:
-    clean["Illustration"] = ill
+        reader = csv.DictReader(
    # Application
    app = pre_normalize_text(row_dict.get("Application", "")).strip()
    clean["Application"] = app
    # Scripture
    scr = pre_normalize_text(row_dict.get("Scripture", "")).strip()
    scr_norm = normalize_scripture(scr)
    if scr and scr != scr_norm:
        stats["scripture_changed"] += 1
    clean["Scripture"] = scr_norm
    # Source
    src = pre_normalize_text(row_dict.get("Source", "")).strip()
    clean["Source"] = src
    # Talk Title
    ttitle = pre_normalize_text(row_dict.get("Talk Title", "")).strip()
    clean["Talk Title"] = ttitle
    # Talk Number
    tnum = to_int_or_blank(row_dict.get("Talk Number", ""))
    if tnum == "" and str(row_dict.get("Talk Number", "")).strip() not in ("",):
        stats["invalid_ints"] += 1
    clean["Talk Number"] = tnum
    # Code
    code = to_int_or_blank(row_dict.get("Code", ""))
    if code == "" and str(row_dict.get("Code", "")).strip() not in ("",):
        stats["invalid_ints"] += 1
    clean["Code"] = code
    # Date
    date_raw = pre_normalize_text(row_dict.get("Date", "")).strip()
    date_norm = parse_date(date_raw)
    if date_raw and not date_norm:
        stats["invalid_dates"] += 1
    elif date_norm:
        stats["dates_normalized"] += 1
    clean["Date"] = date_norm
    # Date Edited
    datee_raw = pre_normalize_text(row_dict.get("Date Edited", "")).strip()
    datee_norm = parse_date(datee_raw)
    if datee_raw and not datee_norm:
        stats["invalid_dates"] += 1
    elif datee_norm:
        stats["dates_normalized"] += 1
    clean["Date Edited"] = datee_norm
    # Reject logic: If the row is completely empty across all known fields, skip.
    if not any(clean.get(h, "").strip() for h in HEADER):
        return clean, "empty_row"
    return clean, None
 def read_with_prepass(path):
    """Read entire file, pre-normalize text, then parse CSV via csv.DictReader."""
    with open(path, "r", encoding="utf-8-sig", newline="") as f:
        raw = f.read()
    normalized = pre_normalize_text(raw)
    buf = io.StringIO(normalized)
    reader = csv.DictReader(buf, delimiter=",", quotechar='"', doublequote=True, escapechar="\\", strict=False)
    return reader
 def write_csv(rows, out_path):
    """Write rows (list of dicts) with QUOTE_ALL to ensure commas/newlines are safe."""
    with open(out_path, "w", encoding="utf-8", newline="") as f:
        writer = csv.DictWriter(
            f,
-            fieldnames=HEADER,
+            delimiter=",",
            quotechar='"',
            doublequote=True,
            escapechar="\\",
            strict=False,
        )
        input_header = reader.fieldnames or []
        missing = [h for h in EXPECTED_HEADER if h not in input_header]
        if missing:
            print(f"[WARN] Input CSV missing columns: {missing}")
            print(f"       Found columns: {input_header}")
        rows_out = []
        total = 0
        for row in reader:
            total += 1
            # Build output row strictly in EXPECTED_HEADER order, preserving raw strings.
            out = {h: (row.get(h, "") if row.get(h, "") is not None else "") for h in EXPECTED_HEADER}
            # Optional date normalization (ONLY dates; no commas involved)
            if args.normalize_dates:
                for dh in ("Date", "Date Edited"):
                    out[dh] = parse_date_flex(out.get(dh, ""))
            rows_out.append(out)
    # Write with QUOTE_ALL so commas/newlines/quotes never break columns.
    with open(args.out_path, "w", encoding="utf-8", newline="") as f:
        w = csv.DictWriter(
            f,
            fieldnames=EXPECTED_HEADER,
            delimiter=",",
            quotechar='"',
            quoting=csv.QUOTE_ALL,
@ -321,113 +106,27 @@ def write_csv(rows, out_path):
            escapechar="\\",
            lineterminator="\n",
        )
-        writer.writeheader()
+        w.writeheader()
-        for r in rows:
+        for r in rows_out:
-            writer.writerow({k: r.get(k, "") for k in HEADER})
+            w.writerow(r)
-def self_check_csv(path):
+    # Quick self-check: re-read output and ensure fixed column count
    """Verify column count on every row equals header length."""
    problems = []
-    with open(path, "r", encoding="utf-8", newline="") as f:
+    with open(args.out_path, "r", encoding="utf-8", newline="") as f:
-        reader = csv.reader(f, delimiter=",", quotechar='"', doublequote=True, escapechar="\\", strict=False)
+        rdr = csv.reader(f, delimiter=",", quotechar='"', doublequote=True, escapechar="\\", strict=False)
-        expected = None
+        expected_cols = None
-        rownum = 0
+        for i, row in enumerate(rdr, start=1):
-        for row in reader:
+            if i == 1:
-            rownum += 1
+                expected_cols = len(row)
            if rownum == 1:
                expected = len(row)
                continue
-            if len(row) != expected:
+            if len(row) != expected_cols:
-                problems.append((rownum, len(row), expected))
+                problems.append((i, len(row), expected_cols))
    return problems
 def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--in", dest="in_path", default="illustrations_seed.csv")
    ap.add_argument("--out", dest="out_path", default="illustrations_clean.csv")
    ap.add_argument("--rejects", dest="rejects_path", default="illustrations_rejects.csv")
    args = ap.parse_args()
    in_path = args.in_path
    out_path = args.out_path
    rejects_path = args.rejects_path
    if not os.path.exists(in_path):
        raise SystemExit(f"Input file not found: {in_path}")
    stats = {
        "total_rows": 0,
        "written_rows": 0,
        "reject_rows": 0,
        "scripture_changed": 0,
        "invalid_ints": 0,
        "invalid_dates": 0,
        "dates_normalized": 0,
    }
    rejects = []
    cleaned_rows = []
    reader = read_with_prepass(in_path)
    # Validate header presence/shape
    missing = [h for h in HEADER if h not in reader.fieldnames]
    if missing:
        raise SystemExit(f"Input CSV missing expected headers: {missing}\nFound headers: {reader.fieldnames}")
    for idx, row in enumerate(reader, start=2):  # start=2 because header is line 1
        stats["total_rows"] += 1
        try:
            cleaned, reason = normalize_row(row, stats, idx)
            if reason:
                stats["reject_rows"] += 1
                r = {k: row.get(k, "") for k in HEADER}
                r["reason"] = reason
                rejects.append(r)
                continue
            cleaned_rows.append(cleaned)
        except Exception as e:
            stats["reject_rows"] += 1
            r = {k: row.get(k, "") for k in HEADER}
            r["reason"] = f"exception@row {idx}: {e}"
            rejects.append(r)
    # Write outputs
    write_csv(cleaned_rows, out_path)
    # Write rejects if any
    if rejects:
        # Ensure 'reason' is the last column for readability
        rej_header = HEADER + ["reason"]
        with open(rejects_path, "w", encoding="utf-8", newline="") as f:
            writer = csv.DictWriter(
                f,
                fieldnames=rej_header,
                delimiter=",",
                quotechar='"',
                quoting=csv.QUOTE_ALL,
                doublequote=True,
                escapechar="\\",
                lineterminator="\n",
            )
            writer.writeheader()
            for r in rejects:
                writer.writerow({k: r.get(k, "") for k in rej_header})
    # Self check the written CSV
    problems = self_check_csv(out_path)
    # Summary
    print("=== Clean Summary ===")
-    print(f"Input rows (excluding header): {stats['total_rows']}")
+    print(f"Input rows (excluding header): {total}")
-    print(f"Written rows:                {stats['written_rows'] + len(cleaned_rows)}")
+    print(f"Written rows:                 {len(rows_out)}")
    print(f"Reject rows:                 {stats['reject_rows']}")
    print(f"Scripture normalized:        {stats['scripture_changed']}")
    print(f"Dates normalized:            {stats['dates_normalized']}")
    print(f"Invalid ints blanked:        {stats['invalid_ints']}")
    print(f"Invalid dates (left blank):  {stats['invalid_dates']}")
    if problems:
-        print("\nSelf-check found column count issues on these line numbers in the OUTPUT (1-based):")
+        print("\n[WARNING] Column count issues detected in OUTPUT:")
        for line_no, got, exp in problems:
            print(f"  line {line_no}: columns={got}, expected={exp}")
    else: