#!/usr/bin/env python3 """ Content-preserving CSV sanitizer for 'illustrations_seed.csv' -> 'illustrations_clean.csv'. What it does (and ONLY this by default): - Parses your CSV safely. - Writes a new CSV where EVERY FIELD is QUOTED and any inner " become "". - Keeps the exact text of every field (no trimming, no subject/scripture tweaking, no punctuation edits). - Keeps the exact column order expected by your importer. Optional flag --normalize-dates will convert Date and Date Edited to ISO YYYY-MM-DD. """ import argparse import csv import os from datetime import datetime EXPECTED_HEADER = [ "Subject", "Illustration", "Application", "Scripture", "Source", "Talk Title", "Talk Number", "Code", "Date", "Date Edited", ] def parse_date_flex(v: str) -> str: """Very permissive date parser; returns ISO or original string if parsing fails.""" v = (v or "").strip() if not v: return "" fmts = ["%m/%d/%y", "%m/%d/%Y", "%-m/%-d/%y", "%-m/%-d/%Y"] for fmt in fmts: try: return datetime.strptime(v, fmt).strftime("%Y-%m-%d") except Exception: pass # last-ditch: split by / or - try: parts = [p.strip() for p in v.replace("-", "/").split("/")] if len(parts) == 3: m, d, y = parts y = int(y) m = int(m) d = int(d) return datetime(y if y > 99 else (2000 + y if y <= 69 else 1900 + y), m, d).strftime("%Y-%m-%d") except Exception: pass return v # preserve original if we can't confidently parse def main(): ap = argparse.ArgumentParser() ap.add_argument("--in", dest="in_path", default="illustrations_seed.csv") ap.add_argument("--out", dest="out_path", default="illustrations_clean.csv") ap.add_argument("--normalize-dates", action="store_true", help="If set, convert Date and Date Edited to ISO YYYY-MM-DD. Otherwise leave as-is.") args = ap.parse_args() if not os.path.exists(args.in_path): raise SystemExit(f"Input file not found: {args.in_path}") # Read using DictReader; accept whatever header is present but verify shape. with open(args.in_path, "r", encoding="utf-8-sig", newline="") as f: reader = csv.DictReader( f, delimiter=",", quotechar='"', doublequote=True, escapechar="\\", strict=False, ) input_header = reader.fieldnames or [] missing = [h for h in EXPECTED_HEADER if h not in input_header] if missing: print(f"[WARN] Input CSV missing columns: {missing}") print(f" Found columns: {input_header}") rows_out = [] total = 0 for row in reader: total += 1 # Build output row strictly in EXPECTED_HEADER order, preserving raw strings. out = {h: (row.get(h, "") if row.get(h, "") is not None else "") for h in EXPECTED_HEADER} # Optional date normalization (ONLY dates; no commas involved) if args.normalize_dates: for dh in ("Date", "Date Edited"): out[dh] = parse_date_flex(out.get(dh, "")) rows_out.append(out) # Write with QUOTE_ALL so commas/newlines/quotes never break columns. with open(args.out_path, "w", encoding="utf-8", newline="") as f: w = csv.DictWriter( f, fieldnames=EXPECTED_HEADER, delimiter=",", quotechar='"', quoting=csv.QUOTE_ALL, doublequote=True, escapechar="\\", lineterminator="\n", ) w.writeheader() for r in rows_out: w.writerow(r) # Quick self-check: re-read output and ensure fixed column count problems = [] with open(args.out_path, "r", encoding="utf-8", newline="") as f: rdr = csv.reader(f, delimiter=",", quotechar='"', doublequote=True, escapechar="\\", strict=False) expected_cols = None for i, row in enumerate(rdr, start=1): if i == 1: expected_cols = len(row) continue if len(row) != expected_cols: problems.append((i, len(row), expected_cols)) print("=== Clean Summary ===") print(f"Input rows (excluding header): {total}") print(f"Written rows: {len(rows_out)}") if problems: print("\n[WARNING] Column count issues detected in OUTPUT:") for line_no, got, exp in problems: print(f" line {line_no}: columns={got}, expected={exp}") else: print("\nSelf-check: all rows have the expected column count.") if __name__ == "__main__": main()