From 65bc45ad0d5a8151a47166d013ac3d8a982bfb4c Mon Sep 17 00:00:00 2001
From: Joshua Laymon <joshlaymon@icloud.com>
Date: Wed, 13 Aug 2025 17:05:57 +0000
Subject: [PATCH] Update imports/cleaner.py

---
 imports/cleaner.py | 97 +++++++++++++++++++++++++++++-----------------
 1 file changed, 62 insertions(+), 35 deletions(-)

diff --git a/imports/cleaner.py b/imports/cleaner.py
index b8134fa..654fec3 100644
--- a/imports/cleaner.py
+++ b/imports/cleaner.py
@@ -1,14 +1,19 @@
 #!/usr/bin/env python3
 """
-Content-preserving CSV sanitizer for 'illustrations_seed.csv' -> 'illustrations_clean.csv'.
+Cleaner for `illustrations_seed.csv` -> `illustrations_clean.csv`
 
-What it does (and ONLY this by default):
-- Parses your CSV safely.
-- Writes a new CSV where EVERY FIELD is QUOTED and any inner " become "".
-- Keeps the exact text of every field (no trimming, no subject/scripture tweaking, no punctuation edits).
-- Keeps the exact column order expected by your importer.
+What it does:
+- Reads your CSV safely (commas/newlines/quotes inside fields supported).
+- For every field value, replaces inner *double quotes* with a single quote:
+    "  “  ”  «  »  „   -->   '
+  (Existing single quotes like Jehovah's are preserved.)
+- Writes a new CSV with QUOTE_ALL so commas/newlines never break columns.
+- Optional: --normalize-dates converts Date / Date Edited to ISO YYYY-MM-DD.
+- Prints a short summary and verifies the output column count.
 
-Optional flag --normalize-dates will convert Date and Date Edited to ISO YYYY-MM-DD.
+Usage:
+  python3 cleaner.py
+  python3 cleaner.py --in my_in.csv --out my_out.csv --normalize-dates
 """
 
 import argparse
@@ -29,42 +34,60 @@ EXPECTED_HEADER = [
     "Date Edited",
 ]
 
+# Map various double-quote characters to a single straight apostrophe
+DOUBLE_QUOTES_TO_SINGLE = {
+    '"': "'",          # U+0022
+    '“': "'",          # U+201C
+    '”': "'",          # U+201D
+    '„': "'",          # U+201E
+    '«': "'",          # U+00AB
+    '»': "'",          # U+00BB
+}
+
+def to_single_quotes(s: str) -> str:
+    """Replace inner double-quote characters with a single quote, leave existing single quotes as-is."""
+    if not isinstance(s, str):
+        return s
+    out = s
+    for dq, sq in DOUBLE_QUOTES_TO_SINGLE.items():
+        out = out.replace(dq, sq)
+    return out
+
 def parse_date_flex(v: str) -> str:
-    """Very permissive date parser; returns ISO or original string if parsing fails."""
+    """Parse common US formats and return ISO YYYY-MM-DD. If unclear, leave original."""
     v = (v or "").strip()
     if not v:
         return ""
-    fmts = ["%m/%d/%y", "%m/%d/%Y", "%-m/%-d/%y", "%-m/%-d/%Y"]
-    for fmt in fmts:
+    fmts = ["%m/%d/%y", "%m/%d/%Y"]
+    # Some systems support %-m; try them but ignore if unsupported
+    fmts_platform = fmts + ["%-m/%-d/%y", "%-m/%-d/%Y"]
+    for fmt in fmts_platform:
         try:
             return datetime.strptime(v, fmt).strftime("%Y-%m-%d")
         except Exception:
-            pass
-    # last-ditch: split by / or -
+            continue
+    # last-ditch tolerant parse (handles 1/2/25, 01-02-2025, etc.)
     try:
-        parts = [p.strip() for p in v.replace("-", "/").split("/")]
-        if len(parts) == 3:
-            m, d, y = parts
-            y = int(y)
-            m = int(m)
-            d = int(d)
-            return datetime(y if y > 99 else (2000 + y if y <= 69 else 1900 + y), m, d).strftime("%Y-%m-%d")
+        m, d, y = [p.strip() for p in v.replace("-", "/").split("/")]
+        m, d, y = int(m), int(d), int(y)
+        if y < 100:
+            y = 2000 + y if y <= 69 else 1900 + y
+        return datetime(y, m, d).strftime("%Y-%m-%d")
     except Exception:
-        pass
-    return v  # preserve original if we can't confidently parse
+        return v  # preserve original if not confidently parsed
 
 def main():
     ap = argparse.ArgumentParser()
-    ap.add_argument("--in", dest="in_path", default="illustrations_seed.csv")
-    ap.add_argument("--out", dest="out_path", default="illustrations_clean.csv")
+    ap.add_argument("--in",  dest="in_path",  default="illustrations_seed.csv", help="Input CSV path")
+    ap.add_argument("--out", dest="out_path", default="illustrations_clean.csv", help="Output CSV path")
     ap.add_argument("--normalize-dates", action="store_true",
-                    help="If set, convert Date and Date Edited to ISO YYYY-MM-DD. Otherwise leave as-is.")
+                    help="Convert Date and Date Edited to ISO YYYY-MM-DD.")
     args = ap.parse_args()
 
     if not os.path.exists(args.in_path):
         raise SystemExit(f"Input file not found: {args.in_path}")
 
-    # Read using DictReader; accept whatever header is present but verify shape.
+    # Read input with a tolerant DictReader
     with open(args.in_path, "r", encoding="utf-8-sig", newline="") as f:
         reader = csv.DictReader(
             f,
@@ -77,26 +100,30 @@ def main():
         input_header = reader.fieldnames or []
         missing = [h for h in EXPECTED_HEADER if h not in input_header]
         if missing:
-            print(f"[WARN] Input CSV missing columns: {missing}")
+            print(f"[WARN] Missing expected columns: {missing}")
             print(f"       Found columns: {input_header}")
 
         rows_out = []
         total = 0
         for row in reader:
             total += 1
-            # Build output row strictly in EXPECTED_HEADER order, preserving raw strings.
+            # Build row strictly in EXPECTED_HEADER order; default to ""
             out = {h: (row.get(h, "") if row.get(h, "") is not None else "") for h in EXPECTED_HEADER}
 
-            # Optional date normalization (ONLY dates; no commas involved)
+            # Replace inner double quotes with single quotes for every field
+            for k, v in out.items():
+                out[k] = to_single_quotes(v)
+
+            # Optional: normalize date fields (safe; doesn't affect commas/quotes)
             if args.normalize_dates:
-                for dh in ("Date", "Date Edited"):
-                    out[dh] = parse_date_flex(out.get(dh, ""))
+                out["Date"] = parse_date_flex(out.get("Date", ""))
+                out["Date Edited"] = parse_date_flex(out.get("Date Edited", ""))
 
             rows_out.append(out)
 
-    # Write with QUOTE_ALL so commas/newlines/quotes never break columns.
+    # Write output with QUOTE_ALL so commas/newlines remain safe
     with open(args.out_path, "w", encoding="utf-8", newline="") as f:
-        w = csv.DictWriter(
+        writer = csv.DictWriter(
             f,
             fieldnames=EXPECTED_HEADER,
             delimiter=",",
@@ -106,11 +133,11 @@ def main():
             escapechar="\\",
             lineterminator="\n",
         )
-        w.writeheader()
+        writer.writeheader()
         for r in rows_out:
-            w.writerow(r)
+            writer.writerow(r)
 
-    # Quick self-check: re-read output and ensure fixed column count
+    # Self-check the written CSV for a stable column count
     problems = []
     with open(args.out_path, "r", encoding="utf-8", newline="") as f:
         rdr = csv.reader(f, delimiter=",", quotechar='"', doublequote=True, escapechar="\\", strict=False)