From 65bc45ad0d5a8151a47166d013ac3d8a982bfb4c Mon Sep 17 00:00:00 2001 From: Joshua Laymon Date: Wed, 13 Aug 2025 17:05:57 +0000 Subject: [PATCH] Update imports/cleaner.py --- imports/cleaner.py | 97 +++++++++++++++++++++++++++++----------------- 1 file changed, 62 insertions(+), 35 deletions(-) diff --git a/imports/cleaner.py b/imports/cleaner.py index b8134fa..654fec3 100644 --- a/imports/cleaner.py +++ b/imports/cleaner.py @@ -1,14 +1,19 @@ #!/usr/bin/env python3 """ -Content-preserving CSV sanitizer for 'illustrations_seed.csv' -> 'illustrations_clean.csv'. +Cleaner for `illustrations_seed.csv` -> `illustrations_clean.csv` -What it does (and ONLY this by default): -- Parses your CSV safely. -- Writes a new CSV where EVERY FIELD is QUOTED and any inner " become "". -- Keeps the exact text of every field (no trimming, no subject/scripture tweaking, no punctuation edits). -- Keeps the exact column order expected by your importer. +What it does: +- Reads your CSV safely (commas/newlines/quotes inside fields supported). +- For every field value, replaces inner *double quotes* with a single quote: + " “ ” « » „ --> ' + (Existing single quotes like Jehovah's are preserved.) +- Writes a new CSV with QUOTE_ALL so commas/newlines never break columns. +- Optional: --normalize-dates converts Date / Date Edited to ISO YYYY-MM-DD. +- Prints a short summary and verifies the output column count. -Optional flag --normalize-dates will convert Date and Date Edited to ISO YYYY-MM-DD. +Usage: + python3 cleaner.py + python3 cleaner.py --in my_in.csv --out my_out.csv --normalize-dates """ import argparse @@ -29,42 +34,60 @@ EXPECTED_HEADER = [ "Date Edited", ] +# Map various double-quote characters to a single straight apostrophe +DOUBLE_QUOTES_TO_SINGLE = { + '"': "'", # U+0022 + '“': "'", # U+201C + '”': "'", # U+201D + '„': "'", # U+201E + '«': "'", # U+00AB + '»': "'", # U+00BB +} + +def to_single_quotes(s: str) -> str: + """Replace inner double-quote characters with a single quote, leave existing single quotes as-is.""" + if not isinstance(s, str): + return s + out = s + for dq, sq in DOUBLE_QUOTES_TO_SINGLE.items(): + out = out.replace(dq, sq) + return out + def parse_date_flex(v: str) -> str: - """Very permissive date parser; returns ISO or original string if parsing fails.""" + """Parse common US formats and return ISO YYYY-MM-DD. If unclear, leave original.""" v = (v or "").strip() if not v: return "" - fmts = ["%m/%d/%y", "%m/%d/%Y", "%-m/%-d/%y", "%-m/%-d/%Y"] - for fmt in fmts: + fmts = ["%m/%d/%y", "%m/%d/%Y"] + # Some systems support %-m; try them but ignore if unsupported + fmts_platform = fmts + ["%-m/%-d/%y", "%-m/%-d/%Y"] + for fmt in fmts_platform: try: return datetime.strptime(v, fmt).strftime("%Y-%m-%d") except Exception: - pass - # last-ditch: split by / or - + continue + # last-ditch tolerant parse (handles 1/2/25, 01-02-2025, etc.) try: - parts = [p.strip() for p in v.replace("-", "/").split("/")] - if len(parts) == 3: - m, d, y = parts - y = int(y) - m = int(m) - d = int(d) - return datetime(y if y > 99 else (2000 + y if y <= 69 else 1900 + y), m, d).strftime("%Y-%m-%d") + m, d, y = [p.strip() for p in v.replace("-", "/").split("/")] + m, d, y = int(m), int(d), int(y) + if y < 100: + y = 2000 + y if y <= 69 else 1900 + y + return datetime(y, m, d).strftime("%Y-%m-%d") except Exception: - pass - return v # preserve original if we can't confidently parse + return v # preserve original if not confidently parsed def main(): ap = argparse.ArgumentParser() - ap.add_argument("--in", dest="in_path", default="illustrations_seed.csv") - ap.add_argument("--out", dest="out_path", default="illustrations_clean.csv") + ap.add_argument("--in", dest="in_path", default="illustrations_seed.csv", help="Input CSV path") + ap.add_argument("--out", dest="out_path", default="illustrations_clean.csv", help="Output CSV path") ap.add_argument("--normalize-dates", action="store_true", - help="If set, convert Date and Date Edited to ISO YYYY-MM-DD. Otherwise leave as-is.") + help="Convert Date and Date Edited to ISO YYYY-MM-DD.") args = ap.parse_args() if not os.path.exists(args.in_path): raise SystemExit(f"Input file not found: {args.in_path}") - # Read using DictReader; accept whatever header is present but verify shape. + # Read input with a tolerant DictReader with open(args.in_path, "r", encoding="utf-8-sig", newline="") as f: reader = csv.DictReader( f, @@ -77,26 +100,30 @@ def main(): input_header = reader.fieldnames or [] missing = [h for h in EXPECTED_HEADER if h not in input_header] if missing: - print(f"[WARN] Input CSV missing columns: {missing}") + print(f"[WARN] Missing expected columns: {missing}") print(f" Found columns: {input_header}") rows_out = [] total = 0 for row in reader: total += 1 - # Build output row strictly in EXPECTED_HEADER order, preserving raw strings. + # Build row strictly in EXPECTED_HEADER order; default to "" out = {h: (row.get(h, "") if row.get(h, "") is not None else "") for h in EXPECTED_HEADER} - # Optional date normalization (ONLY dates; no commas involved) + # Replace inner double quotes with single quotes for every field + for k, v in out.items(): + out[k] = to_single_quotes(v) + + # Optional: normalize date fields (safe; doesn't affect commas/quotes) if args.normalize_dates: - for dh in ("Date", "Date Edited"): - out[dh] = parse_date_flex(out.get(dh, "")) + out["Date"] = parse_date_flex(out.get("Date", "")) + out["Date Edited"] = parse_date_flex(out.get("Date Edited", "")) rows_out.append(out) - # Write with QUOTE_ALL so commas/newlines/quotes never break columns. + # Write output with QUOTE_ALL so commas/newlines remain safe with open(args.out_path, "w", encoding="utf-8", newline="") as f: - w = csv.DictWriter( + writer = csv.DictWriter( f, fieldnames=EXPECTED_HEADER, delimiter=",", @@ -106,11 +133,11 @@ def main(): escapechar="\\", lineterminator="\n", ) - w.writeheader() + writer.writeheader() for r in rows_out: - w.writerow(r) + writer.writerow(r) - # Quick self-check: re-read output and ensure fixed column count + # Self-check the written CSV for a stable column count problems = [] with open(args.out_path, "r", encoding="utf-8", newline="") as f: rdr = csv.reader(f, delimiter=",", quotechar='"', doublequote=True, escapechar="\\", strict=False)