diff --git a/imports/cleaner.py b/imports/cleaner.py new file mode 100644 index 0000000..e01c191 --- /dev/null +++ b/imports/cleaner.py @@ -0,0 +1,437 @@ +#!/usr/bin/env python3 +""" +Clean and normalize `illustrations_seed.csv` -> `illustrations_clean.csv`. + +Key behaviors: +- Robustly parse CSV with many commas and embedded quotes/newlines. +- Prepass to normalize smart quotes/non-breaking spaces before parsing. +- Quote ALL fields on output to guarantee importer-friendly CSV. +- Subject: trim parts, drop empties, remove trailing comma, rejoin with ", ". +- Scripture: remove trailing semicolons, normalize common book abbreviations. +- Dates: accept flexible M/D/YY, M/D/YYYY, etc.; output ISO YYYY-MM-DD. +- Talk Number / Code: numeric if possible; blank if invalid. +- Write rejects to `illustrations_rejects.csv` with a reason, if any. +- Post-write self-check: verifies column count of every row. + +Usage: + python3 clean_illustrations_csv.py \ + --in illustrations_seed.csv \ + --out illustrations_clean.csv \ + --rejects illustrations_rejects.csv +""" + +import argparse +import csv +import io +import os +import re +from datetime import datetime + +HEADER = [ + "Subject", + "Illustration", + "Application", + "Scripture", + "Source", + "Talk Title", + "Talk Number", + "Code", + "Date", + "Date Edited", +] + +# Common scripture book abbreviation normalization. +BOOK_MAP = { + # New Testament (common abbreviations) + "rom": "Romans", + "romans": "Romans", + "eph": "Ephesians", + "ephesians": "Ephesians", + "col": "Colossians", + "colossians": "Colossians", + "1 cor": "1 Corinthians", + "2 cor": "2 Corinthians", + "1 thess": "1 Thessalonians", + "2 thess": "2 Thessalonians", + "1 tim": "1 Timothy", + "2 tim": "2 Timothy", + "1 pet": "1 Peter", + "2 pet": "2 Peter", + "1 john": "1 John", + "2 john": "2 John", + "3 john": "3 John", + "heb": "Hebrews", + "rev": "Revelation", + "revelation": "Revelation", + "acts": "Acts", + "matt": "Matthew", + "mt": "Matthew", + "mark": "Mark", + "mk": "Mark", + "luke": "Luke", + "lk": "Luke", + "john": "John", + "jn": "John", + "jude": "Jude", + "phil": "Philippians", + "php": "Philippians", + "philem": "Philemon", + "titus": "Titus", + "gal": "Galatians", + "galatians": "Galatians", + "james": "James", + "jas": "James", + + # Old Testament (examples + the ones in your sample) + "eccl": "Ecclesiastes", + "eccles": "Ecclesiastes", + "ecclesiastes": "Ecclesiastes", + "ps": "Psalms", + "psalm": "Psalms", + "psalms": "Psalms", + "prov": "Proverbs", + "proverbs": "Proverbs", + "gen": "Genesis", + "genesis": "Genesis", + "ex": "Exodus", + "exod": "Exodus", + "exodus": "Exodus", + "isa": "Isaiah", + "isaiah": "Isaiah", + "jer": "Jeremiah", + "jeremiah": "Jeremiah", + "dan": "Daniel", + "daniel": "Daniel", +} + +DATE_FORMATS = [ + "%m/%d/%y", + "%m/%d/%Y", + "%-m/%-d/%y", # on Linux/mac + "%-m/%-d/%Y", + "%m/%-d/%y", + "%-m/%d/%y", + "%m/%-d/%Y", + "%-m/%d/%Y", +] + +NBSP = "\u00A0" +SMARTS = { + "\u201C": '"', # left double + "\u201D": '"', # right double + "\u2018": "'", # left single + "\u2019": "'", # right single / apostrophe + "\u00AB": '"', # « + "\u00BB": '"', # » +} + +def pre_normalize_text(raw: str) -> str: + """Prepass to remove non-breaking spaces and normalize smart quotes.""" + if raw is None: + return "" + s = str(raw).replace(NBSP, " ") + for k, v in SMARTS.items(): + s = s.replace(k, v) + # Normalize any stray CR-only line endings + s = s.replace("\r\n", "\n").replace("\r", "\n") + return s + +def parse_date(value: str) -> str: + """Parse flexible US-style dates; return ISO YYYY-MM-DD or '' if empty/invalid.""" + if not value: + return "" + v = value.strip() + if not v: + return "" + # Common separators already handled; try multiple formats. + for fmt in DATE_FORMATS: + try: + dt = datetime.strptime(v, fmt) + # Heuristic for 2-digit years if needed (datetime handles 1900s/2000s defaults) + # We trust strptime here; user examples are 2000s. + return dt.strftime("%Y-%m-%d") + except Exception: + pass + # Try to interpret like M/D/YY or M/D/YYYY with flexible spacing + m = re.match(r"^\s*(\d{1,2})[/-](\d{1,2})[/-](\d{2,4})\s*$", v) + if m: + mth, day, yr = int(m.group(1)), int(m.group(2)), m.group(3) + if len(yr) == 2: + # Assume 20xx for 00-69, 19xx for 70-99 (datetime default logic) + year = 2000 + int(yr) if int(yr) <= 69 else 1900 + int(yr) + else: + year = int(yr) + try: + return datetime(year, mth, day).strftime("%Y-%m-%d") + except Exception: + return "" + return "" + +BOOK_RE = re.compile(r"^\s*([1-3]?\s*[A-Za-z\.]+)\s*(.*)$") + +def normalize_scripture(value: str) -> str: + """Normalize scripture strings: strip trailing semicolons/spaces, normalize book name if easily detectable.""" + if not value: + return "" + s = value.strip() + # Remove trailing semicolons and excess punctuation/spaces. + s = re.sub(r"[;,\s]+$", "", s) + + # Try to normalize the *first* book token if identifiable. + m = BOOK_RE.match(s) + if not m: + return s + book_raw, rest = m.group(1), m.group(2) + + # normalize book key + key = book_raw.lower().replace(".", "") + key = re.sub(r"\s+", " ", key).strip() + # normalize ordinal spacing: "1cor" -> "1 cor" + key = re.sub(r"^([1-3])([a-z])", r"\1 \2", key) + + book = BOOK_MAP.get(key, None) + if not book: + # Title-case fallback (basic) + book = " ".join(w.capitalize() for w in key.split()) + + rest = rest.strip() + # Normalize spacing in the chapter/verse segment (e.g., "14:13, 19") + rest = re.sub(r"\s*,\s*", ", ", rest) + rest = re.sub(r"\s*;\s*", "; ", rest) + rest = re.sub(r"\s+", " ", rest) + + out = (book + (" " + rest if rest else "")).strip() + return out + +def clean_subject(value: str) -> str: + """Trim parts, drop empty entries, remove trailing commas/spaces, re-join with ', '.""" + if not value: + return "" + # Strip external quotes handled by csv; here we just process content + s = value.strip() + # Split by comma, trim each token, drop empties + parts = [p.strip() for p in s.split(",")] + parts = [p for p in parts if p] # drop empty tokens + # Re-join + return ", ".join(parts) + +def to_int_or_blank(value: str) -> str: + if value is None: + return "" + v = str(value).strip() + if v == "": + return "" + # Strip non-digit chars (but keep minus? not needed here) + v2 = re.sub(r"[^0-9-]+", "", v) + if v2 in ("", "-", "--"): + return "" + try: + int(v2) + return v2 + except Exception: + return "" + +def normalize_row(row_dict, stats, rownum): + """Return (cleaned_row_dict, reject_reason_or_None).""" + clean = {} + + # Subject + subject = row_dict.get("Subject", "") + subject = clean_subject(subject) + clean["Subject"] = subject + + # Illustration + ill = pre_normalize_text(row_dict.get("Illustration", "")).strip() + clean["Illustration"] = ill + + # Application + app = pre_normalize_text(row_dict.get("Application", "")).strip() + clean["Application"] = app + + # Scripture + scr = pre_normalize_text(row_dict.get("Scripture", "")).strip() + scr_norm = normalize_scripture(scr) + if scr and scr != scr_norm: + stats["scripture_changed"] += 1 + clean["Scripture"] = scr_norm + + # Source + src = pre_normalize_text(row_dict.get("Source", "")).strip() + clean["Source"] = src + + # Talk Title + ttitle = pre_normalize_text(row_dict.get("Talk Title", "")).strip() + clean["Talk Title"] = ttitle + + # Talk Number + tnum = to_int_or_blank(row_dict.get("Talk Number", "")) + if tnum == "" and str(row_dict.get("Talk Number", "")).strip() not in ("",): + stats["invalid_ints"] += 1 + clean["Talk Number"] = tnum + + # Code + code = to_int_or_blank(row_dict.get("Code", "")) + if code == "" and str(row_dict.get("Code", "")).strip() not in ("",): + stats["invalid_ints"] += 1 + clean["Code"] = code + + # Date + date_raw = pre_normalize_text(row_dict.get("Date", "")).strip() + date_norm = parse_date(date_raw) + if date_raw and not date_norm: + stats["invalid_dates"] += 1 + elif date_norm: + stats["dates_normalized"] += 1 + clean["Date"] = date_norm + + # Date Edited + datee_raw = pre_normalize_text(row_dict.get("Date Edited", "")).strip() + datee_norm = parse_date(datee_raw) + if datee_raw and not datee_norm: + stats["invalid_dates"] += 1 + elif datee_norm: + stats["dates_normalized"] += 1 + clean["Date Edited"] = datee_norm + + # Reject logic: If the row is completely empty across all known fields, skip. + if not any(clean.get(h, "").strip() for h in HEADER): + return clean, "empty_row" + + return clean, None + +def read_with_prepass(path): + """Read entire file, pre-normalize text, then parse CSV via csv.DictReader.""" + with open(path, "r", encoding="utf-8-sig", newline="") as f: + raw = f.read() + normalized = pre_normalize_text(raw) + buf = io.StringIO(normalized) + reader = csv.DictReader(buf, delimiter=",", quotechar='"', doublequote=True, escapechar="\\", strict=False) + return reader + +def write_csv(rows, out_path): + """Write rows (list of dicts) with QUOTE_ALL to ensure commas/newlines are safe.""" + with open(out_path, "w", encoding="utf-8", newline="") as f: + writer = csv.DictWriter( + f, + fieldnames=HEADER, + delimiter=",", + quotechar='"', + quoting=csv.QUOTE_ALL, + doublequote=True, + escapechar="\\", + lineterminator="\n", + ) + writer.writeheader() + for r in rows: + writer.writerow({k: r.get(k, "") for k in HEADER}) + +def self_check_csv(path): + """Verify column count on every row equals header length.""" + problems = [] + with open(path, "r", encoding="utf-8", newline="") as f: + reader = csv.reader(f, delimiter=",", quotechar='"', doublequote=True, escapechar="\\", strict=False) + expected = None + rownum = 0 + for row in reader: + rownum += 1 + if rownum == 1: + expected = len(row) + continue + if len(row) != expected: + problems.append((rownum, len(row), expected)) + return problems + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--in", dest="in_path", default="illustrations_seed.csv") + ap.add_argument("--out", dest="out_path", default="illustrations_clean.csv") + ap.add_argument("--rejects", dest="rejects_path", default="illustrations_rejects.csv") + args = ap.parse_args() + + in_path = args.in_path + out_path = args.out_path + rejects_path = args.rejects_path + + if not os.path.exists(in_path): + raise SystemExit(f"Input file not found: {in_path}") + + stats = { + "total_rows": 0, + "written_rows": 0, + "reject_rows": 0, + "scripture_changed": 0, + "invalid_ints": 0, + "invalid_dates": 0, + "dates_normalized": 0, + } + + rejects = [] + cleaned_rows = [] + + reader = read_with_prepass(in_path) + + # Validate header presence/shape + missing = [h for h in HEADER if h not in reader.fieldnames] + if missing: + raise SystemExit(f"Input CSV missing expected headers: {missing}\nFound headers: {reader.fieldnames}") + + for idx, row in enumerate(reader, start=2): # start=2 because header is line 1 + stats["total_rows"] += 1 + try: + cleaned, reason = normalize_row(row, stats, idx) + if reason: + stats["reject_rows"] += 1 + r = {k: row.get(k, "") for k in HEADER} + r["reason"] = reason + rejects.append(r) + continue + cleaned_rows.append(cleaned) + except Exception as e: + stats["reject_rows"] += 1 + r = {k: row.get(k, "") for k in HEADER} + r["reason"] = f"exception@row {idx}: {e}" + rejects.append(r) + + # Write outputs + write_csv(cleaned_rows, out_path) + + # Write rejects if any + if rejects: + # Ensure 'reason' is the last column for readability + rej_header = HEADER + ["reason"] + with open(rejects_path, "w", encoding="utf-8", newline="") as f: + writer = csv.DictWriter( + f, + fieldnames=rej_header, + delimiter=",", + quotechar='"', + quoting=csv.QUOTE_ALL, + doublequote=True, + escapechar="\\", + lineterminator="\n", + ) + writer.writeheader() + for r in rejects: + writer.writerow({k: r.get(k, "") for k in rej_header}) + + # Self check the written CSV + problems = self_check_csv(out_path) + + # Summary + print("=== Clean Summary ===") + print(f"Input rows (excluding header): {stats['total_rows']}") + print(f"Written rows: {stats['written_rows'] + len(cleaned_rows)}") + print(f"Reject rows: {stats['reject_rows']}") + print(f"Scripture normalized: {stats['scripture_changed']}") + print(f"Dates normalized: {stats['dates_normalized']}") + print(f"Invalid ints blanked: {stats['invalid_ints']}") + print(f"Invalid dates (left blank): {stats['invalid_dates']}") + if problems: + print("\nSelf-check found column count issues on these line numbers in the OUTPUT (1-based):") + for line_no, got, exp in problems: + print(f" line {line_no}: columns={got}, expected={exp}") + else: + print("\nSelf-check: all rows have the expected column count.") + +if __name__ == "__main__": + main() \ No newline at end of file