Add imports/cleaner.py

2025-08-13 16:08:32 +00:00 · 2025-08-13 16:08:32 +00:00 · 8dc4a5c52b
commit 8dc4a5c52b
parent edaa463377
1 changed files with 437 additions and 0 deletions
--- a/imports/cleaner.py
+++ b/imports/cleaner.py
@ -0,0 +1,437 @@
 #!/usr/bin/env python3
 """
 Clean and normalize `illustrations_seed.csv` -> `illustrations_clean.csv`.
 Key behaviors:
 - Robustly parse CSV with many commas and embedded quotes/newlines.
 - Prepass to normalize smart quotes/non-breaking spaces before parsing.
 - Quote ALL fields on output to guarantee importer-friendly CSV.
 - Subject: trim parts, drop empties, remove trailing comma, rejoin with ", ".
 - Scripture: remove trailing semicolons, normalize common book abbreviations.
 - Dates: accept flexible M/D/YY, M/D/YYYY, etc.; output ISO YYYY-MM-DD.
 - Talk Number / Code: numeric if possible; blank if invalid.
 - Write rejects to `illustrations_rejects.csv` with a reason, if any.
 - Post-write self-check: verifies column count of every row.
 Usage:
  python3 clean_illustrations_csv.py \
    --in illustrations_seed.csv \
    --out illustrations_clean.csv \
    --rejects illustrations_rejects.csv
 """
 import argparse
 import csv
 import io
 import os
 import re
 from datetime import datetime
 HEADER = [
    "Subject",
    "Illustration",
    "Application",
    "Scripture",
    "Source",
    "Talk Title",
    "Talk Number",
    "Code",
    "Date",
    "Date Edited",
 ]
 # Common scripture book abbreviation normalization.
 BOOK_MAP = {
    # New Testament (common abbreviations)
    "rom": "Romans",
    "romans": "Romans",
    "eph": "Ephesians",
    "ephesians": "Ephesians",
    "col": "Colossians",
    "colossians": "Colossians",
    "1 cor": "1 Corinthians",
    "2 cor": "2 Corinthians",
    "1 thess": "1 Thessalonians",
    "2 thess": "2 Thessalonians",
    "1 tim": "1 Timothy",
    "2 tim": "2 Timothy",
    "1 pet": "1 Peter",
    "2 pet": "2 Peter",
    "1 john": "1 John",
    "2 john": "2 John",
    "3 john": "3 John",
    "heb": "Hebrews",
    "rev": "Revelation",
    "revelation": "Revelation",
    "acts": "Acts",
    "matt": "Matthew",
    "mt": "Matthew",
    "mark": "Mark",
    "mk": "Mark",
    "luke": "Luke",
    "lk": "Luke",
    "john": "John",
    "jn": "John",
    "jude": "Jude",
    "phil": "Philippians",
    "php": "Philippians",
    "philem": "Philemon",
    "titus": "Titus",
    "gal": "Galatians",
    "galatians": "Galatians",
    "james": "James",
    "jas": "James",
    # Old Testament (examples + the ones in your sample)
    "eccl": "Ecclesiastes",
    "eccles": "Ecclesiastes",
    "ecclesiastes": "Ecclesiastes",
    "ps": "Psalms",
    "psalm": "Psalms",
    "psalms": "Psalms",
    "prov": "Proverbs",
    "proverbs": "Proverbs",
    "gen": "Genesis",
    "genesis": "Genesis",
    "ex": "Exodus",
    "exod": "Exodus",
    "exodus": "Exodus",
    "isa": "Isaiah",
    "isaiah": "Isaiah",
    "jer": "Jeremiah",
    "jeremiah": "Jeremiah",
    "dan": "Daniel",
    "daniel": "Daniel",
 }
 DATE_FORMATS = [
    "%m/%d/%y",
    "%m/%d/%Y",
    "%-m/%-d/%y",   # on Linux/mac
    "%-m/%-d/%Y",
    "%m/%-d/%y",
    "%-m/%d/%y",
    "%m/%-d/%Y",
    "%-m/%d/%Y",
 ]
 NBSP = "\u00A0"
 SMARTS = {
    "\u201C": '"',  # left double
    "\u201D": '"',  # right double
    "\u2018": "'",  # left single
    "\u2019": "'",  # right single / apostrophe
    "\u00AB": '"',  # «
    "\u00BB": '"',  # »
 }
 def pre_normalize_text(raw: str) -> str:
    """Prepass to remove non-breaking spaces and normalize smart quotes."""
    if raw is None:
        return ""
    s = str(raw).replace(NBSP, " ")
    for k, v in SMARTS.items():
        s = s.replace(k, v)
    # Normalize any stray CR-only line endings
    s = s.replace("\r\n", "\n").replace("\r", "\n")
    return s
 def parse_date(value: str) -> str:
    """Parse flexible US-style dates; return ISO YYYY-MM-DD or '' if empty/invalid."""
    if not value:
        return ""
    v = value.strip()
    if not v:
        return ""
    # Common separators already handled; try multiple formats.
    for fmt in DATE_FORMATS:
        try:
            dt = datetime.strptime(v, fmt)
            # Heuristic for 2-digit years if needed (datetime handles 1900s/2000s defaults)
            # We trust strptime here; user examples are 2000s.
            return dt.strftime("%Y-%m-%d")
        except Exception:
            pass
    # Try to interpret like M/D/YY or M/D/YYYY with flexible spacing
    m = re.match(r"^\s*(\d{1,2})[/-](\d{1,2})[/-](\d{2,4})\s*$", v)
    if m:
        mth, day, yr = int(m.group(1)), int(m.group(2)), m.group(3)
        if len(yr) == 2:
            # Assume 20xx for 00-69, 19xx for 70-99 (datetime default logic)
            year = 2000 + int(yr) if int(yr) <= 69 else 1900 + int(yr)
        else:
            year = int(yr)
        try:
            return datetime(year, mth, day).strftime("%Y-%m-%d")
        except Exception:
            return ""
    return ""
 BOOK_RE = re.compile(r"^\s*([1-3]?\s*[A-Za-z\.]+)\s*(.*)$")
 def normalize_scripture(value: str) -> str:
    """Normalize scripture strings: strip trailing semicolons/spaces, normalize book name if easily detectable."""
    if not value:
        return ""
    s = value.strip()
    # Remove trailing semicolons and excess punctuation/spaces.
    s = re.sub(r"[;,\s]+$", "", s)
    # Try to normalize the *first* book token if identifiable.
    m = BOOK_RE.match(s)
    if not m:
        return s
    book_raw, rest = m.group(1), m.group(2)
    # normalize book key
    key = book_raw.lower().replace(".", "")
    key = re.sub(r"\s+", " ", key).strip()
    # normalize ordinal spacing: "1cor" -> "1 cor"
    key = re.sub(r"^([1-3])([a-z])", r"\1 \2", key)
    book = BOOK_MAP.get(key, None)
    if not book:
        # Title-case fallback (basic)
        book = " ".join(w.capitalize() for w in key.split())
    rest = rest.strip()
    # Normalize spacing in the chapter/verse segment (e.g., "14:13, 19")
    rest = re.sub(r"\s*,\s*", ", ", rest)
    rest = re.sub(r"\s*;\s*", "; ", rest)
    rest = re.sub(r"\s+", " ", rest)
    out = (book + (" " + rest if rest else "")).strip()
    return out
 def clean_subject(value: str) -> str:
    """Trim parts, drop empty entries, remove trailing commas/spaces, re-join with ', '."""
    if not value:
        return ""
    # Strip external quotes handled by csv; here we just process content
    s = value.strip()
    # Split by comma, trim each token, drop empties
    parts = [p.strip() for p in s.split(",")]
    parts = [p for p in parts if p]  # drop empty tokens
    # Re-join
    return ", ".join(parts)
 def to_int_or_blank(value: str) -> str:
    if value is None:
        return ""
    v = str(value).strip()
    if v == "":
        return ""
    # Strip non-digit chars (but keep minus? not needed here)
    v2 = re.sub(r"[^0-9-]+", "", v)
    if v2 in ("", "-", "--"):
        return ""
    try:
        int(v2)
        return v2
    except Exception:
        return ""
 def normalize_row(row_dict, stats, rownum):
    """Return (cleaned_row_dict, reject_reason_or_None)."""
    clean = {}
    # Subject
    subject = row_dict.get("Subject", "")
    subject = clean_subject(subject)
    clean["Subject"] = subject
    # Illustration
    ill = pre_normalize_text(row_dict.get("Illustration", "")).strip()
    clean["Illustration"] = ill
    # Application
    app = pre_normalize_text(row_dict.get("Application", "")).strip()
    clean["Application"] = app
    # Scripture
    scr = pre_normalize_text(row_dict.get("Scripture", "")).strip()
    scr_norm = normalize_scripture(scr)
    if scr and scr != scr_norm:
        stats["scripture_changed"] += 1
    clean["Scripture"] = scr_norm
    # Source
    src = pre_normalize_text(row_dict.get("Source", "")).strip()
    clean["Source"] = src
    # Talk Title
    ttitle = pre_normalize_text(row_dict.get("Talk Title", "")).strip()
    clean["Talk Title"] = ttitle
    # Talk Number
    tnum = to_int_or_blank(row_dict.get("Talk Number", ""))
    if tnum == "" and str(row_dict.get("Talk Number", "")).strip() not in ("",):
        stats["invalid_ints"] += 1
    clean["Talk Number"] = tnum
    # Code
    code = to_int_or_blank(row_dict.get("Code", ""))
    if code == "" and str(row_dict.get("Code", "")).strip() not in ("",):
        stats["invalid_ints"] += 1
    clean["Code"] = code
    # Date
    date_raw = pre_normalize_text(row_dict.get("Date", "")).strip()
    date_norm = parse_date(date_raw)
    if date_raw and not date_norm:
        stats["invalid_dates"] += 1
    elif date_norm:
        stats["dates_normalized"] += 1
    clean["Date"] = date_norm
    # Date Edited
    datee_raw = pre_normalize_text(row_dict.get("Date Edited", "")).strip()
    datee_norm = parse_date(datee_raw)
    if datee_raw and not datee_norm:
        stats["invalid_dates"] += 1
    elif datee_norm:
        stats["dates_normalized"] += 1
    clean["Date Edited"] = datee_norm
    # Reject logic: If the row is completely empty across all known fields, skip.
    if not any(clean.get(h, "").strip() for h in HEADER):
        return clean, "empty_row"
    return clean, None
 def read_with_prepass(path):
    """Read entire file, pre-normalize text, then parse CSV via csv.DictReader."""
    with open(path, "r", encoding="utf-8-sig", newline="") as f:
        raw = f.read()
    normalized = pre_normalize_text(raw)
    buf = io.StringIO(normalized)
    reader = csv.DictReader(buf, delimiter=",", quotechar='"', doublequote=True, escapechar="\\", strict=False)
    return reader
 def write_csv(rows, out_path):
    """Write rows (list of dicts) with QUOTE_ALL to ensure commas/newlines are safe."""
    with open(out_path, "w", encoding="utf-8", newline="") as f:
        writer = csv.DictWriter(
            f,
            fieldnames=HEADER,
            delimiter=",",
            quotechar='"',
            quoting=csv.QUOTE_ALL,
            doublequote=True,
            escapechar="\\",
            lineterminator="\n",
        )
        writer.writeheader()
        for r in rows:
            writer.writerow({k: r.get(k, "") for k in HEADER})
 def self_check_csv(path):
    """Verify column count on every row equals header length."""
    problems = []
    with open(path, "r", encoding="utf-8", newline="") as f:
        reader = csv.reader(f, delimiter=",", quotechar='"', doublequote=True, escapechar="\\", strict=False)
        expected = None
        rownum = 0
        for row in reader:
            rownum += 1
            if rownum == 1:
                expected = len(row)
                continue
            if len(row) != expected:
                problems.append((rownum, len(row), expected))
    return problems
 def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--in", dest="in_path", default="illustrations_seed.csv")
    ap.add_argument("--out", dest="out_path", default="illustrations_clean.csv")
    ap.add_argument("--rejects", dest="rejects_path", default="illustrations_rejects.csv")
    args = ap.parse_args()
    in_path = args.in_path
    out_path = args.out_path
    rejects_path = args.rejects_path
    if not os.path.exists(in_path):
        raise SystemExit(f"Input file not found: {in_path}")
    stats = {
        "total_rows": 0,
        "written_rows": 0,
        "reject_rows": 0,
        "scripture_changed": 0,
        "invalid_ints": 0,
        "invalid_dates": 0,
        "dates_normalized": 0,
    }
    rejects = []
    cleaned_rows = []
    reader = read_with_prepass(in_path)
    # Validate header presence/shape
    missing = [h for h in HEADER if h not in reader.fieldnames]
    if missing:
        raise SystemExit(f"Input CSV missing expected headers: {missing}\nFound headers: {reader.fieldnames}")
    for idx, row in enumerate(reader, start=2):  # start=2 because header is line 1
        stats["total_rows"] += 1
        try:
            cleaned, reason = normalize_row(row, stats, idx)
            if reason:
                stats["reject_rows"] += 1
                r = {k: row.get(k, "") for k in HEADER}
                r["reason"] = reason
                rejects.append(r)
                continue
            cleaned_rows.append(cleaned)
        except Exception as e:
            stats["reject_rows"] += 1
            r = {k: row.get(k, "") for k in HEADER}
            r["reason"] = f"exception@row {idx}: {e}"
            rejects.append(r)
    # Write outputs
    write_csv(cleaned_rows, out_path)
    # Write rejects if any
    if rejects:
        # Ensure 'reason' is the last column for readability
        rej_header = HEADER + ["reason"]
        with open(rejects_path, "w", encoding="utf-8", newline="") as f:
            writer = csv.DictWriter(
                f,
                fieldnames=rej_header,
                delimiter=",",
                quotechar='"',
                quoting=csv.QUOTE_ALL,
                doublequote=True,
                escapechar="\\",
                lineterminator="\n",
            )
            writer.writeheader()
            for r in rejects:
                writer.writerow({k: r.get(k, "") for k in rej_header})
    # Self check the written CSV
    problems = self_check_csv(out_path)
    # Summary
    print("=== Clean Summary ===")
    print(f"Input rows (excluding header): {stats['total_rows']}")
    print(f"Written rows:                {stats['written_rows'] + len(cleaned_rows)}")
    print(f"Reject rows:                 {stats['reject_rows']}")
    print(f"Scripture normalized:        {stats['scripture_changed']}")
    print(f"Dates normalized:            {stats['dates_normalized']}")
    print(f"Invalid ints blanked:        {stats['invalid_ints']}")
    print(f"Invalid dates (left blank):  {stats['invalid_dates']}")
    if problems:
        print("\nSelf-check found column count issues on these line numbers in the OUTPUT (1-based):")
        for line_no, got, exp in problems:
            print(f"  line {line_no}: columns={got}, expected={exp}")
    else:
        print("\nSelf-check: all rows have the expected column count.")
 if __name__ == "__main__":
    main()