Update imports/cleaner.py

This commit is contained in:
Joshua Laymon 2025-08-13 16:52:11 +00:00
parent bd9af4fd6b
commit 54a5ecaf12

View File

@ -1,33 +1,22 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
""" """
Clean and normalize `illustrations_seed.csv` -> `illustrations_clean.csv`. Content-preserving CSV sanitizer for 'illustrations_seed.csv' -> 'illustrations_clean.csv'.
Key behaviors: What it does (and ONLY this by default):
- Robustly parse CSV with many commas and embedded quotes/newlines. - Parses your CSV safely.
- Prepass to normalize smart quotes/non-breaking spaces before parsing. - Writes a new CSV where EVERY FIELD is QUOTED and any inner " become "".
- Quote ALL fields on output to guarantee importer-friendly CSV. - Keeps the exact text of every field (no trimming, no subject/scripture tweaking, no punctuation edits).
- Subject: trim parts, drop empties, remove trailing comma, rejoin with ", ". - Keeps the exact column order expected by your importer.
- Scripture: remove trailing semicolons, normalize common book abbreviations.
- Dates: accept flexible M/D/YY, M/D/YYYY, etc.; output ISO YYYY-MM-DD.
- Talk Number / Code: numeric if possible; blank if invalid.
- Write rejects to `illustrations_rejects.csv` with a reason, if any.
- Post-write self-check: verifies column count of every row.
Usage: Optional flag --normalize-dates will convert Date and Date Edited to ISO YYYY-MM-DD.
python3 clean_illustrations_csv.py \
--in illustrations_seed.csv \
--out illustrations_clean.csv \
--rejects illustrations_rejects.csv
""" """
import argparse import argparse
import csv import csv
import io
import os import os
import re
from datetime import datetime from datetime import datetime
HEADER = [ EXPECTED_HEADER = [
"Subject", "Subject",
"Illustration", "Illustration",
"Application", "Application",
@ -40,369 +29,76 @@ HEADER = [
"Date Edited", "Date Edited",
] ]
# Common scripture book abbreviation normalization. def parse_date_flex(v: str) -> str:
BOOK_MAP = { """Very permissive date parser; returns ISO or original string if parsing fails."""
# New Testament (common abbreviations) v = (v or "").strip()
"rom": "Romans",
"romans": "Romans",
"eph": "Ephesians",
"ephesians": "Ephesians",
"col": "Colossians",
"colossians": "Colossians",
"1 cor": "1 Corinthians",
"2 cor": "2 Corinthians",
"1 thess": "1 Thessalonians",
"2 thess": "2 Thessalonians",
"1 tim": "1 Timothy",
"2 tim": "2 Timothy",
"1 pet": "1 Peter",
"2 pet": "2 Peter",
"1 john": "1 John",
"2 john": "2 John",
"3 john": "3 John",
"heb": "Hebrews",
"rev": "Revelation",
"revelation": "Revelation",
"acts": "Acts",
"matt": "Matthew",
"mt": "Matthew",
"mark": "Mark",
"mk": "Mark",
"luke": "Luke",
"lk": "Luke",
"john": "John",
"jn": "John",
"jude": "Jude",
"phil": "Philippians",
"php": "Philippians",
"philem": "Philemon",
"titus": "Titus",
"gal": "Galatians",
"galatians": "Galatians",
"james": "James",
"jas": "James",
# Old Testament (examples + the ones in your sample)
"eccl": "Ecclesiastes",
"eccles": "Ecclesiastes",
"ecclesiastes": "Ecclesiastes",
"ps": "Psalms",
"psalm": "Psalms",
"psalms": "Psalms",
"prov": "Proverbs",
"proverbs": "Proverbs",
"gen": "Genesis",
"genesis": "Genesis",
"ex": "Exodus",
"exod": "Exodus",
"exodus": "Exodus",
"isa": "Isaiah",
"isaiah": "Isaiah",
"jer": "Jeremiah",
"jeremiah": "Jeremiah",
"dan": "Daniel",
"daniel": "Daniel",
}
DATE_FORMATS = [
"%m/%d/%y",
"%m/%d/%Y",
"%-m/%-d/%y", # on Linux/mac
"%-m/%-d/%Y",
"%m/%-d/%y",
"%-m/%d/%y",
"%m/%-d/%Y",
"%-m/%d/%Y",
]
NBSP = "\u00A0"
SMARTS = {
"\u201C": '"', # left double
"\u201D": '"', # right double
"\u2018": "'", # left single
"\u2019": "'", # right single / apostrophe
"\u00AB": '"', # «
"\u00BB": '"', # »
}
def pre_normalize_text(raw: str) -> str:
"""Prepass to remove non-breaking spaces and normalize smart quotes."""
if raw is None:
return ""
s = str(raw).replace(NBSP, " ")
for k, v in SMARTS.items():
s = s.replace(k, v)
# Normalize any stray CR-only line endings
s = s.replace("\r\n", "\n").replace("\r", "\n")
return s
def parse_date(value: str) -> str:
"""Parse flexible US-style dates; return ISO YYYY-MM-DD or '' if empty/invalid."""
if not value:
return ""
v = value.strip()
if not v: if not v:
return "" return ""
# Common separators already handled; try multiple formats. fmts = ["%m/%d/%y", "%m/%d/%Y", "%-m/%-d/%y", "%-m/%-d/%Y"]
for fmt in DATE_FORMATS: for fmt in fmts:
try: try:
dt = datetime.strptime(v, fmt) return datetime.strptime(v, fmt).strftime("%Y-%m-%d")
# Heuristic for 2-digit years if needed (datetime handles 1900s/2000s defaults)
# We trust strptime here; user examples are 2000s.
return dt.strftime("%Y-%m-%d")
except Exception: except Exception:
pass pass
# Try to interpret like M/D/YY or M/D/YYYY with flexible spacing # last-ditch: split by / or -
m = re.match(r"^\s*(\d{1,2})[/-](\d{1,2})[/-](\d{2,4})\s*$", v)
if m:
mth, day, yr = int(m.group(1)), int(m.group(2)), m.group(3)
if len(yr) == 2:
# Assume 20xx for 00-69, 19xx for 70-99 (datetime default logic)
year = 2000 + int(yr) if int(yr) <= 69 else 1900 + int(yr)
else:
year = int(yr)
try: try:
return datetime(year, mth, day).strftime("%Y-%m-%d") parts = [p.strip() for p in v.replace("-", "/").split("/")]
if len(parts) == 3:
m, d, y = parts
y = int(y)
m = int(m)
d = int(d)
return datetime(y if y > 99 else (2000 + y if y <= 69 else 1900 + y), m, d).strftime("%Y-%m-%d")
except Exception: except Exception:
return "" pass
return "" return v # preserve original if we can't confidently parse
BOOK_RE = re.compile(r"^\s*([1-3]?\s*[A-Za-z\.]+)\s*(.*)$")
def normalize_scripture(value: str) -> str:
"""Normalize scripture strings: strip trailing semicolons/spaces, normalize book name if easily detectable."""
if not value:
return ""
s = value.strip()
# Remove trailing semicolons and excess punctuation/spaces.
s = re.sub(r"[;,\s]+$", "", s)
# Try to normalize the *first* book token if identifiable.
m = BOOK_RE.match(s)
if not m:
return s
book_raw, rest = m.group(1), m.group(2)
# normalize book key
key = book_raw.lower().replace(".", "")
key = re.sub(r"\s+", " ", key).strip()
# normalize ordinal spacing: "1cor" -> "1 cor"
key = re.sub(r"^([1-3])([a-z])", r"\1 \2", key)
book = BOOK_MAP.get(key, None)
if not book:
# Title-case fallback (basic)
book = " ".join(w.capitalize() for w in key.split())
rest = rest.strip()
# Normalize spacing in the chapter/verse segment (e.g., "14:13, 19")
rest = re.sub(r"\s*,\s*", ", ", rest)
rest = re.sub(r"\s*;\s*", "; ", rest)
rest = re.sub(r"\s+", " ", rest)
out = (book + (" " + rest if rest else "")).strip()
return out
def clean_subject(value: str) -> str:
"""Trim parts, drop empty entries, remove trailing commas/spaces, re-join with ', '."""
if not value:
return ""
# Strip external quotes handled by csv; here we just process content
s = value.strip()
# Split by comma, trim each token, drop empties
parts = [p.strip() for p in s.split(",")]
parts = [p for p in parts if p] # drop empty tokens
# Re-join
return ", ".join(parts)
def to_int_or_blank(value: str) -> str:
if value is None:
return ""
v = str(value).strip()
if v == "":
return ""
# Strip non-digit chars (but keep minus? not needed here)
v2 = re.sub(r"[^0-9-]+", "", v)
if v2 in ("", "-", "--"):
return ""
try:
int(v2)
return v2
except Exception:
return ""
def normalize_row(row_dict, stats, rownum):
"""Return (cleaned_row_dict, reject_reason_or_None)."""
clean = {}
# Subject
subject = row_dict.get("Subject", "")
subject = clean_subject(subject)
clean["Subject"] = subject
# Illustration
ill = pre_normalize_text(row_dict.get("Illustration", "")).strip()
clean["Illustration"] = ill
# Application
app = pre_normalize_text(row_dict.get("Application", "")).strip()
clean["Application"] = app
# Scripture
scr = pre_normalize_text(row_dict.get("Scripture", "")).strip()
scr_norm = normalize_scripture(scr)
if scr and scr != scr_norm:
stats["scripture_changed"] += 1
clean["Scripture"] = scr_norm
# Source
src = pre_normalize_text(row_dict.get("Source", "")).strip()
clean["Source"] = src
# Talk Title
ttitle = pre_normalize_text(row_dict.get("Talk Title", "")).strip()
clean["Talk Title"] = ttitle
# Talk Number
tnum = to_int_or_blank(row_dict.get("Talk Number", ""))
if tnum == "" and str(row_dict.get("Talk Number", "")).strip() not in ("",):
stats["invalid_ints"] += 1
clean["Talk Number"] = tnum
# Code
code = to_int_or_blank(row_dict.get("Code", ""))
if code == "" and str(row_dict.get("Code", "")).strip() not in ("",):
stats["invalid_ints"] += 1
clean["Code"] = code
# Date
date_raw = pre_normalize_text(row_dict.get("Date", "")).strip()
date_norm = parse_date(date_raw)
if date_raw and not date_norm:
stats["invalid_dates"] += 1
elif date_norm:
stats["dates_normalized"] += 1
clean["Date"] = date_norm
# Date Edited
datee_raw = pre_normalize_text(row_dict.get("Date Edited", "")).strip()
datee_norm = parse_date(datee_raw)
if datee_raw and not datee_norm:
stats["invalid_dates"] += 1
elif datee_norm:
stats["dates_normalized"] += 1
clean["Date Edited"] = datee_norm
# Reject logic: If the row is completely empty across all known fields, skip.
if not any(clean.get(h, "").strip() for h in HEADER):
return clean, "empty_row"
return clean, None
def read_with_prepass(path):
"""Read entire file, pre-normalize text, then parse CSV via csv.DictReader."""
with open(path, "r", encoding="utf-8-sig", newline="") as f:
raw = f.read()
normalized = pre_normalize_text(raw)
buf = io.StringIO(normalized)
reader = csv.DictReader(buf, delimiter=",", quotechar='"', doublequote=True, escapechar="\\", strict=False)
return reader
def write_csv(rows, out_path):
"""Write rows (list of dicts) with QUOTE_ALL to ensure commas/newlines are safe."""
with open(out_path, "w", encoding="utf-8", newline="") as f:
writer = csv.DictWriter(
f,
fieldnames=HEADER,
delimiter=",",
quotechar='"',
quoting=csv.QUOTE_ALL,
doublequote=True,
escapechar="\\",
lineterminator="\n",
)
writer.writeheader()
for r in rows:
writer.writerow({k: r.get(k, "") for k in HEADER})
def self_check_csv(path):
"""Verify column count on every row equals header length."""
problems = []
with open(path, "r", encoding="utf-8", newline="") as f:
reader = csv.reader(f, delimiter=",", quotechar='"', doublequote=True, escapechar="\\", strict=False)
expected = None
rownum = 0
for row in reader:
rownum += 1
if rownum == 1:
expected = len(row)
continue
if len(row) != expected:
problems.append((rownum, len(row), expected))
return problems
def main(): def main():
ap = argparse.ArgumentParser() ap = argparse.ArgumentParser()
ap.add_argument("--in", dest="in_path", default="illustrations_seed.csv") ap.add_argument("--in", dest="in_path", default="illustrations_seed.csv")
ap.add_argument("--out", dest="out_path", default="illustrations_clean.csv") ap.add_argument("--out", dest="out_path", default="illustrations_clean.csv")
ap.add_argument("--rejects", dest="rejects_path", default="illustrations_rejects.csv") ap.add_argument("--normalize-dates", action="store_true",
help="If set, convert Date and Date Edited to ISO YYYY-MM-DD. Otherwise leave as-is.")
args = ap.parse_args() args = ap.parse_args()
in_path = args.in_path if not os.path.exists(args.in_path):
out_path = args.out_path raise SystemExit(f"Input file not found: {args.in_path}")
rejects_path = args.rejects_path
if not os.path.exists(in_path): # Read using DictReader; accept whatever header is present but verify shape.
raise SystemExit(f"Input file not found: {in_path}") with open(args.in_path, "r", encoding="utf-8-sig", newline="") as f:
reader = csv.DictReader(
stats = {
"total_rows": 0,
"written_rows": 0,
"reject_rows": 0,
"scripture_changed": 0,
"invalid_ints": 0,
"invalid_dates": 0,
"dates_normalized": 0,
}
rejects = []
cleaned_rows = []
reader = read_with_prepass(in_path)
# Validate header presence/shape
missing = [h for h in HEADER if h not in reader.fieldnames]
if missing:
raise SystemExit(f"Input CSV missing expected headers: {missing}\nFound headers: {reader.fieldnames}")
for idx, row in enumerate(reader, start=2): # start=2 because header is line 1
stats["total_rows"] += 1
try:
cleaned, reason = normalize_row(row, stats, idx)
if reason:
stats["reject_rows"] += 1
r = {k: row.get(k, "") for k in HEADER}
r["reason"] = reason
rejects.append(r)
continue
cleaned_rows.append(cleaned)
except Exception as e:
stats["reject_rows"] += 1
r = {k: row.get(k, "") for k in HEADER}
r["reason"] = f"exception@row {idx}: {e}"
rejects.append(r)
# Write outputs
write_csv(cleaned_rows, out_path)
# Write rejects if any
if rejects:
# Ensure 'reason' is the last column for readability
rej_header = HEADER + ["reason"]
with open(rejects_path, "w", encoding="utf-8", newline="") as f:
writer = csv.DictWriter(
f, f,
fieldnames=rej_header, delimiter=",",
quotechar='"',
doublequote=True,
escapechar="\\",
strict=False,
)
input_header = reader.fieldnames or []
missing = [h for h in EXPECTED_HEADER if h not in input_header]
if missing:
print(f"[WARN] Input CSV missing columns: {missing}")
print(f" Found columns: {input_header}")
rows_out = []
total = 0
for row in reader:
total += 1
# Build output row strictly in EXPECTED_HEADER order, preserving raw strings.
out = {h: (row.get(h, "") if row.get(h, "") is not None else "") for h in EXPECTED_HEADER}
# Optional date normalization (ONLY dates; no commas involved)
if args.normalize_dates:
for dh in ("Date", "Date Edited"):
out[dh] = parse_date_flex(out.get(dh, ""))
rows_out.append(out)
# Write with QUOTE_ALL so commas/newlines/quotes never break columns.
with open(args.out_path, "w", encoding="utf-8", newline="") as f:
w = csv.DictWriter(
f,
fieldnames=EXPECTED_HEADER,
delimiter=",", delimiter=",",
quotechar='"', quotechar='"',
quoting=csv.QUOTE_ALL, quoting=csv.QUOTE_ALL,
@ -410,24 +106,27 @@ def main():
escapechar="\\", escapechar="\\",
lineterminator="\n", lineterminator="\n",
) )
writer.writeheader() w.writeheader()
for r in rejects: for r in rows_out:
writer.writerow({k: r.get(k, "") for k in rej_header}) w.writerow(r)
# Self check the written CSV # Quick self-check: re-read output and ensure fixed column count
problems = self_check_csv(out_path) problems = []
with open(args.out_path, "r", encoding="utf-8", newline="") as f:
rdr = csv.reader(f, delimiter=",", quotechar='"', doublequote=True, escapechar="\\", strict=False)
expected_cols = None
for i, row in enumerate(rdr, start=1):
if i == 1:
expected_cols = len(row)
continue
if len(row) != expected_cols:
problems.append((i, len(row), expected_cols))
# Summary
print("=== Clean Summary ===") print("=== Clean Summary ===")
print(f"Input rows (excluding header): {stats['total_rows']}") print(f"Input rows (excluding header): {total}")
print(f"Written rows: {stats['written_rows'] + len(cleaned_rows)}") print(f"Written rows: {len(rows_out)}")
print(f"Reject rows: {stats['reject_rows']}")
print(f"Scripture normalized: {stats['scripture_changed']}")
print(f"Dates normalized: {stats['dates_normalized']}")
print(f"Invalid ints blanked: {stats['invalid_ints']}")
print(f"Invalid dates (left blank): {stats['invalid_dates']}")
if problems: if problems:
print("\nSelf-check found column count issues on these line numbers in the OUTPUT (1-based):") print("\n[WARNING] Column count issues detected in OUTPUT:")
for line_no, got, exp in problems: for line_no, got, exp in problems:
print(f" line {line_no}: columns={got}, expected={exp}") print(f" line {line_no}: columns={got}, expected={exp}")
else: else: