Add imports/cleaner.py
This commit is contained in:
parent
edaa463377
commit
8dc4a5c52b
437
imports/cleaner.py
Normal file
437
imports/cleaner.py
Normal file
@ -0,0 +1,437 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Clean and normalize `illustrations_seed.csv` -> `illustrations_clean.csv`.
|
||||||
|
|
||||||
|
Key behaviors:
|
||||||
|
- Robustly parse CSV with many commas and embedded quotes/newlines.
|
||||||
|
- Prepass to normalize smart quotes/non-breaking spaces before parsing.
|
||||||
|
- Quote ALL fields on output to guarantee importer-friendly CSV.
|
||||||
|
- Subject: trim parts, drop empties, remove trailing comma, rejoin with ", ".
|
||||||
|
- Scripture: remove trailing semicolons, normalize common book abbreviations.
|
||||||
|
- Dates: accept flexible M/D/YY, M/D/YYYY, etc.; output ISO YYYY-MM-DD.
|
||||||
|
- Talk Number / Code: numeric if possible; blank if invalid.
|
||||||
|
- Write rejects to `illustrations_rejects.csv` with a reason, if any.
|
||||||
|
- Post-write self-check: verifies column count of every row.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python3 clean_illustrations_csv.py \
|
||||||
|
--in illustrations_seed.csv \
|
||||||
|
--out illustrations_clean.csv \
|
||||||
|
--rejects illustrations_rejects.csv
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import csv
|
||||||
|
import io
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
HEADER = [
|
||||||
|
"Subject",
|
||||||
|
"Illustration",
|
||||||
|
"Application",
|
||||||
|
"Scripture",
|
||||||
|
"Source",
|
||||||
|
"Talk Title",
|
||||||
|
"Talk Number",
|
||||||
|
"Code",
|
||||||
|
"Date",
|
||||||
|
"Date Edited",
|
||||||
|
]
|
||||||
|
|
||||||
|
# Common scripture book abbreviation normalization.
|
||||||
|
BOOK_MAP = {
|
||||||
|
# New Testament (common abbreviations)
|
||||||
|
"rom": "Romans",
|
||||||
|
"romans": "Romans",
|
||||||
|
"eph": "Ephesians",
|
||||||
|
"ephesians": "Ephesians",
|
||||||
|
"col": "Colossians",
|
||||||
|
"colossians": "Colossians",
|
||||||
|
"1 cor": "1 Corinthians",
|
||||||
|
"2 cor": "2 Corinthians",
|
||||||
|
"1 thess": "1 Thessalonians",
|
||||||
|
"2 thess": "2 Thessalonians",
|
||||||
|
"1 tim": "1 Timothy",
|
||||||
|
"2 tim": "2 Timothy",
|
||||||
|
"1 pet": "1 Peter",
|
||||||
|
"2 pet": "2 Peter",
|
||||||
|
"1 john": "1 John",
|
||||||
|
"2 john": "2 John",
|
||||||
|
"3 john": "3 John",
|
||||||
|
"heb": "Hebrews",
|
||||||
|
"rev": "Revelation",
|
||||||
|
"revelation": "Revelation",
|
||||||
|
"acts": "Acts",
|
||||||
|
"matt": "Matthew",
|
||||||
|
"mt": "Matthew",
|
||||||
|
"mark": "Mark",
|
||||||
|
"mk": "Mark",
|
||||||
|
"luke": "Luke",
|
||||||
|
"lk": "Luke",
|
||||||
|
"john": "John",
|
||||||
|
"jn": "John",
|
||||||
|
"jude": "Jude",
|
||||||
|
"phil": "Philippians",
|
||||||
|
"php": "Philippians",
|
||||||
|
"philem": "Philemon",
|
||||||
|
"titus": "Titus",
|
||||||
|
"gal": "Galatians",
|
||||||
|
"galatians": "Galatians",
|
||||||
|
"james": "James",
|
||||||
|
"jas": "James",
|
||||||
|
|
||||||
|
# Old Testament (examples + the ones in your sample)
|
||||||
|
"eccl": "Ecclesiastes",
|
||||||
|
"eccles": "Ecclesiastes",
|
||||||
|
"ecclesiastes": "Ecclesiastes",
|
||||||
|
"ps": "Psalms",
|
||||||
|
"psalm": "Psalms",
|
||||||
|
"psalms": "Psalms",
|
||||||
|
"prov": "Proverbs",
|
||||||
|
"proverbs": "Proverbs",
|
||||||
|
"gen": "Genesis",
|
||||||
|
"genesis": "Genesis",
|
||||||
|
"ex": "Exodus",
|
||||||
|
"exod": "Exodus",
|
||||||
|
"exodus": "Exodus",
|
||||||
|
"isa": "Isaiah",
|
||||||
|
"isaiah": "Isaiah",
|
||||||
|
"jer": "Jeremiah",
|
||||||
|
"jeremiah": "Jeremiah",
|
||||||
|
"dan": "Daniel",
|
||||||
|
"daniel": "Daniel",
|
||||||
|
}
|
||||||
|
|
||||||
|
DATE_FORMATS = [
|
||||||
|
"%m/%d/%y",
|
||||||
|
"%m/%d/%Y",
|
||||||
|
"%-m/%-d/%y", # on Linux/mac
|
||||||
|
"%-m/%-d/%Y",
|
||||||
|
"%m/%-d/%y",
|
||||||
|
"%-m/%d/%y",
|
||||||
|
"%m/%-d/%Y",
|
||||||
|
"%-m/%d/%Y",
|
||||||
|
]
|
||||||
|
|
||||||
|
NBSP = "\u00A0"
|
||||||
|
SMARTS = {
|
||||||
|
"\u201C": '"', # left double
|
||||||
|
"\u201D": '"', # right double
|
||||||
|
"\u2018": "'", # left single
|
||||||
|
"\u2019": "'", # right single / apostrophe
|
||||||
|
"\u00AB": '"', # «
|
||||||
|
"\u00BB": '"', # »
|
||||||
|
}
|
||||||
|
|
||||||
|
def pre_normalize_text(raw: str) -> str:
|
||||||
|
"""Prepass to remove non-breaking spaces and normalize smart quotes."""
|
||||||
|
if raw is None:
|
||||||
|
return ""
|
||||||
|
s = str(raw).replace(NBSP, " ")
|
||||||
|
for k, v in SMARTS.items():
|
||||||
|
s = s.replace(k, v)
|
||||||
|
# Normalize any stray CR-only line endings
|
||||||
|
s = s.replace("\r\n", "\n").replace("\r", "\n")
|
||||||
|
return s
|
||||||
|
|
||||||
|
def parse_date(value: str) -> str:
|
||||||
|
"""Parse flexible US-style dates; return ISO YYYY-MM-DD or '' if empty/invalid."""
|
||||||
|
if not value:
|
||||||
|
return ""
|
||||||
|
v = value.strip()
|
||||||
|
if not v:
|
||||||
|
return ""
|
||||||
|
# Common separators already handled; try multiple formats.
|
||||||
|
for fmt in DATE_FORMATS:
|
||||||
|
try:
|
||||||
|
dt = datetime.strptime(v, fmt)
|
||||||
|
# Heuristic for 2-digit years if needed (datetime handles 1900s/2000s defaults)
|
||||||
|
# We trust strptime here; user examples are 2000s.
|
||||||
|
return dt.strftime("%Y-%m-%d")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
# Try to interpret like M/D/YY or M/D/YYYY with flexible spacing
|
||||||
|
m = re.match(r"^\s*(\d{1,2})[/-](\d{1,2})[/-](\d{2,4})\s*$", v)
|
||||||
|
if m:
|
||||||
|
mth, day, yr = int(m.group(1)), int(m.group(2)), m.group(3)
|
||||||
|
if len(yr) == 2:
|
||||||
|
# Assume 20xx for 00-69, 19xx for 70-99 (datetime default logic)
|
||||||
|
year = 2000 + int(yr) if int(yr) <= 69 else 1900 + int(yr)
|
||||||
|
else:
|
||||||
|
year = int(yr)
|
||||||
|
try:
|
||||||
|
return datetime(year, mth, day).strftime("%Y-%m-%d")
|
||||||
|
except Exception:
|
||||||
|
return ""
|
||||||
|
return ""
|
||||||
|
|
||||||
|
BOOK_RE = re.compile(r"^\s*([1-3]?\s*[A-Za-z\.]+)\s*(.*)$")
|
||||||
|
|
||||||
|
def normalize_scripture(value: str) -> str:
|
||||||
|
"""Normalize scripture strings: strip trailing semicolons/spaces, normalize book name if easily detectable."""
|
||||||
|
if not value:
|
||||||
|
return ""
|
||||||
|
s = value.strip()
|
||||||
|
# Remove trailing semicolons and excess punctuation/spaces.
|
||||||
|
s = re.sub(r"[;,\s]+$", "", s)
|
||||||
|
|
||||||
|
# Try to normalize the *first* book token if identifiable.
|
||||||
|
m = BOOK_RE.match(s)
|
||||||
|
if not m:
|
||||||
|
return s
|
||||||
|
book_raw, rest = m.group(1), m.group(2)
|
||||||
|
|
||||||
|
# normalize book key
|
||||||
|
key = book_raw.lower().replace(".", "")
|
||||||
|
key = re.sub(r"\s+", " ", key).strip()
|
||||||
|
# normalize ordinal spacing: "1cor" -> "1 cor"
|
||||||
|
key = re.sub(r"^([1-3])([a-z])", r"\1 \2", key)
|
||||||
|
|
||||||
|
book = BOOK_MAP.get(key, None)
|
||||||
|
if not book:
|
||||||
|
# Title-case fallback (basic)
|
||||||
|
book = " ".join(w.capitalize() for w in key.split())
|
||||||
|
|
||||||
|
rest = rest.strip()
|
||||||
|
# Normalize spacing in the chapter/verse segment (e.g., "14:13, 19")
|
||||||
|
rest = re.sub(r"\s*,\s*", ", ", rest)
|
||||||
|
rest = re.sub(r"\s*;\s*", "; ", rest)
|
||||||
|
rest = re.sub(r"\s+", " ", rest)
|
||||||
|
|
||||||
|
out = (book + (" " + rest if rest else "")).strip()
|
||||||
|
return out
|
||||||
|
|
||||||
|
def clean_subject(value: str) -> str:
|
||||||
|
"""Trim parts, drop empty entries, remove trailing commas/spaces, re-join with ', '."""
|
||||||
|
if not value:
|
||||||
|
return ""
|
||||||
|
# Strip external quotes handled by csv; here we just process content
|
||||||
|
s = value.strip()
|
||||||
|
# Split by comma, trim each token, drop empties
|
||||||
|
parts = [p.strip() for p in s.split(",")]
|
||||||
|
parts = [p for p in parts if p] # drop empty tokens
|
||||||
|
# Re-join
|
||||||
|
return ", ".join(parts)
|
||||||
|
|
||||||
|
def to_int_or_blank(value: str) -> str:
|
||||||
|
if value is None:
|
||||||
|
return ""
|
||||||
|
v = str(value).strip()
|
||||||
|
if v == "":
|
||||||
|
return ""
|
||||||
|
# Strip non-digit chars (but keep minus? not needed here)
|
||||||
|
v2 = re.sub(r"[^0-9-]+", "", v)
|
||||||
|
if v2 in ("", "-", "--"):
|
||||||
|
return ""
|
||||||
|
try:
|
||||||
|
int(v2)
|
||||||
|
return v2
|
||||||
|
except Exception:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def normalize_row(row_dict, stats, rownum):
|
||||||
|
"""Return (cleaned_row_dict, reject_reason_or_None)."""
|
||||||
|
clean = {}
|
||||||
|
|
||||||
|
# Subject
|
||||||
|
subject = row_dict.get("Subject", "")
|
||||||
|
subject = clean_subject(subject)
|
||||||
|
clean["Subject"] = subject
|
||||||
|
|
||||||
|
# Illustration
|
||||||
|
ill = pre_normalize_text(row_dict.get("Illustration", "")).strip()
|
||||||
|
clean["Illustration"] = ill
|
||||||
|
|
||||||
|
# Application
|
||||||
|
app = pre_normalize_text(row_dict.get("Application", "")).strip()
|
||||||
|
clean["Application"] = app
|
||||||
|
|
||||||
|
# Scripture
|
||||||
|
scr = pre_normalize_text(row_dict.get("Scripture", "")).strip()
|
||||||
|
scr_norm = normalize_scripture(scr)
|
||||||
|
if scr and scr != scr_norm:
|
||||||
|
stats["scripture_changed"] += 1
|
||||||
|
clean["Scripture"] = scr_norm
|
||||||
|
|
||||||
|
# Source
|
||||||
|
src = pre_normalize_text(row_dict.get("Source", "")).strip()
|
||||||
|
clean["Source"] = src
|
||||||
|
|
||||||
|
# Talk Title
|
||||||
|
ttitle = pre_normalize_text(row_dict.get("Talk Title", "")).strip()
|
||||||
|
clean["Talk Title"] = ttitle
|
||||||
|
|
||||||
|
# Talk Number
|
||||||
|
tnum = to_int_or_blank(row_dict.get("Talk Number", ""))
|
||||||
|
if tnum == "" and str(row_dict.get("Talk Number", "")).strip() not in ("",):
|
||||||
|
stats["invalid_ints"] += 1
|
||||||
|
clean["Talk Number"] = tnum
|
||||||
|
|
||||||
|
# Code
|
||||||
|
code = to_int_or_blank(row_dict.get("Code", ""))
|
||||||
|
if code == "" and str(row_dict.get("Code", "")).strip() not in ("",):
|
||||||
|
stats["invalid_ints"] += 1
|
||||||
|
clean["Code"] = code
|
||||||
|
|
||||||
|
# Date
|
||||||
|
date_raw = pre_normalize_text(row_dict.get("Date", "")).strip()
|
||||||
|
date_norm = parse_date(date_raw)
|
||||||
|
if date_raw and not date_norm:
|
||||||
|
stats["invalid_dates"] += 1
|
||||||
|
elif date_norm:
|
||||||
|
stats["dates_normalized"] += 1
|
||||||
|
clean["Date"] = date_norm
|
||||||
|
|
||||||
|
# Date Edited
|
||||||
|
datee_raw = pre_normalize_text(row_dict.get("Date Edited", "")).strip()
|
||||||
|
datee_norm = parse_date(datee_raw)
|
||||||
|
if datee_raw and not datee_norm:
|
||||||
|
stats["invalid_dates"] += 1
|
||||||
|
elif datee_norm:
|
||||||
|
stats["dates_normalized"] += 1
|
||||||
|
clean["Date Edited"] = datee_norm
|
||||||
|
|
||||||
|
# Reject logic: If the row is completely empty across all known fields, skip.
|
||||||
|
if not any(clean.get(h, "").strip() for h in HEADER):
|
||||||
|
return clean, "empty_row"
|
||||||
|
|
||||||
|
return clean, None
|
||||||
|
|
||||||
|
def read_with_prepass(path):
|
||||||
|
"""Read entire file, pre-normalize text, then parse CSV via csv.DictReader."""
|
||||||
|
with open(path, "r", encoding="utf-8-sig", newline="") as f:
|
||||||
|
raw = f.read()
|
||||||
|
normalized = pre_normalize_text(raw)
|
||||||
|
buf = io.StringIO(normalized)
|
||||||
|
reader = csv.DictReader(buf, delimiter=",", quotechar='"', doublequote=True, escapechar="\\", strict=False)
|
||||||
|
return reader
|
||||||
|
|
||||||
|
def write_csv(rows, out_path):
|
||||||
|
"""Write rows (list of dicts) with QUOTE_ALL to ensure commas/newlines are safe."""
|
||||||
|
with open(out_path, "w", encoding="utf-8", newline="") as f:
|
||||||
|
writer = csv.DictWriter(
|
||||||
|
f,
|
||||||
|
fieldnames=HEADER,
|
||||||
|
delimiter=",",
|
||||||
|
quotechar='"',
|
||||||
|
quoting=csv.QUOTE_ALL,
|
||||||
|
doublequote=True,
|
||||||
|
escapechar="\\",
|
||||||
|
lineterminator="\n",
|
||||||
|
)
|
||||||
|
writer.writeheader()
|
||||||
|
for r in rows:
|
||||||
|
writer.writerow({k: r.get(k, "") for k in HEADER})
|
||||||
|
|
||||||
|
def self_check_csv(path):
|
||||||
|
"""Verify column count on every row equals header length."""
|
||||||
|
problems = []
|
||||||
|
with open(path, "r", encoding="utf-8", newline="") as f:
|
||||||
|
reader = csv.reader(f, delimiter=",", quotechar='"', doublequote=True, escapechar="\\", strict=False)
|
||||||
|
expected = None
|
||||||
|
rownum = 0
|
||||||
|
for row in reader:
|
||||||
|
rownum += 1
|
||||||
|
if rownum == 1:
|
||||||
|
expected = len(row)
|
||||||
|
continue
|
||||||
|
if len(row) != expected:
|
||||||
|
problems.append((rownum, len(row), expected))
|
||||||
|
return problems
|
||||||
|
|
||||||
|
def main():
|
||||||
|
ap = argparse.ArgumentParser()
|
||||||
|
ap.add_argument("--in", dest="in_path", default="illustrations_seed.csv")
|
||||||
|
ap.add_argument("--out", dest="out_path", default="illustrations_clean.csv")
|
||||||
|
ap.add_argument("--rejects", dest="rejects_path", default="illustrations_rejects.csv")
|
||||||
|
args = ap.parse_args()
|
||||||
|
|
||||||
|
in_path = args.in_path
|
||||||
|
out_path = args.out_path
|
||||||
|
rejects_path = args.rejects_path
|
||||||
|
|
||||||
|
if not os.path.exists(in_path):
|
||||||
|
raise SystemExit(f"Input file not found: {in_path}")
|
||||||
|
|
||||||
|
stats = {
|
||||||
|
"total_rows": 0,
|
||||||
|
"written_rows": 0,
|
||||||
|
"reject_rows": 0,
|
||||||
|
"scripture_changed": 0,
|
||||||
|
"invalid_ints": 0,
|
||||||
|
"invalid_dates": 0,
|
||||||
|
"dates_normalized": 0,
|
||||||
|
}
|
||||||
|
|
||||||
|
rejects = []
|
||||||
|
cleaned_rows = []
|
||||||
|
|
||||||
|
reader = read_with_prepass(in_path)
|
||||||
|
|
||||||
|
# Validate header presence/shape
|
||||||
|
missing = [h for h in HEADER if h not in reader.fieldnames]
|
||||||
|
if missing:
|
||||||
|
raise SystemExit(f"Input CSV missing expected headers: {missing}\nFound headers: {reader.fieldnames}")
|
||||||
|
|
||||||
|
for idx, row in enumerate(reader, start=2): # start=2 because header is line 1
|
||||||
|
stats["total_rows"] += 1
|
||||||
|
try:
|
||||||
|
cleaned, reason = normalize_row(row, stats, idx)
|
||||||
|
if reason:
|
||||||
|
stats["reject_rows"] += 1
|
||||||
|
r = {k: row.get(k, "") for k in HEADER}
|
||||||
|
r["reason"] = reason
|
||||||
|
rejects.append(r)
|
||||||
|
continue
|
||||||
|
cleaned_rows.append(cleaned)
|
||||||
|
except Exception as e:
|
||||||
|
stats["reject_rows"] += 1
|
||||||
|
r = {k: row.get(k, "") for k in HEADER}
|
||||||
|
r["reason"] = f"exception@row {idx}: {e}"
|
||||||
|
rejects.append(r)
|
||||||
|
|
||||||
|
# Write outputs
|
||||||
|
write_csv(cleaned_rows, out_path)
|
||||||
|
|
||||||
|
# Write rejects if any
|
||||||
|
if rejects:
|
||||||
|
# Ensure 'reason' is the last column for readability
|
||||||
|
rej_header = HEADER + ["reason"]
|
||||||
|
with open(rejects_path, "w", encoding="utf-8", newline="") as f:
|
||||||
|
writer = csv.DictWriter(
|
||||||
|
f,
|
||||||
|
fieldnames=rej_header,
|
||||||
|
delimiter=",",
|
||||||
|
quotechar='"',
|
||||||
|
quoting=csv.QUOTE_ALL,
|
||||||
|
doublequote=True,
|
||||||
|
escapechar="\\",
|
||||||
|
lineterminator="\n",
|
||||||
|
)
|
||||||
|
writer.writeheader()
|
||||||
|
for r in rejects:
|
||||||
|
writer.writerow({k: r.get(k, "") for k in rej_header})
|
||||||
|
|
||||||
|
# Self check the written CSV
|
||||||
|
problems = self_check_csv(out_path)
|
||||||
|
|
||||||
|
# Summary
|
||||||
|
print("=== Clean Summary ===")
|
||||||
|
print(f"Input rows (excluding header): {stats['total_rows']}")
|
||||||
|
print(f"Written rows: {stats['written_rows'] + len(cleaned_rows)}")
|
||||||
|
print(f"Reject rows: {stats['reject_rows']}")
|
||||||
|
print(f"Scripture normalized: {stats['scripture_changed']}")
|
||||||
|
print(f"Dates normalized: {stats['dates_normalized']}")
|
||||||
|
print(f"Invalid ints blanked: {stats['invalid_ints']}")
|
||||||
|
print(f"Invalid dates (left blank): {stats['invalid_dates']}")
|
||||||
|
if problems:
|
||||||
|
print("\nSelf-check found column count issues on these line numbers in the OUTPUT (1-based):")
|
||||||
|
for line_no, got, exp in problems:
|
||||||
|
print(f" line {line_no}: columns={got}, expected={exp}")
|
||||||
|
else:
|
||||||
|
print("\nSelf-check: all rows have the expected column count.")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Loading…
Reference in New Issue
Block a user