✅ Running this will create two files in the same folder:
cleaned_blog_posts.json → structured archive
cleaned_blog_posts.csv → spreadsheet-friendly (Excel, Google Sheets, etc.)
import re
import json
import csv
from lxml import etree
# Path to your WordPress export XML
xml_path = “thescottogrottoorg.WordPress.2025-09-04.xml”
def clean_content(html_text):
“””Remove WordPress block tags, HTML, and excess whitespace.”””
if not html_text:
return “”
text = re.sub(r”<!–.*?–>”, “”, html_text, flags=re.DOTALL) # WP comments
text = re.sub(r”<[^>]+>”, “”, text) # strip HTML
text = (text.replace(“ ”, ” “)
.replace(“&”, “&”)
.replace(“<”, “<“)
.replace(“>”, “>”)
.replace(“"”, ‘”‘)
.replace(“'”, “‘”))
text = re.sub(r”\s+”, ” “, text).strip()
return text
# Parse XML with recovery mode
parser = etree.XMLParser(recover=True)
tree = etree.parse(xml_path, parser)
root = tree.getroot()
posts = []
for item in root.findall(“./channel/item”):
post_type = item.find(“{http://wordpress.org/export/1.2/}post_type”)
status = item.find(“{http://wordpress.org/export/1.2/}status”)
if post_type is not None and post_type.text == “post”:
if status is not None and status.text == “publish”:
title_el = item.find(“title”)
date_el = item.find(“pubDate”)
content_el = item.find(“{http://purl.org/rss/1.0/modules/content/}encoded”)
title = title_el.text if title_el is not None else “Untitled”
date = date_el.text if date_el is not None else “Unknown”
content = content_el.text if content_el is not None else “”
if content.strip():
posts.append({
“title”: title.strip(),
“date”: date.strip(),
“content”: clean_content(content)
})
# Save all cleaned posts to JSON
json_file = “cleaned_blog_posts.json”
with open(json_file, “w”, encoding=”utf-8″) as f:
json.dump(posts, f, ensure_ascii=False, indent=2)
print(f”Saved {len(posts)} posts to {json_file}”)
# Save all cleaned posts to CSV
csv_file = “cleaned_blog_posts.csv”
with open(csv_file, “w”, encoding=”utf-8″, newline=””) as f:
writer = csv.DictWriter(f, fieldnames=[“title”, “date”, “content”])
writer.writeheader()
writer.writerows(posts)
print(f”Saved {len(posts)} posts to {csv_file}”)