WordPress xml cleanup

✅ Running this will create two files in the same folder:

cleaned_blog_posts.json → structured archive

cleaned_blog_posts.csv → spreadsheet-friendly (Excel, Google Sheets, etc.)

import re
import json
import csv
from lxml import etree

# Path to your WordPress export XML
xml_path = “thescottogrottoorg.WordPress.2025-09-04.xml”

def clean_content(html_text):
    “””Remove WordPress block tags, HTML, and excess whitespace.”””
    if not html_text:
        return “”
    text = re.sub(r”<!–.*?–>”, “”, html_text, flags=re.DOTALL) # WP comments
    text = re.sub(r”<[^>]+>”, “”, text) # strip HTML
    text = (text.replace(“ ”, ” “)
                .replace(“&”, “&”)
                .replace(“<”, “<“)
                .replace(“>”, “>”)
                .replace(“"”, ‘”‘)
                .replace(“'”, “‘”))
    text = re.sub(r”\s+”, ” “, text).strip()
    return text

# Parse XML with recovery mode
parser = etree.XMLParser(recover=True)
tree = etree.parse(xml_path, parser)
root = tree.getroot()

posts = []
for item in root.findall(“./channel/item”):
post_type = item.find(“{http://wordpress.org/export/1.2/}post_type”)
status = item.find(“{http://wordpress.org/export/1.2/}status”)

    if post_type is not None and post_type.text == “post”:
        if status is not None and status.text == “publish”:
            title_el = item.find(“title”)
            date_el = item.find(“pubDate”)
            content_el = item.find(“{http://purl.org/rss/1.0/modules/content/}encoded”)

            title = title_el.text if title_el is not None else “Untitled”
            date = date_el.text if date_el is not None else “Unknown”
            content = content_el.text if content_el is not None else “”

            if content.strip():
                posts.append({
                    “title”: title.strip(),
                    “date”: date.strip(),
                    “content”: clean_content(content)
                })

# Save all cleaned posts to JSON
json_file = “cleaned_blog_posts.json”
with open(json_file, “w”, encoding=”utf-8″) as f:
json.dump(posts, f, ensure_ascii=False, indent=2)

print(f”Saved {len(posts)} posts to {json_file}”)

# Save all cleaned posts to CSV
csv_file = “cleaned_blog_posts.csv”
with open(csv_file, “w”, encoding=”utf-8″, newline=””) as f:
    writer = csv.DictWriter(f, fieldnames=[“title”, “date”, “content”])
    writer.writeheader()
    writer.writerows(posts)

print(f”Saved {len(posts)} posts to {csv_file}”)

December 16, 2001 Ebay wackiness. HITLER SPANKED If you had captured Hitler & Hirohito, what would you do with them?
July 2, 2017 July 02, 2017 at 12:04AM Pyewacket is in love with the secret life of pets. via Instagram http://bit.ly/2ucxyPH
October 12, 2022 Liked on YouTube: Search for Lady Hades: a ToyPizza Fan Film Fan film featuring: Action Figures from Toypizza Shot, chopped and scored by saintsyn
August 31, 2000 Up Up and away (In my Beautiful airship) The best UFO story of all is the story of the fabulous airship of 1897. It's got […]
February 13, 2001 Lost Empires ... A great series... I saw the one on Trebuchets last night. 🙂