WordPress xml cleanup



✅ Running this will create two files in the same folder:

cleaned_blog_posts.json → structured archive

cleaned_blog_posts.csv → spreadsheet-friendly (Excel, Google Sheets, etc.)

import re
import json
import csv
from lxml import etree

# Path to your WordPress export XML
xml_path = “thescottogrottoorg.WordPress.2025-09-04.xml”

def clean_content(html_text):
    “””Remove WordPress block tags, HTML, and excess whitespace.”””
    if not html_text:
        return “”
    text = re.sub(r”<!–.*?–>”, “”, html_text, flags=re.DOTALL)  # WP comments
    text = re.sub(r”<[^>]+>”, “”, text)  # strip HTML
    text = (text.replace(“&nbsp;”, ” “)
                .replace(“&amp;”, “&”)
                .replace(“&lt;”, “<“)
                .replace(“&gt;”, “>”)
                .replace(“&quot;”, ‘”‘)
                .replace(“&#39;”, “‘”))
    text = re.sub(r”\s+”, ” “, text).strip()
    return text

# Parse XML with recovery mode
parser = etree.XMLParser(recover=True)
tree = etree.parse(xml_path, parser)
root = tree.getroot()

posts = []
for item in root.findall(“./channel/item”):
    post_type = item.find(“{http://wordpress.org/export/1.2/}post_type”)
    status = item.find(“{http://wordpress.org/export/1.2/}status”)

    if post_type is not None and post_type.text == “post”:
        if status is not None and status.text == “publish”:
            title_el = item.find(“title”)
            date_el = item.find(“pubDate”)
            content_el = item.find(“{http://purl.org/rss/1.0/modules/content/}encoded”)

            title = title_el.text if title_el is not None else “Untitled”
            date = date_el.text if date_el is not None else “Unknown”
            content = content_el.text if content_el is not None else “”

            if content.strip():
                posts.append({
                    “title”: title.strip(),
                    “date”: date.strip(),
                    “content”: clean_content(content)
                })

# Save all cleaned posts to JSON
json_file = “cleaned_blog_posts.json”
with open(json_file, “w”, encoding=”utf-8″) as f:
    json.dump(posts, f, ensure_ascii=False, indent=2)

print(f”Saved {len(posts)} posts to {json_file}”)

# Save all cleaned posts to CSV
csv_file = “cleaned_blog_posts.csv”
with open(csv_file, “w”, encoding=”utf-8″, newline=””) as f:
    writer = csv.DictWriter(f, fieldnames=[“title”, “date”, “content”])
    writer.writeheader()
    writer.writerows(posts)

print(f”Saved {len(posts)} posts to {csv_file}”)

Related Posts

Leave a Reply