#!/usr/bin/env python3 import re import os import shutil import xml.etree.ElementTree as ET from urllib.parse import unquote ignored_urls = ["https://www.portugalrunning.com/eventos/"] tree = ET.parse("sitemap.xml") root = tree.getroot() for url_element in root.findall(".//{*}url"): loc = url_element.find("{*}loc") lastmod = url_element.find("{*}lastmod") assert loc is not None assert lastmod is not None url = loc.text lastmod = lastmod.text assert url is not None assert lastmod is not None url = url.strip() lastmod = lastmod.strip() if url in ignored_urls: continue slug = re.match("https://www.portugalrunning.com/eventos/([^/]*)/", url) assert slug is not None, f"failed to extract slug from '{url}'" slug = unquote(slug[1]) event_dir = os.path.join("events", slug) page_path = os.path.join(event_dir, "page.html") lastmod_path = os.path.join(event_dir, "lastmod") if os.path.exists(lastmod_path) and open(lastmod_path).read() == lastmod: continue if os.path.exists(event_dir): shutil.rmtree(event_dir) os.makedirs(event_dir, exist_ok=True) with open(lastmod_path, "w") as f: f.write(lastmod)