#!/usr/bin/env python3
import re
import os
import shutil
import xml.etree.ElementTree as ET

from urllib.parse import unquote

ignored_urls = ["https://www.portugalrunning.com/eventos/"]
tree = ET.parse("sitemap.xml")
root = tree.getroot()

for url_element in root.findall(".//{*}url"):
    loc = url_element.find("{*}loc")
    lastmod = url_element.find("{*}lastmod")
    assert loc is not None
    assert lastmod is not None

    url = loc.text
    lastmod = lastmod.text

    assert url is not None
    assert lastmod is not None

    url = url.strip()
    lastmod = lastmod.strip()

    if url in ignored_urls:
        continue

    slug = re.match("https://www.portugalrunning.com/eventos/([^/]*)/", url)
    assert slug is not None, f"failed to extract slug from '{url}'"
    slug = unquote(slug[1])

    event_dir = os.path.join("events", slug)
    page_path = os.path.join(event_dir, "page.html")
    lastmod_path = os.path.join(event_dir, "lastmod")

    if os.path.exists(lastmod_path) and open(lastmod_path).read() == lastmod:
        continue

    if os.path.exists(event_dir):
        shutil.rmtree(event_dir)
    os.makedirs(event_dir, exist_ok=True)
    with open(lastmod_path, "w") as f:
        f.write(lastmod)