setup-directories


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47

#!/usr/bin/env python3
import re
import os
import shutil
import xml.etree.ElementTree as ET

from urllib.parse import unquote

ignored_urls = ["https://www.portugalrunning.com/eventos/"]
tree = ET.parse("sitemap.xml")
root = tree.getroot()

for url_element in root.findall(".//{*}url"):
    loc = url_element.find("{*}loc")
    lastmod = url_element.find("{*}lastmod")
    assert loc is not None
    assert lastmod is not None

    url = loc.text
    lastmod = lastmod.text

    assert url is not None
    assert lastmod is not None

    url = url.strip()
    lastmod = lastmod.strip()

    if url in ignored_urls:
        continue

    slug = re.match("https://www.portugalrunning.com/eventos/([^/]*)/", url)
    assert slug is not None, f"failed to extract slug from '{url}'"
    slug = unquote(slug[1])

    event_dir = os.path.join("events", slug)
    page_path = os.path.join(event_dir, "page.html")
    lastmod_path = os.path.join(event_dir, "lastmod")

    if os.path.exists(lastmod_path) and open(lastmod_path).read() == lastmod:
        continue

    if os.path.exists(event_dir):
        shutil.rmtree(event_dir)
    os.makedirs(event_dir, exist_ok=True)
    with open(lastmod_path, "w") as f:
        f.write(lastmod)