blob: ab94676b12d741dcd9c5e3b24b1a979df4a40fc1 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
|
#!/usr/bin/env python3
import re
import os
import shutil
import xml.etree.ElementTree as ET
from urllib.parse import unquote
ignored_urls = ["https://www.portugalrunning.com/eventos/"]
tree = ET.parse("sitemap.xml")
root = tree.getroot()
for url_element in root.findall(".//{*}url"):
loc = url_element.find("{*}loc")
lastmod = url_element.find("{*}lastmod")
assert loc is not None
assert lastmod is not None
url = loc.text
lastmod = lastmod.text
assert url is not None
assert lastmod is not None
url = url.strip()
lastmod = lastmod.strip()
if url in ignored_urls:
continue
slug = re.match("https://www.portugalrunning.com/eventos/([^/]*)/", url)
assert slug is not None, f"failed to extract slug from '{url}'"
slug = unquote(slug[1])
event_dir = os.path.join("events", slug)
page_path = os.path.join(event_dir, "page.html")
lastmod_path = os.path.join(event_dir, "lastmod")
if os.path.exists(lastmod_path) and open(lastmod_path).read() == lastmod:
continue
if os.path.exists(event_dir):
shutil.rmtree(event_dir)
os.makedirs(event_dir, exist_ok=True)
with open(lastmod_path, "w") as f:
f.write(lastmod)
|