aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authordiogo464 <[email protected]>2025-07-21 15:02:48 +0100
committerdiogo464 <[email protected]>2025-07-21 15:02:48 +0100
commit8c8dabd0ed20679a2dad43a5c239f9fcfe1c1ad7 (patch)
tree55abbcfbbff19efa3aaf6cf36540ac7651c54973
init
-rw-r--r--.envrc5
-rw-r--r--Justfile26
-rw-r--r--README.md40
-rwxr-xr-xextract-categories74
-rwxr-xr-xextract-circuits85
-rwxr-xr-xextract-date25
-rwxr-xr-xextract-id17
-rwxr-xr-xextract-organizer83
-rwxr-xr-xfetch-data16
-rwxr-xr-xfetch-ics17
-rwxr-xr-xfetch-image20
-rwxr-xr-xfetch-location243
-rwxr-xr-xfetch-oneline-description73
-rwxr-xr-xfetch-page15
-rwxr-xr-xfetch-sitemap2
-rwxr-xr-xlist-slugs2
-rwxr-xr-xsetup-directories45
17 files changed, 788 insertions, 0 deletions
diff --git a/.envrc b/.envrc
new file mode 100644
index 000000000..755bb3cb5
--- /dev/null
+++ b/.envrc
@@ -0,0 +1,5 @@
1export GOOGLE_MAPS_API_KEY=$(pass google-geocoding-api-key)
2export ANTHROPIC_API_KEY=$(pass api/anthropic)
3export OPENROUTER_KEY=$(pass api/openrouter)
4export PUSHOVER_USERKEY=$(pass api/pushover/user-key)
5export PUSHOVER_KEY=$(pass api/pushover/production)
diff --git a/Justfile b/Justfile
new file mode 100644
index 000000000..f91b708a3
--- /dev/null
+++ b/Justfile
@@ -0,0 +1,26 @@
1_default:
2 just --list
3
4scrape:
5 ./fetch-sitemap
6 ./setup-directories
7 ./list-slugs | xargs -L 64 -P 40 ./fetch-page
8 ./list-slugs | xargs -L 64 -P 0 ./extract-id
9 ./list-slugs | xargs -L 64 -P 40 ./fetch-ics
10 ./list-slugs | xargs -L 64 -P 40 ./fetch-data
11 ./list-slugs | xargs -L 64 -P 40 ./fetch-image
12 ./list-slugs | xargs -L 64 -P 40 ./fetch-location
13 ./list-slugs | xargs -L 64 -P 20 ./fetch-oneline-description
14 ./list-slugs | xargs -L 64 -P 40 ./extract-date
15 ./list-slugs | xargs -L 64 -P 40 ./extract-categories
16 ./list-slugs | xargs -L 64 -P 40 ./extract-circuits
17 ./list-slugs | xargs -L 64 -P 40 ./extract-organizer
18
19scrape-categories:
20 ./list-slugs | xargs -e -L 64 -P 1 ./extract-categories
21
22scrape-circuits:
23 ./list-slugs | xargs -e -L 64 -P 1 ./extract-circuits
24
25scrape-organizer:
26 ./list-slugs | xargs -e -L 64 -P 1 ./extract-organizer
diff --git a/README.md b/README.md
new file mode 100644
index 000000000..dc718ceeb
--- /dev/null
+++ b/README.md
@@ -0,0 +1,40 @@
1# portugal-running-data
2repo with scraper for the portugal running calendar data
3
4| Filename | Source Script | Optional | Description |
5| ------------------------------------------------------------- | ------------------------------------------------------------- | ------------------------------------------------------------- | ------------------------------------------------------------- |
6| `lastmod` | `setup-directories` | no | last modification time extracted from the sitemap file |
7| `page.html` | `fetch-page` | no | event page from portugalrunning.com |
8| `id` | `extract-id` | no | event numeric id from wordpress |
9| `data.json` | `fetch-data` | no | json file with some event data |
10| `ics` | `fetch-ics` | no | calendar file with location, date and other event information |
11| `location` | `fetch-location` | yes | location data for the event |
12| `image` | `fetch-image` | yes | cover image for the event |
13| `date` | `extract-date` | no | event date extracted from the ics file |
14| `oneline-description` | `fetch-oneline-description` | yes | ai generated one line description |
15| `categories` | `extract-categories` | no | event categories |
16| `circuits` | `extract-circuits` | no | event circuits |
17
18## `fetch-sitemap`
19this script fetches the sitemap that contains a list of event page urls and the last modification date
20
21## `fetch-pages`
22this script will fetch any missing pages or outdated pages by looking at the lastmod file.
23
24## `extract-ids`
25this script will extract the event ids from the page.html file. this id can be used to later fetch other data related to this event.
26
27## `fetch-ics`
28this script uses the event id and fetches its ics file.
29
30## `fetch-data`
31this script uses the event id to fetch some event data in json format.
32
33## `fetch-images`
34some events have a main image in the json data file, this script will fetch that image.
35
36## `extract-organizer`
37this script extracts the organizer from the class list in the json data file, if one exists.
38
39## `extract-categories`
40this script extracts a list of categories from the class list in the json data file.
diff --git a/extract-categories b/extract-categories
new file mode 100755
index 000000000..1683ecb75
--- /dev/null
+++ b/extract-categories
@@ -0,0 +1,74 @@
1#!/usr/bin/env python3
2import os
3import sys
4import json
5
6CATEGORY_RUN = "run"
7CATEGORY_WALK = "walk"
8CATEGORY_TRAIL = "trail"
9CATEGORY_KIDS = "kids"
10CATEGORY_SAINT_SILVESTER = "saint-silvester"
11CATEGORY_10K = "10k"
12CATEGORY_15K = "15k"
13CATEGORY_HALF_MARATHON = "half-marathon"
14CATEGORY_MARATHON = "marathon"
15
16CLASS_TO_CATEGORIES = {
17 "event_type-corrida": [CATEGORY_RUN],
18 "event_type-caminhada": [CATEGORY_WALK],
19 "event_type-corrida-10km": [CATEGORY_RUN, CATEGORY_10K],
20 "event_type-corrida-10-km": [CATEGORY_RUN, CATEGORY_10K],
21 "event_type-corrida-de-15-km": [CATEGORY_RUN, CATEGORY_15K],
22 "event_type-trail": [CATEGORY_TRAIL],
23 "event_type-kids-trail": [CATEGORY_TRAIL, CATEGORY_KIDS],
24 "event_type-trail-curto": [CATEGORY_TRAIL],
25 "event_type-trail-longo": [CATEGORY_TRAIL],
26 "event_type-trail-endurance": [CATEGORY_TRAIL],
27 "event_type-trail-ultra": [CATEGORY_TRAIL],
28 "event_type-sao-silvestre": [CATEGORY_SAINT_SILVESTER],
29 "event_type-outras": [],
30 "event_type-obstaculos": [CATEGORY_RUN],
31 "event_type-corta-mato": [CATEGORY_RUN],
32 "event_type-backyard-2": [CATEGORY_RUN],
33 "event_type-meiamaratona": [CATEGORY_RUN, CATEGORY_HALF_MARATHON],
34 "event_type-maratona": [CATEGORY_RUN, CATEGORY_MARATHON],
35 "event_type-skyrunning": [CATEGORY_RUN],
36 "event_type-corridas-inferior-10": [CATEGORY_RUN],
37 "event_type-kids": [CATEGORY_KIDS],
38 # ignored
39 "ajde_events": [],
40 "type-ajde_events": [],
41 "status-publish": [],
42 "has-post-thumbnail": [],
43 "hentry": [],
44}
45CLASS_IGNORE_PREFIXES = [
46 "post-",
47 "event_location-",
48 "event_organizer-",
49 "event_type_2-",
50 "event_type_3-",
51 "event_type_4-",
52 "event_type_5-",
53]
54
55for slug in sys.argv[1:]:
56 data_path = os.path.join("events", slug, "data.json")
57 with open(data_path, "r") as f:
58 data = json.load(f)
59 classes = data["class_list"]
60 categories = set()
61 categories_path = os.path.join("events", slug, "categories")
62 for class_ in classes:
63 if any(class_.startswith(p) for p in CLASS_IGNORE_PREFIXES):
64 continue
65 if class_ not in CLASS_TO_CATEGORIES:
66 raise Exception(f"unknown class: {class_}")
67 for category in CLASS_TO_CATEGORIES[class_]:
68 categories.add(category)
69 with open(categories_path, "w") as f:
70 for i, category in enumerate(sorted(categories)):
71 if i > 0:
72 f.write("\n")
73 f.write(category)
74
diff --git a/extract-circuits b/extract-circuits
new file mode 100755
index 000000000..e7b9a5564
--- /dev/null
+++ b/extract-circuits
@@ -0,0 +1,85 @@
1#!/usr/bin/env python3
2import os
3import sys
4import json
5
6CIRCUIT_ATRP = "atrp"
7CIRCUIT_MAJORS = "majors"
8CIRCUIT_RIOS_TRAIL_TROPHY = "rios-trail-trophy"
9CIRCUIT_SUPER_HALFS = "super-halfs"
10CIRCUIT_ESTRELAS_DE_PORTUGAL = "estrelas-de-portugal"
11CIRCUIT_TROFEU_ATLETISMO_ALMADA = "trofeu-atletismo-almada"
12CIRCUIT_TROFEU_ALMADA = "trofeu-almada"
13CIRCUIT_ATLETISMO_BARREIRO = "atletismo-barreiro"
14CIRCUIT_MADEIRA_TRAIL = "circuit-madeira-trail"
15CIRCUITO_4_ESTACOES = "quatro-estacoes"
16
17CLASS_TO_CIRCUITS = {
18 "event_type_5-circuito-atrp": [CIRCUIT_ATRP],
19 "event_type_5-superhalfs": [CIRCUIT_SUPER_HALFS],
20 "event_type_5-trofeu-de-almada": [CIRCUIT_TROFEU_ALMADA],
21 "event_type_5-trofeu-atletismo-de-almada": [CIRCUIT_TROFEU_ATLETISMO_ALMADA],
22 "event_type_5-circuito-trail-madeira": [CIRCUIT_MADEIRA_TRAIL],
23 "event_type_5-circuito-estrelas-de-portugal": [CIRCUIT_ESTRELAS_DE_PORTUGAL],
24 "event_type_5-circuito-de-atletismo-do-barreiro": [CIRCUIT_ATLETISMO_BARREIRO],
25 "event_type_5-majors": [CIRCUIT_MAJORS],
26 "event_type_5-circuito-4-estacoes": [CIRCUIT_MAJORS],
27 "event_type_5-3-rios-trail-trophy": [CIRCUIT_RIOS_TRAIL_TROPHY],
28 "event_type_5-circuito-4-estacoes": [CIRCUITO_4_ESTACOES],
29 # ignored
30 "event_type-corrida-10km": [],
31 "event_type-corrida-10-km": [],
32 "event_type-corrida-de-15-km": [],
33 "event_type-meiamaratona": [],
34 "event_type-maratona": [],
35 "event_type-trail": [],
36 "event_type-kids-trail": [],
37 "event_type-trail-curto": [],
38 "event_type-trail-longo": [],
39 "event_type-trail-endurance": [],
40 "event_type-trail-ultra": [],
41 "event_type-sao-silvestre": [],
42 "event_type-outras": [],
43 "event_type-obstaculos": [],
44 "event_type-corta-mato": [],
45 "event_type-backyard-2": [],
46 "event_type-skyrunning": [],
47 "event_type-corridas-inferior-10": [],
48 "event_type-corrida": [],
49 "event_type-kids": [],
50 "event_type-caminhada": [],
51 "ajde_events": [],
52 "type-ajde_events": [],
53 "status-publish": [],
54 "has-post-thumbnail": [],
55 "hentry": [],
56}
57CLASS_IGNORE_PREFIXES = [
58 "post-",
59 "event_location-",
60 "event_organizer-",
61 "event_type_2-",
62 "event_type_4-",
63 "event_type_3-sim",
64]
65
66for slug in sys.argv[1:]:
67 data_path = os.path.join("events", slug, "data.json")
68 with open(data_path, "r") as f:
69 data = json.load(f)
70 classes = data["class_list"]
71 circuits = set()
72 circuits_path = os.path.join("events", slug, "circuits")
73 for class_ in classes:
74 if any(class_.startswith(p) for p in CLASS_IGNORE_PREFIXES):
75 continue
76 if class_ not in CLASS_TO_CIRCUITS:
77 raise Exception(f"unknown class: {class_}")
78 for category in CLASS_TO_CIRCUITS[class_]:
79 circuits.add(category)
80 with open(circuits_path, "w") as f:
81 for i, category in enumerate(sorted(circuits)):
82 if i > 0:
83 f.write("\n")
84 f.write(category)
85
diff --git a/extract-date b/extract-date
new file mode 100755
index 000000000..727b77b1e
--- /dev/null
+++ b/extract-date
@@ -0,0 +1,25 @@
1#!/usr/bin/env python3
2import os
3import re
4import sys
5
6from datetime import datetime
7
8# DTSTART:20251130
9
10for slug in sys.argv[1:]:
11 ics_path = os.path.join("events", slug, "ics")
12 ics_content = open(ics_path, "rb").read().decode("utf-8", errors="ignore")
13
14 date_path = os.path.join("events", slug, "date")
15 if os.path.exists(date_path):
16 continue
17
18 date = re.search(r"DTSTART:(\d+)", ics_content)
19 assert date is not None
20 date = date[1]
21
22 dt = datetime.strptime(date, "%Y%m%d")
23 with open(date_path, "w") as f:
24 f.write(date_path)
25
diff --git a/extract-id b/extract-id
new file mode 100755
index 000000000..698b6ae08
--- /dev/null
+++ b/extract-id
@@ -0,0 +1,17 @@
1#!/usr/bin/env python3
2import os
3import re
4import sys
5
6for slug in sys.argv[1:]:
7 page_path = os.path.join("events", slug, "page.html")
8 page_content = open(page_path).read()
9 event_id_match = re.search(
10 r'href="https://www.portugalrunning.com/wp-json/wp/v2/ajde_events/(\d+)"',
11 page_content,
12 )
13 assert event_id_match is not None, f"failed to extract event id from {slug}"
14 event_id_path = os.path.join("events", slug, "id")
15 with open(event_id_path, "w") as f:
16 f.write(event_id_match[1].strip())
17
diff --git a/extract-organizer b/extract-organizer
new file mode 100755
index 000000000..3739f4ead
--- /dev/null
+++ b/extract-organizer
@@ -0,0 +1,83 @@
1#!/usr/bin/env python3
2import os
3import sys
4import json
5
6ORGANIZER_XISTARCA = "xistarca"
7ORGANIZER_WERUN = "werun"
8ORGANIZER_HMS_SPORTS = "hms-sports"
9ORGANIZER_CPA = "cpa"
10ORGANIZER_TRILHO_PERDIDO = "trilho-perdido"
11ORGANIZER_EVENT_SPORT = "event-sport"
12ORGANIZER_RUN_PORTO = "run-porto"
13ORGANIZER_BRAGANCA_GRANFONDO_BY_TREK="braganca-grafondo-by-trek"
14ORGANIZER_VILA_FRANCA_DE_XIRA_LISBOA = "vila-franca-de-xira-lisboa"
15ORGANIZER_SINTRA_LISBOA = "sintra-lisboa"
16ORGANIZER_CAMARA_MUNICIPAL_DE_OEIRAS = "camara-municipal-de-oeiras"
17ORGANIZER_MARCO_DE_CANAVESES_PORTO = "marco-de-canaveses-porto"
18ORGANIZER_ESTADIO_UNIVERSITARIO_LISBOA = "estadio-universitario-lisboa"
19ORGANIZER_CLUBE_DO_PESSOAL_AGUAS_DE_GAIA = "clube-do-pessoal-aguas-de-gaia"
20ORGANIZER_AARAM = "aaram"
21ORGANIZER_SANTA_CRUZTORRES_VEDRAS = "santa-cruztorres-vedras"
22ORGANIZER_CLUBE_ASSOCIACAO_DESPORTIVA_E_RECREATIVA_DA_MATA = "clube-associacao-desportiva-e-recreativa-da-mata"
23ORGANIZER_TURRES_EVENTS = "turres-events"
24ORGANIZER_CAMARA_MUNICIPAL_DE_ESTREMOZ = "camara-municipal-de-estremoz"
25ORGANIZER_GRUPO_DE_CULTURA_E_DESPORTO_DOS_BOMBEIROS_VOLUNTARIOS_DE_S_B_MESSINES = "grupo-de-cultura-e-desporto-dos-bombeiros-voluntarios-de-s-b-messines"
26
27CLASS_TO_ORGANIZER = {
28 "event_organizer-xistarca": ORGANIZER_XISTARCA,
29 "event_organizer-we-run": ORGANIZER_WERUN,
30 "event_organizer-cpa": ORGANIZER_CPA,
31 "event_organizer-eventsport": ORGANIZER_EVENT_SPORT,
32 "event_organizer-trilho-perdido": ORGANIZER_TRILHO_PERDIDO,
33 "event_organizer-runporto": ORGANIZER_RUN_PORTO,
34 "event_organizer-hms-sports": ORGANIZER_HMS_SPORTS,
35 "event_organizer-vila-franca-de-xira-lisboa":ORGANIZER_VILA_FRANCA_DE_XIRA_LISBOA,
36 "event_organizer-braganca-granfondo-by-trek":ORGANIZER_BRAGANCA_GRANFONDO_BY_TREK,
37 "event_organizer-sintra-lisboa": ORGANIZER_SINTRA_LISBOA,
38 "event_organizer-camara-municipal-de-oeiras": ORGANIZER_CAMARA_MUNICIPAL_DE_OEIRAS,
39 "event_organizer-marco-de-canaveses-porto": ORGANIZER_MARCO_DE_CANAVESES_PORTO,
40 "event_organizer-estadio-universitario-lisboa": ORGANIZER_ESTADIO_UNIVERSITARIO_LISBOA,
41 "event_organizer-clube-do-pessoal-aguas-de-gaia": ORGANIZER_CLUBE_DO_PESSOAL_AGUAS_DE_GAIA,
42 "event_organizer-aaram": ORGANIZER_AARAM,
43 "event_organizer-santa-cruztorres-vedras": ORGANIZER_SANTA_CRUZTORRES_VEDRAS,
44 "event_organizer-clube-associacao-desportiva-e-recreativa-da-mata": ORGANIZER_CLUBE_ASSOCIACAO_DESPORTIVA_E_RECREATIVA_DA_MATA,
45 "event_organizer-turres-events": ORGANIZER_TURRES_EVENTS,
46 "event_organizer-camara-municipal-de-estremoz": ORGANIZER_CAMARA_MUNICIPAL_DE_ESTREMOZ,
47 "event_organizer-grupo-de-cultura-e-desporto-dos-bombeiros-voluntarios-de-s-b-messines": ORGANIZER_GRUPO_DE_CULTURA_E_DESPORTO_DOS_BOMBEIROS_VOLUNTARIOS_DE_S_B_MESSINES,
48 # ignored
49 "ajde_events": None,
50 "type-ajde_events": None,
51 "status-publish": None,
52 "has-post-thumbnail": None,
53 "hentry": None,
54}
55CLASS_IGNORE_PREFIXES = [
56 "post-",
57 "event_location-",
58 "event_type-",
59 "event_type_2",
60 "event_type_3-sim",
61 "event_type_4-",
62 "event_type_5-",
63]
64
65for slug in sys.argv[1:]:
66 data_path = os.path.join("events", slug, "data.json")
67 with open(data_path, "r") as f:
68 data = json.load(f)
69 classes = data["class_list"]
70 organizer = None
71 organizer_path = os.path.join("events", slug, "organizer")
72 for class_ in classes:
73 if any(class_.startswith(p) for p in CLASS_IGNORE_PREFIXES):
74 continue
75 if class_ not in CLASS_TO_ORGANIZER:
76 raise Exception(f"unknown class: {class_}")
77 if organizer is not None:
78 raise Exception(f"duplicate organizer")
79 organizer = CLASS_TO_ORGANIZER[class_]
80 if organizer is not None:
81 with open(organizer_path, "w") as f:
82 f.write(organizer)
83
diff --git a/fetch-data b/fetch-data
new file mode 100755
index 000000000..b6fc8c661
--- /dev/null
+++ b/fetch-data
@@ -0,0 +1,16 @@
1#!/usr/bin/env python3
2import os
3import sys
4import requests
5
6for slug in sys.argv[1:]:
7 id = open(os.path.join("events", slug, "id"), "r").read()
8 data_path = os.path.join("events", slug, "data.json")
9 if os.path.exists(data_path):
10 continue
11 data_url = f"https://portugalrunning.com/wp-json/wp/v2/ajde_events/{id}"
12 response = requests.get(data_url)
13 response.raise_for_status()
14 with open(data_path, "w") as f:
15 f.write(response.text)
16
diff --git a/fetch-ics b/fetch-ics
new file mode 100755
index 000000000..0b06ddac3
--- /dev/null
+++ b/fetch-ics
@@ -0,0 +1,17 @@
1#!/usr/bin/env python3
2import os
3import sys
4import requests
5
6for slug in sys.argv[1:]:
7 id_path = os.path.join("events", slug, "id")
8 id = open(id_path, "r").read()
9 ics_path = os.path.join("events", slug, "ics")
10 if os.path.exists(ics_path):
11 continue
12 ics_url = f"http://www.portugalrunning.com/export-events/{id}_0/"
13 response = requests.get(ics_url)
14 response.raise_for_status()
15 with open(ics_path, "wb") as f:
16 f.write(response.content)
17
diff --git a/fetch-image b/fetch-image
new file mode 100755
index 000000000..70758c1d9
--- /dev/null
+++ b/fetch-image
@@ -0,0 +1,20 @@
1#!/usr/bin/env python3
2import os
3import sys
4import json
5import requests
6
7for slug in sys.argv[1:]:
8 data_path = os.path.join("events", slug, "data.json")
9 data = json.loads(open(data_path).read())
10 image_url = data["featured_image_src"]
11 image_path = os.path.join("events", slug, "image")
12 if os.path.exists(image_path):
13 continue
14 if image_url == "":
15 continue
16 response = requests.get(image_url)
17 response.raise_for_status()
18 with open(image_path, "wb") as f:
19 f.write(response.content)
20
diff --git a/fetch-location b/fetch-location
new file mode 100755
index 000000000..47bd064f0
--- /dev/null
+++ b/fetch-location
@@ -0,0 +1,243 @@
1#!/usr/bin/env python3
2import os
3import re
4import sys
5import json
6import requests
7import urllib.parse
8
9from dataclasses import dataclass, asdict
10from typing import Optional, Dict, Any
11
12ENV_GOOGLE_MAPS_API_KEY = "GOOGLE_MAPS_API_KEY"
13
14# Portugal district codes mapping (ISO 3166-2:PT)
15PORTUGAL_DISTRICT_CODES = {
16 # Mainland Districts
17 "Aveiro": 1,
18 "Beja": 2,
19 "Braga": 3,
20 "Bragança": 4,
21 "Castelo Branco": 5,
22 "Coimbra": 6,
23 "Évora": 7,
24 "Faro": 8,
25 "Guarda": 9,
26 "Leiria": 10,
27 "Lisboa": 11,
28 "Portalegre": 12,
29 "Porto": 13,
30 "Santarém": 14,
31 "Setúbal": 15,
32 "Viana do Castelo": 16,
33 "Vila Real": 17,
34 "Viseu": 18,
35 # Autonomous Regions
36 "Região Autónoma dos Açores": 20,
37 "Açores": 20,
38 "Azores": 20,
39 "Região Autónoma da Madeira": 30,
40 "Madeira": 30,
41}
42
43
44def get_district_code(district_name: Optional[str]) -> Optional[int]:
45 """Get Portuguese district code from district name (ISO 3166-2:PT)."""
46 if not district_name:
47 return None
48
49 # Direct lookup
50 if district_name in PORTUGAL_DISTRICT_CODES:
51 return PORTUGAL_DISTRICT_CODES[district_name]
52
53 # Try common variations and normalize
54 normalized = district_name.strip()
55
56 # Handle common variations for autonomous regions
57 variations = {
58 "Região Autónoma da Madeira": "Madeira",
59 "Região Autónoma dos Açores": "Açores",
60 }
61
62 if normalized in variations:
63 return PORTUGAL_DISTRICT_CODES[variations[normalized]]
64
65 # Last resort: try partial matching for districts
66 for district in PORTUGAL_DISTRICT_CODES:
67 if (
68 district.lower() in normalized.lower()
69 or normalized.lower() in district.lower()
70 ):
71 return PORTUGAL_DISTRICT_CODES[district]
72
73 return None
74
75
76@dataclass
77class Coordinates:
78 """Geographic coordinates."""
79
80 lat: float
81 lon: float
82
83 def to_dict(self) -> Dict[str, float]:
84 return asdict(self)
85
86
87@dataclass
88class EventLocation:
89 """Location information for an event."""
90
91 name: str
92 country: str
93 locality: str
94 coordinates: Optional[Coordinates] = None
95 administrative_area_level_1: Optional[str] = None # District
96 administrative_area_level_2: Optional[str] = None # Municipality
97 administrative_area_level_3: Optional[str] = None # Parish
98 district_code: Optional[int] = None # Portuguese district code
99
100 def to_dict(self) -> Dict[str, Any]:
101 result = {}
102 result["name"] = self.name
103 result["country"] = self.country
104 result["locality"] = self.locality
105 if self.coordinates:
106 result["coordinates"] = self.coordinates.to_dict() # type: ignore
107 if self.administrative_area_level_1:
108 result["administrative_area_level_1"] = self.administrative_area_level_1
109 if self.administrative_area_level_2:
110 result["administrative_area_level_2"] = self.administrative_area_level_2
111 if self.administrative_area_level_3:
112 result["administrative_area_level_3"] = self.administrative_area_level_3
113 if self.district_code:
114 result["district_code"] = self.district_code
115 return result
116
117
118class GoogleGeocodingClient:
119 """Google Maps Geocoding API client with caching."""
120
121 def __init__(self, api_key: str):
122 self.api_key = api_key
123 self.base_url = "https://maps.googleapis.com/maps/api/geocode/json"
124
125 def _parse_google_response(
126 self, location: str, google_result: dict
127 ) -> EventLocation:
128 """Parse Google Maps API response into EventLocation."""
129 location_data = {
130 "name": location,
131 "lat": google_result["geometry"]["location"]["lat"],
132 "lon": google_result["geometry"]["location"]["lng"],
133 "country": "Portugal",
134 "locality": location.split(",")[0].strip(),
135 "administrative_area_level_1": None,
136 "administrative_area_level_2": None,
137 "administrative_area_level_3": None,
138 "district_code": None,
139 }
140
141 # Extract all administrative levels from address components
142 for component in google_result["address_components"]:
143 types = component["types"]
144 if "country" in types:
145 location_data["country"] = component["long_name"]
146 elif "administrative_area_level_1" in types:
147 location_data["administrative_area_level_1"] = component["long_name"]
148 # Use district as locality for Portugal
149 location_data["locality"] = component["long_name"]
150 elif "administrative_area_level_2" in types:
151 location_data["administrative_area_level_2"] = component["long_name"]
152 elif "administrative_area_level_3" in types:
153 location_data["administrative_area_level_3"] = component["long_name"]
154
155 # Calculate district code from administrative_area_level_1 (district)
156 location_data["district_code"] = get_district_code(
157 location_data["administrative_area_level_1"]
158 )
159
160 return EventLocation(
161 name=location,
162 country=location_data["country"],
163 locality=location_data["locality"],
164 coordinates=Coordinates(lat=location_data["lat"], lon=location_data["lon"]),
165 administrative_area_level_1=location_data["administrative_area_level_1"],
166 administrative_area_level_2=location_data["administrative_area_level_2"],
167 administrative_area_level_3=location_data["administrative_area_level_3"],
168 district_code=location_data["district_code"],
169 )
170
171 def geocode(self, location: str) -> Optional[EventLocation]:
172 """Geocode a location string."""
173 # Build request
174 params = {
175 "address": location,
176 "key": self.api_key,
177 "region": "pt",
178 "language": "pt",
179 }
180
181 try:
182 url = f"{self.base_url}?{urllib.parse.urlencode(params)}"
183 response = requests.get(url)
184 response.raise_for_status()
185 content = response.content
186 # print(f"GEOCODING|Google API content|{location}|{content}")
187 data = json.loads(content)
188 result = data["results"][0]
189 # print(f"GEOCODING|Google API result|{location}|{result}")
190 return self._parse_google_response(location, result)
191 except Exception as e:
192 # print(f"GEOCODING|Error|{location}|{str(e)}")
193 return None
194
195
196def clean_ics_location(loc: str) -> str:
197 def clean_ics_segment(segment: str) -> str:
198 segment = segment.strip()
199 words = segment.split()
200 # Madeira Madeira
201 # Santa Maria da Cruz Santa Maria da Cruz
202 if len(words) % 2 == 0:
203 hl = len(words) // 2
204 if words[:hl] == words[hl:]:
205 words = words[:hl]
206 return " ".join(words)
207
208 loc = loc.strip()
209 loc = loc.replace("\\", "")
210 segments = [clean_ics_segment(s) for s in loc.split(",")]
211
212 # Cabeço de Vida, Fontreira Cabeço de Vida, Fontreira
213 # Alcaria da Serra, Beja Alcaria da Serra, Beja
214 if len(segments) == 3 and segments[1] == f"{segments[2]} {segments[0]}":
215 segments = [segments[0], segments[2]]
216 return ", ".join(segments)
217
218
219FIXUP_TABLE = {"Alcaria da Serra, Beja": "Alcaria da Serra"}
220
221key = os.environ[ENV_GOOGLE_MAPS_API_KEY]
222client = GoogleGeocodingClient(key)
223for slug in sys.argv[1:]:
224 ics_path = os.path.join("events", slug, "ics")
225 location_path = os.path.join("events", slug, "location")
226 if os.path.exists(location_path):
227 continue
228 ics_content = open(ics_path, "rb").read().decode("utf-8", errors="ignore")
229 ics_location_match = re.search(r"LOCATION:(.*)", ics_content)
230 assert ics_location_match is not None
231 ics_location = ics_location_match[1]
232 ics_location = clean_ics_location(ics_location)
233 if ics_location == "":
234 continue
235 if ics_location in FIXUP_TABLE:
236 ics_location = FIXUP_TABLE[ics_location]
237 location = client.geocode(ics_location)
238 if location is None:
239 print(f"failed to obtain location from {slug} '{ics_location}'")
240 sys.exit(1)
241 with open(location_path, "w") as f:
242 json.dump(location.to_dict(), f)
243
diff --git a/fetch-oneline-description b/fetch-oneline-description
new file mode 100755
index 000000000..6f592e681
--- /dev/null
+++ b/fetch-oneline-description
@@ -0,0 +1,73 @@
1#!/usr/bin/env python3
2import os
3import sys
4import json
5import subprocess
6
7MODEL = "openrouter/anthropic/claude-3.5-haiku"
8
9
10class LLMClient:
11 """Client for LLM description generation."""
12
13 def __init__(self, model: str):
14 self.model = model
15
16 def llm_call(
17 self,
18 system_prompt: str,
19 user_prompt: str,
20 ) -> str:
21 proc = subprocess.run(
22 ["llm", "-m", self.model, "-s", system_prompt, user_prompt],
23 timeout=30,
24 check=True,
25 capture_output=True,
26 text=True,
27 )
28 stdout = proc.stdout.strip()
29 return stdout
30
31 def generate_description(self, text: str) -> str:
32 """Generate short description using LLM."""
33 system_prompt = """És um assistente especializado em condensar descrições de eventos de corrida em resumos de uma linha em português de Portugal. Deves extrair e resumir apenas a informação mais importante e relevante da descrição fornecida.
34
35Exemplos de resumos que deves gerar:
36+ Corrida histórica pelas ruas de Lisboa com vista para o Tejo
37+ Trail desafiante pela Serra da Estrela
38+ São Silvestre tradicional no centro histórico do Porto
39+ Meia maratona costeira com paisagens do Atlântico
40+ Corrida solidária organizada pela câmara municipal
41+ Prova de montanha com subidas técnicas
42+ Corrida de Natal pela zona ribeirinha
43+ Trail nocturno por caminhos antigos
44
45IMPORTANTE:
46- Responde APENAS com a descrição de uma linha em português de Portugal
47- Usa apenas informação presente na descrição original
48- Destaca características especiais do percurso, localização ou organização
49- Não menciones distâncias se já estão implícitas no tipo de evento
50- Foca-te no que torna este evento único ou interessante"""
51
52 return self.llm_call(
53 system_prompt,
54 text,
55 )
56
57
58client = LLMClient(MODEL)
59for slug in sys.argv[1:]:
60 data_path = os.path.join("events", slug, "data.json")
61 if not os.path.exists(data_path):
62 continue
63 data = json.load(open(data_path, "r"))
64 description = data["content"]["rendered"].strip()
65 if description == "":
66 continue
67 oneline_path = os.path.join("events", slug, "oneline-description")
68 if os.path.exists(oneline_path):
69 continue
70 oneline = client.generate_description(description)
71 with open(oneline_path, "w") as f:
72 f.write(oneline)
73
diff --git a/fetch-page b/fetch-page
new file mode 100755
index 000000000..085b54e7b
--- /dev/null
+++ b/fetch-page
@@ -0,0 +1,15 @@
1#!/usr/bin/env python3
2import os
3import sys
4import requests
5
6for slug in sys.argv[1:]:
7 url = f"https://www.portugalrunning.com/eventos/{slug}"
8 page_path = os.path.join("events", slug, "page.html")
9 if os.path.exists(page_path):
10 continue
11 response = requests.get(url)
12 response.raise_for_status()
13 with open(page_path, "w") as f:
14 f.write(response.text)
15
diff --git a/fetch-sitemap b/fetch-sitemap
new file mode 100755
index 000000000..b952ab529
--- /dev/null
+++ b/fetch-sitemap
@@ -0,0 +1,2 @@
1#!/usr/bin/env sh
2curl https://www.portugalrunning.com/ajde_events-sitemap.xml -o sitemap.xml
diff --git a/list-slugs b/list-slugs
new file mode 100755
index 000000000..a409e5d1e
--- /dev/null
+++ b/list-slugs
@@ -0,0 +1,2 @@
1#!/usr/bin/env sh
2ls -1 events
diff --git a/setup-directories b/setup-directories
new file mode 100755
index 000000000..d3548b14f
--- /dev/null
+++ b/setup-directories
@@ -0,0 +1,45 @@
1#!/usr/bin/env python3
2import re
3import os
4import shutil
5import xml.etree.ElementTree as ET
6
7ignored_urls = ["https://www.portugalrunning.com/eventos/"]
8tree = ET.parse("sitemap.xml")
9root = tree.getroot()
10
11for url_element in root.findall(".//{*}url"):
12 loc = url_element.find("{*}loc")
13 lastmod = url_element.find("{*}lastmod")
14 assert loc is not None
15 assert lastmod is not None
16
17 url = loc.text
18 lastmod = lastmod.text
19
20 assert url is not None
21 assert lastmod is not None
22
23 url = url.strip()
24 lastmod = lastmod.strip()
25
26 if url in ignored_urls:
27 continue
28
29 slug = re.match("https://www.portugalrunning.com/eventos/([^/]*)/", url)
30 assert slug is not None, f"failed to extract slug from '{url}'"
31 slug = slug[1]
32
33 event_dir = os.path.join("events", slug)
34 page_path = os.path.join(event_dir, "page.html")
35 lastmod_path = os.path.join(event_dir, "lastmod")
36
37 if os.path.exists(lastmod_path) and open(lastmod_path).read() == lastmod:
38 continue
39
40 if os.path.exists(event_dir):
41 shutil.rmtree(event_dir)
42 os.makedirs(event_dir, exist_ok=True)
43 with open(lastmod_path, "w") as f:
44 f.write(lastmod)
45