init

author: diogo464 <[email protected]> 2025-07-21 15:02:48 +0100
committer: diogo464 <[email protected]> 2025-07-21 15:02:48 +0100
commit: 8c8dabd0ed20679a2dad43a5c239f9fcfe1c1ad7 (patch)
tree: 55abbcfbbff19efa3aaf6cf36540ac7651c54973
17 files changed, 788 insertions, 0 deletions
diff --git a/.envrc b/.envrc
new file mode 100644
index 000000000..755bb3cb5
--- /dev/null
+++ b/.envrc
@@ -0,0 +1,5 @@
+export GOOGLE_MAPS_API_KEY=$(pass google-geocoding-api-key)
+export ANTHROPIC_API_KEY=$(pass api/anthropic)
+export OPENROUTER_KEY=$(pass api/openrouter)
+export PUSHOVER_USERKEY=$(pass api/pushover/user-key)
+export PUSHOVER_KEY=$(pass api/pushover/production)
diff --git a/Justfile b/Justfile
new file mode 100644
index 000000000..f91b708a3
--- /dev/null
+++ b/Justfile
@@ -0,0 +1,26 @@
+_default:
+    just --list
+scrape:
+    ./fetch-sitemap
+    ./setup-directories
+    ./list-slugs | xargs -L 64 -P 40 ./fetch-page
+    ./list-slugs | xargs -L 64 -P 0 ./extract-id
+    ./list-slugs | xargs -L 64 -P 40 ./fetch-ics
+    ./list-slugs | xargs -L 64 -P 40 ./fetch-data
+    ./list-slugs | xargs -L 64 -P 40 ./fetch-image
+    ./list-slugs | xargs -L 64 -P 40 ./fetch-location
+    ./list-slugs | xargs -L 64 -P 20 ./fetch-oneline-description
+    ./list-slugs | xargs -L 64 -P 40 ./extract-date
+    ./list-slugs | xargs -L 64 -P 40 ./extract-categories
+    ./list-slugs | xargs -L 64 -P 40 ./extract-circuits
+    ./list-slugs | xargs -L 64 -P 40 ./extract-organizer
+scrape-categories:
+    ./list-slugs | xargs -e -L 64 -P 1 ./extract-categories
+scrape-circuits:
+    ./list-slugs | xargs -e -L 64 -P 1 ./extract-circuits
+scrape-organizer:
+    ./list-slugs | xargs -e -L 64 -P 1 ./extract-organizer
diff --git a/README.md b/README.md
new file mode 100644
index 000000000..dc718ceeb
--- /dev/null
+++ b/README.md
@@ -0,0 +1,40 @@
+# portugal-running-data
+repo with scraper for the portugal running calendar data
+| Filename                                                      | Source Script                                                 | Optional                                                      | Description                                                   |
+| ------------------------------------------------------------- | ------------------------------------------------------------- | ------------------------------------------------------------- | ------------------------------------------------------------- |
+| `lastmod`                                                     | `setup-directories`                                           | no                                                            | last modification time extracted from the sitemap file        |
+| `page.html`                                                   | `fetch-page`                                                  | no                                                            | event page from portugalrunning.com                           |
+| `id`                                                          | `extract-id`                                                  | no                                                            | event numeric id from wordpress                               |
+| `data.json`                                                   | `fetch-data`                                                  | no                                                            | json file with some event data                                |
+| `ics`                                                         | `fetch-ics`                                                   | no                                                            | calendar file with location, date and other event information |
+| `location`                                                    | `fetch-location`                                              | yes                                                           | location data for the event                                   |
+| `image`                                                       | `fetch-image`                                                 | yes                                                           | cover image for the event                                     |
+| `date`                                                        | `extract-date`                                                | no                                                            | event date extracted from the ics file                        |
+| `oneline-description`                                         | `fetch-oneline-description`                                   | yes                                                           | ai generated one line description                             |
+| `categories`                                                  | `extract-categories`                                          | no                                                            | event categories                                              |
+| `circuits`                                                    | `extract-circuits`                                            | no                                                            | event circuits                                                |
+## `fetch-sitemap`
+this script fetches the sitemap that contains a list of event page urls and the last modification date
+## `fetch-pages`
+this script will fetch any missing pages or outdated pages by looking at the lastmod file.
+## `extract-ids`
+this script will extract the event ids from the page.html file. this id can be used to later fetch other data related to this event.
+## `fetch-ics`
+this script uses the event id and fetches its ics file.
+## `fetch-data`
+this script uses the event id to fetch some event data in json format.
+## `fetch-images`
+some events have a main image in the json data file, this script will fetch that image.
+## `extract-organizer`
+this script extracts the organizer from the class list in the json data file, if one exists.
+## `extract-categories`
+this script extracts a list of categories from the class list in the json data file.
diff --git a/extract-categories b/extract-categories
new file mode 100755
index 000000000..1683ecb75
--- /dev/null
+++ b/extract-categories
@@ -0,0 +1,74 @@
+#!/usr/bin/env python3
+import os
+import sys
+import json
+CATEGORY_RUN = "run"
+CATEGORY_WALK = "walk"
+CATEGORY_TRAIL = "trail"
+CATEGORY_KIDS = "kids"
+CATEGORY_SAINT_SILVESTER = "saint-silvester"
+CATEGORY_10K = "10k"
+CATEGORY_15K = "15k"
+CATEGORY_HALF_MARATHON = "half-marathon"
+CATEGORY_MARATHON = "marathon"
+CLASS_TO_CATEGORIES = {
+    "event_type-corrida": [CATEGORY_RUN],
+    "event_type-caminhada": [CATEGORY_WALK],
+    "event_type-corrida-10km": [CATEGORY_RUN, CATEGORY_10K],
+    "event_type-corrida-10-km": [CATEGORY_RUN, CATEGORY_10K],
+    "event_type-corrida-de-15-km": [CATEGORY_RUN, CATEGORY_15K],
+    "event_type-trail": [CATEGORY_TRAIL],
+    "event_type-kids-trail": [CATEGORY_TRAIL, CATEGORY_KIDS],
+    "event_type-trail-curto": [CATEGORY_TRAIL],
+    "event_type-trail-longo": [CATEGORY_TRAIL],
+    "event_type-trail-endurance": [CATEGORY_TRAIL],
+    "event_type-trail-ultra": [CATEGORY_TRAIL],
+    "event_type-sao-silvestre": [CATEGORY_SAINT_SILVESTER],
+    "event_type-outras": [],
+    "event_type-obstaculos": [CATEGORY_RUN],
+    "event_type-corta-mato": [CATEGORY_RUN],
+    "event_type-backyard-2": [CATEGORY_RUN],
+    "event_type-meiamaratona": [CATEGORY_RUN, CATEGORY_HALF_MARATHON],
+    "event_type-maratona": [CATEGORY_RUN, CATEGORY_MARATHON],
+    "event_type-skyrunning": [CATEGORY_RUN],
+    "event_type-corridas-inferior-10": [CATEGORY_RUN],
+    "event_type-kids": [CATEGORY_KIDS],
+    # ignored
+    "ajde_events": [],
+    "type-ajde_events": [],
+    "status-publish": [],
+    "has-post-thumbnail": [],
+    "hentry": [],
+}
+CLASS_IGNORE_PREFIXES = [
+    "post-",
+    "event_location-",
+    "event_organizer-",
+    "event_type_2-",
+    "event_type_3-",
+    "event_type_4-",
+    "event_type_5-",
+]
+for slug in sys.argv[1:]:
+    data_path = os.path.join("events", slug, "data.json")
+    with open(data_path, "r") as f:
+        data = json.load(f)
+    classes = data["class_list"]
+    categories = set()
+    categories_path = os.path.join("events", slug, "categories")
+    for class_ in classes:
+        if any(class_.startswith(p) for p in CLASS_IGNORE_PREFIXES):
+            continue
+        if class_ not in CLASS_TO_CATEGORIES:
+            raise Exception(f"unknown class: {class_}")
+        for category in CLASS_TO_CATEGORIES[class_]:
+            categories.add(category)
+    with open(categories_path, "w") as f:
+        for i, category in enumerate(sorted(categories)):
+            if i > 0:
+                f.write("\n")
+            f.write(category)
diff --git a/extract-circuits b/extract-circuits
new file mode 100755
index 000000000..e7b9a5564
--- /dev/null
+++ b/extract-circuits
@@ -0,0 +1,85 @@
+#!/usr/bin/env python3
+import os
+import sys
+import json
+CIRCUIT_ATRP = "atrp"
+CIRCUIT_MAJORS = "majors"
+CIRCUIT_RIOS_TRAIL_TROPHY = "rios-trail-trophy"
+CIRCUIT_SUPER_HALFS = "super-halfs"
+CIRCUIT_ESTRELAS_DE_PORTUGAL = "estrelas-de-portugal"
+CIRCUIT_TROFEU_ATLETISMO_ALMADA = "trofeu-atletismo-almada"
+CIRCUIT_TROFEU_ALMADA = "trofeu-almada"
+CIRCUIT_ATLETISMO_BARREIRO = "atletismo-barreiro"
+CIRCUIT_MADEIRA_TRAIL = "circuit-madeira-trail"
+CIRCUITO_4_ESTACOES = "quatro-estacoes"
+CLASS_TO_CIRCUITS = {
+    "event_type_5-circuito-atrp": [CIRCUIT_ATRP],
+    "event_type_5-superhalfs": [CIRCUIT_SUPER_HALFS],
+    "event_type_5-trofeu-de-almada": [CIRCUIT_TROFEU_ALMADA],
+    "event_type_5-trofeu-atletismo-de-almada": [CIRCUIT_TROFEU_ATLETISMO_ALMADA],
+    "event_type_5-circuito-trail-madeira": [CIRCUIT_MADEIRA_TRAIL],
+    "event_type_5-circuito-estrelas-de-portugal": [CIRCUIT_ESTRELAS_DE_PORTUGAL],
+    "event_type_5-circuito-de-atletismo-do-barreiro": [CIRCUIT_ATLETISMO_BARREIRO],
+    "event_type_5-majors": [CIRCUIT_MAJORS],
+    "event_type_5-circuito-4-estacoes": [CIRCUIT_MAJORS],
+    "event_type_5-3-rios-trail-trophy": [CIRCUIT_RIOS_TRAIL_TROPHY],
+    "event_type_5-circuito-4-estacoes": [CIRCUITO_4_ESTACOES],
+    # ignored
+    "event_type-corrida-10km": [],
+    "event_type-corrida-10-km": [],
+    "event_type-corrida-de-15-km": [],
+    "event_type-meiamaratona": [],
+    "event_type-maratona": [],
+    "event_type-trail": [],
+    "event_type-kids-trail": [],
+    "event_type-trail-curto": [],
+    "event_type-trail-longo": [],
+    "event_type-trail-endurance": [],
+    "event_type-trail-ultra": [],
+    "event_type-sao-silvestre": [],
+    "event_type-outras": [],
+    "event_type-obstaculos": [],
+    "event_type-corta-mato": [],
+    "event_type-backyard-2": [],
+    "event_type-skyrunning": [],
+    "event_type-corridas-inferior-10": [],
+    "event_type-corrida": [],
+    "event_type-kids": [],
+    "event_type-caminhada": [],
+    "ajde_events": [],
+    "type-ajde_events": [],
+    "status-publish": [],
+    "has-post-thumbnail": [],
+    "hentry": [],
+}
+CLASS_IGNORE_PREFIXES = [
+    "post-",
+    "event_location-",
+    "event_organizer-",
+    "event_type_2-",
+    "event_type_4-",
+    "event_type_3-sim",
+]
+for slug in sys.argv[1:]:
+    data_path = os.path.join("events", slug, "data.json")
+    with open(data_path, "r") as f:
+        data = json.load(f)
+    classes = data["class_list"]
+    circuits = set()
+    circuits_path = os.path.join("events", slug, "circuits")
+    for class_ in classes:
+        if any(class_.startswith(p) for p in CLASS_IGNORE_PREFIXES):
+            continue
+        if class_ not in CLASS_TO_CIRCUITS:
+            raise Exception(f"unknown class: {class_}")
+        for category in CLASS_TO_CIRCUITS[class_]:
+            circuits.add(category)
+    with open(circuits_path, "w") as f:
+        for i, category in enumerate(sorted(circuits)):
+            if i > 0:
+                f.write("\n")
+            f.write(category)
diff --git a/extract-date b/extract-date
new file mode 100755
index 000000000..727b77b1e
--- /dev/null
+++ b/extract-date
@@ -0,0 +1,25 @@
+#!/usr/bin/env python3
+import os
+import re
+import sys
+from datetime import datetime
+# DTSTART:20251130
+for slug in sys.argv[1:]:
+    ics_path = os.path.join("events", slug, "ics")
+    ics_content = open(ics_path, "rb").read().decode("utf-8", errors="ignore")
+    date_path = os.path.join("events", slug, "date")
+    if os.path.exists(date_path):
+        continue
+    date = re.search(r"DTSTART:(\d+)", ics_content)
+    assert date is not None
+    date = date[1]
+    dt = datetime.strptime(date, "%Y%m%d")
+    with open(date_path, "w") as f:
+        f.write(date_path)
diff --git a/extract-id b/extract-id
new file mode 100755
index 000000000..698b6ae08
--- /dev/null
+++ b/extract-id
@@ -0,0 +1,17 @@
+#!/usr/bin/env python3
+import os
+import re
+import sys
+for slug in sys.argv[1:]:
+    page_path = os.path.join("events", slug, "page.html")
+    page_content = open(page_path).read()
+    event_id_match = re.search(
+        r'href="https://www.portugalrunning.com/wp-json/wp/v2/ajde_events/(\d+)"',
+        page_content,
+    )
+    assert event_id_match is not None, f"failed to extract event id from {slug}"
+    event_id_path = os.path.join("events", slug, "id")
+    with open(event_id_path, "w") as f:
+        f.write(event_id_match[1].strip())
diff --git a/extract-organizer b/extract-organizer
new file mode 100755
index 000000000..3739f4ead
--- /dev/null
+++ b/extract-organizer
@@ -0,0 +1,83 @@
+#!/usr/bin/env python3
+import os
+import sys
+import json
+ORGANIZER_XISTARCA = "xistarca"
+ORGANIZER_WERUN = "werun"
+ORGANIZER_HMS_SPORTS = "hms-sports"
+ORGANIZER_CPA = "cpa"
+ORGANIZER_TRILHO_PERDIDO = "trilho-perdido"
+ORGANIZER_EVENT_SPORT = "event-sport"
+ORGANIZER_RUN_PORTO = "run-porto"
+ORGANIZER_BRAGANCA_GRANFONDO_BY_TREK="braganca-grafondo-by-trek"
+ORGANIZER_VILA_FRANCA_DE_XIRA_LISBOA = "vila-franca-de-xira-lisboa"
+ORGANIZER_SINTRA_LISBOA = "sintra-lisboa"
+ORGANIZER_CAMARA_MUNICIPAL_DE_OEIRAS = "camara-municipal-de-oeiras"
+ORGANIZER_MARCO_DE_CANAVESES_PORTO = "marco-de-canaveses-porto"
+ORGANIZER_ESTADIO_UNIVERSITARIO_LISBOA = "estadio-universitario-lisboa"
+ORGANIZER_CLUBE_DO_PESSOAL_AGUAS_DE_GAIA = "clube-do-pessoal-aguas-de-gaia"
+ORGANIZER_AARAM = "aaram"
+ORGANIZER_SANTA_CRUZTORRES_VEDRAS = "santa-cruztorres-vedras"
+ORGANIZER_CLUBE_ASSOCIACAO_DESPORTIVA_E_RECREATIVA_DA_MATA = "clube-associacao-desportiva-e-recreativa-da-mata"
+ORGANIZER_TURRES_EVENTS = "turres-events"
+ORGANIZER_CAMARA_MUNICIPAL_DE_ESTREMOZ = "camara-municipal-de-estremoz"
+ORGANIZER_GRUPO_DE_CULTURA_E_DESPORTO_DOS_BOMBEIROS_VOLUNTARIOS_DE_S_B_MESSINES = "grupo-de-cultura-e-desporto-dos-bombeiros-voluntarios-de-s-b-messines"
+CLASS_TO_ORGANIZER = {
+    "event_organizer-xistarca": ORGANIZER_XISTARCA,
+    "event_organizer-we-run": ORGANIZER_WERUN,
+    "event_organizer-cpa": ORGANIZER_CPA,
+    "event_organizer-eventsport": ORGANIZER_EVENT_SPORT,
+    "event_organizer-trilho-perdido": ORGANIZER_TRILHO_PERDIDO,
+    "event_organizer-runporto": ORGANIZER_RUN_PORTO,
+    "event_organizer-hms-sports": ORGANIZER_HMS_SPORTS,
+    "event_organizer-vila-franca-de-xira-lisboa":ORGANIZER_VILA_FRANCA_DE_XIRA_LISBOA,
+    "event_organizer-braganca-granfondo-by-trek":ORGANIZER_BRAGANCA_GRANFONDO_BY_TREK,
+    "event_organizer-sintra-lisboa": ORGANIZER_SINTRA_LISBOA,
+    "event_organizer-camara-municipal-de-oeiras": ORGANIZER_CAMARA_MUNICIPAL_DE_OEIRAS,
+    "event_organizer-marco-de-canaveses-porto": ORGANIZER_MARCO_DE_CANAVESES_PORTO,
+    "event_organizer-estadio-universitario-lisboa": ORGANIZER_ESTADIO_UNIVERSITARIO_LISBOA,
+    "event_organizer-clube-do-pessoal-aguas-de-gaia": ORGANIZER_CLUBE_DO_PESSOAL_AGUAS_DE_GAIA,
+    "event_organizer-aaram": ORGANIZER_AARAM,
+    "event_organizer-santa-cruztorres-vedras": ORGANIZER_SANTA_CRUZTORRES_VEDRAS,
+    "event_organizer-clube-associacao-desportiva-e-recreativa-da-mata": ORGANIZER_CLUBE_ASSOCIACAO_DESPORTIVA_E_RECREATIVA_DA_MATA,
+    "event_organizer-turres-events": ORGANIZER_TURRES_EVENTS,
+    "event_organizer-camara-municipal-de-estremoz": ORGANIZER_CAMARA_MUNICIPAL_DE_ESTREMOZ,
+    "event_organizer-grupo-de-cultura-e-desporto-dos-bombeiros-voluntarios-de-s-b-messines": ORGANIZER_GRUPO_DE_CULTURA_E_DESPORTO_DOS_BOMBEIROS_VOLUNTARIOS_DE_S_B_MESSINES,
+    # ignored
+    "ajde_events": None,
+    "type-ajde_events": None,
+    "status-publish": None,
+    "has-post-thumbnail": None,
+    "hentry": None,
+}
+CLASS_IGNORE_PREFIXES = [
+    "post-",
+    "event_location-",
+    "event_type-",
+    "event_type_2",
+    "event_type_3-sim",
+    "event_type_4-",
+    "event_type_5-",
+]
+for slug in sys.argv[1:]:
+    data_path = os.path.join("events", slug, "data.json")
+    with open(data_path, "r") as f:
+        data = json.load(f)
+    classes = data["class_list"]
+    organizer = None
+    organizer_path = os.path.join("events", slug, "organizer")
+    for class_ in classes:
+        if any(class_.startswith(p) for p in CLASS_IGNORE_PREFIXES):
+            continue
+        if class_ not in CLASS_TO_ORGANIZER:
+            raise Exception(f"unknown class: {class_}")
+        if organizer is not None:
+            raise Exception(f"duplicate organizer")
+        organizer = CLASS_TO_ORGANIZER[class_]
+    if organizer is not None:
+        with open(organizer_path, "w") as f:
+            f.write(organizer)
diff --git a/fetch-data b/fetch-data
new file mode 100755
index 000000000..b6fc8c661
--- /dev/null
+++ b/fetch-data
@@ -0,0 +1,16 @@
+#!/usr/bin/env python3
+import os
+import sys
+import requests
+for slug in sys.argv[1:]:
+    id = open(os.path.join("events", slug, "id"), "r").read()
+    data_path = os.path.join("events", slug, "data.json")
+    if os.path.exists(data_path):
+        continue
+    data_url = f"https://portugalrunning.com/wp-json/wp/v2/ajde_events/{id}"
+    response = requests.get(data_url)
+    response.raise_for_status()
+    with open(data_path, "w") as f:
+        f.write(response.text)
diff --git a/fetch-ics b/fetch-ics
new file mode 100755
index 000000000..0b06ddac3
--- /dev/null
+++ b/fetch-ics
@@ -0,0 +1,17 @@
+#!/usr/bin/env python3
+import os
+import sys
+import requests
+for slug in sys.argv[1:]:
+    id_path = os.path.join("events", slug, "id")
+    id = open(id_path, "r").read()
+    ics_path = os.path.join("events", slug, "ics")
+    if os.path.exists(ics_path):
+        continue
+    ics_url = f"http://www.portugalrunning.com/export-events/{id}_0/"
+    response = requests.get(ics_url)
+    response.raise_for_status()
+    with open(ics_path, "wb") as f:
+        f.write(response.content)
diff --git a/fetch-image b/fetch-image
new file mode 100755
index 000000000..70758c1d9
--- /dev/null
+++ b/fetch-image
@@ -0,0 +1,20 @@
+#!/usr/bin/env python3
+import os
+import sys
+import json
+import requests
+for slug in sys.argv[1:]:
+    data_path = os.path.join("events", slug, "data.json")
+    data = json.loads(open(data_path).read())
+    image_url = data["featured_image_src"]
+    image_path = os.path.join("events", slug, "image")
+    if os.path.exists(image_path):
+        continue
+    if image_url == "":
+        continue
+    response = requests.get(image_url)
+    response.raise_for_status()
+    with open(image_path, "wb") as f:
+        f.write(response.content)
diff --git a/fetch-location b/fetch-location
new file mode 100755
index 000000000..47bd064f0
--- /dev/null
+++ b/fetch-location
@@ -0,0 +1,243 @@
+#!/usr/bin/env python3
+import os
+import re
+import sys
+import json
+import requests
+import urllib.parse
+from dataclasses import dataclass, asdict
+from typing import Optional, Dict, Any
+ENV_GOOGLE_MAPS_API_KEY = "GOOGLE_MAPS_API_KEY"
+# Portugal district codes mapping (ISO 3166-2:PT)
+PORTUGAL_DISTRICT_CODES = {
+    # Mainland Districts
+    "Aveiro": 1,
+    "Beja": 2,
+    "Braga": 3,
+    "Bragança": 4,
+    "Castelo Branco": 5,
+    "Coimbra": 6,
+    "Évora": 7,
+    "Faro": 8,
+    "Guarda": 9,
+    "Leiria": 10,
+    "Lisboa": 11,
+    "Portalegre": 12,
+    "Porto": 13,
+    "Santarém": 14,
+    "Setúbal": 15,
+    "Viana do Castelo": 16,
+    "Vila Real": 17,
+    "Viseu": 18,
+    # Autonomous Regions
+    "Região Autónoma dos Açores": 20,
+    "Açores": 20,
+    "Azores": 20,
+    "Região Autónoma da Madeira": 30,
+    "Madeira": 30,
+}
+def get_district_code(district_name: Optional[str]) -> Optional[int]:
+    """Get Portuguese district code from district name (ISO 3166-2:PT)."""
+    if not district_name:
+        return None
+    # Direct lookup
+    if district_name in PORTUGAL_DISTRICT_CODES:
+        return PORTUGAL_DISTRICT_CODES[district_name]
+    # Try common variations and normalize
+    normalized = district_name.strip()
+    # Handle common variations for autonomous regions
+    variations = {
+        "Região Autónoma da Madeira": "Madeira",
+        "Região Autónoma dos Açores": "Açores",
+    }
+    if normalized in variations:
+        return PORTUGAL_DISTRICT_CODES[variations[normalized]]
+    # Last resort: try partial matching for districts
+    for district in PORTUGAL_DISTRICT_CODES:
+        if (
+            district.lower() in normalized.lower()
+            or normalized.lower() in district.lower()
+        ):
+            return PORTUGAL_DISTRICT_CODES[district]
+    return None
+@dataclass
+class Coordinates:
+    """Geographic coordinates."""
+    lat: float
+    lon: float
+    def to_dict(self) -> Dict[str, float]:
+        return asdict(self)
+@dataclass
+class EventLocation:
+    """Location information for an event."""
+    name: str
+    country: str
+    locality: str
+    coordinates: Optional[Coordinates] = None
+    administrative_area_level_1: Optional[str] = None  # District
+    administrative_area_level_2: Optional[str] = None  # Municipality
+    administrative_area_level_3: Optional[str] = None  # Parish
+    district_code: Optional[int] = None  # Portuguese district code
+    def to_dict(self) -> Dict[str, Any]:
+        result = {}
+        result["name"] = self.name
+        result["country"] = self.country
+        result["locality"] = self.locality
+        if self.coordinates:
+            result["coordinates"] = self.coordinates.to_dict()  # type: ignore
+        if self.administrative_area_level_1:
+            result["administrative_area_level_1"] = self.administrative_area_level_1
+        if self.administrative_area_level_2:
+            result["administrative_area_level_2"] = self.administrative_area_level_2
+        if self.administrative_area_level_3:
+            result["administrative_area_level_3"] = self.administrative_area_level_3
+        if self.district_code:
+            result["district_code"] = self.district_code
+        return result
+class GoogleGeocodingClient:
+    """Google Maps Geocoding API client with caching."""
+    def __init__(self, api_key: str):
+        self.api_key = api_key
+        self.base_url = "https://maps.googleapis.com/maps/api/geocode/json"
+    def _parse_google_response(
+        self, location: str, google_result: dict
+    ) -> EventLocation:
+        """Parse Google Maps API response into EventLocation."""
+        location_data = {
+            "name": location,
+            "lat": google_result["geometry"]["location"]["lat"],
+            "lon": google_result["geometry"]["location"]["lng"],
+            "country": "Portugal",
+            "locality": location.split(",")[0].strip(),
+            "administrative_area_level_1": None,
+            "administrative_area_level_2": None,
+            "administrative_area_level_3": None,
+            "district_code": None,
+        }
+        # Extract all administrative levels from address components
+        for component in google_result["address_components"]:
+            types = component["types"]
+            if "country" in types:
+                location_data["country"] = component["long_name"]
+            elif "administrative_area_level_1" in types:
+                location_data["administrative_area_level_1"] = component["long_name"]
+                # Use district as locality for Portugal
+                location_data["locality"] = component["long_name"]
+            elif "administrative_area_level_2" in types:
+                location_data["administrative_area_level_2"] = component["long_name"]
+            elif "administrative_area_level_3" in types:
+                location_data["administrative_area_level_3"] = component["long_name"]
+        # Calculate district code from administrative_area_level_1 (district)
+        location_data["district_code"] = get_district_code(
+            location_data["administrative_area_level_1"]
+        )
+        return EventLocation(
+            name=location,
+            country=location_data["country"],
+            locality=location_data["locality"],
+            coordinates=Coordinates(lat=location_data["lat"], lon=location_data["lon"]),
+            administrative_area_level_1=location_data["administrative_area_level_1"],
+            administrative_area_level_2=location_data["administrative_area_level_2"],
+            administrative_area_level_3=location_data["administrative_area_level_3"],
+            district_code=location_data["district_code"],
+        )
+    def geocode(self, location: str) -> Optional[EventLocation]:
+        """Geocode a location string."""
+        # Build request
+        params = {
+            "address": location,
+            "key": self.api_key,
+            "region": "pt",
+            "language": "pt",
+        }
+        try:
+            url = f"{self.base_url}?{urllib.parse.urlencode(params)}"
+            response = requests.get(url)
+            response.raise_for_status()
+            content = response.content
+            # print(f"GEOCODING|Google API content|{location}|{content}")
+            data = json.loads(content)
+            result = data["results"][0]
+            # print(f"GEOCODING|Google API result|{location}|{result}")
+            return self._parse_google_response(location, result)
+        except Exception as e:
+            # print(f"GEOCODING|Error|{location}|{str(e)}")
+            return None
+def clean_ics_location(loc: str) -> str:
+    def clean_ics_segment(segment: str) -> str:
+        segment = segment.strip()
+        words = segment.split()
+        # Madeira Madeira
+        # Santa Maria da Cruz Santa Maria da Cruz
+        if len(words) % 2 == 0:
+            hl = len(words) // 2
+            if words[:hl] == words[hl:]:
+                words = words[:hl]
+        return " ".join(words)
+    loc = loc.strip()
+    loc = loc.replace("\\", "")
+    segments = [clean_ics_segment(s) for s in loc.split(",")]
+    # Cabeço de Vida, Fontreira Cabeço de Vida, Fontreira
+    # Alcaria da Serra, Beja Alcaria da Serra, Beja
+    if len(segments) == 3 and segments[1] == f"{segments[2]} {segments[0]}":
+        segments = [segments[0], segments[2]]
+    return ", ".join(segments)
+FIXUP_TABLE = {"Alcaria da Serra, Beja": "Alcaria da Serra"}
+key = os.environ[ENV_GOOGLE_MAPS_API_KEY]
+client = GoogleGeocodingClient(key)
+for slug in sys.argv[1:]:
+    ics_path = os.path.join("events", slug, "ics")
+    location_path = os.path.join("events", slug, "location")
+    if os.path.exists(location_path):
+        continue
+    ics_content = open(ics_path, "rb").read().decode("utf-8", errors="ignore")
+    ics_location_match = re.search(r"LOCATION:(.*)", ics_content)
+    assert ics_location_match is not None
+    ics_location = ics_location_match[1]
+    ics_location = clean_ics_location(ics_location)
+    if ics_location == "":
+        continue
+    if ics_location in FIXUP_TABLE:
+        ics_location = FIXUP_TABLE[ics_location]
+    location = client.geocode(ics_location)
+    if location is None:
+        print(f"failed to obtain location from {slug} '{ics_location}'")
+        sys.exit(1)
+    with open(location_path, "w") as f:
+        json.dump(location.to_dict(), f)
diff --git a/fetch-oneline-description b/fetch-oneline-description
new file mode 100755
index 000000000..6f592e681
--- /dev/null
+++ b/fetch-oneline-description
@@ -0,0 +1,73 @@
+#!/usr/bin/env python3
+import os
+import sys
+import json
+import subprocess
+MODEL = "openrouter/anthropic/claude-3.5-haiku"
+class LLMClient:
+    """Client for LLM description generation."""
+    def __init__(self, model: str):
+        self.model = model
+    def llm_call(
+        self,
+        system_prompt: str,
+        user_prompt: str,
+    ) -> str:
+        proc = subprocess.run(
+            ["llm", "-m", self.model, "-s", system_prompt, user_prompt],
+            timeout=30,
+            check=True,
+            capture_output=True,
+            text=True,
+        )
+        stdout = proc.stdout.strip()
+        return stdout
+    def generate_description(self, text: str) -> str:
+        """Generate short description using LLM."""
+        system_prompt = """És um assistente especializado em condensar descrições de eventos de corrida em resumos de uma linha em português de Portugal. Deves extrair e resumir apenas a informação mais importante e relevante da descrição fornecida.
+Exemplos de resumos que deves gerar:
+ Corrida histórica pelas ruas de Lisboa com vista para o Tejo
+ Trail desafiante pela Serra da Estrela
+ São Silvestre tradicional no centro histórico do Porto
+ Meia maratona costeira com paisagens do Atlântico
+ Corrida solidária organizada pela câmara municipal
+ Prova de montanha com subidas técnicas
+ Corrida de Natal pela zona ribeirinha
+ Trail nocturno por caminhos antigos
+IMPORTANTE:
+- Responde APENAS com a descrição de uma linha em português de Portugal
+- Usa apenas informação presente na descrição original
+- Destaca características especiais do percurso, localização ou organização
+- Não menciones distâncias se já estão implícitas no tipo de evento
+- Foca-te no que torna este evento único ou interessante"""
+        return self.llm_call(
+            system_prompt,
+            text,
+        )
+client = LLMClient(MODEL)
+for slug in sys.argv[1:]:
+    data_path = os.path.join("events", slug, "data.json")
+    if not os.path.exists(data_path):
+        continue
+    data = json.load(open(data_path, "r"))
+    description = data["content"]["rendered"].strip()
+    if description == "":
+        continue
+    oneline_path = os.path.join("events", slug, "oneline-description")
+    if os.path.exists(oneline_path):
+        continue
+    oneline = client.generate_description(description)
+    with open(oneline_path, "w") as f:
+        f.write(oneline)
diff --git a/fetch-page b/fetch-page
new file mode 100755
index 000000000..085b54e7b
--- /dev/null
+++ b/fetch-page
@@ -0,0 +1,15 @@
+#!/usr/bin/env python3
+import os
+import sys
+import requests
+for slug in sys.argv[1:]:
+    url = f"https://www.portugalrunning.com/eventos/{slug}"
+    page_path = os.path.join("events", slug, "page.html")
+    if os.path.exists(page_path):
+        continue
+    response = requests.get(url)
+    response.raise_for_status()
+    with open(page_path, "w") as f:
+        f.write(response.text)
diff --git a/fetch-sitemap b/fetch-sitemap
new file mode 100755
index 000000000..b952ab529
--- /dev/null
+++ b/fetch-sitemap
@@ -0,0 +1,2 @@
+#!/usr/bin/env sh
+curl https://www.portugalrunning.com/ajde_events-sitemap.xml -o sitemap.xml
diff --git a/list-slugs b/list-slugs
new file mode 100755
index 000000000..a409e5d1e
--- /dev/null
+++ b/list-slugs
@@ -0,0 +1,2 @@
+#!/usr/bin/env sh
+ls -1 events
diff --git a/setup-directories b/setup-directories
new file mode 100755
index 000000000..d3548b14f
--- /dev/null
+++ b/setup-directories
@@ -0,0 +1,45 @@
+#!/usr/bin/env python3
+import re
+import os
+import shutil
+import xml.etree.ElementTree as ET
+ignored_urls = ["https://www.portugalrunning.com/eventos/"]
+tree = ET.parse("sitemap.xml")
+root = tree.getroot()
+for url_element in root.findall(".//{*}url"):
+    loc = url_element.find("{*}loc")
+    lastmod = url_element.find("{*}lastmod")
+    assert loc is not None
+    assert lastmod is not None
+    url = loc.text
+    lastmod = lastmod.text
+    assert url is not None
+    assert lastmod is not None
+    url = url.strip()
+    lastmod = lastmod.strip()
+    if url in ignored_urls:
+        continue
+    slug = re.match("https://www.portugalrunning.com/eventos/([^/]*)/", url)
+    assert slug is not None, f"failed to extract slug from '{url}'"
+    slug = slug[1]
+    event_dir = os.path.join("events", slug)
+    page_path = os.path.join(event_dir, "page.html")
+    lastmod_path = os.path.join(event_dir, "lastmod")
+    if os.path.exists(lastmod_path) and open(lastmod_path).read() == lastmod:
+        continue
+    if os.path.exists(event_dir):
+        shutil.rmtree(event_dir)
+    os.makedirs(event_dir, exist_ok=True)
+    with open(lastmod_path, "w") as f:
+        f.write(lastmod)
author	diogo464 <[email protected]>	2025-07-21 15:02:48 +0100
committer	diogo464 <[email protected]>	2025-07-21 15:02:48 +0100
commit	8c8dabd0ed20679a2dad43a5c239f9fcfe1c1ad7 (patch)
tree	55abbcfbbff19efa3aaf6cf36540ac7651c54973