diff options
| author | diogo464 <[email protected]> | 2025-07-21 15:02:48 +0100 |
|---|---|---|
| committer | diogo464 <[email protected]> | 2025-07-21 15:02:48 +0100 |
| commit | 8c8dabd0ed20679a2dad43a5c239f9fcfe1c1ad7 (patch) | |
| tree | 55abbcfbbff19efa3aaf6cf36540ac7651c54973 | |
init
| -rw-r--r-- | .envrc | 5 | ||||
| -rw-r--r-- | Justfile | 26 | ||||
| -rw-r--r-- | README.md | 40 | ||||
| -rwxr-xr-x | extract-categories | 74 | ||||
| -rwxr-xr-x | extract-circuits | 85 | ||||
| -rwxr-xr-x | extract-date | 25 | ||||
| -rwxr-xr-x | extract-id | 17 | ||||
| -rwxr-xr-x | extract-organizer | 83 | ||||
| -rwxr-xr-x | fetch-data | 16 | ||||
| -rwxr-xr-x | fetch-ics | 17 | ||||
| -rwxr-xr-x | fetch-image | 20 | ||||
| -rwxr-xr-x | fetch-location | 243 | ||||
| -rwxr-xr-x | fetch-oneline-description | 73 | ||||
| -rwxr-xr-x | fetch-page | 15 | ||||
| -rwxr-xr-x | fetch-sitemap | 2 | ||||
| -rwxr-xr-x | list-slugs | 2 | ||||
| -rwxr-xr-x | setup-directories | 45 |
17 files changed, 788 insertions, 0 deletions
| @@ -0,0 +1,5 @@ | |||
| 1 | export GOOGLE_MAPS_API_KEY=$(pass google-geocoding-api-key) | ||
| 2 | export ANTHROPIC_API_KEY=$(pass api/anthropic) | ||
| 3 | export OPENROUTER_KEY=$(pass api/openrouter) | ||
| 4 | export PUSHOVER_USERKEY=$(pass api/pushover/user-key) | ||
| 5 | export PUSHOVER_KEY=$(pass api/pushover/production) | ||
diff --git a/Justfile b/Justfile new file mode 100644 index 000000000..f91b708a3 --- /dev/null +++ b/Justfile | |||
| @@ -0,0 +1,26 @@ | |||
| 1 | _default: | ||
| 2 | just --list | ||
| 3 | |||
| 4 | scrape: | ||
| 5 | ./fetch-sitemap | ||
| 6 | ./setup-directories | ||
| 7 | ./list-slugs | xargs -L 64 -P 40 ./fetch-page | ||
| 8 | ./list-slugs | xargs -L 64 -P 0 ./extract-id | ||
| 9 | ./list-slugs | xargs -L 64 -P 40 ./fetch-ics | ||
| 10 | ./list-slugs | xargs -L 64 -P 40 ./fetch-data | ||
| 11 | ./list-slugs | xargs -L 64 -P 40 ./fetch-image | ||
| 12 | ./list-slugs | xargs -L 64 -P 40 ./fetch-location | ||
| 13 | ./list-slugs | xargs -L 64 -P 20 ./fetch-oneline-description | ||
| 14 | ./list-slugs | xargs -L 64 -P 40 ./extract-date | ||
| 15 | ./list-slugs | xargs -L 64 -P 40 ./extract-categories | ||
| 16 | ./list-slugs | xargs -L 64 -P 40 ./extract-circuits | ||
| 17 | ./list-slugs | xargs -L 64 -P 40 ./extract-organizer | ||
| 18 | |||
| 19 | scrape-categories: | ||
| 20 | ./list-slugs | xargs -e -L 64 -P 1 ./extract-categories | ||
| 21 | |||
| 22 | scrape-circuits: | ||
| 23 | ./list-slugs | xargs -e -L 64 -P 1 ./extract-circuits | ||
| 24 | |||
| 25 | scrape-organizer: | ||
| 26 | ./list-slugs | xargs -e -L 64 -P 1 ./extract-organizer | ||
diff --git a/README.md b/README.md new file mode 100644 index 000000000..dc718ceeb --- /dev/null +++ b/README.md | |||
| @@ -0,0 +1,40 @@ | |||
| 1 | # portugal-running-data | ||
| 2 | repo with scraper for the portugal running calendar data | ||
| 3 | |||
| 4 | | Filename | Source Script | Optional | Description | | ||
| 5 | | ------------------------------------------------------------- | ------------------------------------------------------------- | ------------------------------------------------------------- | ------------------------------------------------------------- | | ||
| 6 | | `lastmod` | `setup-directories` | no | last modification time extracted from the sitemap file | | ||
| 7 | | `page.html` | `fetch-page` | no | event page from portugalrunning.com | | ||
| 8 | | `id` | `extract-id` | no | event numeric id from wordpress | | ||
| 9 | | `data.json` | `fetch-data` | no | json file with some event data | | ||
| 10 | | `ics` | `fetch-ics` | no | calendar file with location, date and other event information | | ||
| 11 | | `location` | `fetch-location` | yes | location data for the event | | ||
| 12 | | `image` | `fetch-image` | yes | cover image for the event | | ||
| 13 | | `date` | `extract-date` | no | event date extracted from the ics file | | ||
| 14 | | `oneline-description` | `fetch-oneline-description` | yes | ai generated one line description | | ||
| 15 | | `categories` | `extract-categories` | no | event categories | | ||
| 16 | | `circuits` | `extract-circuits` | no | event circuits | | ||
| 17 | |||
| 18 | ## `fetch-sitemap` | ||
| 19 | this script fetches the sitemap that contains a list of event page urls and the last modification date | ||
| 20 | |||
| 21 | ## `fetch-pages` | ||
| 22 | this script will fetch any missing pages or outdated pages by looking at the lastmod file. | ||
| 23 | |||
| 24 | ## `extract-ids` | ||
| 25 | this script will extract the event ids from the page.html file. this id can be used to later fetch other data related to this event. | ||
| 26 | |||
| 27 | ## `fetch-ics` | ||
| 28 | this script uses the event id and fetches its ics file. | ||
| 29 | |||
| 30 | ## `fetch-data` | ||
| 31 | this script uses the event id to fetch some event data in json format. | ||
| 32 | |||
| 33 | ## `fetch-images` | ||
| 34 | some events have a main image in the json data file, this script will fetch that image. | ||
| 35 | |||
| 36 | ## `extract-organizer` | ||
| 37 | this script extracts the organizer from the class list in the json data file, if one exists. | ||
| 38 | |||
| 39 | ## `extract-categories` | ||
| 40 | this script extracts a list of categories from the class list in the json data file. | ||
diff --git a/extract-categories b/extract-categories new file mode 100755 index 000000000..1683ecb75 --- /dev/null +++ b/extract-categories | |||
| @@ -0,0 +1,74 @@ | |||
| 1 | #!/usr/bin/env python3 | ||
| 2 | import os | ||
| 3 | import sys | ||
| 4 | import json | ||
| 5 | |||
| 6 | CATEGORY_RUN = "run" | ||
| 7 | CATEGORY_WALK = "walk" | ||
| 8 | CATEGORY_TRAIL = "trail" | ||
| 9 | CATEGORY_KIDS = "kids" | ||
| 10 | CATEGORY_SAINT_SILVESTER = "saint-silvester" | ||
| 11 | CATEGORY_10K = "10k" | ||
| 12 | CATEGORY_15K = "15k" | ||
| 13 | CATEGORY_HALF_MARATHON = "half-marathon" | ||
| 14 | CATEGORY_MARATHON = "marathon" | ||
| 15 | |||
| 16 | CLASS_TO_CATEGORIES = { | ||
| 17 | "event_type-corrida": [CATEGORY_RUN], | ||
| 18 | "event_type-caminhada": [CATEGORY_WALK], | ||
| 19 | "event_type-corrida-10km": [CATEGORY_RUN, CATEGORY_10K], | ||
| 20 | "event_type-corrida-10-km": [CATEGORY_RUN, CATEGORY_10K], | ||
| 21 | "event_type-corrida-de-15-km": [CATEGORY_RUN, CATEGORY_15K], | ||
| 22 | "event_type-trail": [CATEGORY_TRAIL], | ||
| 23 | "event_type-kids-trail": [CATEGORY_TRAIL, CATEGORY_KIDS], | ||
| 24 | "event_type-trail-curto": [CATEGORY_TRAIL], | ||
| 25 | "event_type-trail-longo": [CATEGORY_TRAIL], | ||
| 26 | "event_type-trail-endurance": [CATEGORY_TRAIL], | ||
| 27 | "event_type-trail-ultra": [CATEGORY_TRAIL], | ||
| 28 | "event_type-sao-silvestre": [CATEGORY_SAINT_SILVESTER], | ||
| 29 | "event_type-outras": [], | ||
| 30 | "event_type-obstaculos": [CATEGORY_RUN], | ||
| 31 | "event_type-corta-mato": [CATEGORY_RUN], | ||
| 32 | "event_type-backyard-2": [CATEGORY_RUN], | ||
| 33 | "event_type-meiamaratona": [CATEGORY_RUN, CATEGORY_HALF_MARATHON], | ||
| 34 | "event_type-maratona": [CATEGORY_RUN, CATEGORY_MARATHON], | ||
| 35 | "event_type-skyrunning": [CATEGORY_RUN], | ||
| 36 | "event_type-corridas-inferior-10": [CATEGORY_RUN], | ||
| 37 | "event_type-kids": [CATEGORY_KIDS], | ||
| 38 | # ignored | ||
| 39 | "ajde_events": [], | ||
| 40 | "type-ajde_events": [], | ||
| 41 | "status-publish": [], | ||
| 42 | "has-post-thumbnail": [], | ||
| 43 | "hentry": [], | ||
| 44 | } | ||
| 45 | CLASS_IGNORE_PREFIXES = [ | ||
| 46 | "post-", | ||
| 47 | "event_location-", | ||
| 48 | "event_organizer-", | ||
| 49 | "event_type_2-", | ||
| 50 | "event_type_3-", | ||
| 51 | "event_type_4-", | ||
| 52 | "event_type_5-", | ||
| 53 | ] | ||
| 54 | |||
| 55 | for slug in sys.argv[1:]: | ||
| 56 | data_path = os.path.join("events", slug, "data.json") | ||
| 57 | with open(data_path, "r") as f: | ||
| 58 | data = json.load(f) | ||
| 59 | classes = data["class_list"] | ||
| 60 | categories = set() | ||
| 61 | categories_path = os.path.join("events", slug, "categories") | ||
| 62 | for class_ in classes: | ||
| 63 | if any(class_.startswith(p) for p in CLASS_IGNORE_PREFIXES): | ||
| 64 | continue | ||
| 65 | if class_ not in CLASS_TO_CATEGORIES: | ||
| 66 | raise Exception(f"unknown class: {class_}") | ||
| 67 | for category in CLASS_TO_CATEGORIES[class_]: | ||
| 68 | categories.add(category) | ||
| 69 | with open(categories_path, "w") as f: | ||
| 70 | for i, category in enumerate(sorted(categories)): | ||
| 71 | if i > 0: | ||
| 72 | f.write("\n") | ||
| 73 | f.write(category) | ||
| 74 | |||
diff --git a/extract-circuits b/extract-circuits new file mode 100755 index 000000000..e7b9a5564 --- /dev/null +++ b/extract-circuits | |||
| @@ -0,0 +1,85 @@ | |||
| 1 | #!/usr/bin/env python3 | ||
| 2 | import os | ||
| 3 | import sys | ||
| 4 | import json | ||
| 5 | |||
| 6 | CIRCUIT_ATRP = "atrp" | ||
| 7 | CIRCUIT_MAJORS = "majors" | ||
| 8 | CIRCUIT_RIOS_TRAIL_TROPHY = "rios-trail-trophy" | ||
| 9 | CIRCUIT_SUPER_HALFS = "super-halfs" | ||
| 10 | CIRCUIT_ESTRELAS_DE_PORTUGAL = "estrelas-de-portugal" | ||
| 11 | CIRCUIT_TROFEU_ATLETISMO_ALMADA = "trofeu-atletismo-almada" | ||
| 12 | CIRCUIT_TROFEU_ALMADA = "trofeu-almada" | ||
| 13 | CIRCUIT_ATLETISMO_BARREIRO = "atletismo-barreiro" | ||
| 14 | CIRCUIT_MADEIRA_TRAIL = "circuit-madeira-trail" | ||
| 15 | CIRCUITO_4_ESTACOES = "quatro-estacoes" | ||
| 16 | |||
| 17 | CLASS_TO_CIRCUITS = { | ||
| 18 | "event_type_5-circuito-atrp": [CIRCUIT_ATRP], | ||
| 19 | "event_type_5-superhalfs": [CIRCUIT_SUPER_HALFS], | ||
| 20 | "event_type_5-trofeu-de-almada": [CIRCUIT_TROFEU_ALMADA], | ||
| 21 | "event_type_5-trofeu-atletismo-de-almada": [CIRCUIT_TROFEU_ATLETISMO_ALMADA], | ||
| 22 | "event_type_5-circuito-trail-madeira": [CIRCUIT_MADEIRA_TRAIL], | ||
| 23 | "event_type_5-circuito-estrelas-de-portugal": [CIRCUIT_ESTRELAS_DE_PORTUGAL], | ||
| 24 | "event_type_5-circuito-de-atletismo-do-barreiro": [CIRCUIT_ATLETISMO_BARREIRO], | ||
| 25 | "event_type_5-majors": [CIRCUIT_MAJORS], | ||
| 26 | "event_type_5-circuito-4-estacoes": [CIRCUIT_MAJORS], | ||
| 27 | "event_type_5-3-rios-trail-trophy": [CIRCUIT_RIOS_TRAIL_TROPHY], | ||
| 28 | "event_type_5-circuito-4-estacoes": [CIRCUITO_4_ESTACOES], | ||
| 29 | # ignored | ||
| 30 | "event_type-corrida-10km": [], | ||
| 31 | "event_type-corrida-10-km": [], | ||
| 32 | "event_type-corrida-de-15-km": [], | ||
| 33 | "event_type-meiamaratona": [], | ||
| 34 | "event_type-maratona": [], | ||
| 35 | "event_type-trail": [], | ||
| 36 | "event_type-kids-trail": [], | ||
| 37 | "event_type-trail-curto": [], | ||
| 38 | "event_type-trail-longo": [], | ||
| 39 | "event_type-trail-endurance": [], | ||
| 40 | "event_type-trail-ultra": [], | ||
| 41 | "event_type-sao-silvestre": [], | ||
| 42 | "event_type-outras": [], | ||
| 43 | "event_type-obstaculos": [], | ||
| 44 | "event_type-corta-mato": [], | ||
| 45 | "event_type-backyard-2": [], | ||
| 46 | "event_type-skyrunning": [], | ||
| 47 | "event_type-corridas-inferior-10": [], | ||
| 48 | "event_type-corrida": [], | ||
| 49 | "event_type-kids": [], | ||
| 50 | "event_type-caminhada": [], | ||
| 51 | "ajde_events": [], | ||
| 52 | "type-ajde_events": [], | ||
| 53 | "status-publish": [], | ||
| 54 | "has-post-thumbnail": [], | ||
| 55 | "hentry": [], | ||
| 56 | } | ||
| 57 | CLASS_IGNORE_PREFIXES = [ | ||
| 58 | "post-", | ||
| 59 | "event_location-", | ||
| 60 | "event_organizer-", | ||
| 61 | "event_type_2-", | ||
| 62 | "event_type_4-", | ||
| 63 | "event_type_3-sim", | ||
| 64 | ] | ||
| 65 | |||
| 66 | for slug in sys.argv[1:]: | ||
| 67 | data_path = os.path.join("events", slug, "data.json") | ||
| 68 | with open(data_path, "r") as f: | ||
| 69 | data = json.load(f) | ||
| 70 | classes = data["class_list"] | ||
| 71 | circuits = set() | ||
| 72 | circuits_path = os.path.join("events", slug, "circuits") | ||
| 73 | for class_ in classes: | ||
| 74 | if any(class_.startswith(p) for p in CLASS_IGNORE_PREFIXES): | ||
| 75 | continue | ||
| 76 | if class_ not in CLASS_TO_CIRCUITS: | ||
| 77 | raise Exception(f"unknown class: {class_}") | ||
| 78 | for category in CLASS_TO_CIRCUITS[class_]: | ||
| 79 | circuits.add(category) | ||
| 80 | with open(circuits_path, "w") as f: | ||
| 81 | for i, category in enumerate(sorted(circuits)): | ||
| 82 | if i > 0: | ||
| 83 | f.write("\n") | ||
| 84 | f.write(category) | ||
| 85 | |||
diff --git a/extract-date b/extract-date new file mode 100755 index 000000000..727b77b1e --- /dev/null +++ b/extract-date | |||
| @@ -0,0 +1,25 @@ | |||
| 1 | #!/usr/bin/env python3 | ||
| 2 | import os | ||
| 3 | import re | ||
| 4 | import sys | ||
| 5 | |||
| 6 | from datetime import datetime | ||
| 7 | |||
| 8 | # DTSTART:20251130 | ||
| 9 | |||
| 10 | for slug in sys.argv[1:]: | ||
| 11 | ics_path = os.path.join("events", slug, "ics") | ||
| 12 | ics_content = open(ics_path, "rb").read().decode("utf-8", errors="ignore") | ||
| 13 | |||
| 14 | date_path = os.path.join("events", slug, "date") | ||
| 15 | if os.path.exists(date_path): | ||
| 16 | continue | ||
| 17 | |||
| 18 | date = re.search(r"DTSTART:(\d+)", ics_content) | ||
| 19 | assert date is not None | ||
| 20 | date = date[1] | ||
| 21 | |||
| 22 | dt = datetime.strptime(date, "%Y%m%d") | ||
| 23 | with open(date_path, "w") as f: | ||
| 24 | f.write(date_path) | ||
| 25 | |||
diff --git a/extract-id b/extract-id new file mode 100755 index 000000000..698b6ae08 --- /dev/null +++ b/extract-id | |||
| @@ -0,0 +1,17 @@ | |||
| 1 | #!/usr/bin/env python3 | ||
| 2 | import os | ||
| 3 | import re | ||
| 4 | import sys | ||
| 5 | |||
| 6 | for slug in sys.argv[1:]: | ||
| 7 | page_path = os.path.join("events", slug, "page.html") | ||
| 8 | page_content = open(page_path).read() | ||
| 9 | event_id_match = re.search( | ||
| 10 | r'href="https://www.portugalrunning.com/wp-json/wp/v2/ajde_events/(\d+)"', | ||
| 11 | page_content, | ||
| 12 | ) | ||
| 13 | assert event_id_match is not None, f"failed to extract event id from {slug}" | ||
| 14 | event_id_path = os.path.join("events", slug, "id") | ||
| 15 | with open(event_id_path, "w") as f: | ||
| 16 | f.write(event_id_match[1].strip()) | ||
| 17 | |||
diff --git a/extract-organizer b/extract-organizer new file mode 100755 index 000000000..3739f4ead --- /dev/null +++ b/extract-organizer | |||
| @@ -0,0 +1,83 @@ | |||
| 1 | #!/usr/bin/env python3 | ||
| 2 | import os | ||
| 3 | import sys | ||
| 4 | import json | ||
| 5 | |||
| 6 | ORGANIZER_XISTARCA = "xistarca" | ||
| 7 | ORGANIZER_WERUN = "werun" | ||
| 8 | ORGANIZER_HMS_SPORTS = "hms-sports" | ||
| 9 | ORGANIZER_CPA = "cpa" | ||
| 10 | ORGANIZER_TRILHO_PERDIDO = "trilho-perdido" | ||
| 11 | ORGANIZER_EVENT_SPORT = "event-sport" | ||
| 12 | ORGANIZER_RUN_PORTO = "run-porto" | ||
| 13 | ORGANIZER_BRAGANCA_GRANFONDO_BY_TREK="braganca-grafondo-by-trek" | ||
| 14 | ORGANIZER_VILA_FRANCA_DE_XIRA_LISBOA = "vila-franca-de-xira-lisboa" | ||
| 15 | ORGANIZER_SINTRA_LISBOA = "sintra-lisboa" | ||
| 16 | ORGANIZER_CAMARA_MUNICIPAL_DE_OEIRAS = "camara-municipal-de-oeiras" | ||
| 17 | ORGANIZER_MARCO_DE_CANAVESES_PORTO = "marco-de-canaveses-porto" | ||
| 18 | ORGANIZER_ESTADIO_UNIVERSITARIO_LISBOA = "estadio-universitario-lisboa" | ||
| 19 | ORGANIZER_CLUBE_DO_PESSOAL_AGUAS_DE_GAIA = "clube-do-pessoal-aguas-de-gaia" | ||
| 20 | ORGANIZER_AARAM = "aaram" | ||
| 21 | ORGANIZER_SANTA_CRUZTORRES_VEDRAS = "santa-cruztorres-vedras" | ||
| 22 | ORGANIZER_CLUBE_ASSOCIACAO_DESPORTIVA_E_RECREATIVA_DA_MATA = "clube-associacao-desportiva-e-recreativa-da-mata" | ||
| 23 | ORGANIZER_TURRES_EVENTS = "turres-events" | ||
| 24 | ORGANIZER_CAMARA_MUNICIPAL_DE_ESTREMOZ = "camara-municipal-de-estremoz" | ||
| 25 | ORGANIZER_GRUPO_DE_CULTURA_E_DESPORTO_DOS_BOMBEIROS_VOLUNTARIOS_DE_S_B_MESSINES = "grupo-de-cultura-e-desporto-dos-bombeiros-voluntarios-de-s-b-messines" | ||
| 26 | |||
| 27 | CLASS_TO_ORGANIZER = { | ||
| 28 | "event_organizer-xistarca": ORGANIZER_XISTARCA, | ||
| 29 | "event_organizer-we-run": ORGANIZER_WERUN, | ||
| 30 | "event_organizer-cpa": ORGANIZER_CPA, | ||
| 31 | "event_organizer-eventsport": ORGANIZER_EVENT_SPORT, | ||
| 32 | "event_organizer-trilho-perdido": ORGANIZER_TRILHO_PERDIDO, | ||
| 33 | "event_organizer-runporto": ORGANIZER_RUN_PORTO, | ||
| 34 | "event_organizer-hms-sports": ORGANIZER_HMS_SPORTS, | ||
| 35 | "event_organizer-vila-franca-de-xira-lisboa":ORGANIZER_VILA_FRANCA_DE_XIRA_LISBOA, | ||
| 36 | "event_organizer-braganca-granfondo-by-trek":ORGANIZER_BRAGANCA_GRANFONDO_BY_TREK, | ||
| 37 | "event_organizer-sintra-lisboa": ORGANIZER_SINTRA_LISBOA, | ||
| 38 | "event_organizer-camara-municipal-de-oeiras": ORGANIZER_CAMARA_MUNICIPAL_DE_OEIRAS, | ||
| 39 | "event_organizer-marco-de-canaveses-porto": ORGANIZER_MARCO_DE_CANAVESES_PORTO, | ||
| 40 | "event_organizer-estadio-universitario-lisboa": ORGANIZER_ESTADIO_UNIVERSITARIO_LISBOA, | ||
| 41 | "event_organizer-clube-do-pessoal-aguas-de-gaia": ORGANIZER_CLUBE_DO_PESSOAL_AGUAS_DE_GAIA, | ||
| 42 | "event_organizer-aaram": ORGANIZER_AARAM, | ||
| 43 | "event_organizer-santa-cruztorres-vedras": ORGANIZER_SANTA_CRUZTORRES_VEDRAS, | ||
| 44 | "event_organizer-clube-associacao-desportiva-e-recreativa-da-mata": ORGANIZER_CLUBE_ASSOCIACAO_DESPORTIVA_E_RECREATIVA_DA_MATA, | ||
| 45 | "event_organizer-turres-events": ORGANIZER_TURRES_EVENTS, | ||
| 46 | "event_organizer-camara-municipal-de-estremoz": ORGANIZER_CAMARA_MUNICIPAL_DE_ESTREMOZ, | ||
| 47 | "event_organizer-grupo-de-cultura-e-desporto-dos-bombeiros-voluntarios-de-s-b-messines": ORGANIZER_GRUPO_DE_CULTURA_E_DESPORTO_DOS_BOMBEIROS_VOLUNTARIOS_DE_S_B_MESSINES, | ||
| 48 | # ignored | ||
| 49 | "ajde_events": None, | ||
| 50 | "type-ajde_events": None, | ||
| 51 | "status-publish": None, | ||
| 52 | "has-post-thumbnail": None, | ||
| 53 | "hentry": None, | ||
| 54 | } | ||
| 55 | CLASS_IGNORE_PREFIXES = [ | ||
| 56 | "post-", | ||
| 57 | "event_location-", | ||
| 58 | "event_type-", | ||
| 59 | "event_type_2", | ||
| 60 | "event_type_3-sim", | ||
| 61 | "event_type_4-", | ||
| 62 | "event_type_5-", | ||
| 63 | ] | ||
| 64 | |||
| 65 | for slug in sys.argv[1:]: | ||
| 66 | data_path = os.path.join("events", slug, "data.json") | ||
| 67 | with open(data_path, "r") as f: | ||
| 68 | data = json.load(f) | ||
| 69 | classes = data["class_list"] | ||
| 70 | organizer = None | ||
| 71 | organizer_path = os.path.join("events", slug, "organizer") | ||
| 72 | for class_ in classes: | ||
| 73 | if any(class_.startswith(p) for p in CLASS_IGNORE_PREFIXES): | ||
| 74 | continue | ||
| 75 | if class_ not in CLASS_TO_ORGANIZER: | ||
| 76 | raise Exception(f"unknown class: {class_}") | ||
| 77 | if organizer is not None: | ||
| 78 | raise Exception(f"duplicate organizer") | ||
| 79 | organizer = CLASS_TO_ORGANIZER[class_] | ||
| 80 | if organizer is not None: | ||
| 81 | with open(organizer_path, "w") as f: | ||
| 82 | f.write(organizer) | ||
| 83 | |||
diff --git a/fetch-data b/fetch-data new file mode 100755 index 000000000..b6fc8c661 --- /dev/null +++ b/fetch-data | |||
| @@ -0,0 +1,16 @@ | |||
| 1 | #!/usr/bin/env python3 | ||
| 2 | import os | ||
| 3 | import sys | ||
| 4 | import requests | ||
| 5 | |||
| 6 | for slug in sys.argv[1:]: | ||
| 7 | id = open(os.path.join("events", slug, "id"), "r").read() | ||
| 8 | data_path = os.path.join("events", slug, "data.json") | ||
| 9 | if os.path.exists(data_path): | ||
| 10 | continue | ||
| 11 | data_url = f"https://portugalrunning.com/wp-json/wp/v2/ajde_events/{id}" | ||
| 12 | response = requests.get(data_url) | ||
| 13 | response.raise_for_status() | ||
| 14 | with open(data_path, "w") as f: | ||
| 15 | f.write(response.text) | ||
| 16 | |||
diff --git a/fetch-ics b/fetch-ics new file mode 100755 index 000000000..0b06ddac3 --- /dev/null +++ b/fetch-ics | |||
| @@ -0,0 +1,17 @@ | |||
| 1 | #!/usr/bin/env python3 | ||
| 2 | import os | ||
| 3 | import sys | ||
| 4 | import requests | ||
| 5 | |||
| 6 | for slug in sys.argv[1:]: | ||
| 7 | id_path = os.path.join("events", slug, "id") | ||
| 8 | id = open(id_path, "r").read() | ||
| 9 | ics_path = os.path.join("events", slug, "ics") | ||
| 10 | if os.path.exists(ics_path): | ||
| 11 | continue | ||
| 12 | ics_url = f"http://www.portugalrunning.com/export-events/{id}_0/" | ||
| 13 | response = requests.get(ics_url) | ||
| 14 | response.raise_for_status() | ||
| 15 | with open(ics_path, "wb") as f: | ||
| 16 | f.write(response.content) | ||
| 17 | |||
diff --git a/fetch-image b/fetch-image new file mode 100755 index 000000000..70758c1d9 --- /dev/null +++ b/fetch-image | |||
| @@ -0,0 +1,20 @@ | |||
| 1 | #!/usr/bin/env python3 | ||
| 2 | import os | ||
| 3 | import sys | ||
| 4 | import json | ||
| 5 | import requests | ||
| 6 | |||
| 7 | for slug in sys.argv[1:]: | ||
| 8 | data_path = os.path.join("events", slug, "data.json") | ||
| 9 | data = json.loads(open(data_path).read()) | ||
| 10 | image_url = data["featured_image_src"] | ||
| 11 | image_path = os.path.join("events", slug, "image") | ||
| 12 | if os.path.exists(image_path): | ||
| 13 | continue | ||
| 14 | if image_url == "": | ||
| 15 | continue | ||
| 16 | response = requests.get(image_url) | ||
| 17 | response.raise_for_status() | ||
| 18 | with open(image_path, "wb") as f: | ||
| 19 | f.write(response.content) | ||
| 20 | |||
diff --git a/fetch-location b/fetch-location new file mode 100755 index 000000000..47bd064f0 --- /dev/null +++ b/fetch-location | |||
| @@ -0,0 +1,243 @@ | |||
| 1 | #!/usr/bin/env python3 | ||
| 2 | import os | ||
| 3 | import re | ||
| 4 | import sys | ||
| 5 | import json | ||
| 6 | import requests | ||
| 7 | import urllib.parse | ||
| 8 | |||
| 9 | from dataclasses import dataclass, asdict | ||
| 10 | from typing import Optional, Dict, Any | ||
| 11 | |||
| 12 | ENV_GOOGLE_MAPS_API_KEY = "GOOGLE_MAPS_API_KEY" | ||
| 13 | |||
| 14 | # Portugal district codes mapping (ISO 3166-2:PT) | ||
| 15 | PORTUGAL_DISTRICT_CODES = { | ||
| 16 | # Mainland Districts | ||
| 17 | "Aveiro": 1, | ||
| 18 | "Beja": 2, | ||
| 19 | "Braga": 3, | ||
| 20 | "Bragança": 4, | ||
| 21 | "Castelo Branco": 5, | ||
| 22 | "Coimbra": 6, | ||
| 23 | "Évora": 7, | ||
| 24 | "Faro": 8, | ||
| 25 | "Guarda": 9, | ||
| 26 | "Leiria": 10, | ||
| 27 | "Lisboa": 11, | ||
| 28 | "Portalegre": 12, | ||
| 29 | "Porto": 13, | ||
| 30 | "Santarém": 14, | ||
| 31 | "Setúbal": 15, | ||
| 32 | "Viana do Castelo": 16, | ||
| 33 | "Vila Real": 17, | ||
| 34 | "Viseu": 18, | ||
| 35 | # Autonomous Regions | ||
| 36 | "Região Autónoma dos Açores": 20, | ||
| 37 | "Açores": 20, | ||
| 38 | "Azores": 20, | ||
| 39 | "Região Autónoma da Madeira": 30, | ||
| 40 | "Madeira": 30, | ||
| 41 | } | ||
| 42 | |||
| 43 | |||
| 44 | def get_district_code(district_name: Optional[str]) -> Optional[int]: | ||
| 45 | """Get Portuguese district code from district name (ISO 3166-2:PT).""" | ||
| 46 | if not district_name: | ||
| 47 | return None | ||
| 48 | |||
| 49 | # Direct lookup | ||
| 50 | if district_name in PORTUGAL_DISTRICT_CODES: | ||
| 51 | return PORTUGAL_DISTRICT_CODES[district_name] | ||
| 52 | |||
| 53 | # Try common variations and normalize | ||
| 54 | normalized = district_name.strip() | ||
| 55 | |||
| 56 | # Handle common variations for autonomous regions | ||
| 57 | variations = { | ||
| 58 | "Região Autónoma da Madeira": "Madeira", | ||
| 59 | "Região Autónoma dos Açores": "Açores", | ||
| 60 | } | ||
| 61 | |||
| 62 | if normalized in variations: | ||
| 63 | return PORTUGAL_DISTRICT_CODES[variations[normalized]] | ||
| 64 | |||
| 65 | # Last resort: try partial matching for districts | ||
| 66 | for district in PORTUGAL_DISTRICT_CODES: | ||
| 67 | if ( | ||
| 68 | district.lower() in normalized.lower() | ||
| 69 | or normalized.lower() in district.lower() | ||
| 70 | ): | ||
| 71 | return PORTUGAL_DISTRICT_CODES[district] | ||
| 72 | |||
| 73 | return None | ||
| 74 | |||
| 75 | |||
| 76 | @dataclass | ||
| 77 | class Coordinates: | ||
| 78 | """Geographic coordinates.""" | ||
| 79 | |||
| 80 | lat: float | ||
| 81 | lon: float | ||
| 82 | |||
| 83 | def to_dict(self) -> Dict[str, float]: | ||
| 84 | return asdict(self) | ||
| 85 | |||
| 86 | |||
| 87 | @dataclass | ||
| 88 | class EventLocation: | ||
| 89 | """Location information for an event.""" | ||
| 90 | |||
| 91 | name: str | ||
| 92 | country: str | ||
| 93 | locality: str | ||
| 94 | coordinates: Optional[Coordinates] = None | ||
| 95 | administrative_area_level_1: Optional[str] = None # District | ||
| 96 | administrative_area_level_2: Optional[str] = None # Municipality | ||
| 97 | administrative_area_level_3: Optional[str] = None # Parish | ||
| 98 | district_code: Optional[int] = None # Portuguese district code | ||
| 99 | |||
| 100 | def to_dict(self) -> Dict[str, Any]: | ||
| 101 | result = {} | ||
| 102 | result["name"] = self.name | ||
| 103 | result["country"] = self.country | ||
| 104 | result["locality"] = self.locality | ||
| 105 | if self.coordinates: | ||
| 106 | result["coordinates"] = self.coordinates.to_dict() # type: ignore | ||
| 107 | if self.administrative_area_level_1: | ||
| 108 | result["administrative_area_level_1"] = self.administrative_area_level_1 | ||
| 109 | if self.administrative_area_level_2: | ||
| 110 | result["administrative_area_level_2"] = self.administrative_area_level_2 | ||
| 111 | if self.administrative_area_level_3: | ||
| 112 | result["administrative_area_level_3"] = self.administrative_area_level_3 | ||
| 113 | if self.district_code: | ||
| 114 | result["district_code"] = self.district_code | ||
| 115 | return result | ||
| 116 | |||
| 117 | |||
| 118 | class GoogleGeocodingClient: | ||
| 119 | """Google Maps Geocoding API client with caching.""" | ||
| 120 | |||
| 121 | def __init__(self, api_key: str): | ||
| 122 | self.api_key = api_key | ||
| 123 | self.base_url = "https://maps.googleapis.com/maps/api/geocode/json" | ||
| 124 | |||
| 125 | def _parse_google_response( | ||
| 126 | self, location: str, google_result: dict | ||
| 127 | ) -> EventLocation: | ||
| 128 | """Parse Google Maps API response into EventLocation.""" | ||
| 129 | location_data = { | ||
| 130 | "name": location, | ||
| 131 | "lat": google_result["geometry"]["location"]["lat"], | ||
| 132 | "lon": google_result["geometry"]["location"]["lng"], | ||
| 133 | "country": "Portugal", | ||
| 134 | "locality": location.split(",")[0].strip(), | ||
| 135 | "administrative_area_level_1": None, | ||
| 136 | "administrative_area_level_2": None, | ||
| 137 | "administrative_area_level_3": None, | ||
| 138 | "district_code": None, | ||
| 139 | } | ||
| 140 | |||
| 141 | # Extract all administrative levels from address components | ||
| 142 | for component in google_result["address_components"]: | ||
| 143 | types = component["types"] | ||
| 144 | if "country" in types: | ||
| 145 | location_data["country"] = component["long_name"] | ||
| 146 | elif "administrative_area_level_1" in types: | ||
| 147 | location_data["administrative_area_level_1"] = component["long_name"] | ||
| 148 | # Use district as locality for Portugal | ||
| 149 | location_data["locality"] = component["long_name"] | ||
| 150 | elif "administrative_area_level_2" in types: | ||
| 151 | location_data["administrative_area_level_2"] = component["long_name"] | ||
| 152 | elif "administrative_area_level_3" in types: | ||
| 153 | location_data["administrative_area_level_3"] = component["long_name"] | ||
| 154 | |||
| 155 | # Calculate district code from administrative_area_level_1 (district) | ||
| 156 | location_data["district_code"] = get_district_code( | ||
| 157 | location_data["administrative_area_level_1"] | ||
| 158 | ) | ||
| 159 | |||
| 160 | return EventLocation( | ||
| 161 | name=location, | ||
| 162 | country=location_data["country"], | ||
| 163 | locality=location_data["locality"], | ||
| 164 | coordinates=Coordinates(lat=location_data["lat"], lon=location_data["lon"]), | ||
| 165 | administrative_area_level_1=location_data["administrative_area_level_1"], | ||
| 166 | administrative_area_level_2=location_data["administrative_area_level_2"], | ||
| 167 | administrative_area_level_3=location_data["administrative_area_level_3"], | ||
| 168 | district_code=location_data["district_code"], | ||
| 169 | ) | ||
| 170 | |||
| 171 | def geocode(self, location: str) -> Optional[EventLocation]: | ||
| 172 | """Geocode a location string.""" | ||
| 173 | # Build request | ||
| 174 | params = { | ||
| 175 | "address": location, | ||
| 176 | "key": self.api_key, | ||
| 177 | "region": "pt", | ||
| 178 | "language": "pt", | ||
| 179 | } | ||
| 180 | |||
| 181 | try: | ||
| 182 | url = f"{self.base_url}?{urllib.parse.urlencode(params)}" | ||
| 183 | response = requests.get(url) | ||
| 184 | response.raise_for_status() | ||
| 185 | content = response.content | ||
| 186 | # print(f"GEOCODING|Google API content|{location}|{content}") | ||
| 187 | data = json.loads(content) | ||
| 188 | result = data["results"][0] | ||
| 189 | # print(f"GEOCODING|Google API result|{location}|{result}") | ||
| 190 | return self._parse_google_response(location, result) | ||
| 191 | except Exception as e: | ||
| 192 | # print(f"GEOCODING|Error|{location}|{str(e)}") | ||
| 193 | return None | ||
| 194 | |||
| 195 | |||
| 196 | def clean_ics_location(loc: str) -> str: | ||
| 197 | def clean_ics_segment(segment: str) -> str: | ||
| 198 | segment = segment.strip() | ||
| 199 | words = segment.split() | ||
| 200 | # Madeira Madeira | ||
| 201 | # Santa Maria da Cruz Santa Maria da Cruz | ||
| 202 | if len(words) % 2 == 0: | ||
| 203 | hl = len(words) // 2 | ||
| 204 | if words[:hl] == words[hl:]: | ||
| 205 | words = words[:hl] | ||
| 206 | return " ".join(words) | ||
| 207 | |||
| 208 | loc = loc.strip() | ||
| 209 | loc = loc.replace("\\", "") | ||
| 210 | segments = [clean_ics_segment(s) for s in loc.split(",")] | ||
| 211 | |||
| 212 | # Cabeço de Vida, Fontreira Cabeço de Vida, Fontreira | ||
| 213 | # Alcaria da Serra, Beja Alcaria da Serra, Beja | ||
| 214 | if len(segments) == 3 and segments[1] == f"{segments[2]} {segments[0]}": | ||
| 215 | segments = [segments[0], segments[2]] | ||
| 216 | return ", ".join(segments) | ||
| 217 | |||
| 218 | |||
| 219 | FIXUP_TABLE = {"Alcaria da Serra, Beja": "Alcaria da Serra"} | ||
| 220 | |||
| 221 | key = os.environ[ENV_GOOGLE_MAPS_API_KEY] | ||
| 222 | client = GoogleGeocodingClient(key) | ||
| 223 | for slug in sys.argv[1:]: | ||
| 224 | ics_path = os.path.join("events", slug, "ics") | ||
| 225 | location_path = os.path.join("events", slug, "location") | ||
| 226 | if os.path.exists(location_path): | ||
| 227 | continue | ||
| 228 | ics_content = open(ics_path, "rb").read().decode("utf-8", errors="ignore") | ||
| 229 | ics_location_match = re.search(r"LOCATION:(.*)", ics_content) | ||
| 230 | assert ics_location_match is not None | ||
| 231 | ics_location = ics_location_match[1] | ||
| 232 | ics_location = clean_ics_location(ics_location) | ||
| 233 | if ics_location == "": | ||
| 234 | continue | ||
| 235 | if ics_location in FIXUP_TABLE: | ||
| 236 | ics_location = FIXUP_TABLE[ics_location] | ||
| 237 | location = client.geocode(ics_location) | ||
| 238 | if location is None: | ||
| 239 | print(f"failed to obtain location from {slug} '{ics_location}'") | ||
| 240 | sys.exit(1) | ||
| 241 | with open(location_path, "w") as f: | ||
| 242 | json.dump(location.to_dict(), f) | ||
| 243 | |||
diff --git a/fetch-oneline-description b/fetch-oneline-description new file mode 100755 index 000000000..6f592e681 --- /dev/null +++ b/fetch-oneline-description | |||
| @@ -0,0 +1,73 @@ | |||
| 1 | #!/usr/bin/env python3 | ||
| 2 | import os | ||
| 3 | import sys | ||
| 4 | import json | ||
| 5 | import subprocess | ||
| 6 | |||
| 7 | MODEL = "openrouter/anthropic/claude-3.5-haiku" | ||
| 8 | |||
| 9 | |||
| 10 | class LLMClient: | ||
| 11 | """Client for LLM description generation.""" | ||
| 12 | |||
| 13 | def __init__(self, model: str): | ||
| 14 | self.model = model | ||
| 15 | |||
| 16 | def llm_call( | ||
| 17 | self, | ||
| 18 | system_prompt: str, | ||
| 19 | user_prompt: str, | ||
| 20 | ) -> str: | ||
| 21 | proc = subprocess.run( | ||
| 22 | ["llm", "-m", self.model, "-s", system_prompt, user_prompt], | ||
| 23 | timeout=30, | ||
| 24 | check=True, | ||
| 25 | capture_output=True, | ||
| 26 | text=True, | ||
| 27 | ) | ||
| 28 | stdout = proc.stdout.strip() | ||
| 29 | return stdout | ||
| 30 | |||
| 31 | def generate_description(self, text: str) -> str: | ||
| 32 | """Generate short description using LLM.""" | ||
| 33 | system_prompt = """És um assistente especializado em condensar descrições de eventos de corrida em resumos de uma linha em português de Portugal. Deves extrair e resumir apenas a informação mais importante e relevante da descrição fornecida. | ||
| 34 | |||
| 35 | Exemplos de resumos que deves gerar: | ||
| 36 | + Corrida histórica pelas ruas de Lisboa com vista para o Tejo | ||
| 37 | + Trail desafiante pela Serra da Estrela | ||
| 38 | + São Silvestre tradicional no centro histórico do Porto | ||
| 39 | + Meia maratona costeira com paisagens do Atlântico | ||
| 40 | + Corrida solidária organizada pela câmara municipal | ||
| 41 | + Prova de montanha com subidas técnicas | ||
| 42 | + Corrida de Natal pela zona ribeirinha | ||
| 43 | + Trail nocturno por caminhos antigos | ||
| 44 | |||
| 45 | IMPORTANTE: | ||
| 46 | - Responde APENAS com a descrição de uma linha em português de Portugal | ||
| 47 | - Usa apenas informação presente na descrição original | ||
| 48 | - Destaca características especiais do percurso, localização ou organização | ||
| 49 | - Não menciones distâncias se já estão implícitas no tipo de evento | ||
| 50 | - Foca-te no que torna este evento único ou interessante""" | ||
| 51 | |||
| 52 | return self.llm_call( | ||
| 53 | system_prompt, | ||
| 54 | text, | ||
| 55 | ) | ||
| 56 | |||
| 57 | |||
| 58 | client = LLMClient(MODEL) | ||
| 59 | for slug in sys.argv[1:]: | ||
| 60 | data_path = os.path.join("events", slug, "data.json") | ||
| 61 | if not os.path.exists(data_path): | ||
| 62 | continue | ||
| 63 | data = json.load(open(data_path, "r")) | ||
| 64 | description = data["content"]["rendered"].strip() | ||
| 65 | if description == "": | ||
| 66 | continue | ||
| 67 | oneline_path = os.path.join("events", slug, "oneline-description") | ||
| 68 | if os.path.exists(oneline_path): | ||
| 69 | continue | ||
| 70 | oneline = client.generate_description(description) | ||
| 71 | with open(oneline_path, "w") as f: | ||
| 72 | f.write(oneline) | ||
| 73 | |||
diff --git a/fetch-page b/fetch-page new file mode 100755 index 000000000..085b54e7b --- /dev/null +++ b/fetch-page | |||
| @@ -0,0 +1,15 @@ | |||
| 1 | #!/usr/bin/env python3 | ||
| 2 | import os | ||
| 3 | import sys | ||
| 4 | import requests | ||
| 5 | |||
| 6 | for slug in sys.argv[1:]: | ||
| 7 | url = f"https://www.portugalrunning.com/eventos/{slug}" | ||
| 8 | page_path = os.path.join("events", slug, "page.html") | ||
| 9 | if os.path.exists(page_path): | ||
| 10 | continue | ||
| 11 | response = requests.get(url) | ||
| 12 | response.raise_for_status() | ||
| 13 | with open(page_path, "w") as f: | ||
| 14 | f.write(response.text) | ||
| 15 | |||
diff --git a/fetch-sitemap b/fetch-sitemap new file mode 100755 index 000000000..b952ab529 --- /dev/null +++ b/fetch-sitemap | |||
| @@ -0,0 +1,2 @@ | |||
| 1 | #!/usr/bin/env sh | ||
| 2 | curl https://www.portugalrunning.com/ajde_events-sitemap.xml -o sitemap.xml | ||
diff --git a/list-slugs b/list-slugs new file mode 100755 index 000000000..a409e5d1e --- /dev/null +++ b/list-slugs | |||
| @@ -0,0 +1,2 @@ | |||
| 1 | #!/usr/bin/env sh | ||
| 2 | ls -1 events | ||
diff --git a/setup-directories b/setup-directories new file mode 100755 index 000000000..d3548b14f --- /dev/null +++ b/setup-directories | |||
| @@ -0,0 +1,45 @@ | |||
| 1 | #!/usr/bin/env python3 | ||
| 2 | import re | ||
| 3 | import os | ||
| 4 | import shutil | ||
| 5 | import xml.etree.ElementTree as ET | ||
| 6 | |||
| 7 | ignored_urls = ["https://www.portugalrunning.com/eventos/"] | ||
| 8 | tree = ET.parse("sitemap.xml") | ||
| 9 | root = tree.getroot() | ||
| 10 | |||
| 11 | for url_element in root.findall(".//{*}url"): | ||
| 12 | loc = url_element.find("{*}loc") | ||
| 13 | lastmod = url_element.find("{*}lastmod") | ||
| 14 | assert loc is not None | ||
| 15 | assert lastmod is not None | ||
| 16 | |||
| 17 | url = loc.text | ||
| 18 | lastmod = lastmod.text | ||
| 19 | |||
| 20 | assert url is not None | ||
| 21 | assert lastmod is not None | ||
| 22 | |||
| 23 | url = url.strip() | ||
| 24 | lastmod = lastmod.strip() | ||
| 25 | |||
| 26 | if url in ignored_urls: | ||
| 27 | continue | ||
| 28 | |||
| 29 | slug = re.match("https://www.portugalrunning.com/eventos/([^/]*)/", url) | ||
| 30 | assert slug is not None, f"failed to extract slug from '{url}'" | ||
| 31 | slug = slug[1] | ||
| 32 | |||
| 33 | event_dir = os.path.join("events", slug) | ||
| 34 | page_path = os.path.join(event_dir, "page.html") | ||
| 35 | lastmod_path = os.path.join(event_dir, "lastmod") | ||
| 36 | |||
| 37 | if os.path.exists(lastmod_path) and open(lastmod_path).read() == lastmod: | ||
| 38 | continue | ||
| 39 | |||
| 40 | if os.path.exists(event_dir): | ||
| 41 | shutil.rmtree(event_dir) | ||
| 42 | os.makedirs(event_dir, exist_ok=True) | ||
| 43 | with open(lastmod_path, "w") as f: | ||
| 44 | f.write(lastmod) | ||
| 45 | |||
