diff options
| author | diogo464 <[email protected]> | 2025-07-11 20:14:42 +0100 |
|---|---|---|
| committer | diogo464 <[email protected]> | 2025-07-11 20:14:42 +0100 |
| commit | f0f57060a54b5c57608f945fd8f66b030779fd31 (patch) | |
| tree | ad9f341e8a91057f1df9d00027812fee53de117a | |
| parent | 9c57f905f93e381cbb27147f2effc76e127cc646 (diff) | |
feat: add benchmark startup analysis tools and improve demo.sh
- Add generate-schedule.sh script to create container schedules from addresses.txt
- Add benchmark-startup Python script for analyzing container startup times
- Update demo.sh to print timestamps and wait for start signal at /oar-p2p/start
- Add comprehensive statistics including startup, start signal, and waiting times
- Support for synchronized container coordination via start signal file
| -rw-r--r-- | .gitignore | 2 | ||||
| -rw-r--r-- | demo.sh | 10 | ||||
| -rwxr-xr-x | scripts/benchmark-startup | 204 | ||||
| -rw-r--r-- | scripts/benchmark.sh | 2 | ||||
| -rwxr-xr-x | scripts/generate-schedule.sh | 49 | ||||
| -rw-r--r-- | src/main.rs | 28 |
6 files changed, 295 insertions, 0 deletions
| @@ -1,5 +1,7 @@ | |||
| 1 | /target | 1 | /target |
| 2 | /logs | 2 | /logs |
| 3 | /benchmark-logs | ||
| 4 | /addresses.txt | ||
| 3 | __pycache__/ | 5 | __pycache__/ |
| 4 | .envrc | 6 | .envrc |
| 5 | latency-matrix-*.txt | 7 | latency-matrix-*.txt |
| @@ -1,5 +1,15 @@ | |||
| 1 | #!/bin/sh | 1 | #!/bin/sh |
| 2 | date -Iseconds | ||
| 2 | echo $MESSAGE | 3 | echo $MESSAGE |
| 4 | |||
| 5 | # Wait for start signal | ||
| 6 | while [ ! -f /oar-p2p/start ]; do | ||
| 7 | sleep 0.25 | ||
| 8 | done | ||
| 9 | |||
| 10 | # Print time when start signal received | ||
| 11 | date -Iseconds | ||
| 12 | |||
| 3 | sleep 2 | 13 | sleep 2 |
| 4 | ping -c 3 -I $ADDRESS $REMOTE | 14 | ping -c 3 -I $ADDRESS $REMOTE |
| 5 | sleep 1 | 15 | sleep 1 |
diff --git a/scripts/benchmark-startup b/scripts/benchmark-startup new file mode 100755 index 0000000..f38d031 --- /dev/null +++ b/scripts/benchmark-startup | |||
| @@ -0,0 +1,204 @@ | |||
| 1 | #!/usr/bin/env python3 | ||
| 2 | |||
| 3 | import os | ||
| 4 | import glob | ||
| 5 | import sys | ||
| 6 | from datetime import datetime | ||
| 7 | import statistics | ||
| 8 | |||
| 9 | def parse_timestamp(timestamp_str): | ||
| 10 | """Parse ISO timestamp from date -Iseconds""" | ||
| 11 | if 'T' in timestamp_str: | ||
| 12 | # Handle timezone formats | ||
| 13 | if timestamp_str.endswith('Z'): | ||
| 14 | return datetime.fromisoformat(timestamp_str.replace('Z', '+00:00')) | ||
| 15 | elif '+' in timestamp_str or '-' in timestamp_str[-6:]: | ||
| 16 | return datetime.fromisoformat(timestamp_str) | ||
| 17 | else: | ||
| 18 | # Add timezone if missing | ||
| 19 | return datetime.fromisoformat(timestamp_str + '+00:00') | ||
| 20 | else: | ||
| 21 | # Try other formats | ||
| 22 | return datetime.fromisoformat(timestamp_str) | ||
| 23 | |||
| 24 | def analyze_startup_times(): | ||
| 25 | """Analyze startup times from log files""" | ||
| 26 | print("Analyzing startup times...") | ||
| 27 | |||
| 28 | log_files = glob.glob("benchmark-logs/*.stdout") | ||
| 29 | if not log_files: | ||
| 30 | print("No log files found in benchmark-logs/") | ||
| 31 | return | ||
| 32 | |||
| 33 | startup_times = [] | ||
| 34 | start_signal_times = [] | ||
| 35 | earliest_startup_time = None | ||
| 36 | earliest_start_signal_time = None | ||
| 37 | |||
| 38 | for log_file in log_files: | ||
| 39 | try: | ||
| 40 | with open(log_file, 'r') as f: | ||
| 41 | lines = f.readlines() | ||
| 42 | |||
| 43 | # Parse first line (startup time) | ||
| 44 | if len(lines) > 0: | ||
| 45 | first_line = lines[0].strip() | ||
| 46 | if first_line: | ||
| 47 | try: | ||
| 48 | timestamp = parse_timestamp(first_line) | ||
| 49 | startup_times.append((log_file, timestamp)) | ||
| 50 | |||
| 51 | if earliest_startup_time is None or timestamp < earliest_startup_time: | ||
| 52 | earliest_startup_time = timestamp | ||
| 53 | |||
| 54 | except ValueError as e: | ||
| 55 | print(f"Could not parse startup timestamp in {log_file}: '{first_line}' - {e}") | ||
| 56 | |||
| 57 | # Look for second timestamp (start signal time) | ||
| 58 | for i, line in enumerate(lines): | ||
| 59 | line = line.strip() | ||
| 60 | if 'T' in line and i > 0: # Skip first line, look for second timestamp | ||
| 61 | try: | ||
| 62 | timestamp = parse_timestamp(line) | ||
| 63 | start_signal_times.append((log_file, timestamp)) | ||
| 64 | |||
| 65 | if earliest_start_signal_time is None or timestamp < earliest_start_signal_time: | ||
| 66 | earliest_start_signal_time = timestamp | ||
| 67 | break # Only take the first additional timestamp found | ||
| 68 | |||
| 69 | except ValueError: | ||
| 70 | continue # Skip lines that aren't timestamps | ||
| 71 | |||
| 72 | except Exception as e: | ||
| 73 | print(f"Error reading {log_file}: {e}") | ||
| 74 | |||
| 75 | if not startup_times: | ||
| 76 | print("No valid startup times found") | ||
| 77 | return | ||
| 78 | |||
| 79 | # Calculate relative startup times | ||
| 80 | relative_startup_times = [] | ||
| 81 | for log_file, timestamp in startup_times: | ||
| 82 | relative_time = (timestamp - earliest_startup_time).total_seconds() | ||
| 83 | relative_startup_times.append(relative_time) | ||
| 84 | |||
| 85 | # Print startup statistics | ||
| 86 | print(f"\n=== STARTUP TIME STATISTICS ===") | ||
| 87 | print(f"Total containers: {len(startup_times)}") | ||
| 88 | print(f"Earliest start time: {earliest_startup_time}") | ||
| 89 | print(f"Latest start time: {max(startup_times, key=lambda x: x[1])[1]}") | ||
| 90 | print(f"Total startup window: {max(relative_startup_times):.2f} seconds") | ||
| 91 | |||
| 92 | if len(relative_startup_times) > 1: | ||
| 93 | print(f"Average relative startup time: {statistics.mean(relative_startup_times):.2f} seconds") | ||
| 94 | print(f"Median relative startup time: {statistics.median(relative_startup_times):.2f} seconds") | ||
| 95 | print(f"Standard deviation: {statistics.stdev(relative_startup_times):.2f} seconds") | ||
| 96 | print(f"Min relative startup time: {min(relative_startup_times):.2f} seconds") | ||
| 97 | print(f"Max relative startup time: {max(relative_startup_times):.2f} seconds") | ||
| 98 | |||
| 99 | # Percentiles | ||
| 100 | sorted_times = sorted(relative_startup_times) | ||
| 101 | p50 = sorted_times[int(len(sorted_times) * 0.5)] | ||
| 102 | p90 = sorted_times[int(len(sorted_times) * 0.9)] | ||
| 103 | p95 = sorted_times[int(len(sorted_times) * 0.95)] | ||
| 104 | p99 = sorted_times[int(len(sorted_times) * 0.99)] | ||
| 105 | |||
| 106 | print(f"50th percentile: {p50:.2f} seconds") | ||
| 107 | print(f"90th percentile: {p90:.2f} seconds") | ||
| 108 | print(f"95th percentile: {p95:.2f} seconds") | ||
| 109 | print(f"99th percentile: {p99:.2f} seconds") | ||
| 110 | |||
| 111 | # Show startup distribution | ||
| 112 | print(f"\nStartup time distribution (by second):") | ||
| 113 | time_buckets = {} | ||
| 114 | for relative_time in relative_startup_times: | ||
| 115 | bucket = int(relative_time) | ||
| 116 | time_buckets[bucket] = time_buckets.get(bucket, 0) + 1 | ||
| 117 | |||
| 118 | for bucket in sorted(time_buckets.keys())[:10]: # Show first 10 buckets | ||
| 119 | print(f" {bucket}s: {time_buckets[bucket]} containers") | ||
| 120 | |||
| 121 | # Print per-container details (first 10) | ||
| 122 | print(f"\nFirst 10 containers to start:") | ||
| 123 | sorted_startup_times = sorted(startup_times, key=lambda x: x[1]) | ||
| 124 | for i, (log_file, timestamp) in enumerate(sorted_startup_times[:10]): | ||
| 125 | container_name = os.path.basename(log_file).replace('.stdout', '') | ||
| 126 | relative_time = (timestamp - earliest_startup_time).total_seconds() | ||
| 127 | print(f"{i+1:2d}. {container_name}: +{relative_time:.2f}s") | ||
| 128 | |||
| 129 | if len(sorted_startup_times) > 10: | ||
| 130 | print(f"... and {len(sorted_startup_times) - 10} more containers") | ||
| 131 | |||
| 132 | # Analyze start signal times if available | ||
| 133 | if start_signal_times: | ||
| 134 | print(f"\n=== START SIGNAL TIME STATISTICS ===") | ||
| 135 | print(f"Containers with start signal: {len(start_signal_times)}") | ||
| 136 | |||
| 137 | # Calculate relative start signal times | ||
| 138 | relative_start_signal_times = [] | ||
| 139 | for log_file, timestamp in start_signal_times: | ||
| 140 | relative_time = (timestamp - earliest_start_signal_time).total_seconds() | ||
| 141 | relative_start_signal_times.append(relative_time) | ||
| 142 | |||
| 143 | print(f"Earliest start signal time: {earliest_start_signal_time}") | ||
| 144 | print(f"Latest start signal time: {max(start_signal_times, key=lambda x: x[1])[1]}") | ||
| 145 | print(f"Total start signal window: {max(relative_start_signal_times):.2f} seconds") | ||
| 146 | |||
| 147 | if len(relative_start_signal_times) > 1: | ||
| 148 | print(f"Average relative start signal time: {statistics.mean(relative_start_signal_times):.2f} seconds") | ||
| 149 | print(f"Median relative start signal time: {statistics.median(relative_start_signal_times):.2f} seconds") | ||
| 150 | print(f"Standard deviation: {statistics.stdev(relative_start_signal_times):.2f} seconds") | ||
| 151 | |||
| 152 | # Percentiles | ||
| 153 | sorted_signal_times = sorted(relative_start_signal_times) | ||
| 154 | p50 = sorted_signal_times[int(len(sorted_signal_times) * 0.5)] | ||
| 155 | p90 = sorted_signal_times[int(len(sorted_signal_times) * 0.9)] | ||
| 156 | p95 = sorted_signal_times[int(len(sorted_signal_times) * 0.95)] | ||
| 157 | p99 = sorted_signal_times[int(len(sorted_signal_times) * 0.99)] | ||
| 158 | |||
| 159 | print(f"50th percentile: {p50:.2f} seconds") | ||
| 160 | print(f"90th percentile: {p90:.2f} seconds") | ||
| 161 | print(f"95th percentile: {p95:.2f} seconds") | ||
| 162 | print(f"99th percentile: {p99:.2f} seconds") | ||
| 163 | |||
| 164 | # Show start signal distribution | ||
| 165 | print(f"\nStart signal time distribution (by second):") | ||
| 166 | signal_time_buckets = {} | ||
| 167 | for relative_time in relative_start_signal_times: | ||
| 168 | bucket = int(relative_time) | ||
| 169 | signal_time_buckets[bucket] = signal_time_buckets.get(bucket, 0) + 1 | ||
| 170 | |||
| 171 | for bucket in sorted(signal_time_buckets.keys())[:10]: # Show first 10 buckets | ||
| 172 | print(f" {bucket}s: {signal_time_buckets[bucket]} containers") | ||
| 173 | |||
| 174 | # Calculate waiting times (time between startup and start signal) | ||
| 175 | waiting_times = [] | ||
| 176 | for startup_entry in startup_times: | ||
| 177 | startup_log, startup_time = startup_entry | ||
| 178 | # Find corresponding start signal time | ||
| 179 | for signal_entry in start_signal_times: | ||
| 180 | signal_log, signal_time = signal_entry | ||
| 181 | if startup_log == signal_log: | ||
| 182 | waiting_time = (signal_time - startup_time).total_seconds() | ||
| 183 | waiting_times.append(waiting_time) | ||
| 184 | break | ||
| 185 | |||
| 186 | if waiting_times: | ||
| 187 | print(f"\n=== WAITING TIME STATISTICS ===") | ||
| 188 | print(f"Average waiting time: {statistics.mean(waiting_times):.2f} seconds") | ||
| 189 | print(f"Median waiting time: {statistics.median(waiting_times):.2f} seconds") | ||
| 190 | print(f"Min waiting time: {min(waiting_times):.2f} seconds") | ||
| 191 | print(f"Max waiting time: {max(waiting_times):.2f} seconds") | ||
| 192 | |||
| 193 | if len(waiting_times) > 1: | ||
| 194 | print(f"Standard deviation: {statistics.stdev(waiting_times):.2f} seconds") | ||
| 195 | |||
| 196 | else: | ||
| 197 | print(f"\nNo start signal timestamps found (containers may not have reached start signal yet)") | ||
| 198 | |||
| 199 | def main(): | ||
| 200 | """Main function""" | ||
| 201 | analyze_startup_times() | ||
| 202 | |||
| 203 | if __name__ == "__main__": | ||
| 204 | main() \ No newline at end of file | ||
diff --git a/scripts/benchmark.sh b/scripts/benchmark.sh new file mode 100644 index 0000000..db956bf --- /dev/null +++ b/scripts/benchmark.sh | |||
| @@ -0,0 +1,2 @@ | |||
| 1 | #!/usr/bin/env bash | ||
| 2 | scripts/generate-schedule.sh | oar-p2p run --output-dir benchmark-logs/ && scripts/benchmark-startup | ||
diff --git a/scripts/generate-schedule.sh b/scripts/generate-schedule.sh new file mode 100755 index 0000000..8e8db4e --- /dev/null +++ b/scripts/generate-schedule.sh | |||
| @@ -0,0 +1,49 @@ | |||
| 1 | #!/bin/bash | ||
| 2 | |||
| 3 | # Script to generate a schedule based on `cargo run -- net show` output | ||
| 4 | # Usage: ./generate-schedule.sh [image] | ||
| 5 | |||
| 6 | set -e | ||
| 7 | |||
| 8 | # Default values | ||
| 9 | IMAGE="${1:-ghcr.io/diogo464/oar-p2p/demo:latest}" | ||
| 10 | |||
| 11 | # Generate JSON schedule | ||
| 12 | echo "Getting addresses from addresses.txt..." >&2 | ||
| 13 | if [ ! -f "addresses.txt" ]; then | ||
| 14 | echo "Error: addresses.txt not found" >&2 | ||
| 15 | exit 1 | ||
| 16 | fi | ||
| 17 | addresses_output=$(cat addresses.txt) | ||
| 18 | address_count=$(echo "$addresses_output" | wc -l) | ||
| 19 | |||
| 20 | echo "Generating schedule with $address_count containers..." >&2 | ||
| 21 | echo "Using image: $IMAGE" >&2 | ||
| 22 | |||
| 23 | # Start JSON array | ||
| 24 | echo "[" | ||
| 25 | |||
| 26 | # Process each address | ||
| 27 | first=true | ||
| 28 | while IFS=' ' read -r machine address; do | ||
| 29 | # Skip empty lines | ||
| 30 | if [ -z "$address" ]; then | ||
| 31 | continue | ||
| 32 | fi | ||
| 33 | |||
| 34 | # Add comma separator for all but first entry | ||
| 35 | if [ "$first" = true ]; then | ||
| 36 | first=false | ||
| 37 | else | ||
| 38 | echo "," | ||
| 39 | fi | ||
| 40 | |||
| 41 | # Generate container entry directly with proper escaping | ||
| 42 | printf ' {\n "address": "%s",\n "image": "%s",\n "env": {\n "ADDRESS": "%s",\n "MACHINE": "%s",\n "MESSAGE": "Container on %s with address %s"\n }\n }' \ | ||
| 43 | "$address" "$IMAGE" "$address" "$machine" "$machine" "$address" | ||
| 44 | |||
| 45 | done <<< "$addresses_output" | ||
| 46 | |||
| 47 | # Close JSON array | ||
| 48 | echo "" | ||
| 49 | echo "]" \ No newline at end of file | ||
diff --git a/src/main.rs b/src/main.rs index 06cf322..e2d356e 100644 --- a/src/main.rs +++ b/src/main.rs | |||
| @@ -4,6 +4,7 @@ use std::{ | |||
| 4 | net::Ipv4Addr, | 4 | net::Ipv4Addr, |
| 5 | path::{Path, PathBuf}, | 5 | path::{Path, PathBuf}, |
| 6 | process::Output, | 6 | process::Output, |
| 7 | time::Duration, | ||
| 7 | }; | 8 | }; |
| 8 | 9 | ||
| 9 | use clap::{Args, Parser, Subcommand}; | 10 | use clap::{Args, Parser, Subcommand}; |
| @@ -108,6 +109,9 @@ struct RunArgs { | |||
| 108 | #[clap(long)] | 109 | #[clap(long)] |
| 109 | output_dir: PathBuf, | 110 | output_dir: PathBuf, |
| 110 | 111 | ||
| 112 | #[clap(long, default_value = "10")] | ||
| 113 | signal_delay: u64, | ||
| 114 | |||
| 111 | schedule: Option<PathBuf>, | 115 | schedule: Option<PathBuf>, |
| 112 | } | 116 | } |
| 113 | 117 | ||
| @@ -307,6 +311,17 @@ async fn cmd_run(args: RunArgs) -> Result<()> { | |||
| 307 | ) | 311 | ) |
| 308 | .await?; | 312 | .await?; |
| 309 | 313 | ||
| 314 | tracing::info!("waiting {} seconds before signaling", args.signal_delay); | ||
| 315 | tokio::time::sleep(Duration::from_secs(args.signal_delay)).await; | ||
| 316 | |||
| 317 | machine::for_each( | ||
| 318 | machines | ||
| 319 | .iter() | ||
| 320 | .filter(|&machine| containers.iter().any(|c| c.machine == *machine)), | ||
| 321 | |machine| machine_signal_containers(&ctx, machine), | ||
| 322 | ) | ||
| 323 | .await?; | ||
| 324 | |||
| 310 | tracing::info!("waiting for all containers to exit"); | 325 | tracing::info!("waiting for all containers to exit"); |
| 311 | machine::for_each(&machines, |machine| { | 326 | machine::for_each(&machines, |machine| { |
| 312 | let ctx = ctx.clone(); | 327 | let ctx = ctx.clone(); |
| @@ -350,10 +365,15 @@ async fn cmd_run(args: RunArgs) -> Result<()> { | |||
| 350 | fn machine_containers_create_script(containers: &[ScheduledContainer]) -> String { | 365 | fn machine_containers_create_script(containers: &[ScheduledContainer]) -> String { |
| 351 | let mut script = String::default(); | 366 | let mut script = String::default(); |
| 352 | for (idx, container) in containers.iter().enumerate() { | 367 | for (idx, container) in containers.iter().enumerate() { |
| 368 | // remove the start signal file if it exists | ||
| 369 | script.push_str("mkdir -p /tmp/oar-p2p-signal\n"); | ||
| 370 | script.push_str("rm /tmp/oar-p2p-signal/start 2>/dev/null || true\n"); | ||
| 371 | |||
| 353 | script.push_str("docker create \\\n"); | 372 | script.push_str("docker create \\\n"); |
| 354 | script.push_str("\t--pull=always \\\n"); | 373 | script.push_str("\t--pull=always \\\n"); |
| 355 | script.push_str("\t--network=host \\\n"); | 374 | script.push_str("\t--network=host \\\n"); |
| 356 | script.push_str("\t--restart=no \\\n"); | 375 | script.push_str("\t--restart=no \\\n"); |
| 376 | script.push_str("\t--volume /tmp/oar-p2p-signal:/oar-p2p\\\n"); | ||
| 357 | script.push_str(&format!("\t--name {} \\\n", container.name)); | 377 | script.push_str(&format!("\t--name {} \\\n", container.name)); |
| 358 | for (key, val) in container.variables.iter() { | 378 | for (key, val) in container.variables.iter() { |
| 359 | script.push_str("\t-e "); | 379 | script.push_str("\t-e "); |
| @@ -406,6 +426,14 @@ async fn machine_start_containers(ctx: &Context, machine: Machine) -> Result<()> | |||
| 406 | Ok(()) | 426 | Ok(()) |
| 407 | } | 427 | } |
| 408 | 428 | ||
| 429 | #[tracing::instrument(ret, err, skip(ctx))] | ||
| 430 | async fn machine_signal_containers(ctx: &Context, machine: Machine) -> Result<()> { | ||
| 431 | tracing::info!("signaling containers"); | ||
| 432 | machine_run_script(ctx, machine, "touch /tmp/oar-p2p-signal/start").await?; | ||
| 433 | tracing::info!("containers signaled"); | ||
| 434 | Ok(()) | ||
| 435 | } | ||
| 436 | |||
| 409 | fn machine_containers_wait_script(containers: &[ScheduledContainer]) -> String { | 437 | fn machine_containers_wait_script(containers: &[ScheduledContainer]) -> String { |
| 410 | let mut script = String::default(); | 438 | let mut script = String::default(); |
| 411 | for container in containers { | 439 | for container in containers { |
