From cd946b0a9fd2494bec6f25d81650002206c90cf4 Mon Sep 17 00:00:00 2001 From: Nader Arbabian Date: Tue, 12 Aug 2025 13:31:19 -0700 Subject: [PATCH] update report_addr to use new webserver endpoint with AS fallback --- lib/metrics.py | 13 +++++++++---- start_server.sh | 2 +- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/lib/metrics.py b/lib/metrics.py index 4bbdabb..166706b 100644 --- a/lib/metrics.py +++ b/lib/metrics.py @@ -114,7 +114,7 @@ class Metrics: url=self.url, ) - def send_data(report_addr: str) -> None: + def send_data(report_addr: str) -> bool: data = compute_autoscaler_data() full_path = report_addr.rstrip("/") + "/worker_status/" log.debug( @@ -129,21 +129,26 @@ class Metrics: ) for attempt in range(1, 4): try: - requests.post(full_path, json=asdict(data), timeout=1) - break + res = requests.post(full_path, json=asdict(data), timeout=1) + res.raise_for_status() + return True except requests.Timeout: log.debug(f"autoscaler status update timed out") except Exception as e: log.debug(f"autoscaler status update failed with error: {e}") time.sleep(2) log.debug(f"retrying autoscaler status update, attempt: {attempt}") + log.debug(f"failed to send update through {report_addr}") + return False ########### self.system_metrics.update_disk_usage() for report_addr in self.report_addr: - send_data(report_addr) + success = send_data(report_addr) + if success is True: + break self.update_pending = False self.model_metrics.reset() self.system_metrics.reset() diff --git a/start_server.sh b/start_server.sh index 6acb8a1..8ef61a7 100755 --- a/start_server.sh +++ b/start_server.sh @@ -9,7 +9,7 @@ ENV_PATH="$WORKSPACE_DIR/worker-env" DEBUG_LOG="$WORKSPACE_DIR/debug.log" PYWORKER_LOG="$WORKSPACE_DIR/pyworker.log" -REPORT_ADDR="${REPORT_ADDR:-https://run.vast.ai}" +REPORT_ADDR="${REPORT_ADDR:-https://cloud.vast.ai/api/v0,https://run.vast.ai}" USE_SSL="${USE_SSL:-true}" WORKER_PORT="${WORKER_PORT:-3000}" mkdir -p "$WORKSPACE_DIR"