update report_addr to use new webserver endpoint with AS fallback

This commit is contained in:
Nader Arbabian
2025-08-12 13:31:19 -07:00
parent c595b42410
commit cd946b0a9f
2 changed files with 10 additions and 5 deletions
+9 -4
View File
@@ -114,7 +114,7 @@ class Metrics:
url=self.url,
)
def send_data(report_addr: str) -> None:
def send_data(report_addr: str) -> bool:
data = compute_autoscaler_data()
full_path = report_addr.rstrip("/") + "/worker_status/"
log.debug(
@@ -129,21 +129,26 @@ class Metrics:
)
for attempt in range(1, 4):
try:
requests.post(full_path, json=asdict(data), timeout=1)
break
res = requests.post(full_path, json=asdict(data), timeout=1)
res.raise_for_status()
return True
except requests.Timeout:
log.debug(f"autoscaler status update timed out")
except Exception as e:
log.debug(f"autoscaler status update failed with error: {e}")
time.sleep(2)
log.debug(f"retrying autoscaler status update, attempt: {attempt}")
log.debug(f"failed to send update through {report_addr}")
return False
###########
self.system_metrics.update_disk_usage()
for report_addr in self.report_addr:
send_data(report_addr)
success = send_data(report_addr)
if success is True:
break
self.update_pending = False
self.model_metrics.reset()
self.system_metrics.reset()
+1 -1
View File
@@ -9,7 +9,7 @@ ENV_PATH="$WORKSPACE_DIR/worker-env"
DEBUG_LOG="$WORKSPACE_DIR/debug.log"
PYWORKER_LOG="$WORKSPACE_DIR/pyworker.log"
REPORT_ADDR="${REPORT_ADDR:-https://run.vast.ai}"
REPORT_ADDR="${REPORT_ADDR:-https://cloud.vast.ai/api/v0,https://run.vast.ai}"
USE_SSL="${USE_SSL:-true}"
WORKER_PORT="${WORKER_PORT:-3000}"
mkdir -p "$WORKSPACE_DIR"