update report_addr to use new webserver endpoint with AS fallback

This commit is contained in:
Nader Arbabian
2025-08-12 13:31:19 -07:00
parent c595b42410
commit cd946b0a9f
2 changed files with 10 additions and 5 deletions
+9 -4
View File
@@ -114,7 +114,7 @@ class Metrics:
url=self.url, url=self.url,
) )
def send_data(report_addr: str) -> None: def send_data(report_addr: str) -> bool:
data = compute_autoscaler_data() data = compute_autoscaler_data()
full_path = report_addr.rstrip("/") + "/worker_status/" full_path = report_addr.rstrip("/") + "/worker_status/"
log.debug( log.debug(
@@ -129,21 +129,26 @@ class Metrics:
) )
for attempt in range(1, 4): for attempt in range(1, 4):
try: try:
requests.post(full_path, json=asdict(data), timeout=1) res = requests.post(full_path, json=asdict(data), timeout=1)
break res.raise_for_status()
return True
except requests.Timeout: except requests.Timeout:
log.debug(f"autoscaler status update timed out") log.debug(f"autoscaler status update timed out")
except Exception as e: except Exception as e:
log.debug(f"autoscaler status update failed with error: {e}") log.debug(f"autoscaler status update failed with error: {e}")
time.sleep(2) time.sleep(2)
log.debug(f"retrying autoscaler status update, attempt: {attempt}") log.debug(f"retrying autoscaler status update, attempt: {attempt}")
log.debug(f"failed to send update through {report_addr}")
return False
########### ###########
self.system_metrics.update_disk_usage() self.system_metrics.update_disk_usage()
for report_addr in self.report_addr: for report_addr in self.report_addr:
send_data(report_addr) success = send_data(report_addr)
if success is True:
break
self.update_pending = False self.update_pending = False
self.model_metrics.reset() self.model_metrics.reset()
self.system_metrics.reset() self.system_metrics.reset()
+1 -1
View File
@@ -9,7 +9,7 @@ ENV_PATH="$WORKSPACE_DIR/worker-env"
DEBUG_LOG="$WORKSPACE_DIR/debug.log" DEBUG_LOG="$WORKSPACE_DIR/debug.log"
PYWORKER_LOG="$WORKSPACE_DIR/pyworker.log" PYWORKER_LOG="$WORKSPACE_DIR/pyworker.log"
REPORT_ADDR="${REPORT_ADDR:-https://run.vast.ai}" REPORT_ADDR="${REPORT_ADDR:-https://cloud.vast.ai/api/v0,https://run.vast.ai}"
USE_SSL="${USE_SSL:-true}" USE_SSL="${USE_SSL:-true}"
WORKER_PORT="${WORKER_PORT:-3000}" WORKER_PORT="${WORKER_PORT:-3000}"
mkdir -p "$WORKSPACE_DIR" mkdir -p "$WORKSPACE_DIR"