update report_addr to use new webserver endpoint with AS fallback
This commit is contained in:
+9
-4
@@ -114,7 +114,7 @@ class Metrics:
|
|||||||
url=self.url,
|
url=self.url,
|
||||||
)
|
)
|
||||||
|
|
||||||
def send_data(report_addr: str) -> None:
|
def send_data(report_addr: str) -> bool:
|
||||||
data = compute_autoscaler_data()
|
data = compute_autoscaler_data()
|
||||||
full_path = report_addr.rstrip("/") + "/worker_status/"
|
full_path = report_addr.rstrip("/") + "/worker_status/"
|
||||||
log.debug(
|
log.debug(
|
||||||
@@ -129,21 +129,26 @@ class Metrics:
|
|||||||
)
|
)
|
||||||
for attempt in range(1, 4):
|
for attempt in range(1, 4):
|
||||||
try:
|
try:
|
||||||
requests.post(full_path, json=asdict(data), timeout=1)
|
res = requests.post(full_path, json=asdict(data), timeout=1)
|
||||||
break
|
res.raise_for_status()
|
||||||
|
return True
|
||||||
except requests.Timeout:
|
except requests.Timeout:
|
||||||
log.debug(f"autoscaler status update timed out")
|
log.debug(f"autoscaler status update timed out")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.debug(f"autoscaler status update failed with error: {e}")
|
log.debug(f"autoscaler status update failed with error: {e}")
|
||||||
time.sleep(2)
|
time.sleep(2)
|
||||||
log.debug(f"retrying autoscaler status update, attempt: {attempt}")
|
log.debug(f"retrying autoscaler status update, attempt: {attempt}")
|
||||||
|
log.debug(f"failed to send update through {report_addr}")
|
||||||
|
return False
|
||||||
|
|
||||||
###########
|
###########
|
||||||
|
|
||||||
self.system_metrics.update_disk_usage()
|
self.system_metrics.update_disk_usage()
|
||||||
|
|
||||||
for report_addr in self.report_addr:
|
for report_addr in self.report_addr:
|
||||||
send_data(report_addr)
|
success = send_data(report_addr)
|
||||||
|
if success is True:
|
||||||
|
break
|
||||||
self.update_pending = False
|
self.update_pending = False
|
||||||
self.model_metrics.reset()
|
self.model_metrics.reset()
|
||||||
self.system_metrics.reset()
|
self.system_metrics.reset()
|
||||||
|
|||||||
+1
-1
@@ -9,7 +9,7 @@ ENV_PATH="$WORKSPACE_DIR/worker-env"
|
|||||||
DEBUG_LOG="$WORKSPACE_DIR/debug.log"
|
DEBUG_LOG="$WORKSPACE_DIR/debug.log"
|
||||||
PYWORKER_LOG="$WORKSPACE_DIR/pyworker.log"
|
PYWORKER_LOG="$WORKSPACE_DIR/pyworker.log"
|
||||||
|
|
||||||
REPORT_ADDR="${REPORT_ADDR:-https://run.vast.ai}"
|
REPORT_ADDR="${REPORT_ADDR:-https://cloud.vast.ai/api/v0,https://run.vast.ai}"
|
||||||
USE_SSL="${USE_SSL:-true}"
|
USE_SSL="${USE_SSL:-true}"
|
||||||
WORKER_PORT="${WORKER_PORT:-3000}"
|
WORKER_PORT="${WORKER_PORT:-3000}"
|
||||||
mkdir -p "$WORKSPACE_DIR"
|
mkdir -p "$WORKSPACE_DIR"
|
||||||
|
|||||||
Reference in New Issue
Block a user