spelling fix

updated startserver
stat script
2025-10-15 15:09:34 -07:00 · 2025-10-15 12:14:31 -07:00 · 2025-10-14 17:38:06 -07:00 · 2025-10-13 10:06:22 -07:00 · 2025-10-09 18:15:55 -07:00 · 2025-10-09 10:13:50 -07:00
5 changed files with 59 additions and 29 deletions
@@ -190,18 +190,30 @@ class Backend:
            log.debug(f"Exception in main handler loop {e}")
            return web.Response(status=500)

+    @cached_property  
+    def healthcheck_session(self):
+        """Dedicated session for healthchecks to avoid conflicts with API session"""
+        log.debug("creating dedicated healthcheck session")
+        connector = TCPConnector(
+            force_close=True,  # Keep this for isolation
+            enable_cleanup_closed=True,
+        )
+        timeout = ClientTimeout(total=10)  # Reasonable timeout for healthchecks
+        return ClientSession(timeout=timeout, connector=connector)
+
    async def __healthcheck(self):
        health_check_url = self.benchmark_handler.healthcheck_endpoint
        if health_check_url is None:
            log.debug("No healthcheck endpoint defined, skipping healthcheck")
            return
+
        while True:
            await sleep(10)
            if self.__start_healthcheck is False:
                continue
            try:
                log.debug(f"Performing healthcheck on {health_check_url}")
-                async with self.session.get(health_check_url) as response:
+                async with self.healthcheck_session.get(health_check_url) as response:
                    if response.status == 200:
                        log.debug("Healthcheck successful")
                    elif response.status == 503:
@@ -210,7 +222,6 @@ class Backend:
                            f"Healthcheck failed with status: {response.status}"
                        )
                    else:
-                        # endpoint not ready yet so bail
                        log.debug(f"Healthcheck Endpoint not ready: {response.status}")
            except Exception as e:
                log.debug(f"Healthcheck failed with exception: {e}")
@@ -45,6 +45,7 @@ class Metrics:
        self.model_metrics.workload_received += workload
        self.model_metrics.requests_recieved.add(reqnum)
        self.model_metrics.requests_working.add(reqnum)
+        self.update_pending = True

    def _request_end(self, workload: float, reqnum: int) -> None:
        """
@@ -78,10 +79,10 @@ class Metrics:
            elapsed = time.time() - self.last_metric_update
            if self.system_metrics.model_is_loaded is False and elapsed >= 10:
                log.debug(f"sending loading model metrics after {int(elapsed)}s wait")
-                self.__send_metrics_and_reset(elapsed)
+                self.__send_metrics_and_reset()
            elif self.update_pending or elapsed > 10:
                log.debug(f"sending loaded model metrics after {int(elapsed)}s wait")
-                self.__send_metrics_and_reset(elapsed)
+                self.__send_metrics_and_reset()

    def _model_loaded(self, max_throughput: float) -> None:
        self.system_metrics.model_loading_time = (
@@ -96,13 +97,13 @@ class Metrics:

    #######################################Private#######################################

-    def __send_metrics_and_reset(self, elapsed):
+    def __send_metrics_and_reset(self):

        def compute_autoscaler_data() -> AutoScalaerData:
            return AutoScalaerData(
                id=self.id,
                loadtime=(self.system_metrics.model_loading_time or 0.0),
-                cur_load=(self.model_metrics.workload_processing / elapsed),
+                cur_load=(self.model_metrics.workload_processing),
                max_perf=self.model_metrics.max_throughput,
                cur_perf=self.model_metrics.cur_perf,
                error_msg=self.model_metrics.error_msg or "",
@@ -3,8 +3,7 @@
 set -e -o pipefail

 WORKSPACE_DIR="${WORKSPACE_DIR:-/workspace}"
-
-SERVER_DIR="$WORKSPACE_DIR/vast-pyworker"
+SERVER_DIR="$WORKSPACE_DIR/worker"
 ENV_PATH="$WORKSPACE_DIR/worker-env"
 DEBUG_LOG="$WORKSPACE_DIR/debug.log"
 PYWORKER_LOG="$WORKSPACE_DIR/pyworker.log"
@@ -22,24 +21,23 @@ function echo_var(){
    echo "$1: ${!1}"
 }

-[ -z "$BACKEND" ] && echo "BACKEND must be set!" && exit 1
+# Updated validation - BACKEND no longer required, but MODEL_LOG still is
 [ -z "$MODEL_LOG" ] && echo "MODEL_LOG must be set!" && exit 1
 [ -z "$HF_TOKEN" ] && echo "HF_TOKEN must be set!" && exit 1
-[ "$BACKEND" = "comfyui" ] && [ -z "$COMFY_MODEL" ] && echo "For comfyui backends, COMFY_MODEL must be set!" && exit 1

-
-echo "start_server.sh"
+echo "start_server.sh - SDK Worker Version"
 date

-echo_var BACKEND
 echo_var REPORT_ADDR
 echo_var WORKER_PORT
 echo_var WORKSPACE_DIR
-echo_var SERVER_DIR
 echo_var ENV_PATH
 echo_var DEBUG_LOG
 echo_var PYWORKER_LOG
 echo_var MODEL_LOG
+echo_var MODEL_SERVER_URL
+echo_var PYWORKER_REPO
+echo_var PYWORKER_REF

 # Populate /etc/environment with quoted values
 if ! grep -q "VAST" /etc/environment; then
@@ -58,16 +56,32 @@ then
        source ~/.local/bin/env
    fi

-    # Fork testing
-    git clone "${PYWORKER_REPO:-https://github.com/vast-ai/pyworker}" "$SERVER_DIR"
-    if [[ -n ${PYWORKER_REF:-} ]]; then
-        (cd "$SERVER_DIR" && git checkout "$PYWORKER_REF")
+    if [[ ! -d $SERVER_DIR ]]; then
+        echo "Cloning worker repository..."
+        git clone --depth=1 "${PYWORKER_REPO:-https://github.com/vast-ai/pyworker}" "$SERVER_DIR"
    fi

-    uv venv --managed-python "$ENV_PATH" -p 3.10
+    if [[ -n ${PYWORKER_REF:-} ]]; then
+        echo "Checking out ref: $PYWORKER_REF"
+        (
+            cd "$SERVER_DIR"
+            git fetch --depth=1 origin "$PYWORKER_REF"
+            git checkout "$PYWORKER_REF"
+        )
+    fi
+
+    uv venv --python-preference only-managed "$ENV_PATH" -p 3.10
    source "$ENV_PATH/bin/activate"

+    # Install vast-sdk from server-side-sdk branch
+    echo "Installing vast-sdk from GitHub (server-side-sdk branch)..."
+    uv pip install "git+https://github.com/vast-ai/vast-sdk.git@server-side-sdk"
+
+    # Install requirements from worker repo if they exist
+    if [ -f "${SERVER_DIR}/requirements.txt" ]; then
+        echo "Installing additional dependencies from requirements.txt..."
        uv pip install -r "${SERVER_DIR}/requirements.txt"
+    fi

    touch ~/.no_auto_tmux
 else
@@ -77,7 +91,12 @@ else
    echo "venv: $VIRTUAL_ENV"
 fi

-[ ! -d "$SERVER_DIR/workers/$BACKEND" ] && echo "$BACKEND not supported!" && exit 1
+# Check that worker.py exists
+if [ ! -f "$SERVER_DIR/worker.py" ]; then
+    echo "ERROR: worker.py not found in $SERVER_DIR"
+    echo "Please ensure your PYWORKER_REPO contains a worker.py file"
+    exit 1
+fi

 if [ "$USE_SSL" = true ]; then

@@ -115,9 +134,6 @@ EOF
        POST "https://console.vast.ai/api/v0/sign_cert/?instance_id=$CONTAINER_ID" > /etc/instance.crt;
 fi

-
-
-
 export REPORT_ADDR WORKER_PORT USE_SSL UNSECURED

 cd "$SERVER_DIR"
@@ -128,5 +144,6 @@ echo "launching PyWorker server"
 # from the run prior to reboot. past logs are saved in $MODEL_LOG.old for debugging only
 [ -e "$MODEL_LOG" ] && cat "$MODEL_LOG" >> "$MODEL_LOG.old" && : > "$MODEL_LOG"

-(python3 -m "workers.$BACKEND.server" |& tee -a "$PYWORKER_LOG") &
+# Launch the SDK-based worker instead of the old backend system
+(python3 worker.py |& tee -a "$PYWORKER_LOG") &
 echo "launching PyWorker server done"
@@ -13,8 +13,9 @@ with open("workers/comfyui/misc/test_prompts.txt", "r") as f:
    test_prompts = f.readlines()

 def count_workload() -> float:
-    # Always 1.0 where there is a single instance of ComfyUI handling requests
-    return 1.0
+    # Always 100.0 where there is a single instance of ComfyUI handling requests
+    # Results will indicate % or a job completed per second.  Avoids sub 0.1 sec performance indication
+    return 100.0

@dataclasses.dataclass
 class ComfyWorkflowData(ApiPayload):
@@ -70,7 +70,7 @@ class ComfyWorkflowHandler(EndpointHandler[ComfyWorkflowData]):

    @property
    def healthcheck_endpoint(self) -> Optional[str]:
-        return None
+        return f"{MODEL_SERVER_URL}/health"

    @classmethod
    def payload_cls(cls) -> Type[ComfyWorkflowData]:
Author	SHA1	Message	Date
Lucas Armand	a7617162a7	spelling fix	2025-10-15 15:09:34 -07:00
Lucas Armand	d8f51a2edc	updated startserver	2025-10-15 12:14:31 -07:00
Lucas Armand	ee57ed207b	stat script	2025-10-14 17:38:06 -07:00
LucasArmandVast	c98d661513	Merge pull request #39 from vast-ai/remove-time-divide PyWorker fixes for cur_load and acks bug	2025-10-13 10:06:22 -07:00
Lucas Armand	f6fd1c6ac1	merge	2025-10-09 18:15:55 -07:00
Lucas Armand	055e346c8c	Send metrics on request start	2025-10-09 10:13:50 -07:00
Lucas Armand	1cedb28acf	Removed division by elapsed time, since autoscaler cur_load in units of workload	2025-10-08 16:54:18 -07:00
Colter-Downing	0397af719d	Merge pull request #37 from robballantyne/bugfix/healthcheck-endpoint Fix healthcheck endpoint URL Tested and merged by Colter	2025-10-06 15:11:27 -07:00
Rob Ballantyne	4fdc314fd9	Fix healthcheck endpoint URL	2025-10-06 22:16:09 +01:00
Colter-Downing	639d82f5b4	Merge pull request #35 from vast-ai/AUTO-664--Healthcheck-error Fix healthcheck with separate session	2025-10-02 12:51:19 -07:00
Colter Downing	25db78e39d	Fix healthcheck with separate session	2025-10-01 18:04:31 -07:00
Scott-Laytart	4e2f2311d0	Merge pull request #33 from vast-ai/comfy-blind-fix-override undo the fix for comfy yesterday.	2025-09-03 11:50:07 -07:00
abiola-vastai	38782d89bc	undo the fix for comfy yesterday.	2025-09-03 17:12:35 +00:00
Scott-Laytart	0185216ccb	Merge pull request #32 from vast-ai/blindhotfix_comfy_ui_default_port Blind hotfix to see if comfy UI default is needed. if it does work we…	2025-09-02 18:26:25 -07:00
abiola-vastai	b20d9e714c	Blind hotfix to see if comfy UI default is needed. if it does work we would revert back.	2025-09-03 01:20:09 +00:00
Rob Ballantyne	b1eb65d75d	Merge pull request #31 from vast-ai/bugfix/startup-script-20250901 Update uv venv creation command	2025-09-01 18:19:17 +01:00
Rob Ballantyne	1d09d7fe96	Update uv venv creation command	2025-09-01 16:55:20 +01:00
Colter-Downing	1b37054dec	Merge pull request #28 from vast-ai/bugfix/backend-timeout-infinite Bugfix/backend timeout infinite	2025-08-28 11:22:33 -07:00
Colter-Downing	1a1e4174b8	Merge pull request #29 from vast-ai/bugfix/comfyui-json-cost-fix Set cost to 100	2025-08-28 11:22:21 -07:00
Rob Ballantyne	b8377c4081	Set cost to 100	2025-08-28 16:13:17 +01:00