comfyui-json: key readiness off api-wrapper's BACKENDS_READY token

Rather than tailing for "Uvicorn running on", which only confirms the api-wrapper's own HTTP listener is bound, watch for the api-wrapper's new structured tokens that reflect actual end-to-end reachability: MODEL_LOAD_LOG_MSG = ["BACKENDS_READY"] MODEL_ERROR_LOG_MSGS includes: - "BACKENDS_READY_TIMEOUT" (backends never came up) - "BACKEND_UNRECOVERABLE" (CUDA fault latched on a backend) - "Application startup failed" (kept; uvicorn's own ASGI failure) Closes the race observed on a live test where the pyworker fired benchmark the moment uvicorn bound, every request inside the api-wrapper hit Cannot-connect-to-host on ComfyUI, and the SDK counted the resulting fast 502s as a fast worker (perf=200). Tokens are emitted by ai-dock/comfyui-api-wrapper#11 and onward; earlier wrapper versions won't emit BACKENDS_READY so warm-up stalls indefinitely — pin to a wrapper that includes that change. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
comfyui-json: address PR #85 review
2026-05-08 09:46:45 +01:00 · 2026-05-07 18:25:21 +01:00 · 2026-05-07 12:46:17 +01:00 · 2026-05-07 12:03:19 +01:00 · 2026-05-07 11:59:30 +01:00 · 2026-05-07 11:54:20 +01:00
15 changed files with 1324 additions and 184 deletions
@@ -1,11 +1,2 @@
-aiohttp[speedups]==3.10.1
+vastai-sdk>=0.3.0
-anyio~=4.4
+nltk==3.9.4 
 lib~=4.0
 nltk~=3.9
 psutil~=6.0
 pycryptodome~=3.20
 Requests~=2.32
 transformers~=4.52
 utils==1.0.*
 hf_transfer>=0.1.9
 git+https://github.com/vast-ai/vast-sdk.git@session
@@ -2,10 +2,17 @@
 set -e -o pipefail
 # Check for force update flag
 FORCE_UPDATE=false
 if [ -f "/.force_update" ]; then
    echo "Force update flag detected at /.force_update"
    FORCE_UPDATE=true
 fi
 WORKSPACE_DIR="${WORKSPACE_DIR:-/workspace}"
 SERVER_DIR="$WORKSPACE_DIR/vast-pyworker"
-ENV_PATH="$WORKSPACE_DIR/worker-env"
+ENV_PATH="${ENV_PATH:-$WORKSPACE_DIR/worker-env}"
 DEBUG_LOG="$WORKSPACE_DIR/debug.log"
 PYWORKER_LOG="$WORKSPACE_DIR/pyworker.log"
@@ -46,6 +53,42 @@ JSON
    exit 1
 }
 function install_vastai_sdk() {
    local uv_flags=()
    if [ "${USE_SYSTEM_PYTHON:-}" = "true" ]; then
        uv_flags+=(--system --break-system-packages)
    fi
    if [ "$FORCE_UPDATE" = true ]; then
        uv_flags+=(--force-reinstall)
        echo "Force reinstalling vastai"
    fi
    # If SDK_BRANCH is set, install vastai from the vast-cli repo at that branch/tag/commit.
    if [ -n "${SDK_BRANCH:-}" ]; then
        if [ -n "${SDK_VERSION:-}" ]; then
            echo "WARNING: Both SDK_BRANCH and SDK_VERSION are set; using SDK_BRANCH=${SDK_BRANCH}"
        fi
        echo "Installing vastai from https://github.com/vast-ai/vast-cli/ @ ${SDK_BRANCH}"
        if ! uv pip install "${uv_flags[@]}" "vastai @ git+https://github.com/vast-ai/vast-cli.git@${SDK_BRANCH}"; then
            report_error_and_exit "Failed to install vastai from vast-ai/vast-cli@${SDK_BRANCH}"
        fi
        return 0
    fi
    if [ -n "${SDK_VERSION:-}" ]; then
        echo "Installing vastai version ${SDK_VERSION}"
        if ! uv pip install "${uv_flags[@]}" "vastai==${SDK_VERSION}"; then
            report_error_and_exit "Failed to install vastai==${SDK_VERSION}"
        fi
        return 0
    fi
    echo "Installing default vastai"
    if ! uv pip install "${uv_flags[@]}" vastai; then
        report_error_and_exit "Failed to install vastai"
    fi
 }
 [ -n "$BACKEND" ] && [ -z "$HF_TOKEN" ] && report_error_and_exit "HF_TOKEN must be set when BACKEND is set!"
 [ -z "$CONTAINER_ID" ] && report_error_and_exit "CONTAINER_ID must be set!"
 [ "$BACKEND" = "comfyui" ] && [ -z "$COMFY_MODEL" ] && report_error_and_exit "For comfyui backends, COMFY_MODEL must be set!"
@@ -63,7 +106,8 @@ echo_var DEBUG_LOG
 echo_var PYWORKER_LOG
 echo_var MODEL_LOG
-if [ -e "$MODEL_LOG" ]; then
+ROTATE_MODEL_LOG="${ROTATE_MODEL_LOG:-false}"
 if [ "$ROTATE_MODEL_LOG" = "true" ] && [ -e "$MODEL_LOG" ]; then
    echo "Rotating model log at $MODEL_LOG to $MODEL_LOG.old"
    if ! cat "$MODEL_LOG" >> "$MODEL_LOG.old"; then
        report_error_and_exit "Failed to rotate model log"
@@ -84,8 +128,21 @@ if ! grep -q "VAST" /etc/environment; then
    fi
 fi
-if [ ! -d "$ENV_PATH" ]
+if [ "${USE_SYSTEM_PYTHON:-}" = "true" ]; then
-then
+    echo "Using system Python: $(which python3)"
    if ! which uv > /dev/null 2>&1; then
        if ! curl -LsSf https://astral.sh/uv/install.sh | sh; then
            report_error_and_exit "Failed to install uv package manager"
        fi
        if [[ -f ~/.local/bin/env ]]; then
            if ! source ~/.local/bin/env; then
                report_error_and_exit "Failed to source uv environment"
            fi
        fi
    fi
    install_vastai_sdk
    touch ~/.no_auto_tmux
 elif [ ! -d "$ENV_PATH" ]; then
    echo "setting up venv"
    if ! which uv; then
        if ! curl -LsSf https://astral.sh/uv/install.sh | sh; then
@@ -104,12 +161,29 @@ then
        if ! git clone "${PYWORKER_REPO:-https://github.com/vast-ai/pyworker}" "$SERVER_DIR"; then
            report_error_and_exit "Failed to clone pyworker repository"
        fi
    elif [ "$FORCE_UPDATE" = true ]; then
        echo "Force updating pyworker repository"
        if ! (cd "$SERVER_DIR" && git fetch --all); then
            report_error_and_exit "Failed to fetch pyworker repository updates"
        fi
    fi
    if [[ -n ${PYWORKER_REF:-} ]]; then
        if [ "$FORCE_UPDATE" = true ]; then
            echo "Force updating to pyworker reference: $PYWORKER_REF"
            if ! (cd "$SERVER_DIR" && git checkout "$PYWORKER_REF" && git pull); then
                report_error_and_exit "Failed to force update pyworker reference: $PYWORKER_REF"
            fi
        else
            if ! (cd "$SERVER_DIR" && git checkout "$PYWORKER_REF"); then
                report_error_and_exit "Failed to checkout pyworker reference: $PYWORKER_REF"
            fi
        fi
    elif [ "$FORCE_UPDATE" = true ]; then
        echo "Force updating pyworker to latest"
        if ! (cd "$SERVER_DIR" && git pull); then
            report_error_and_exit "Failed to pull latest pyworker changes"
        fi
    fi
    if ! uv venv --python-preference only-managed "$ENV_PATH" -p 3.10; then
        report_error_and_exit "Failed to create virtual environment"
@@ -123,6 +197,8 @@ then
        report_error_and_exit "Failed to install Python requirements"
    fi
    install_vastai_sdk
    if ! touch ~/.no_auto_tmux; then
        report_error_and_exit "Failed to create ~/.no_auto_tmux"
    fi
@@ -132,11 +208,44 @@ else
            report_error_and_exit "Failed to source uv environment"
        fi
    fi
-    if ! source "$WORKSPACE_DIR/worker-env/bin/activate"; then
+    if ! source "$ENV_PATH/bin/activate"; then
        report_error_and_exit "Failed to activate existing virtual environment"
    fi
    echo "environment activated"
    echo "venv: $VIRTUAL_ENV"
    # Handle force update for existing environment
    if [ "$FORCE_UPDATE" = true ]; then
        echo "Performing force update on existing environment"
        if [[ -d $SERVER_DIR ]]; then
            echo "Force updating pyworker repository"
            if ! (cd "$SERVER_DIR" && git fetch --all); then
                report_error_and_exit "Failed to fetch pyworker repository updates"
            fi
            if [[ -n ${PYWORKER_REF:-} ]]; then
                echo "Force updating to pyworker reference: $PYWORKER_REF"
                if ! (cd "$SERVER_DIR" && git checkout "$PYWORKER_REF" && git pull); then
                    report_error_and_exit "Failed to force update pyworker reference: $PYWORKER_REF"
                fi
            else
                echo "Force updating pyworker to latest"
                if ! (cd "$SERVER_DIR" && git pull); then
                    report_error_and_exit "Failed to pull latest pyworker changes"
                fi
            fi
        fi
        install_vastai_sdk
    fi
 fi
 # Remove force update flag after successful update
 if [ "$FORCE_UPDATE" = true ]; then
    echo "Removing force update flag"
    rm -f "/.force_update"
    echo "Force update completed successfully"
 fi
 if [ "$USE_SSL" = true ]; then
@@ -174,16 +283,51 @@ EOF
        report_error_and_exit "Failed to generate SSL certificate request"
    fi
-    if ! curl --header 'Content-Type: application/octet-stream' \
+    max_retries=5
    retry_delay=2
    for attempt in $(seq 1 "$max_retries"); do
        http_code=$(curl -sS -o /etc/instance.crt -w '%{http_code}' \
            --header 'Content-Type: application/octet-stream' \
            --data-binary @/etc/instance.csr \
-        -X \
+            -X POST "https://console.vast.ai/api/v0/sign_cert/?instance_id=$CONTAINER_ID")
-        POST "https://console.vast.ai/api/v0/sign_cert/?instance_id=$CONTAINER_ID" > /etc/instance.crt; then
+        if [ "$http_code" -ge 200 ] && [ "$http_code" -lt 300 ]; then
-        report_error_and_exit "Failed to sign SSL certificate"
+            break
        fi
        echo "SSL cert signing attempt $attempt/$max_retries failed (HTTP $http_code)"
        if [ "$attempt" -eq "$max_retries" ]; then
            report_error_and_exit "Failed to sign SSL certificate after $max_retries attempts (HTTP $http_code)"
        fi
        sleep "$retry_delay"
        retry_delay=$((retry_delay * 2))
    done
 fi
 export REPORT_ADDR WORKER_PORT USE_SSL UNSECURED
 # ─── SDK Deployment Mode ───────────────────────────────────────────────
 if [ "$IS_DEPLOYMENT" = "true" ]; then
    echo "=== SDK Deployment Mode ==="
    echo "DEPLOYMENT_ID: $DEPLOYMENT_ID"
    DEPLOY_DIR="/workspace/deployment"
    mkdir -p "$DEPLOY_DIR"
    VAST_API_BASE="${VAST_API_BASE:-https://console.vast.ai}"
    # Download deployment code, retrying until the blob is available on S3.
    # The s3_key exists in the DB as soon as the deployment is created, but the
    # actual upload may still be in flight from the client side.
    # Install SDK (uses the install_vastai_sdk function which supports SDK_BRANCH/SDK_VERSION)
    install_vastai_sdk
    # Run deployment in serve mode
    export VAST_DEPLOYMENT_MODE=serve
    echo "Starting deployment: python3 $DEPLOY_DIR/deployment.py"
    serve-vast-deployment
    exit $?
 fi
 # ─── End SDK Deployment Mode ───────────────────────────────────────────
 if ! cd "$SERVER_DIR"; then
    report_error_and_exit "Failed to cd into SERVER_DIR: $SERVER_DIR"
 fi
@@ -195,19 +339,19 @@ set +e
 PY_STATUS=1
 if [ -f "$SERVER_DIR/worker.py" ]; then
-    echo "trying worker.py"
+    echo "Running worker.py"
    python3 -m "worker" |& tee -a "$PYWORKER_LOG"
    PY_STATUS=${PIPESTATUS[0]}
 fi
 if [ "${PY_STATUS}" -ne 0 ] && [ -f "$SERVER_DIR/workers/$BACKEND/worker.py" ]; then
-    echo "trying workers.${BACKEND}.worker"
+    echo "Running workers.${BACKEND}.worker"
    python3 -m "workers.${BACKEND}.worker" |& tee -a "$PYWORKER_LOG"
    PY_STATUS=${PIPESTATUS[0]}
 fi
 if [ "${PY_STATUS}" -ne 0 ] && [ -f "$SERVER_DIR/workers/$BACKEND/server.py" ]; then
-    echo "trying workers.${BACKEND}.server"
+    echo "Running workers.${BACKEND}.server"
    python3 -m "workers.${BACKEND}.server" |& tee -a "$PYWORKER_LOG"
    PY_STATUS=${PIPESTATUS[0]}
 fi
@@ -221,4 +365,4 @@ if [ "${PY_STATUS}" -ne 0 ]; then
    report_error_and_exit "PyWorker exited with status ${PY_STATUS}"
 fi
-echo "launching PyWorker server done"
+echo "PyWorker bootstrap complete"
@@ -2,7 +2,7 @@
 This is the PyWorker implementation for running **ACE Step v1 3.5B** text-to-music workflows in ComfyUI. It provides a unified interface for executing complete ComfyUI audio-generation workflows through a proxy-based architecture and returning generated audio assets.
-Each request has a static cost of `100`. ComfyUI does not support concurrent workloads, and there is no provision to run multiple ComfyUI instances per worker node.
+Each request has a static cost of `1000`. ComfyUI does not support concurrent workloads, and there is no provision to run multiple ComfyUI instances per worker node.
 ## Requirements
@@ -1,15 +1,159 @@
 # ComfyUI PyWorker
-This is the base PyWorker for ComfyUI. It provides a unified interface for running any ComfyUI workflow through a proxy-based architecture.
+This is the base PyWorker for ComfyUI. It provides a unified interface for running any ComfyUI workflow through a proxy-based architecture. See the [Serverless documentation](https://docs.vast.ai/serverless) for guides and how-to's.
 The cost for each request has a static value of `100`. ComfyUI does not handle concurrent workloads and there is no current provision to load multiple instances of ComfyUI per worker node.
 ## Instance Setup
 1. Pick a template
 - [ComfyUI (Serverless)](https://cloud.vast.ai/?ref_id=62897&creator_id=62897&name=ComfyUI%20(Serverless))
 2. Follow the [getting started guide](https://docs.vast.ai/documentation/serverless/quickstart) for help with configuring your serverless setup. For testing, we recommend that you use the default options presented by the web interface.
 ## Requirements
 This worker requires both [ComfyUI](https://github.com/comfyanonymous/ComfyUI) and [ComfyUI API Wrapper](https://github.com/ai-dock/comfyui-api-wrapper).
 A docker image is provided but you may use any if the above requirements are met.
 ## Client
 The client demonstrates how to use the Vast Serverless SDK to generate images, save them locally, and optionally upload to S3-compatible storage.
 ### Setup
 1. Clone the PyWorker repository to your local machine and install the necessary requirements for running the test client.
 ```bash
 git clone https://github.com/vast-ai/pyworker
 cd pyworker
 pip install uv
 uv venv -p 3.12
 source .venv/bin/activate
 uv pip install -r requirements.txt
 ```
 2. Set your API key:
 ```bash
 export VAST_API_KEY=<your_api_key>
 ```
 ### Usage
 ```bash
 # Default prompt
 python -m workers.comfyui-json.client
 # Custom prompt
 python -m workers.comfyui-json.client --prompt "a cat sitting on a rainbow"
 # With options
 python -m workers.comfyui-json.client --prompt "sunset" --width 1024 --height 1024 --steps 30
 # Using a custom workflow file
 python -m workers.comfyui-json.client --workflow my_workflow.json
 # With S3 upload
 python -m workers.comfyui-json.client --s3
 ```
 ### CLI Flags
 | Flag | Default | Description |
 |------|---------|-------------|
 | `--endpoint` | `my-comfyui-endpoint` | Vast endpoint name |
 | `--prompt` | (default) | Text prompt for image generation |
 | `--workflow` | (none) | Path to custom workflow JSON file |
 | `--width` | 512 | Image width in pixels |
 | `--height` | 512 | Image height in pixels |
 | `--steps` | 20 | Number of denoising steps |
 | `--seed` | (random) | Random seed for reproducibility |
 | `--s3` | (disabled) | Upload generated images to S3 |
 ### Output
 Images are saved to `./generated_images/comfy_{seed}.png`.
 ### S3 Upload (Optional)
 You can optionally upload generated images to an S3-compatible storage service (AWS S3, Cloudflare R2, Backblaze B2, etc.) by using the `--s3` flag.
 **1. Set environment variables:**
 ```bash
 export S3_ENDPOINT_URL="https://your-account.r2.cloudflarestorage.com"
 export S3_BUCKET_NAME="my-bucket"
 export S3_ACCESS_KEY_ID="your-access-key-id"
 export S3_SECRET_ACCESS_KEY="your-secret-access-key"
 ```
 **2. Run with S3 upload enabled:**
 ```bash
 python -m workers.comfyui-json.client --prompt "a beautiful landscape" --s3
 ```
 Images will be saved locally AND uploaded to `s3://{bucket}/comfyui/{filename}`.
 **Note:** Requires `boto3` (`pip install boto3`).
 ## Benchmarking
 ### Custom Benchmark Workflows
 You can provide a custom ComfyUI workflow for benchmarking. This allows you to test performance using your preferred models and workflow complexity.
 **Ways to provide the benchmark file** (in resolution order — first match wins):
 1. **Fork this repository** and commit your workflow to `workers/comfyui-json/misc/benchmark.json`.
 2. **Write the file during provisioning** to a path *outside* the pyworker tree (e.g. `/workspace/benchmark.json`) and export `BENCHMARK_JSON_PATH` so the worker can find it. The pyworker repo is cloned by `start_server.sh` *after* provisioning runs, so provisioning cannot write into `misc/` directly — the destination would be clobbered, or the clone would fail.
 3. **Run on the vast.ai ComfyUI base image.** Its `convert-workflows.sh` maintains `/opt/comfyui-api-wrapper/workflows/pyworker_benchmark.json` as a symlink to the first provisioned workflow; the worker reads this automatically when neither of the above is set. No env var required.
 If `BENCHMARK_JSON_PATH` is set but points at a missing or unreadable file, the worker logs a warning and falls through to the next tier rather than going straight to the SD1.5 fallback.
 An example workflow is provided at `workers/comfyui-json/misc/benchmark.json.example`. To ensure varied generations, use the placeholder `__RANDOM_INT__` in place of static seed values — it will be replaced with a random integer for each benchmark run.
 ### Default Benchmark (Fallback)
 If `benchmark.json` is not available, a simple image generation benchmark runs when each worker initializes. This validates GPU performance and helps identify underperforming machines.
 The default benchmark uses Stable Diffusion v1.5 with ComfyUI's standard text-to-image workflow. Configure it using these environment variables:
 | Environment Variable | Default Value | Description |
 | -------------------- | ------------- | ----------- |
 | BENCHMARK_JSON_PATH | (unset) | Path to a custom workflow file outside the pyworker tree. Used if `misc/benchmark.json` is absent. Falls through to `/opt/comfyui-api-wrapper/workflows/pyworker_benchmark.json` if set but missing. |
 | BENCHMARK_TEST_WIDTH | 512 | Fallback benchmark: image width (pixels) |
 | BENCHMARK_TEST_HEIGHT | 512 | Fallback benchmark: image height (pixels) |
 | BENCHMARK_TEST_STEPS | 20 | Fallback benchmark: number of denoising steps |
 Each benchmark run uses a random prompt from `misc/test_prompts.txt` and a random seed to ensure consistent GPU load patterns.
 #### Calibrating Fallback Benchmark Duration
 To screen for underperforming hardware, set `BENCHMARK_TEST_STEPS` to match your expected production workflow duration. This allows you to identify machines that won't meet performance requirements.
 **Example:** If your typical workflow should complete in 90 seconds on acceptable hardware:
 ```bash
 # 1. Measure it/sec on your reference machine
 # RTX 4090 typically achieves ~43 it/sec with SD1.5
 # 2. Calculate required steps
 # 90 seconds × 43 it/sec = 3870 steps
 # 3. Configure benchmark
 export BENCHMARK_TEST_STEPS=3870
 # 4. Machines completing significantly slower than 90s indicate hardware issues
 ```
 **Performance expectations:**
 - Benchmark duration should remain consistent across identical GPU models
 - Significant variation (>20%) may indicate thermal, power, or configuration issues
 ## Endpoint
 The worker provides a single endpoint:
@@ -1,34 +1,312 @@
 import os
 import sys
 import json
 import uuid
 import random
 import asyncio
-import random
+import logging
 import argparse
 import aiohttp
 from vastai import Serverless
-async def main():
+# ---------------------- Config ----------------------
-    async with Serverless() as client:
+DEFAULT_PROMPT = "a beautiful sunset over mountains, digital art, highly detailed"
-        endpoint = await client.get_endpoint(name="my-comfy-endpoint") # Change this to your endpoint name
+ENDPOINT_NAME = "my-comfyui-endpoint"
 DEFAULT_WIDTH = 512
 DEFAULT_HEIGHT = 512
 DEFAULT_STEPS = 20
 COST = 100  # Fixed cost for ComfyUI requests
 # Optional S3 Configuration (from environment variables)
 S3_ENDPOINT_URL = os.getenv("S3_ENDPOINT_URL")
 S3_BUCKET_NAME = os.getenv("S3_BUCKET_NAME")
 S3_ACCESS_KEY_ID = os.getenv("S3_ACCESS_KEY_ID")
 S3_SECRET_ACCESS_KEY = os.getenv("S3_SECRET_ACCESS_KEY")
 logging.basicConfig(level=logging.INFO, format="%(levelname)s - %(message)s")
 log = logging.getLogger(__name__)
 def get_s3_client():
    """Create and return an S3 client configured for the S3-compatible endpoint"""
    try:
        import boto3
        from botocore.config import Config
    except ImportError:
        log.error("boto3 is required for S3 uploads. Install with: pip install boto3")
        return None
    if not all([S3_ENDPOINT_URL, S3_BUCKET_NAME, S3_ACCESS_KEY_ID, S3_SECRET_ACCESS_KEY]):
        log.error("S3 environment variables not fully configured. Required:")
        log.error("  S3_ENDPOINT_URL, S3_BUCKET_NAME, S3_ACCESS_KEY_ID, S3_SECRET_ACCESS_KEY")
        return None
    return boto3.client(
        "s3",
        endpoint_url=S3_ENDPOINT_URL,
        aws_access_key_id=S3_ACCESS_KEY_ID,
        aws_secret_access_key=S3_SECRET_ACCESS_KEY,
        config=Config(signature_version="s3v4"),
    )
 # ---------------------- API Functions ----------------------
 async def call_generate(
    client: Serverless,
    *,
    endpoint_name: str,
    prompt: str,
    width: int,
    height: int,
    steps: int,
    seed: int,
 ) -> dict:
    """Generate image using Text2Image modifier"""
    endpoint = await client.get_endpoint(name=endpoint_name)
    payload = {
        "input": {
            "request_id": str(uuid.uuid4()),
            "modifier": "Text2Image",
            "modifications": {
-                    "prompt": "a beautiful landscape with mountains and lakes",
+                "prompt": prompt,
-                    "width": 1024,
+                "width": width,
-                    "height": 1024,
+                "height": height,
-                    "steps": 20,
+                "steps": steps,
-                    "seed": random.randint(0, 2**32 - 1)
+                "seed": seed,
            },
                "workflow_json": {}  # Empty since using modifier approach
        }
    }
    return await endpoint.request("/generate/sync", payload, cost=COST)
        response = await endpoint.request("/generate/sync", payload, cost=100)
-        # Get the file from the path on the local machine using SCP or SFTP
+async def call_generate_workflow(
-        # or configure S3 to upload to cloud storage.
+    client: Serverless,
-        print(response["response"]["output"][0]["local_path"])
+    *,
    endpoint_name: str,
    workflow_json: dict,
 ) -> dict:
    """Generate using custom workflow JSON"""
    endpoint = await client.get_endpoint(name=endpoint_name)
    payload = {
        "input": {
            "request_id": str(uuid.uuid4()),
            "workflow_json": workflow_json,
        }
    }
    return await endpoint.request("/generate/sync", payload, cost=COST)
 # ---------------------- Demo Class ----------------------
 class APIDemo:
    def __init__(self, client: Serverless, endpoint_name: str, upload_s3: bool = False):
        self.client = client
        self.endpoint_name = endpoint_name
        self.upload_s3 = upload_s3
        self.s3_client = get_s3_client() if upload_s3 else None
        if upload_s3 and not self.s3_client:
            log.warning("S3 upload requested but client creation failed. Images will only be saved locally.")
    def extract_filename(self, response: dict) -> str | None:
        """Extract the generated image filename from ComfyUI response"""
        if "comfyui_response" in response:
            for data in response["comfyui_response"].values():
                if isinstance(data, dict) and "outputs" in data:
                    for node_output in data["outputs"].values():
                        if "images" in node_output and node_output["images"]:
                            return node_output["images"][0].get("filename")
        return None
    async def save_image(self, worker_url: str, filename: str, local_name: str) -> str | None:
        """Fetch and save image locally from the worker, optionally upload to S3"""
        os.makedirs("generated_images", exist_ok=True)
        return await self._fetch_image(worker_url, filename, local_name)
    def _upload_to_s3(self, local_path: str, s3_key: str) -> str | None:
        """Upload a local file to S3 and return the S3 URL"""
        if not self.s3_client:
            return None
        try:
            self.s3_client.upload_file(
                local_path,
                S3_BUCKET_NAME,
                s3_key,
                ExtraArgs={"ContentType": "image/png"}
            )
            s3_url = f"{S3_ENDPOINT_URL}/{S3_BUCKET_NAME}/{s3_key}"
            print(f"  ☁️  Uploaded to S3: {s3_key}")
            return s3_url
        except Exception as e:
            log.error(f"Failed to upload to S3: {e}")
            return None
    async def _fetch_image(self, worker_url: str, filename: str, local_name: str) -> str | None:
        """Fetch image from worker's /view endpoint and save locally"""
        if not worker_url:
            return None
        try:
            url = f"{worker_url}/view"
            params = {"filename": filename, "type": "output"}
            async with aiohttp.ClientSession() as session:
                async with session.get(url, params=params, ssl=False) as resp:
                    if resp.status == 200:
                        path = f"generated_images/{local_name}"
                        image_data = await resp.read()
                        with open(path, "wb") as f:
                            f.write(image_data)
                        print(f"  💾 Saved: {path}")
                        # Upload to S3 if enabled
                        if self.upload_s3 and self.s3_client:
                            s3_key = f"comfyui/{local_name}"
                            self._upload_to_s3(path, s3_key)
                        return path
            return None
        except Exception:
            return None
    async def demo_prompt(
        self,
        prompt: str,
        width: int,
        height: int,
        steps: int,
        seed: int | None,
    ):
        """Demo: Generate image from text prompt"""
        print("=" * 60)
        print("COMFYUI TEXT-TO-IMAGE DEMO")
        print("=" * 60)
        if seed is None:
            seed = random.randint(0, 2**32 - 1)
        print(f"Prompt: {prompt[:100]}..." if len(prompt) > 100 else f"Prompt: {prompt}")
        print(f"Size: {width}x{height}, Steps: {steps}, Seed: {seed}")
        print("\n🎨 Generating image...")
        response = await call_generate(
            self.client,
            endpoint_name=self.endpoint_name,
            prompt=prompt,
            width=width,
            height=height,
            steps=steps,
            seed=seed,
        )
        print("\n✅ Generation complete!")
        # Get worker URL for fetching images
        worker_url = response.get("url", "")
        print(f"Worker URL: {worker_url}")
        # Fetch and save image
        if "response" in response:
            filename = self.extract_filename(response["response"])
            if filename:
                path = await self.save_image(worker_url, filename, f"comfy_{seed}.png")
                if not path:
                    print(f"❌ Failed to fetch image")
            else:
                print("❌ No image in response")
        else:
            print("❌ Unexpected response format")
    async def demo_workflow(self, workflow_file: str):
        """Demo: Generate using custom workflow file"""
        print("=" * 60)
        print("COMFYUI CUSTOM WORKFLOW DEMO")
        print("=" * 60)
        if not os.path.exists(workflow_file):
            log.error(f"Workflow file not found: {workflow_file}")
            return
        with open(workflow_file, "r") as f:
            workflow_json = json.load(f)
        print(f"Workflow: {workflow_file}")
        print("\n🎨 Generating...")
        response = await call_generate_workflow(
            self.client,
            endpoint_name=self.endpoint_name,
            workflow_json=workflow_json,
        )
        print("\n✅ Generation complete!")
        worker_url = response.get("url", "")
        if "response" in response:
            filename = self.extract_filename(response["response"])
            if filename:
                path = await self.save_image(worker_url, filename, "workflow.png")
                if not path:
                    print(f"❌ Failed to fetch image")
            else:
                print("❌ No image in response")
        else:
            print("❌ Unexpected response format")
 # ---------------------- CLI ----------------------
 def build_arg_parser() -> argparse.ArgumentParser:
    p = argparse.ArgumentParser(description="Vast ComfyUI-JSON Demo (Serverless SDK)")
    p.add_argument("--endpoint", default=ENDPOINT_NAME, help=f"Vast endpoint name (default: {ENDPOINT_NAME})")
    p.add_argument("--prompt", type=str, default=DEFAULT_PROMPT, metavar="TEXT",
                   help=f"Prompt text (default: '{DEFAULT_PROMPT[:30]}...')")
    p.add_argument("--workflow", type=str, metavar="FILE", help="Use custom workflow JSON file instead")
    p.add_argument("--width", type=int, default=DEFAULT_WIDTH, help=f"Image width (default: {DEFAULT_WIDTH})")
    p.add_argument("--height", type=int, default=DEFAULT_HEIGHT, help=f"Image height (default: {DEFAULT_HEIGHT})")
    p.add_argument("--steps", type=int, default=DEFAULT_STEPS, help=f"Steps (default: {DEFAULT_STEPS})")
    p.add_argument("--seed", type=int, default=None, help="Seed (default: random)")
    p.add_argument("--s3", action="store_true", 
                   help="Upload generated images to S3 (requires S3_ENDPOINT_URL, S3_BUCKET_NAME, S3_ACCESS_KEY_ID, S3_SECRET_ACCESS_KEY env vars)")
    return p
 async def main_async():
    args = build_arg_parser().parse_args()
    print("=" * 60)
    print(f"Using endpoint: {args.endpoint}")
    if args.s3:
        print(f"S3 upload: enabled (bucket: {S3_BUCKET_NAME})")
    try:
        async with Serverless() as client:
            demo = APIDemo(client, args.endpoint, upload_s3=args.s3)
            if args.workflow:
                await demo.demo_workflow(workflow_file=args.workflow)
            else:
                await demo.demo_prompt(
                    prompt=args.prompt,
                    width=args.width,
                    height=args.height,
                    steps=args.steps,
                    seed=args.seed,
                )
    except AttributeError as e:
        if "API key" in str(e):
            log.error("API key missing. Set VAST_API_KEY environment variable.")
        else:
            log.error(f"Error: {e}")
        sys.exit(1)
    except Exception as e:
        log.error(f"Error: {e}")
        import traceback
        traceback.print_exc()
        sys.exit(1)
 if __name__ == "__main__":
-    asyncio.run(main())
+    asyncio.run(main_async())
@@ -0,0 +1,107 @@
 {
    "3": {
        "inputs": {
            "seed": "__RANDOM_INT__",
            "steps": 20,
            "cfg": 8,
            "sampler_name": "euler",
            "scheduler": "normal",
            "denoise": 1,
            "model": [
            "4",
            0
            ],
            "positive": [
            "6",
            0
            ],
            "negative": [
            "7",
            0
            ],
            "latent_image": [
            "5",
            0
            ]
        },
        "class_type": "KSampler",
        "_meta": {
            "title": "KSampler"
        }
    },
    "4": {
        "inputs": {
            "ckpt_name": "v1-5-pruned-emaonly-fp16.safetensors"
        },
        "class_type": "CheckpointLoaderSimple",
        "_meta": {
            "title": "Load Checkpoint"
        }
    },
    "5": {
        "inputs": {
            "width": 512,
            "height": 512,
            "batch_size": 1
        },
        "class_type": "EmptyLatentImage",
        "_meta": {
            "title": "Empty Latent Image"
        }
    },
    "6": {
        "inputs": {
            "text": "beautiful scenery nature glass bottle landscape, , purple galaxy bottle,",
            "clip": [
            "4",
            1
            ]
        },
        "class_type": "CLIPTextEncode",
        "_meta": {
            "title": "CLIP Text Encode (Prompt)"
        }
    },
    "7": {
        "inputs": {
            "text": "text, watermark",
            "clip": [
            "4",
            1
            ]
        },
        "class_type": "CLIPTextEncode",
        "_meta": {
            "title": "CLIP Text Encode (Prompt)"
        }
    },
    "8": {
        "inputs": {
            "samples": [
            "3",
            0
            ],
            "vae": [
            "4",
            2
            ]
        },
        "class_type": "VAEDecode",
        "_meta": {
            "title": "VAE Decode"
        }
    },
    "9": {
        "inputs": {
            "filename_prefix": "ComfyUI",
            "images": [
            "8",
            0
            ]
        },
        "class_type": "SaveImage",
        "_meta": {
            "title": "Save Image"
        }
    }
 }
@@ -0,0 +1,34 @@
 cartoon character of a person with a hoodie , in style of cytus and deemo, ork, gold chains, realistic anime cat, dripping black goo, lineage revolution style, thug life, cute anthropomorphic bunny, balrog, arknights, aliased, very buff, black and red and yellow paint, painting illustration collage style, character composition in vector with white background
 stardew valley, fine details
 2D Vector Illustration of a child with soccer ball Art for Sublimation, Design Art, Chrome Art, Painting and Stunning Artwork, Highly Detailed Digital Painting, Airbrush Art, Highly Detailed Digital Artwork, Dramatic Artwork, stained antique yellow copper paint, digital airbrush art, detailed by Mark Brooks, Chicano airbrush art, Swagger! snake Culture
 realistic futuristic city-downtown with short buildings, sunset
 seascape by Ray Collins and artgerm, front view of a perfect wave, sunny background, ultra detailed water
 inspired by realflow-cinema4d editor features, create image of a transparent luxury cup with ice fruits and mint, connected with white, yellow and pink cream, Slow - High Speed MO Photography, YouTube Video Screenshot, Abstract Clay, Transparent Cup , molecular gastronomy, wheel, 3D fluid,Simulation rendering, still video, 4k polymer clay futras photography, very surreal, Houdini Fluid Simulation, hyperrealistic CGI and FLUIDS & MULTIPHYSICS SIMULATION effect, with Somali Stain Lurex, Metallic Jacquard, Gold Thread, Mulberry Silk, Toub Saree, Warm background, a fantastic image worthy of an award.
 biker with backpack on his back riding a motorcycle, Style by Ade Santora, Oilpunk, Cover photo, craig mullins style, on the cover of a magazine, Outdoor Magazine, inspired by Alex Petruk APe, image of a male biker, Cover of an award-winning magazine, the man has a backpack, photo for magazine, with a backpack, magazine cover
 generate a collage-style illustration inspired by the Procreate raster graphic editor, photographic illustration with the theme, 2D vector, art for textile sublimation, containing surrealistic cartoon cat wearing a baseball cap and jeans standing in front of a poster, inspired by Sadao Watanabe, Doraemon, Japanese cartoon style, Eichiro Oda, Iconic high detail character, Director: Nakahara Nantenbō, Kastuhiro Otomo, image detailed, by Miyamoto, Hidetaka Miyazaki, Katsuhiro illustration, 8k, masterpiece, Minimize noise and grain in photo quality without lose quality and increase brightness and lighting,Symmetry and Alignment, Avoid asymmetrical shapes and out-of-focus points. Focus and Sharpness: Make sure the image is focused and sharp and encourages the viewer to see it as a work of art printed on fabric.
 fantasy medieval village world inside a glass sphere , high detail, fantasy, realistic, light effect, hyper detail, volumetric lighting, cinematic, macro, depth of field, blur, red light and clouds from the back, highly detailed epic cinematic concept art cg render made in maya, blender and photoshop, octane render, excellent composition, dynamic dramatic cinematic lighting, aesthetic, very inspirational, world inside a glass sphere by james gurney by artgerm with james jean, joe fenton and tristan eaton by ross tran, fine details
 armored hero with a glowing axe, mecha science_fiction, jungle background, dynamic lighting, detailed shading, digital texture painting, masterpiece, studio quality, 6k
 elderly figure in a leather jacket DJing in a smoky nightclub, mixing live on a giant console, dramatic stage lighting, a masterpiece
 elderly figure in a leather jacket on a motorcycle, magazine cover lighting, a masterpiece
 a young pilot ordering a burger and fries from a futuristic space cantina
 I want to generate a group avatar for a Feishu group chat. The role of this group is daily software technical communication. Now the subject technology stacks that members of this group discuss daily include: algorithms, data structures, optimization, functional programming, and the programming languages often discussed are: TypeScript, Java, python, etc. I hope this avatar has a simple aesthetic, this avatar is a single person avatar
 portrait Anime black girl cute-fine-face, pretty face, realistic shaded Perfect face, fine details. Anime. realistic shaded lighting by Ilya Kuvshinov Giuseppe Dangelico Pino and Michael Garmash and Rob Rey, IAMAG premiere, WLOP matte print, cute freckles, masterpiece
 young woman in modern fashion editorial, beige miniskirt and dark brown turtleneck sweater, soft studio lighting, brown hair, grey eyes, fine details, magazine cover style, a masterpiece
 Cute small cat sitting in a movie theater eating chicken wiggs watching a movie ,unreal engine, cozy indoor lighting, artstation, detailed, digital painting,cinematic,character design by mark ryden and pixar and hayao miyazaki, unreal 5, daz, hyperrealistic, octane render
 Cute small dog sitting in a movie theater eating popcorn watching a movie ,unreal engine, cozy indoor lighting, artstation, detailed, digital painting,cinematic,character design by mark ryden and pixar and hayao miyazaki, unreal 5, daz, hyperrealistic, octane render
 fox bracelet made of buckskin with fox features, rich details, fine carvings, studio lighting
 crane buckskin bracelet with crane features, rich details, fine carvings, studio lighting
 london luxurious interior living-room, light walls
 Parisian luxurious interior penthouse bedroom, dark walls, wooden panels
 cute girl, crop-top, blond hair, black glasses, stretching, with background by greg rutkowski makoto shinkai kyoto animation key art feminine mid shot
 houses in front, houses background, straight houses, digital art, smooth, sharp focus, gravity falls style, doraemon style, shinchan style, anime style
 Simplified technical drawing, Leonardo da Vinci, Mechanical Dinosaur Skeleton, Minimalistic annotations, Hand-drawn illustrations, Basic design and engineering, Wonder and curiosity
 High quality 8K painting impressionist style of a Japanese modern city street with a girl on the foreground wearing a traditional wedding dress with a fox mask, staring at the sky, daylight
 a landscape from the Moon with the Earth setting on the horizon, realistic, detailed
 Isometric Atlantis city,great architecture with columns, great details, ornaments,seaweed, blue ambiance, 3D cartoon style, soft light, 45° view
 A hyper realistic avatar of a guy riding on a black honda cbr 650r in leather suit,high detail, high quality,8K,photo realism
 the street of amedieval fantasy town, at dawn, dark, highly detailed
 overwhelmingly beautiful eagle framed with vector flowers, long shiny wavy flowing hair, polished, ultra detailed vector floral illustration mixed with hyper realism, muted pastel colors, vector floral details in background, muted colors, hyper detailed ultra intricate overwhelming realism in detailed complex scene with magical fantasy atmosphere, no signature, no watermark
 a highly detailed matte painting of a man on a hill watching a rocket launch in the distance by studio ghibli, makoto shinkai, by artgerm, by wlop, by greg rutkowski, volumetric lighting, octane render, 4 k resolution, trending on artstation, masterpiece | hyperrealism| highly detailed| insanely detailed| intricate| cinematic lighting| depth of field
 electronik robot and ofice ,unreal engine, cozy indoor lighting, artstation, detailed, digital painting,cinematic,character design by mark ryden and pixar and hayao miyazaki, unreal 5, daz, hyperrealistic, octane render
 exquisitely intricately detailed illustration, of a small world with a lake and a rainbow, inside a closed glass jar.
@@ -1,60 +1,225 @@
 """ComfyUI worker for the vast.ai PyWorker SDK.
 Each worker runs a benchmark on warm-up. The payload is selected as follows:
  1. If ``misc/benchmark.json`` exists in the cloned worker tree, it is
     used as a custom ComfyUI workflow. Use this if you fork the repo and
     bake in your workflow.
  2. Else, if ``$BENCHMARK_JSON_PATH`` is set and points at a readable
     file, it is used. Use this from a provisioning script — provisioning
     runs before pyworker is cloned, so it cannot write into ``misc/``,
     but it can drop the workflow elsewhere (e.g. ``/workspace/``) and
     export this env var.
  3. Else, if the well-known path
     ``/opt/comfyui-api-wrapper/workflows/pyworker_benchmark.json`` exists,
     it is used. The vast.ai ComfyUI base image's ``convert-workflows.sh``
     maintains this as a symlink to the first provisioned workflow, so on
     that image no env var is needed.
  4. Otherwise an SD1.5 Text2Image fallback runs, parameterised by the
     ``BENCHMARK_TEST_{WIDTH,HEIGHT,STEPS}`` env vars and a random prompt
     from ``misc/test_prompts.txt``.
 ``__RANDOM_INT__`` placeholders in custom workflows are substituted
 server-side by ai-dock/comfyui-api-wrapper, so this worker does not handle
 them itself.
 """
 import json
 import logging
 import os
 import random
 import sys
 from pathlib import Path
 from vastai import Worker, WorkerConfig, HandlerConfig, LogActionConfig, BenchmarkConfig
-# ComyUI model configuration
+# ComfyUI model configuration. The model server is ai-dock's
 # comfyui-api-wrapper sitting in front of ComfyUI itself, not ComfyUI's
 # own port (18188). We tail the api-wrapper's log rather than ComfyUI's
 # and key off the api-wrapper's own structured readiness/fault signals:
 #
 #   BACKENDS_READY            — api-wrapper has confirmed every ComfyUI
 #                               backend passes HTTP+WS probes. Until
 #                               this fires, posting to /generate/sync
 #                               can hit "Cannot connect to host" inside
 #                               the api-wrapper, which the SDK can't
 #                               recover from since __call_backend
 #                               doesn't retry connection-refused.
 #   BACKENDS_READY_TIMEOUT    — backends never reachable within
 #                               api-wrapper's deadline. Worker is
 #                               unrecoverable; mark errored.
 #   BACKEND_UNRECOVERABLE     — CUDA fault / illegal memory access on a
 #                               backend's GPU. Same fate.
 #   Application startup failed — uvicorn's own ASGI lifespan failed.
 #
 # These tokens are emitted by ai-dock/comfyui-api-wrapper >= the
 # "feat/backend-readiness-log-signals" change. Older wrappers won't
 # emit BACKENDS_READY, so warm-up will stall — pin the wrapper version
 # accordingly.
 MODEL_SERVER_URL           = 'http://127.0.0.1'
 MODEL_SERVER_PORT          = 18288
-MODEL_LOG_FILE             = '/var/log/portal/comfyui.log'
+MODEL_LOG_FILE             = '/var/log/portal/api-wrapper.log'
 MODEL_HEALTHCHECK_ENDPOINT = "/health"
-# ComyUI-specific log messages
+# Trigger benchmark only after the full stack (api-wrapper + ComfyUI
 # backends) is reachable. See BACKENDS_READY in the comment above.
 MODEL_LOAD_LOG_MSG = [
-    "To see the GUI go to: "
+    "BACKENDS_READY",
 ]
 # LogAction.ModelError is fatal: the SDK calls backend_errored() and
 # locks the worker into a permanent error state. Patterns must
 # therefore only match conditions where the api-wrapper genuinely
 # cannot serve any request — supervisord restarts on uvicorn exit, so
 # a real failure self-heals rather than dragging the worker down.
 #
 # Notably *not* matched here:
 #   - per-request errors (PreprocessWorker failures, ComfyUI workflow
 #     validation, "Value not in list:") — one malformed client payload
 #     would otherwise kill the worker
 #   - "CUDA out of memory" — surfaces both as a misconfigured GPU
 #     (which the benchmark-failure path already catches via
 #     backend_errored) and as a too-greedy client request, which is
 #     indistinguishable from a substring match
 #   - convert-workflows.sh warnings — that script is not load-bearing
 #     for serving
 MODEL_ERROR_LOG_MSGS = [
-    "MetadataIncompleteBuffer",
+    "BACKENDS_READY_TIMEOUT",       # backends never reachable
-    "Value not in list: ",
+    "BACKEND_UNRECOVERABLE",        # CUDA fault latched per backend
-    "[ERROR] Provisioning Script failed"
+    "Application startup failed",   # uvicorn ASGI lifespan startup failed
 ]
-MODEL_INFO_LOG_MSGS = [
+# LogAction.Info is purely informational (echoes log lines into the vast
-    '"message":"Downloading'
+# console). Nothing in api-wrapper.log is currently worth surfacing —
-]
+# model downloads are upstream in provisioning, per-request logs are
 # too noisy.
 MODEL_INFO_LOG_MSGS = []
-benchmark_prompts = [
+# Benchmark assets shipped alongside this worker. Resolved relative to this
-    "Cartoon hoodie hero; orc, anime cat, bunny; black goo; buff; vector on white.",
+# file so the worker keeps working regardless of the launch cwd.
-    "Cozy farming-game scene with fine details.",
+MISC_DIR       = Path(__file__).parent / "misc"
-    "2D vector child with soccer ball; airbrush chrome; swagger; antique copper.",
+BENCHMARK_FILE = MISC_DIR / "benchmark.json"
-    "Realistic futuristic downtown of low buildings at sunset.",
+TEST_PROMPTS   = MISC_DIR / "test_prompts.txt"
-    "Perfect wave front view; sunny seascape; ultra-detailed water; artful feel.",
+
-    "Clear cup with ice, fruit, mint; creamy swirls; fluid-sim CGI; warm glow.",
+# Well-known location maintained by the vast.ai ComfyUI base image.
-    "Male biker with backpack on motorcycle; oilpunk; award-worthy magazine cover.",
+# convert-workflows.sh symlinks this to the first provisioned workflow,
-    "Collage for textile; surreal cartoon cat in cap/jeans before poster; crisp.",
+# letting the base image work out-of-the-box without any env var.
-    "Medieval village inside glass sphere; volumetric light; macro focus.",
+WELLKNOWN_BENCHMARK = Path("/opt/comfyui-api-wrapper/workflows/pyworker_benchmark.json")
-    "Iron Man with glowing axe; mecha sci-fi; jungle scene; dynamic light.",
+
-    "Pope Francis DJ in leather jacket, mixing on giant console; dramatic.",
+log = logging.getLogger(__name__)
-]
+
 # Used when test_prompts.txt is unreadable or empty. Bare and generic
 # on purpose — this is a benchmark seed, not a creative output.
 _FALLBACK_PROMPT = "a still life on a wooden table, soft daylight"
 def _env_int(name: str, default: int) -> int:
    """Read an integer env var, warning + falling back on bad values."""
    raw = os.getenv(name)
    if raw is None or raw == "":
        return default
    try:
        return int(raw)
    except ValueError:
        log.warning("ignoring %s=%r (not an int); using default %d", name, raw, default)
        return default
-benchmark_dataset = [
+
-    {
+def _try_load_workflow(path: Path) -> dict | None:
    """Load and return a benchmark workflow from ``path``.
    Returns None on any failure (path missing, not a regular file,
    unreadable, invalid JSON) so the caller can fall through to the
    next tier rather than dropping straight to the SD1.5 default.
    """
    if not path.is_file():
        return None
    try:
        with open(path) as f:
            return json.load(f)
    except (json.JSONDecodeError, OSError) as e:
        log.warning("Failed to load %s: %s; trying next tier", path, e)
        return None
 def _custom_workflow_payload() -> dict | None:
    """Try each benchmark workflow tier in order; return the first one
    that loads cleanly as a payload, or None if every tier is absent /
    unreadable. Tiers (in order): in-tree ``misc/benchmark.json``,
    ``$BENCHMARK_JSON_PATH``, well-known base-image symlink.
    """
    env_path = os.getenv("BENCHMARK_JSON_PATH")
    candidates = [("misc", BENCHMARK_FILE)]
    if env_path:
        candidates.append(("env", Path(env_path)))
    candidates.append(("well-known", WELLKNOWN_BENCHMARK))
    for label, path in candidates:
        # Surface a warning specifically when the operator pointed
        # BENCHMARK_JSON_PATH at something we can't use — silent
        # fall-through there is a footgun (typo => SD1.5 fallback,
        # operator wonders why custom benchmark didn't take).
        if not path.is_file():
            if label == "env":
                log.warning(
                    "BENCHMARK_JSON_PATH=%s is not a readable file; trying fallbacks", path
                )
            continue
        workflow = _try_load_workflow(path)
        if workflow is None:
            continue
        log.info("Using custom benchmark workflow from %s (%s)", path, label)
        return {
            "input": {
                "request_id": f"test-{random.randint(1000, 99999)}",
                "workflow_json": workflow,
            }
        }
    return None
 def _load_prompts() -> list[str]:
    """Read misc/test_prompts.txt; defensive against missing/empty file."""
    try:
        with open(TEST_PROMPTS) as f:
            prompts = [line.strip() for line in f if line.strip()]
    except OSError as e:
        log.warning("could not read %s: %s; using built-in fallback prompt", TEST_PROMPTS, e)
        return [_FALLBACK_PROMPT]
    if not prompts:
        log.warning("%s is empty; using built-in fallback prompt", TEST_PROMPTS)
        return [_FALLBACK_PROMPT]
    return prompts
 def _default_payload() -> dict:
    """Build the SD1.5 Text2Image fallback payload."""
    prompts = _load_prompts()
    return {
        "input": {
            "request_id": f"test-{random.randint(1000, 99999)}",
            "modifier": "Text2Image",
            "modifications": {
-                "prompt": prompt,
+                "prompt": random.choice(prompts),
-                "width": 512,
+                "width":  _env_int("BENCHMARK_TEST_WIDTH",  512),
-                "height": 512,
+                "height": _env_int("BENCHMARK_TEST_HEIGHT", 512),
-                "steps": 20,
+                "steps":  _env_int("BENCHMARK_TEST_STEPS",  20),
-                "seed": random.randint(0, sys.maxsize)
+                "seed":   random.randint(0, sys.maxsize),
            }
        }
-    } for prompt in benchmark_prompts
+    }
-]
+
 def make_benchmark_payload() -> dict:
    """Build one benchmark request payload.
    Called once per benchmark run by the SDK; using a generator (rather
    than a static ``dataset=``) lets each run re-pick a prompt and re-roll
    the seed, and avoids holding multiple copies of a large workflow JSON
    in memory.
    """
    return _custom_workflow_payload() or _default_payload()
 worker_config = WorkerConfig(
    model_server_url=MODEL_SERVER_URL,
@@ -67,7 +232,7 @@ worker_config = WorkerConfig(
            allow_parallel_requests=False,
            max_queue_time=10.0,
            benchmark_config=BenchmarkConfig(
-                dataset=benchmark_dataset,
+                generator=make_benchmark_payload,
            )
        )
    ],
@@ -8,14 +8,13 @@ This is the base PyWorker for OpenAI compatible inference servers.  See the [Ser
 This worker is compatible with any backend API that properly implements the `/v1/completions` and `/v1/chat/completions` endpoints.  We currently have three templates you can choose from but you can also create your own without having to modify the PyWorker.
- [vLLM](https://cloud.vast.ai/?ref_id=62897&creator_id=62897&name=vLLM%20%2B%20Qwen%2FQwen3-8B%20(Serverless)) (recommended)
+- [vLLM](https://cloud.vast.ai/?ref_id=62897&creator_id=62897&name=vLLM%20(Serverless)) (recommended)
 - [Ollama](https://cloud.vast.ai/?ref_id=62897&creator_id=62897&name=Ollama%20%2B%20Qwen3%3A32b%20(Serverless))
 - [HuggingFace TGI](https://cloud.vast.ai/?ref_id=62897&creator_id=62897&name=TGI%20%2B%20Qwen3-8B%20(Serverless))
 All of these templates can be configured via the template interface.  You may want to change the model or startup arguments, depending on the template you selected.
-2. Follow the [getting started guide](https://docs.vast.ai/serverless/getting-started) for help with configuring your serverless setup.  For testing, we recommend that you use the default options presented by the web interface.
+2. Follow the [getting started guide](https://docs.vast.ai/documentation/serverless/quickstart) for help with configuring your serverless setup.  For testing, we recommend that you use the default options presented by the web interface.
 ## Client Setup (Demo)
@@ -34,38 +33,20 @@ uv pip install -r requirements.txt
 Several examples have been provided in the client to help you get started with your own implementation.
-### Completions
+First, set your API key as an environment variable:
 Call to `/v1/completions` with json response
 ```bash
-python -m workers.openai.client -k <API_KEY> -e <ENDPOINT_NAME> --completion --model <MODEL_NAME>
+export VAST_API_KEY=<your_api_key>
 ```
-### Chat Completion (json)
+The `--model` and `--endpoint` flags are optional. If not provided, they default to `Qwen/Qwen3-8B` and `my-vllm-endpoint` respectively.
 Call to `/v1/chat/completions` with json response
 ```bash
 python -m workers.openai.client -k <API_KEY> -e <ENDPOINT_NAME> --chat --model <MODEL_NAME>
 ```
 ### Chat Completion (streaming)
 Call to `/v1/chat/completions` with streaming response
 ```bash
-python -m workers.openai.client -k <API_KEY> -e <ENDPOINT_NAME> --chat-stream --model <MODEL_NAME>
+python -m workers.openai.client --chat-stream --endpoint <ENDPOINT_NAME> --model <MODEL_NAME>
 ```
 ### Tool Use (json)
 Call to `/v1/chat/completions` with tool and json response.
 This test defines a simple tool which will list the contents of the local pyworker directory.  The output is then analysed by the model.
 ```bash
 python -m workers.openai.client -k <API_KEY> -e <ENDPOINT_NAME> --tools --model <MODEL_NAME>
 ```
 ### Interactive Chat (streaming)
@@ -75,6 +56,32 @@ Interactive session with calls to `/v1/chat/completions`.
 Type `clear` to clear the chat history or `quit` to exit.
 ```bash
-python -m workers.openai.client -k <API_KEY> -e <ENDPOINT_NAME> --interactive --model <MODEL_NAME>
+python -m workers.openai.client --interactive --endpoint <ENDPOINT_NAME> --model <MODEL_NAME>
 ```
 ### Chat Completion (json)
 Call to `/v1/chat/completions` with json response
 ```bash
 python -m workers.openai.client --chat --endpoint <ENDPOINT_NAME> --model <MODEL_NAME>
 ```
 ### Tool Use (json)
 Call to `/v1/chat/completions` with tool and json response.
 This test defines a simple tool which will list the contents of the local pyworker directory.  The output is then analysed by the model.
 ```bash
 python -m workers.openai.client --tools --endpoint <ENDPOINT_NAME> --model <MODEL_NAME>
 ```
 ### Completions
 Call to `/v1/completions` with json response
 ```bash
 python -m workers.openai.client --completion --endpoint <ENDPOINT_NAME> --model <MODEL_NAME>
 ```
@@ -18,7 +18,7 @@ logging.basicConfig(
 log = logging.getLogger(__file__)
 # ---------------------- Prompts ----------------------
-COMPLETIONS_PROMPT = "the capital of USA is"
+COMPLETIONS_PROMPT = "Zebras are primarily grazers and can subsist on lower-quality vegetation. They are preyed on mainly by"
 CHAT_PROMPT = "Think step by step: Tell me about the Python programming language."
 TOOLS_PROMPT = (
    "Can you list the files in the current working directory and tell me what you see? "
@@ -97,9 +97,9 @@ def _tool_state_to_message_tool_calls(state: Dict[int, Dict[str, Any]]) -> List[
 # ---- OpenAI-compatible calls (non-streaming) ----
-async def call_completions(client: Serverless, *, model: str, prompt: str, **kwargs) -> Dict[str, Any]:
+async def call_completions(client: Serverless, *, model: str, prompt: str, endpoint_name: str, **kwargs) -> Dict[str, Any]:
-    endpoint = await client.get_endpoint(name=ENDPOINT_NAME)
+    endpoint = await client.get_endpoint(name=endpoint_name)
    payload = {
        "model": model,
@@ -111,9 +111,9 @@ async def call_completions(client: Serverless, *, model: str, prompt: str, **kwa
    resp = await endpoint.request("/v1/completions", payload, cost=payload["max_tokens"])
    return resp["response"]
-async def call_chat_completions(client: Serverless, *, model: str, messages: List[Dict[str, Any]], **kwargs) -> Dict[str, Any]:
+async def call_chat_completions(client: Serverless, *, model: str, messages: List[Dict[str, Any]], endpoint_name: str, **kwargs) -> Dict[str, Any]:
-    endpoint = await client.get_endpoint(name=ENDPOINT_NAME)
+    endpoint = await client.get_endpoint(name=endpoint_name)
    payload = {
        "model": model,
@@ -128,9 +128,9 @@ async def call_chat_completions(client: Serverless, *, model: str, messages: Lis
    return resp["response"]
 # ---- Streaming variants ----
-async def stream_completions(client: Serverless, *, model: str, prompt: str, **kwargs):
+async def stream_completions(client: Serverless, *, model: str, prompt: str, endpoint_name: str, **kwargs):
-    endpoint = await client.get_endpoint(name=ENDPOINT_NAME)
+    endpoint = await client.get_endpoint(name=endpoint_name)
    payload = {
        "model": model,
@@ -144,9 +144,9 @@ async def stream_completions(client: Serverless, *, model: str, prompt: str, **k
    resp = await endpoint.request("/v1/completions", payload, cost=payload["max_tokens"], stream=True)
    return resp["response"]  # async generator
-async def stream_chat_completions(client: Serverless, *, model: str, messages: List[Dict[str, Any]], **kwargs):
+async def stream_chat_completions(client: Serverless, *, model: str, messages: List[Dict[str, Any]], endpoint_name: str, **kwargs):
-    endpoint = await client.get_endpoint(name=ENDPOINT_NAME)
+    endpoint = await client.get_endpoint(name=endpoint_name)
    payload = {
        "model": model,
@@ -166,9 +166,10 @@ async def stream_chat_completions(client: Serverless, *, model: str, messages: L
 class APIDemo:
    """Demo and testing functionality for the API client"""
-    def __init__(self, client: Serverless, model: str, tool_manager: Optional[ToolManager] = None):
+    def __init__(self, client: Serverless, model: str, endpoint_name: str, tool_manager: Optional[ToolManager] = None):
        self.client = client
        self.model = model
        self.endpoint_name = endpoint_name
        self.tool_manager = tool_manager or ToolManager()
    # ----- Streaming handler -----
@@ -177,11 +178,16 @@ class APIDemo:
        reasoning_content = ""
        printed_reasoning = False
        printed_answer = False
        finish_reason = None
        async for chunk in stream:
            choice = (chunk.get("choices") or [{}])[0]
            delta = choice.get("delta", {})
            # Track finish reason
            if choice.get("finish_reason"):
                finish_reason = choice.get("finish_reason")
            # reasoning tokens
            rc = delta.get("reasoning_content")
            if rc and show_reasoning:
@@ -211,6 +217,8 @@ class APIDemo:
                print(f"Reasoning tokens: {len(reasoning_content.split())}")
            if printed_answer:
                print(f"Response tokens: {len(full_response.split())}")
            if finish_reason:
                print(f"Finish reason: {finish_reason}")
        return full_response
@@ -223,6 +231,7 @@ class APIDemo:
            client=self.client,
            model=self.model,
            prompt=COMPLETIONS_PROMPT,
            endpoint_name=self.endpoint_name,
            max_tokens=MAX_TOKENS,
            temperature=DEFAULT_TEMPERATURE,
        )
@@ -241,6 +250,7 @@ class APIDemo:
                client=self.client,
                model=self.model, 
                messages=messages,
                endpoint_name=self.endpoint_name,
                max_tokens=MAX_TOKENS,
                temperature=DEFAULT_TEMPERATURE
            )
@@ -253,6 +263,7 @@ class APIDemo:
                client=self.client,
                model=self.model, 
                messages=messages,
                endpoint_name=self.endpoint_name,
                max_tokens=MAX_TOKENS,
                temperature=DEFAULT_TEMPERATURE
            )
@@ -279,6 +290,7 @@ class APIDemo:
                client=self.client,
                model=self.model,
                messages=messages,
                endpoint_name=self.endpoint_name,
                tools=minimal_tool,
                tool_choice="none",
                max_tokens=10
@@ -304,6 +316,7 @@ class APIDemo:
            client=self.client,
            model=self.model,
            messages=messages,
            endpoint_name=self.endpoint_name,
            tools=self.tool_manager.get_ls_tool_definition(),
            tool_choice="auto",
            max_tokens=MAX_TOKENS,
@@ -381,6 +394,7 @@ class APIDemo:
            client=self.client,
            model=self.model,
            messages=messages,
            endpoint_name=self.endpoint_name,
            max_tokens=MAX_TOKENS,
            temperature=DEFAULT_TEMPERATURE,
        )
@@ -419,7 +433,6 @@ class APIDemo:
        print("=" * 60)
        print("INTERACTIVE STREAMING CHAT")
        print("=" * 60)
        print(f"Using model: {self.model}")
        print("Type 'quit' to exit, 'clear' to clear history")
        print()
@@ -446,6 +459,7 @@ class APIDemo:
                    client=self.client,
                    model=self.model, 
                    messages=messages,
                    endpoint_name=self.endpoint_name,
                    max_tokens=MAX_TOKENS, 
                    temperature=0.7
                )
@@ -465,8 +479,8 @@ class APIDemo:
 # ---------------------- CLI ----------------------
 def build_arg_parser() -> argparse.ArgumentParser:
    p = argparse.ArgumentParser(description="Vast vLLM Demo (Serverless SDK)")
-    p.add_argument("--model", required=True, help="Model to use for requests (required)")
+    p.add_argument("--model", default=DEFAULT_MODEL, help=f"Model to use for requests (default: {DEFAULT_MODEL})")
-    p.add_argument("--endpoint", default="my-vllm-endpoint", help="Vast endpoint name (default: my-vllm-endpoint)")
+    p.add_argument("--endpoint", default=ENDPOINT_NAME, help=f"Vast endpoint name (default: {ENDPOINT_NAME})")
    modes = p.add_mutually_exclusive_group(required=False)
    modes.add_argument("--completion", action="store_true", help="Test completions endpoint")
@@ -494,12 +508,14 @@ async def main_async():
        print("Please specify exactly one test mode")
        sys.exit(1)
    print(f"Using model: {args.model}")
    print("=" * 60)
    print(f"Using model: {args.model}")
    print(f"Using endpoint: {args.endpoint}")
    try:
        async with Serverless() as client:
-            demo = APIDemo(client, args.model, ToolManager())
+            demo = APIDemo(client, args.model, args.endpoint, ToolManager())
            if args.completion:
                await demo.demo_completions()
@@ -28,6 +28,12 @@ MODEL_INFO_LOG_MSGS = [
 nltk.download("words")
 WORD_LIST = nltk.corpus.words.words()
 def request_parser(request):
    data = request
    if request.get("input") is not None:
        data = request.get("input")
    return data
 def completions_benchmark_generator() -> dict:
    prompt = " ".join(random.choices(WORD_LIST, k=int(250)))
@@ -54,18 +60,20 @@ worker_config = WorkerConfig(
            route="/v1/completions",
            workload_calculator= lambda data: data.get("max_tokens", 0),
            allow_parallel_requests=True,
-            max_queue_time=60.0,
+            request_parser=request_parser,
            max_queue_time=600.0,
            benchmark_config=BenchmarkConfig(
                generator=completions_benchmark_generator,
-                concurrency=100,
+                concurrency=10,
-                runs=2
+                runs=3
            )
        ),
        HandlerConfig(
            route="/v1/chat/completions",
            workload_calculator= lambda data: data.get("max_tokens", 0),
            allow_parallel_requests=True,
-            max_queue_time=60.0,
+            request_parser=request_parser,
            max_queue_time=600.0,
        )
    ],
    log_action_config=LogActionConfig(
@@ -1,19 +1,103 @@
-This is the base PyWorker for TGI, designed to create PyWorkers that can utilize various LLMs. It offers two primary endpoints:
+# HuggingFace TGI PyWorker
-1. `generate`: Generates the LLM's response to a given prompt in a single request.
+This is the base PyWorker for HuggingFace Text Generation Inference (TGI) servers. See the [Serverless documentation](https://docs.vast.ai/serverless) for guides and how-to's.
 2. `generate_stream`: Streams the LLM's response token by token.
-Both endpoints use the following API payload format:
+## Instance Setup
 1. Pick a template
 This worker is compatible with any TGI backend. We have a template you can use or you can create your own.
 - [HuggingFace TGI](https://cloud.vast.ai/?ref_id=62897&creator_id=62897&name=TGI%20(Serverless))
 The template can be configured via the template interface. You may want to change the model or startup arguments.
 2. Follow the [getting started guide](https://docs.vast.ai/documentation/serverless/quickstart) for help with configuring your serverless setup. For testing, we recommend that you use the default options presented by the web interface.
 ## Client Setup (Demo)
 1. Clone the PyWorker repository to your local machine and install the necessary requirements for running the test client.
 ```bash
 git clone https://github.com/vast-ai/pyworker
 cd pyworker
 pip install uv
 uv venv -p 3.12
 source .venv/bin/activate
 uv pip install -r requirements.txt
 ```
 ## Using the Test Client
 The test client demonstrates both streaming and non-streaming generation using TGI's native API.
 First, set your API key as an environment variable:
 ```bash
 export VAST_API_KEY=<your_api_key>
 ```
 The `--endpoint` flag is optional. If not provided, it defaults to `my-tgi-endpoint`.
 ### Generate (Streaming)
 Call to `/generate_stream` with streaming response:
 ```bash
 python -m workers.tgi.client --generate-stream --endpoint <ENDPOINT_NAME>
 ```
 ### Generate (Non-Streaming)
 Call to `/generate` with json response:
 ```bash
 python -m workers.tgi.client --generate --endpoint <ENDPOINT_NAME>
 ```
 ### Interactive Session (Streaming)
 Interactive session with streaming responses. Type `quit` to exit.
 ```bash
 python -m workers.tgi.client --interactive --endpoint <ENDPOINT_NAME>
 ```
 ## API Endpoints
 TGI provides two primary endpoints:
 ### Generate (Non-Streaming)
 `/generate` - Returns the complete response in a single request.
 ```json
 {
-  "inputs": "PROMPT",
+  "inputs": "Your prompt here",
  "parameters": {
-    "max_new_tokens": 250
+    "max_new_tokens": 1024,
    "temperature": 0.7,
    "return_full_text": false
  }
 }
 ```
-Note that the max_new_tokens parameter, rather than the prompt size, impacts performance. For example, if an
+### Generate Stream (Streaming)
-instance is benchmarked to process 100 tokens per second, a request with max_new_tokens = 200 will take
+
-approximately 2 seconds to complete.
+`/generate_stream` - Streams the response token by token.
 ```json
 {
  "inputs": "Your prompt here",
  "parameters": {
    "max_new_tokens": 1024,
    "temperature": 0.7,
    "do_sample": true,
    "return_full_text": false
  }
 }
 ```
 ## Performance Notes
 The `max_new_tokens` parameter (not the prompt size) primarily impacts performance. For example, if an instance is benchmarked to process 100 tokens per second, a request with `max_new_tokens = 200` will take approximately 2 seconds to complete.
@@ -1,61 +1,222 @@
 import logging
 import json
 import os
 import sys
 import argparse
 from vastai import Serverless
 import asyncio
-ENDPOINT_NAME = "my-tgi-endpoint" # Change this to match your endpoint name
+# ---------------------- Logging ----------------------
 logging.basicConfig(
    level=logging.DEBUG,
    format="%(asctime)s[%(levelname)-5s] %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
 )
 log = logging.getLogger(__file__)
 # ---------------------- Defaults ----------------------
 DEFAULT_PROMPT = "Think step by step: Tell me about the Python programming language."
 ENDPOINT_NAME = "TGI-Prod2"       # change this to your TGI endpoint name
 MAX_TOKENS = 1024
-PROMPT = "Think step by step: Tell me about the Python programming language."
+DEFAULT_TEMPERATURE = 0.7
-async def call_generate(client: Serverless) -> None:
+
-    endpoint = await client.get_endpoint(name=ENDPOINT_NAME)
+# ---------------------- API Calls ----------------------
 async def call_generate(client: Serverless, *, endpoint_name: str, prompt: str, **kwargs) -> dict:
    """Non-streaming generation via /generate endpoint"""
    endpoint = await client.get_endpoint(name=endpoint_name)
    payload = {
-        "inputs": PROMPT,
+        "inputs": prompt,
        "parameters": {
-            "max_new_tokens": MAX_TOKENS,
+            "max_new_tokens": kwargs.get("max_tokens", MAX_TOKENS),
-            "temperature": 0.7,
+            "temperature": kwargs.get("temperature", DEFAULT_TEMPERATURE),
-            "return_full_text": False
+            "return_full_text": False,
        }
    }
-
+    log.debug("POST /generate %s", json.dumps(payload)[:500])
-    resp = await endpoint.request("/generate", payload, cost=MAX_TOKENS)
+    resp = await endpoint.request("/generate", payload, cost=payload["parameters"]["max_new_tokens"])
-
+    return resp["response"]
    print(resp["response"]["generated_text"])
-async def call_generate_stream(client: Serverless) -> None:
+async def call_generate_stream(client: Serverless, *, endpoint_name: str, prompt: str, **kwargs):
-    endpoint = await client.get_endpoint(name=ENDPOINT_NAME)
+    """Streaming generation via /generate_stream endpoint"""
    endpoint = await client.get_endpoint(name=endpoint_name)
    payload = {
-        "inputs": PROMPT,
+        "inputs": prompt,
        "parameters": {
-            "max_new_tokens": MAX_TOKENS,
+            "max_new_tokens": kwargs.get("max_tokens", MAX_TOKENS),
-            "temperature": 0.7,
+            "temperature": kwargs.get("temperature", DEFAULT_TEMPERATURE),
            "do_sample": True,
            "return_full_text": False,
        }
    }
-
+    log.debug("STREAM /generate_stream %s", json.dumps(payload)[:500])
    resp = await endpoint.request(
        "/generate_stream",
        payload,
-        cost=MAX_TOKENS,
+        cost=payload["parameters"]["max_new_tokens"],
        stream=True,
    )
-    stream = resp["response"]
+    return resp["response"]  # async generator
 # ---------------------- Demo Runner ----------------------
 class APIDemo:
    """Demo and testing functionality for the TGI API client"""
    def __init__(self, client: Serverless, endpoint_name: str):
        self.client = client
        self.endpoint_name = endpoint_name
    async def handle_streaming_response(self, stream) -> str:
        """Process streaming response and print tokens"""
        full_response = ""
        printed_answer = False
        async for event in stream:
            tok = (event.get("token") or {}).get("text")
            if tok:
                if not printed_answer:
                    printed_answer = True
-                print("Answer:\n", end="", flush=True)
+                    print("\n💬 Response: ", end="", flush=True)
                print(tok, end="", flush=True)
                full_response += tok
-async def main():
+        print()  # newline
        if printed_answer:
            print(f"\nStreaming completed. Response tokens: {len(full_response.split())}")
        return full_response
    async def demo_generate(self) -> None:
        """Demo non-streaming generation"""
        print("=" * 60)
        print("GENERATE DEMO (NON-STREAMING)")
        print("=" * 60)
        response = await call_generate(
            client=self.client,
            endpoint_name=self.endpoint_name,
            prompt=DEFAULT_PROMPT,
            max_tokens=MAX_TOKENS,
            temperature=DEFAULT_TEMPERATURE,
        )
        print(f"\n💬 Response: {response.get('generated_text', '')}")
        print(f"\nFull Response:\n{json.dumps(response, indent=2)}")
    async def demo_generate_stream(self) -> None:
        """Demo streaming generation"""
        print("=" * 60)
        print("GENERATE DEMO (STREAMING)")
        print("=" * 60)
        stream = await call_generate_stream(
            client=self.client,
            endpoint_name=self.endpoint_name,
            prompt=DEFAULT_PROMPT,
            max_tokens=MAX_TOKENS,
            temperature=DEFAULT_TEMPERATURE,
        )
        try:
            await self.handle_streaming_response(stream)
        except Exception as e:
            log.error("\nError during streaming: %s", e, exc_info=True)
    async def interactive_chat(self) -> None:
        """Interactive session with streaming generation"""
        print("=" * 60)
        print("INTERACTIVE STREAMING SESSION")
        print("=" * 60)
        print(f"Using endpoint: {self.endpoint_name}")
        print("Type 'quit' to exit")
        print()
        while True:
            try:
                user_input = input("You: ").strip()
                if user_input.lower() == "quit":
                    print("👋 Goodbye!")
                    break
                elif not user_input:
                    continue
                print("Assistant: ", end="", flush=True)
                stream = await call_generate_stream(
                    client=self.client,
                    endpoint_name=self.endpoint_name,
                    prompt=user_input,
                    max_tokens=MAX_TOKENS,
                    temperature=DEFAULT_TEMPERATURE,
                )
                full_response = ""
                async for event in stream:
                    tok = (event.get("token") or {}).get("text")
                    if tok:
                        print(tok, end="", flush=True)
                        full_response += tok
                print()  # newline
            except KeyboardInterrupt:
                print("\n👋 Session interrupted. Goodbye!")
                break
            except Exception as e:
                log.error("\nError: %s", e)
                continue
 # ---------------------- CLI ----------------------
 def build_arg_parser() -> argparse.ArgumentParser:
    p = argparse.ArgumentParser(description="Vast TGI Demo (Serverless SDK)")
    p.add_argument("--endpoint", default=ENDPOINT_NAME, help=f"Vast endpoint name (default: {ENDPOINT_NAME})")
    modes = p.add_mutually_exclusive_group(required=False)
    modes.add_argument("--generate", action="store_true", help="Test generate endpoint (non-streaming)")
    modes.add_argument("--generate-stream", action="store_true", help="Test generate endpoint with streaming")
    modes.add_argument("--interactive", action="store_true", help="Start interactive streaming session")
    return p
 async def main_async():
    args = build_arg_parser().parse_args()
    selected = sum([args.generate, args.generate_stream, args.interactive])
    if selected == 0:
        print("Please specify exactly one test mode:")
        print("  --generate        : Test generate endpoint (non-streaming)")
        print("  --generate-stream : Test generate endpoint with streaming")
        print("  --interactive     : Start interactive streaming session")
        print(f"\nExample: python {os.path.basename(sys.argv[0])} --generate-stream --endpoint my-tgi-endpoint")
        sys.exit(1)
    elif selected > 1:
        print("Please specify exactly one test mode")
        sys.exit(1)
    print("=" * 60)
    print(f"Using endpoint: {args.endpoint}")
    try:
        async with Serverless() as client:
-        await call_generate(client)
+            demo = APIDemo(client, args.endpoint)
-        await call_generate_stream(client)
+
            if args.generate:
                await demo.demo_generate()
            elif args.generate_stream:
                await demo.demo_generate_stream()
            elif args.interactive:
                await demo.interactive_chat()
    except Exception as e:
        log.error("Error during test: %s", e, exc_info=True)
        sys.exit(1)
 if __name__ == "__main__":
-    asyncio.run(main())
+    asyncio.run(main_async())
@@ -35,7 +35,7 @@ def benchmark_generator() -> dict:
    benchmark_data = {
        "inputs": prompt,
        "parameters": {
-            "max_new_tokens": 128,
+            "max_new_tokens": 500,
            "temperature": 0.7,
            "return_full_text": False
        }
@@ -52,17 +52,18 @@ worker_config = WorkerConfig(
        HandlerConfig(
            route="/generate",
            allow_parallel_requests=True,
-            max_queue_time=60.0,
+            max_queue_time=600.0,
            benchmark_config=BenchmarkConfig(
                generator=benchmark_generator,
-                concurrency=50
+                concurrency=10,
                runs=3
            ),
            workload_calculator= lambda x: x["parameters"]["max_new_tokens"]
        ),
        HandlerConfig(
            route="/generate_stream",
            allow_parallel_requests=True,
-            max_queue_time=60.0,
+            max_queue_time=600.0,
            workload_calculator= lambda x: x["parameters"]["max_new_tokens"]
        )
    ],
@@ -2,7 +2,7 @@
 This is the PyWorker implementation for running **Wan 2.2 T2V A14B** text-to-video workflows in ComfyUI. It provides a unified interface for executing complete ComfyUI video-generation workflows through a proxy-based architecture and returning generated video assets.
-Each request has a static cost of `100`. ComfyUI does not support concurrent workloads, and there is no provision to run multiple ComfyUI instances per worker node.
+Each request has a static cost of `10000`. ComfyUI does not support concurrent workloads, and there is no provision to run multiple ComfyUI instances per worker node.
 ## Requirements
Author	SHA1	Message	Date
Rob Ballantyne	b52c654f09	comfyui-json: key readiness off api-wrapper's BACKENDS_READY token Rather than tailing for "Uvicorn running on", which only confirms the api-wrapper's own HTTP listener is bound, watch for the api-wrapper's new structured tokens that reflect actual end-to-end reachability: MODEL_LOAD_LOG_MSG = ["BACKENDS_READY"] MODEL_ERROR_LOG_MSGS includes: - "BACKENDS_READY_TIMEOUT" (backends never came up) - "BACKEND_UNRECOVERABLE" (CUDA fault latched on a backend) - "Application startup failed" (kept; uvicorn's own ASGI failure) Closes the race observed on a live test where the pyworker fired benchmark the moment uvicorn bound, every request inside the api-wrapper hit Cannot-connect-to-host on ComfyUI, and the SDK counted the resulting fast 502s as a fast worker (perf=200). Tokens are emitted by ai-dock/comfyui-api-wrapper#11 and onward; earlier wrapper versions won't emit BACKENDS_READY so warm-up stalls indefinitely — pin to a wrapper that includes that change. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>	2026-05-08 09:46:45 +01:00
Rob Ballantyne	a5bcc3de5e	comfyui-json: address PR #85 review Five issues raised by Copilot's review: 1. _resolve_benchmark_path's docstring/README claim that a set-but- broken BENCHMARK_JSON_PATH falls through to the well-known tier, but the implementation only handled "file missing". A path pointing at a directory or holding malformed JSON dropped straight to the SD1.5 fallback without consulting tier 3. Replaced with a true tiered try-and-load: walk (misc, env, well-known), attempt to load each, and fall through to the next on any failure (missing, not a regular file, unreadable, invalid JSON). The env-var case still surfaces a warning so a typo doesn't fail silently. 2. int(os.getenv("BENCHMARK_TEST_WIDTH", ...)) crashed on non-int values. Added _env_int helper that warns + returns default on ValueError. Empty string also handled. 3. random.choice([]) on an empty test_prompts.txt raised IndexError. _load_prompts now warns + uses a built-in _FALLBACK_PROMPT when the file is missing or yields no non-blank lines. 4. README already claimed "missing or unreadable" fall-through; the refactor in (1) makes the code match. No README change needed. 5. test_prompts.txt restored verbatim from the pre-rewrite tree carried real-person and IP-laden prompts (Pope Francis, Iron Man, Luke Skywalker, "Disney socialite"). Used automatically during warm-up they're a reputational/safety-filter risk for the worker. Replaced with generic equivalents that exercise the same workload characteristics (1 elderly figure on motorcycle, 1 armoured hero with axe, etc.). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>	2026-05-07 18:25:21 +01:00
Rob Ballantyne	cecf0236fa	comfyui-json: watch api-wrapper.log for readiness Switch MODEL_LOG_FILE from /var/log/portal/comfyui.log to /var/log/portal/api-wrapper.log and MODEL_LOAD_LOG_MSG to "Uvicorn running on". A live test instance showed the previous setup firing benchmark on ComfyUI's "To see the GUI go to:" line, which races api-wrapper.sh: that script runs convert-workflows.sh (which itself waits for ComfyUI ready and then converts workflows for several seconds) before launching uvicorn. The benchmark hit a closed port on :18288 and the SDK's __call_backend has no retry on connection refused, locking the worker into a permanent error state. Watching the api-wrapper log instead means the benchmark only fires after uvicorn is bound and the pyworker_benchmark.json symlink is already in place — no SDK changes required. Trim MODEL_ERROR_LOG_MSGS down to "Application startup failed". The old patterns were ComfyUI-specific (won't appear in api-wrapper.log) and dangerous: ModelError is fatal, so "Value not in list:" matching on an api-wrapper-style log would let one malformed client request kill the worker. CUDA OOM is similarly off-limits (indistinguishable from a too-greedy client request via substring match; the benchmark- failure path already catches model-load OOM at boot). Empty MODEL_INFO_LOG_MSGS — the prior ComfyUI download pattern can never match this log file. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>	2026-05-07 12:46:17 +01:00
Rob Ballantyne	09917a9c88	Revert "Wait briefly for the well-known benchmark symlink" This reverts commit `9d7371ddba`.	2026-05-07 12:03:19 +01:00
Rob Ballantyne	9d7371ddba	Wait briefly for the well-known benchmark symlink The pyworker and convert-workflows.sh both unblock when ComfyUI is ready, but conversion takes a few seconds longer — without a wait, the first benchmark loses the race and silently drops to the SD1.5 fallback. Wait up to BENCHMARK_WAIT_TIMEOUT (default 30s) for the symlink before giving up. The wait fires only when we're actually about to use the well-known tier (env var / misc/ paths short-circuit), only once per process, and is skipped entirely off the base image (parent directory absent), so non-base-image deployments don't pay the timeout. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>	2026-05-07 11:59:30 +01:00
Rob Ballantyne	381a39f201	Add well-known fallback path for benchmark.json Read /opt/comfyui-api-wrapper/workflows/pyworker_benchmark.json when neither misc/benchmark.json nor $BENCHMARK_JSON_PATH yields a usable file. The vast.ai ComfyUI base image's convert-workflows.sh maintains that path as a symlink to the first provisioned workflow, so on that image the operator does not need to set BENCHMARK_JSON_PATH at all. A set-but-broken $BENCHMARK_JSON_PATH now warns and falls through to the well-known path instead of dropping straight to the SD1.5 fallback, so a typo in the env var doesn't mask an otherwise-working benchmark. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>	2026-05-07 11:54:20 +01:00
Rob Ballantyne	a634ba07a6	Support BENCHMARK_JSON_PATH for provisioning-supplied benchmarks start_server.sh clones pyworker into /workspace/vast-pyworker after the provisioning phase has run, so a provisioning script that wants to ship a custom benchmark workflow cannot write to misc/benchmark.json — that path doesn't exist yet at provisioning time, and pre-creating it would make the subsequent clone fail. Allow provisioning to drop the workflow anywhere (e.g. /workspace) and point the worker at it via the BENCHMARK_JSON_PATH env var. The in-tree file still takes precedence (so forks with a baked-in benchmark keep working unchanged); the env var is consulted only as a second choice, and a misconfigured path logs a warning rather than silently degrading to the SD1.5 fallback. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>	2026-05-07 11:24:14 +01:00
Rob Ballantyne	2dd4f7fc38	Restore benchmark.json loading in comfyui-json worker The "Use PyWorker SDK" rewrite (`4380d98`) replaced the dynamic ComfyWorkflowData.for_test() benchmark logic with a hardcoded list of 11 SD1.5 Text2Image payloads, dropped misc/benchmark.json.example and misc/test_prompts.txt, and stopped honouring the BENCHMARK_TEST_* environment variables. The README's documented behaviour (custom workflow via benchmark.json, env-var-tuned fallback) had no implementation behind it. Restore the original two-tier behaviour against the new SDK by passing BenchmarkConfig(generator=make_benchmark_payload) instead of a static dataset, splitting the load logic into a custom-workflow path and a fallback path, and re-shipping the misc/ assets. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>	2026-05-07 11:06:34 +01:00
Lucas Armand	9bc9ba11c5	Increase TGI benchmark tokens to 500	2026-04-30 14:04:39 -07:00
LucasArmandVast	48fdc65e3d	Update to vastai package (#84 )	2026-04-14 10:41:31 -07:00
LucasArmandVast	2cd97315cd	Add nltk requirement for openai worker (#83 ) * Add nltk requirement for openai worker * pin version	2026-04-13 11:30:06 -07:00
Lucas Armand	83c31e25a9	Add force update detection	2026-03-31 13:46:22 -07:00
Lucas Armand	fbe1dca6fa	more env_path fixes	2026-03-30 16:28:51 -07:00
Lucas Armand	4c3120dbc5	allow override env_path	2026-03-30 16:25:01 -07:00
Lucas Armand	d7d9b915f6	allow break system packages	2026-03-30 16:09:17 -07:00
Lucas Armand	4660b337fb	Check for USE_SYSTEM_PYTHON	2026-03-30 14:46:38 -07:00
edgaratvast	7506ecb6b5	directly invoke one stop shop setup executable exported by vastai pip package for deployments (#82 )	2026-03-26 10:59:49 -07:00
LucasArmandVast	50633c5003	Update deployments script with retries. (#81 )	2026-03-23 14:58:32 -07:00
LucasArmandVast	2e8f18276f	Add beta deployments script (#80 )	2026-03-23 14:14:06 -07:00
Scott Darden	eba9c480eb	Merge pull request #79 from vast-ai/update-requirements Updated requirements to only require vastai-sdk	2026-01-14 12:07:33 -08:00
Lucas Armand	aaca1c9645	Updated requirements to only require vastai-sdk	2026-01-14 10:47:07 -08:00
LucasArmandVast	f319db6bd5	flag for model log rotate (#78 )	2026-01-12 17:03:18 -08:00
LucasArmandVast	4d786b4d17	SDK Versioning Improvements (#77 ) * Add SDK_BRANCH	2026-01-02 10:23:07 -08:00
LucasArmandVast	bd3e0032a1	Add SDK version checking (#76 )	2025-12-17 21:01:52 -08:00
Lucas Armand	e02f4bc943	Lowered concurrency of vLLM and TGI benchmarks	2025-12-17 11:55:33 -08:00
Lucas Armand	bcb04b9a32	add missing comma	2025-12-17 11:40:40 -08:00
Lucas Armand	9daf171487	Increase queue limits for vLLM and TGI	2025-12-17 11:38:55 -08:00
LucasArmandVast	29f836eb1a	Backwards compatible vLLM payload (#75 ) * Support old vLLM payloads	2025-12-15 19:58:02 -08:00
LucasArmandVast	4380d98c01	Use PyWorker SDK (#67 ) * Change PyWorker to Worker SDK * Moved /lib to vast-sdk (https://github.com/vast-ai/vast-sdk)	2025-12-15 19:33:03 -08:00
Abiola Akinnubi	2ce741a8b7	Merge pull request #74 from vast-ai/AUTO-912 Mark pyworkers as "Error" if startup script fails. to avoid silent fail that waits for autoscaler.	2025-12-11 17:05:13 -08:00
Abiola Akinnubi	4ecc07032f	Mark pyworkers as "Error" if startup script fails. to avoid silent fail that waits for autoscaler.	2025-12-11 12:51:56 -08:00
edgaratvast	df61e6e946	correct version pin for aiohttp (#73 ) Co-authored-by: Edgar Lin <edgarlin2000@gmail.com>	2025-12-10 19:34:52 -08:00
LucasArmandVast	70f8a8f534	Merge pull request #72 from vast-ai/hotfix-pin-pycares Hotfix: pin pycares	2025-12-10 20:41:44 -05:00
Lucas Armand	7be8aa6397	pin pycares	2025-12-10 17:38:03 -08:00
Colter-Downing	138fc3ac47	Merge pull request #71 from vast-ai/AUTO-comfyui-updates Auto comfyui updates	2025-12-04 10:55:12 -08:00
Colter Downing	222ac2a0dd	default endpoint name	2025-12-04 10:54:55 -08:00
Colter Downing	40aed9b5f8	adding s3 as an option	2025-12-04 10:52:57 -08:00
Colter Downing	d4d36bf86e	done with comfy updates	2025-12-03 20:45:55 -08:00
Colter Downing	e839cfc6e8	include view in API wrapper	2025-12-03 20:22:45 -08:00
Colter Downing	f04138e13b	update to be able to get images	2025-12-03 20:16:25 -08:00
Colter-Downing	de3aa87c8f	Merge pull request #70 from vast-ai/AUTO-tgi-client-edits update tgi client	2025-12-03 18:40:01 -08:00
Colter Downing	6b5b1341a7	update tgi client	2025-12-03 18:38:42 -08:00
Colter-Downing	8be92c03de	Merge pull request #69 from vast-ai/AUTO-874--fix-openai-worker-client defaults to ENDPOINT_NAME and DEFAULT_MODEL but uses the flag first	2025-12-03 16:59:56 -08:00
Colter Downing	adedb8ba90	defaults to ENDPOINT_NAME and DEFAULT_MODEL but uses the flag first if present	2025-12-03 16:57:28 -08:00
LucasArmandVast	2f543c01ad	Merge pull request #68 from vast-ai/fix-vllm-concurrency Increase model wait time for vLLM	2025-12-03 16:13:51 -05:00
Lucas Armand	0bcd2219ea	Increase model wait time for vLLM	2025-12-03 12:38:52 -08:00