Compare commits
12 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 944f83fc03 | |||
| f56bbc0ebe | |||
| 70d51bafe1 | |||
| 63909736bb | |||
| f4f7080df1 | |||
| d51a338e8f | |||
| 92a04bd7af | |||
| ec25dda3ad | |||
| 3786cf978d | |||
| a86d4bcf9c | |||
| e9b6a14a5e | |||
| cadac033e1 |
+18
-35
@@ -3,7 +3,8 @@
|
|||||||
set -e -o pipefail
|
set -e -o pipefail
|
||||||
|
|
||||||
WORKSPACE_DIR="${WORKSPACE_DIR:-/workspace}"
|
WORKSPACE_DIR="${WORKSPACE_DIR:-/workspace}"
|
||||||
SERVER_DIR="$WORKSPACE_DIR/worker"
|
|
||||||
|
SERVER_DIR="$WORKSPACE_DIR/vast-pyworker"
|
||||||
ENV_PATH="$WORKSPACE_DIR/worker-env"
|
ENV_PATH="$WORKSPACE_DIR/worker-env"
|
||||||
DEBUG_LOG="$WORKSPACE_DIR/debug.log"
|
DEBUG_LOG="$WORKSPACE_DIR/debug.log"
|
||||||
PYWORKER_LOG="$WORKSPACE_DIR/pyworker.log"
|
PYWORKER_LOG="$WORKSPACE_DIR/pyworker.log"
|
||||||
@@ -21,23 +22,24 @@ function echo_var(){
|
|||||||
echo "$1: ${!1}"
|
echo "$1: ${!1}"
|
||||||
}
|
}
|
||||||
|
|
||||||
# Updated validation - BACKEND no longer required, but MODEL_LOG still is
|
[ -z "$BACKEND" ] && echo "BACKEND must be set!" && exit 1
|
||||||
[ -z "$MODEL_LOG" ] && echo "MODEL_LOG must be set!" && exit 1
|
[ -z "$MODEL_LOG" ] && echo "MODEL_LOG must be set!" && exit 1
|
||||||
[ -z "$HF_TOKEN" ] && echo "HF_TOKEN must be set!" && exit 1
|
[ -z "$HF_TOKEN" ] && echo "HF_TOKEN must be set!" && exit 1
|
||||||
|
[ "$BACKEND" = "comfyui" ] && [ -z "$COMFY_MODEL" ] && echo "For comfyui backends, COMFY_MODEL must be set!" && exit 1
|
||||||
|
|
||||||
echo "start_server.sh - SDK Worker Version"
|
|
||||||
|
echo "start_server.sh"
|
||||||
date
|
date
|
||||||
|
|
||||||
|
echo_var BACKEND
|
||||||
echo_var REPORT_ADDR
|
echo_var REPORT_ADDR
|
||||||
echo_var WORKER_PORT
|
echo_var WORKER_PORT
|
||||||
echo_var WORKSPACE_DIR
|
echo_var WORKSPACE_DIR
|
||||||
|
echo_var SERVER_DIR
|
||||||
echo_var ENV_PATH
|
echo_var ENV_PATH
|
||||||
echo_var DEBUG_LOG
|
echo_var DEBUG_LOG
|
||||||
echo_var PYWORKER_LOG
|
echo_var PYWORKER_LOG
|
||||||
echo_var MODEL_LOG
|
echo_var MODEL_LOG
|
||||||
echo_var MODEL_SERVER_URL
|
|
||||||
echo_var PYWORKER_REPO
|
|
||||||
echo_var PYWORKER_REF
|
|
||||||
|
|
||||||
# Populate /etc/environment with quoted values
|
# Populate /etc/environment with quoted values
|
||||||
if ! grep -q "VAST" /etc/environment; then
|
if ! grep -q "VAST" /etc/environment; then
|
||||||
@@ -56,32 +58,16 @@ then
|
|||||||
source ~/.local/bin/env
|
source ~/.local/bin/env
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [[ ! -d $SERVER_DIR ]]; then
|
# Fork testing
|
||||||
echo "Cloning worker repository..."
|
[[ ! -d $SERVER_DIR ]] && git clone "${PYWORKER_REPO:-https://github.com/vast-ai/pyworker}" "$SERVER_DIR"
|
||||||
git clone --depth=1 "${PYWORKER_REPO:-https://github.com/vast-ai/pyworker}" "$SERVER_DIR"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ -n ${PYWORKER_REF:-} ]]; then
|
if [[ -n ${PYWORKER_REF:-} ]]; then
|
||||||
echo "Checking out ref: $PYWORKER_REF"
|
(cd "$SERVER_DIR" && git checkout "$PYWORKER_REF")
|
||||||
(
|
|
||||||
cd "$SERVER_DIR"
|
|
||||||
git fetch --depth=1 origin "$PYWORKER_REF"
|
|
||||||
git checkout "$PYWORKER_REF"
|
|
||||||
)
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
uv venv --python-preference only-managed "$ENV_PATH" -p 3.10
|
uv venv --python-preference only-managed "$ENV_PATH" -p 3.10
|
||||||
source "$ENV_PATH/bin/activate"
|
source "$ENV_PATH/bin/activate"
|
||||||
|
|
||||||
# Install vast-sdk from server-side-sdk branch
|
uv pip install -r "${SERVER_DIR}/requirements.txt"
|
||||||
echo "Installing vast-sdk from GitHub (server-side-sdk branch)..."
|
|
||||||
uv pip install "git+https://github.com/vast-ai/vast-sdk.git@server-side-sdk"
|
|
||||||
|
|
||||||
# Install requirements from worker repo if they exist
|
|
||||||
if [ -f "${SERVER_DIR}/requirements.txt" ]; then
|
|
||||||
echo "Installing additional dependencies from requirements.txt..."
|
|
||||||
uv pip install -r "${SERVER_DIR}/requirements.txt"
|
|
||||||
fi
|
|
||||||
|
|
||||||
touch ~/.no_auto_tmux
|
touch ~/.no_auto_tmux
|
||||||
else
|
else
|
||||||
@@ -91,12 +77,7 @@ else
|
|||||||
echo "venv: $VIRTUAL_ENV"
|
echo "venv: $VIRTUAL_ENV"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Check that worker.py exists
|
[ ! -d "$SERVER_DIR/workers/$BACKEND" ] && echo "$BACKEND not supported!" && exit 1
|
||||||
if [ ! -f "$SERVER_DIR/worker.py" ]; then
|
|
||||||
echo "ERROR: worker.py not found in $SERVER_DIR"
|
|
||||||
echo "Please ensure your PYWORKER_REPO contains a worker.py file"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ "$USE_SSL" = true ]; then
|
if [ "$USE_SSL" = true ]; then
|
||||||
|
|
||||||
@@ -134,6 +115,9 @@ EOF
|
|||||||
POST "https://console.vast.ai/api/v0/sign_cert/?instance_id=$CONTAINER_ID" > /etc/instance.crt;
|
POST "https://console.vast.ai/api/v0/sign_cert/?instance_id=$CONTAINER_ID" > /etc/instance.crt;
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
export REPORT_ADDR WORKER_PORT USE_SSL UNSECURED
|
export REPORT_ADDR WORKER_PORT USE_SSL UNSECURED
|
||||||
|
|
||||||
cd "$SERVER_DIR"
|
cd "$SERVER_DIR"
|
||||||
@@ -144,6 +128,5 @@ echo "launching PyWorker server"
|
|||||||
# from the run prior to reboot. past logs are saved in $MODEL_LOG.old for debugging only
|
# from the run prior to reboot. past logs are saved in $MODEL_LOG.old for debugging only
|
||||||
[ -e "$MODEL_LOG" ] && cat "$MODEL_LOG" >> "$MODEL_LOG.old" && : > "$MODEL_LOG"
|
[ -e "$MODEL_LOG" ] && cat "$MODEL_LOG" >> "$MODEL_LOG.old" && : > "$MODEL_LOG"
|
||||||
|
|
||||||
# Launch the SDK-based worker instead of the old backend system
|
(python3 -m "workers.$BACKEND.server" |& tee -a "$PYWORKER_LOG") &
|
||||||
(python3 worker.py |& tee -a "$PYWORKER_LOG") &
|
echo "launching PyWorker server done"
|
||||||
echo "launching PyWorker server done"
|
|
||||||
|
|||||||
@@ -12,9 +12,21 @@ A docker image is provided but you may use any if the above requirements are met
|
|||||||
|
|
||||||
## Benchmarking
|
## Benchmarking
|
||||||
|
|
||||||
A simple image generation benchmark runs when each worker initializes to validate GPU performance and identify underperforming machines.
|
### Custom Benchmark Workflows
|
||||||
|
|
||||||
The benchmark uses Stable Diffusion v1.5 with ComfyUI's default text-to-image workflow. Configure the benchmark complexity and duration using these variables:
|
You can provide a custom ComfyUI workflow for benchmarking by creating `workers/comfyui-json/misc/benchmark.json`. This allows you to test performance using your preferred models and workflow complexity.
|
||||||
|
|
||||||
|
**Ways to provide the benchmark file:**
|
||||||
|
- Fork this repository and add your `benchmark.json` file
|
||||||
|
- Write the file during worker provisioning (onstart script or setup phase)
|
||||||
|
|
||||||
|
An example file is provided in the repository. To ensure varied generations, use the placeholder `__RANDOM_INT__` in place of static seed values - it will be replaced with a random integer for each benchmark run.
|
||||||
|
|
||||||
|
### Default Benchmark (Fallback)
|
||||||
|
|
||||||
|
If `benchmark.json` is not available, a simple image generation benchmark runs when each worker initializes. This validates GPU performance and helps identify underperforming machines.
|
||||||
|
|
||||||
|
The default benchmark uses Stable Diffusion v1.5 with ComfyUI's standard text-to-image workflow. Configure it using these environment variables:
|
||||||
|
|
||||||
| Environment Variable | Default Value | Description |
|
| Environment Variable | Default Value | Description |
|
||||||
| -------------------- | ------------- | ----------- |
|
| -------------------- | ------------- | ----------- |
|
||||||
@@ -24,7 +36,7 @@ The benchmark uses Stable Diffusion v1.5 with ComfyUI's default text-to-image wo
|
|||||||
|
|
||||||
Each benchmark run uses a random prompt from `misc/test_prompts.txt` and a random seed to ensure consistent GPU load patterns.
|
Each benchmark run uses a random prompt from `misc/test_prompts.txt` and a random seed to ensure consistent GPU load patterns.
|
||||||
|
|
||||||
### Calibrating Benchmark Duration
|
#### Calibrating Fallback Benchmark Duration
|
||||||
|
|
||||||
To screen for underperforming hardware, set `BENCHMARK_TEST_STEPS` to match your expected production workflow duration. This allows you to identify machines that won't meet performance requirements.
|
To screen for underperforming hardware, set `BENCHMARK_TEST_STEPS` to match your expected production workflow duration. This allows you to identify machines that won't meet performance requirements.
|
||||||
|
|
||||||
|
|||||||
@@ -98,6 +98,7 @@ def call_text2image_workflow(
|
|||||||
endpoint=route_response["endpoint"],
|
endpoint=route_response["endpoint"],
|
||||||
reqnum=route_response["reqnum"],
|
reqnum=route_response["reqnum"],
|
||||||
url=route_response["url"],
|
url=route_response["url"],
|
||||||
|
request_idx=route_response["request_idx"],
|
||||||
)
|
)
|
||||||
|
|
||||||
# Build the payload for the worker request
|
# Build the payload for the worker request
|
||||||
|
|||||||
@@ -5,12 +5,13 @@ import dataclasses
|
|||||||
from typing import Dict, Any
|
from typing import Dict, Any
|
||||||
from functools import cache
|
from functools import cache
|
||||||
from math import ceil
|
from math import ceil
|
||||||
|
from pathlib import Path
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
|
||||||
from lib.data_types import ApiPayload, JsonDataException
|
from lib.data_types import ApiPayload, JsonDataException
|
||||||
|
|
||||||
|
log = logging.getLogger(__file__)
|
||||||
with open("workers/comfyui/misc/test_prompts.txt", "r") as f:
|
|
||||||
test_prompts = f.readlines()
|
|
||||||
|
|
||||||
def count_workload() -> float:
|
def count_workload() -> float:
|
||||||
# Always 100.0 where there is a single instance of ComfyUI handling requests
|
# Always 100.0 where there is a single instance of ComfyUI handling requests
|
||||||
@@ -24,9 +25,32 @@ class ComfyWorkflowData(ApiPayload):
|
|||||||
@classmethod
|
@classmethod
|
||||||
def for_test(cls):
|
def for_test(cls):
|
||||||
"""
|
"""
|
||||||
Use the variables available to simulate workflows of the required running time
|
If the user has provided a benchmark workflow we can use it here to properly gauge performance.
|
||||||
|
Otherwise, use the variables available to simulate workflows of the required running time
|
||||||
Example: SD1.5, simple image gen 10000 steps, 512px x 512px will run for approximately 9 minutes @ ~18 it/s (RTX 4090)
|
Example: SD1.5, simple image gen 10000 steps, 512px x 512px will run for approximately 9 minutes @ ~18 it/s (RTX 4090)
|
||||||
"""
|
"""
|
||||||
|
# Try to load benchmark.json
|
||||||
|
benchmark_file = Path("workers/comfyui-json/misc/benchmark.json")
|
||||||
|
|
||||||
|
if benchmark_file.exists():
|
||||||
|
try:
|
||||||
|
with open(benchmark_file, "r") as f:
|
||||||
|
benchmark_workflow = json.load(f)
|
||||||
|
return cls(
|
||||||
|
input={
|
||||||
|
"request_id": f"test-{random.randint(1000, 99999)}",
|
||||||
|
"workflow_json": benchmark_workflow
|
||||||
|
}
|
||||||
|
)
|
||||||
|
except (json.JSONDecodeError, IOError):
|
||||||
|
# JSON is malformed or file can't be read, fall through to default
|
||||||
|
log.error(f"Failed to benchmark using {benchmark_file}")
|
||||||
|
|
||||||
|
# Fallback: read prompts and construct payload
|
||||||
|
log.info("Using fallback method for benchmarking")
|
||||||
|
with open("workers/comfyui-json/misc/test_prompts.txt", "r") as f:
|
||||||
|
test_prompts = f.readlines()
|
||||||
|
|
||||||
test_prompt = random.choice(test_prompts).rstrip()
|
test_prompt = random.choice(test_prompts).rstrip()
|
||||||
return cls(
|
return cls(
|
||||||
input={
|
input={
|
||||||
|
|||||||
@@ -0,0 +1,107 @@
|
|||||||
|
{
|
||||||
|
"3": {
|
||||||
|
"inputs": {
|
||||||
|
"seed": "__RANDOM_INT__",
|
||||||
|
"steps": 20,
|
||||||
|
"cfg": 8,
|
||||||
|
"sampler_name": "euler",
|
||||||
|
"scheduler": "normal",
|
||||||
|
"denoise": 1,
|
||||||
|
"model": [
|
||||||
|
"4",
|
||||||
|
0
|
||||||
|
],
|
||||||
|
"positive": [
|
||||||
|
"6",
|
||||||
|
0
|
||||||
|
],
|
||||||
|
"negative": [
|
||||||
|
"7",
|
||||||
|
0
|
||||||
|
],
|
||||||
|
"latent_image": [
|
||||||
|
"5",
|
||||||
|
0
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"class_type": "KSampler",
|
||||||
|
"_meta": {
|
||||||
|
"title": "KSampler"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"4": {
|
||||||
|
"inputs": {
|
||||||
|
"ckpt_name": "v1-5-pruned-emaonly-fp16.safetensors"
|
||||||
|
},
|
||||||
|
"class_type": "CheckpointLoaderSimple",
|
||||||
|
"_meta": {
|
||||||
|
"title": "Load Checkpoint"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"5": {
|
||||||
|
"inputs": {
|
||||||
|
"width": 512,
|
||||||
|
"height": 512,
|
||||||
|
"batch_size": 1
|
||||||
|
},
|
||||||
|
"class_type": "EmptyLatentImage",
|
||||||
|
"_meta": {
|
||||||
|
"title": "Empty Latent Image"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"6": {
|
||||||
|
"inputs": {
|
||||||
|
"text": "beautiful scenery nature glass bottle landscape, , purple galaxy bottle,",
|
||||||
|
"clip": [
|
||||||
|
"4",
|
||||||
|
1
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"class_type": "CLIPTextEncode",
|
||||||
|
"_meta": {
|
||||||
|
"title": "CLIP Text Encode (Prompt)"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"7": {
|
||||||
|
"inputs": {
|
||||||
|
"text": "text, watermark",
|
||||||
|
"clip": [
|
||||||
|
"4",
|
||||||
|
1
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"class_type": "CLIPTextEncode",
|
||||||
|
"_meta": {
|
||||||
|
"title": "CLIP Text Encode (Prompt)"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"8": {
|
||||||
|
"inputs": {
|
||||||
|
"samples": [
|
||||||
|
"3",
|
||||||
|
0
|
||||||
|
],
|
||||||
|
"vae": [
|
||||||
|
"4",
|
||||||
|
2
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"class_type": "VAEDecode",
|
||||||
|
"_meta": {
|
||||||
|
"title": "VAE Decode"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"9": {
|
||||||
|
"inputs": {
|
||||||
|
"filename_prefix": "ComfyUI",
|
||||||
|
"images": [
|
||||||
|
"8",
|
||||||
|
0
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"class_type": "SaveImage",
|
||||||
|
"_meta": {
|
||||||
|
"title": "Save Image"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -19,6 +19,7 @@ MODEL_SERVER_START_LOG_MSG = "To see the GUI go to: "
|
|||||||
MODEL_SERVER_ERROR_LOG_MSGS = [
|
MODEL_SERVER_ERROR_LOG_MSGS = [
|
||||||
"MetadataIncompleteBuffer", # This error is emitted when the downloaded model is corrupted
|
"MetadataIncompleteBuffer", # This error is emitted when the downloaded model is corrupted
|
||||||
"Value not in list: ", # This error is emitted when the model file is not there at all
|
"Value not in list: ", # This error is emitted when the model file is not there at all
|
||||||
|
"[ERROR] Provisioning Script failed", # Error inserted by provisioning script if models/nodes fail to download
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -82,6 +82,7 @@ def call_custom_workflow_for_sd3(
|
|||||||
endpoint=message["endpoint"],
|
endpoint=message["endpoint"],
|
||||||
reqnum=message["reqnum"],
|
reqnum=message["reqnum"],
|
||||||
url=message["url"],
|
url=message["url"],
|
||||||
|
request_idx=message["request_idx"],
|
||||||
)
|
)
|
||||||
workflow = {
|
workflow = {
|
||||||
"3": {
|
"3": {
|
||||||
|
|||||||
Reference in New Issue
Block a user