Files
pyworker/workers/comfyui-json/worker.py
T
Rob Ballantyne 9d7371ddba Wait briefly for the well-known benchmark symlink
The pyworker and convert-workflows.sh both unblock when ComfyUI is
ready, but conversion takes a few seconds longer — without a wait, the
first benchmark loses the race and silently drops to the SD1.5 fallback.

Wait up to BENCHMARK_WAIT_TIMEOUT (default 30s) for the symlink before
giving up. The wait fires only when we're actually about to use the
well-known tier (env var / misc/ paths short-circuit), only once per
process, and is skipped entirely off the base image (parent directory
absent), so non-base-image deployments don't pay the timeout.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-07 11:59:30 +01:00

202 lines
7.2 KiB
Python

"""ComfyUI worker for the vast.ai PyWorker SDK.
Each worker runs a benchmark on warm-up. The payload is selected as follows:
1. If ``misc/benchmark.json`` exists in the cloned worker tree, it is
used as a custom ComfyUI workflow. Use this if you fork the repo and
bake in your workflow.
2. Else, if ``$BENCHMARK_JSON_PATH`` is set and points at a readable
file, it is used. Use this from a provisioning script — provisioning
runs before pyworker is cloned, so it cannot write into ``misc/``,
but it can drop the workflow elsewhere (e.g. ``/workspace/``) and
export this env var.
3. Else, if the well-known path
``/opt/comfyui-api-wrapper/workflows/pyworker_benchmark.json`` exists,
it is used. The vast.ai ComfyUI base image's ``convert-workflows.sh``
maintains this as a symlink to the first provisioned workflow, so on
that image no env var is needed.
4. Otherwise an SD1.5 Text2Image fallback runs, parameterised by the
``BENCHMARK_TEST_{WIDTH,HEIGHT,STEPS}`` env vars and a random prompt
from ``misc/test_prompts.txt``.
``__RANDOM_INT__`` placeholders in custom workflows are substituted
server-side by ai-dock/comfyui-api-wrapper, so this worker does not handle
them itself.
"""
import json
import logging
import os
import random
import sys
import time
from pathlib import Path
from vastai import Worker, WorkerConfig, HandlerConfig, LogActionConfig, BenchmarkConfig
# ComfyUI model configuration
MODEL_SERVER_URL = 'http://127.0.0.1'
MODEL_SERVER_PORT = 18288
MODEL_LOG_FILE = '/var/log/portal/comfyui.log'
MODEL_HEALTHCHECK_ENDPOINT = "/health"
# ComfyUI-specific log messages
MODEL_LOAD_LOG_MSG = [
"To see the GUI go to: "
]
MODEL_ERROR_LOG_MSGS = [
"MetadataIncompleteBuffer",
"Value not in list: ",
"[ERROR] Provisioning Script failed"
]
MODEL_INFO_LOG_MSGS = [
'"message":"Downloading'
]
# Benchmark assets shipped alongside this worker. Resolved relative to this
# file so the worker keeps working regardless of the launch cwd.
MISC_DIR = Path(__file__).parent / "misc"
BENCHMARK_FILE = MISC_DIR / "benchmark.json"
TEST_PROMPTS = MISC_DIR / "test_prompts.txt"
# Well-known location maintained by the vast.ai ComfyUI base image.
# convert-workflows.sh symlinks this to the first provisioned workflow,
# letting the base image work out-of-the-box without any env var.
WELLKNOWN_BENCHMARK = Path("/opt/comfyui-api-wrapper/workflows/pyworker_benchmark.json")
# How long to wait for the well-known symlink to appear before giving up.
# convert-workflows.sh and the pyworker both unblock at "ComfyUI ready",
# but conversion takes a few seconds — without this wait the first
# benchmark loses the race and silently drops to the SD1.5 fallback.
# Tunable for slow setups (many workflows / slow disk).
_WELLKNOWN_WAIT_SECS = float(os.getenv("BENCHMARK_WAIT_TIMEOUT", "30"))
log = logging.getLogger(__name__)
_wait_done = False
def _wait_for_wellknown() -> None:
"""Wait at most once per process for ``WELLKNOWN_BENCHMARK`` to appear.
Skipped immediately if the parent directory doesn't exist (we're
not on the base image, so the symlink will never appear and there's
no point burning the timeout). Skipped on subsequent calls regardless
of outcome — if the file *does* show up later, ``_resolve_benchmark_path``
will still pick it up via the regular ``.exists()`` check on the next
benchmark run.
"""
global _wait_done
if _wait_done:
return
_wait_done = True
if WELLKNOWN_BENCHMARK.exists() or not WELLKNOWN_BENCHMARK.parent.is_dir():
return
deadline = time.monotonic() + _WELLKNOWN_WAIT_SECS
log.info("Waiting up to %.0fs for %s", _WELLKNOWN_WAIT_SECS, WELLKNOWN_BENCHMARK)
while time.monotonic() < deadline:
if WELLKNOWN_BENCHMARK.exists():
log.info("Found %s after wait", WELLKNOWN_BENCHMARK)
return
time.sleep(0.5)
log.info("%s did not appear within %.0fs; falling through", WELLKNOWN_BENCHMARK, _WELLKNOWN_WAIT_SECS)
def _resolve_benchmark_path() -> Path | None:
"""Return the path to the custom benchmark workflow, or None if absent.
See module docstring for the precedence rule. A set-but-broken
``$BENCHMARK_JSON_PATH`` logs a warning then falls through to the
well-known path, so a typo in the env var doesn't silently mask a
provisioned benchmark sitting at the standard location.
"""
if BENCHMARK_FILE.exists():
return BENCHMARK_FILE
env_path = os.getenv("BENCHMARK_JSON_PATH")
if env_path:
path = Path(env_path)
if path.exists():
return path
log.warning("BENCHMARK_JSON_PATH=%s does not exist; trying fallbacks", path)
_wait_for_wellknown()
if WELLKNOWN_BENCHMARK.exists():
return WELLKNOWN_BENCHMARK
return None
def _custom_workflow_payload() -> dict | None:
"""Build a payload from a custom benchmark workflow JSON, or None if unavailable."""
path = _resolve_benchmark_path()
if path is None:
return None
try:
with open(path) as f:
workflow = json.load(f)
except (json.JSONDecodeError, OSError) as e:
log.error("Failed to load %s: %s; falling back to default benchmark", path, e)
return None
log.info("Using custom benchmark workflow from %s", path)
return {
"input": {
"request_id": f"test-{random.randint(1000, 99999)}",
"workflow_json": workflow,
}
}
def _default_payload() -> dict:
"""Build the SD1.5 Text2Image fallback payload."""
with open(TEST_PROMPTS) as f:
prompts = [line.strip() for line in f if line.strip()]
return {
"input": {
"request_id": f"test-{random.randint(1000, 99999)}",
"modifier": "Text2Image",
"modifications": {
"prompt": random.choice(prompts),
"width": int(os.getenv("BENCHMARK_TEST_WIDTH", 512)),
"height": int(os.getenv("BENCHMARK_TEST_HEIGHT", 512)),
"steps": int(os.getenv("BENCHMARK_TEST_STEPS", 20)),
"seed": random.randint(0, sys.maxsize),
}
}
}
def make_benchmark_payload() -> dict:
"""Build one benchmark request payload.
Called once per benchmark run by the SDK; using a generator (rather
than a static ``dataset=``) lets each run re-pick a prompt and re-roll
the seed, and avoids holding multiple copies of a large workflow JSON
in memory.
"""
return _custom_workflow_payload() or _default_payload()
worker_config = WorkerConfig(
model_server_url=MODEL_SERVER_URL,
model_server_port=MODEL_SERVER_PORT,
model_log_file=MODEL_LOG_FILE,
model_healthcheck_url=MODEL_HEALTHCHECK_ENDPOINT,
handlers=[
HandlerConfig(
route="/generate/sync",
allow_parallel_requests=False,
max_queue_time=10.0,
benchmark_config=BenchmarkConfig(
generator=make_benchmark_payload,
)
)
],
log_action_config=LogActionConfig(
on_load=MODEL_LOAD_LOG_MSG,
on_error=MODEL_ERROR_LOG_MSGS,
on_info=MODEL_INFO_LOG_MSGS
)
)
Worker(worker_config).run()