From cecf0236faa85b042db2711d07cf25d151462174 Mon Sep 17 00:00:00 2001 From: Rob Ballantyne Date: Thu, 7 May 2026 12:46:17 +0100 Subject: [PATCH] comfyui-json: watch api-wrapper.log for readiness MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Switch MODEL_LOG_FILE from /var/log/portal/comfyui.log to /var/log/portal/api-wrapper.log and MODEL_LOAD_LOG_MSG to "Uvicorn running on". A live test instance showed the previous setup firing benchmark on ComfyUI's "To see the GUI go to:" line, which races api-wrapper.sh: that script runs convert-workflows.sh (which itself waits for ComfyUI ready and then converts workflows for several seconds) before launching uvicorn. The benchmark hit a closed port on :18288 and the SDK's __call_backend has no retry on connection refused, locking the worker into a permanent error state. Watching the api-wrapper log instead means the benchmark only fires after uvicorn is bound and the pyworker_benchmark.json symlink is already in place — no SDK changes required. Trim MODEL_ERROR_LOG_MSGS down to "Application startup failed". The old patterns were ComfyUI-specific (won't appear in api-wrapper.log) and dangerous: ModelError is fatal, so "Value not in list:" matching on an api-wrapper-style log would let one malformed client request kill the worker. CUDA OOM is similarly off-limits (indistinguishable from a too-greedy client request via substring match; the benchmark- failure path already catches model-load OOM at boot). Empty MODEL_INFO_LOG_MSGS — the prior ComfyUI download pattern can never match this log file. Co-Authored-By: Claude Opus 4.7 (1M context) --- workers/comfyui-json/worker.py | 45 ++++++++++++++++++++++++++-------- 1 file changed, 35 insertions(+), 10 deletions(-) diff --git a/workers/comfyui-json/worker.py b/workers/comfyui-json/worker.py index e6dcee3..e35b43d 100644 --- a/workers/comfyui-json/worker.py +++ b/workers/comfyui-json/worker.py @@ -33,26 +33,51 @@ from pathlib import Path from vastai import Worker, WorkerConfig, HandlerConfig, LogActionConfig, BenchmarkConfig -# ComfyUI model configuration +# ComfyUI model configuration. The model server here is the ai-dock +# comfyui-api-wrapper sitting in front of ComfyUI itself, not ComfyUI's +# own port (18188). We watch the api-wrapper's log rather than ComfyUI's +# because the api-wrapper runs convert-workflows.sh before launching +# uvicorn — by the time uvicorn logs "Uvicorn running on ...", the +# benchmark workflows are converted, the pyworker_benchmark.json symlink +# exists, and :18288 is accepting connections. Watching ComfyUI's log +# fires the benchmark too early (before the api-wrapper is reachable), +# which the SDK can't recover from since __call_backend doesn't retry +# connection-refused. MODEL_SERVER_URL = 'http://127.0.0.1' MODEL_SERVER_PORT = 18288 -MODEL_LOG_FILE = '/var/log/portal/comfyui.log' +MODEL_LOG_FILE = '/var/log/portal/api-wrapper.log' MODEL_HEALTHCHECK_ENDPOINT = "/health" -# ComfyUI-specific log messages +# api-wrapper log messages MODEL_LOAD_LOG_MSG = [ - "To see the GUI go to: " + "Uvicorn running on" ] +# LogAction.ModelError is fatal: the SDK calls backend_errored() and the +# worker is locked into a permanent error state. Patterns must therefore +# only match conditions where the api-wrapper genuinely cannot serve any +# request — supervisord restarts on uvicorn exit, so a real failure +# self-heals rather than dragging the worker down. +# +# Notably *not* matched here: +# - per-request errors (PreprocessWorker failures, ComfyUI workflow +# validation, "Value not in list:") — one malformed client payload +# would otherwise kill the worker +# - "CUDA out of memory" — surfaces both as misconfigured GPU (which +# the benchmark-failure path already catches via backend_errored) +# and as a too-greedy client request, which is indistinguishable +# from a substring match +# - convert-workflows.sh warnings — that script is not load-bearing +# for serving (uvicorn starts even if conversion partially failed) MODEL_ERROR_LOG_MSGS = [ - "MetadataIncompleteBuffer", - "Value not in list: ", - "[ERROR] Provisioning Script failed" + "Application startup failed", # uvicorn ASGI lifespan startup failed -> uvicorn exits ] -MODEL_INFO_LOG_MSGS = [ - '"message":"Downloading' -] +# LogAction.Info is purely informational (echoes log lines into the vast +# console). Nothing in api-wrapper.log is currently worth surfacing — +# model downloads are upstream in provisioning, per-request logs are +# too noisy. +MODEL_INFO_LOG_MSGS = [] # Benchmark assets shipped alongside this worker. Resolved relative to this # file so the worker keeps working regardless of the launch cwd.