From cecf0236faa85b042db2711d07cf25d151462174 Mon Sep 17 00:00:00 2001
From: Rob Ballantyne <rob@vast.ai>
Date: Thu, 7 May 2026 12:46:17 +0100
Subject: [PATCH] comfyui-json: watch api-wrapper.log for readiness
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Switch MODEL_LOG_FILE from /var/log/portal/comfyui.log to
/var/log/portal/api-wrapper.log and MODEL_LOAD_LOG_MSG to "Uvicorn
running on". A live test instance showed the previous setup firing
benchmark on ComfyUI's "To see the GUI go to:" line, which races
api-wrapper.sh: that script runs convert-workflows.sh (which itself
waits for ComfyUI ready and then converts workflows for several
seconds) before launching uvicorn. The benchmark hit a closed port
on :18288 and the SDK's __call_backend has no retry on connection
refused, locking the worker into a permanent error state.

Watching the api-wrapper log instead means the benchmark only fires
after uvicorn is bound and the pyworker_benchmark.json symlink is
already in place — no SDK changes required.

Trim MODEL_ERROR_LOG_MSGS down to "Application startup failed". The
old patterns were ComfyUI-specific (won't appear in api-wrapper.log)
and dangerous: ModelError is fatal, so "Value not in list:" matching
on an api-wrapper-style log would let one malformed client request
kill the worker. CUDA OOM is similarly off-limits (indistinguishable
from a too-greedy client request via substring match; the benchmark-
failure path already catches model-load OOM at boot). Empty
MODEL_INFO_LOG_MSGS — the prior ComfyUI download pattern can never
match this log file.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 workers/comfyui-json/worker.py | 45 ++++++++++++++++++++++++++--------
 1 file changed, 35 insertions(+), 10 deletions(-)

diff --git a/workers/comfyui-json/worker.py b/workers/comfyui-json/worker.py
index e6dcee3..e35b43d 100644
--- a/workers/comfyui-json/worker.py
+++ b/workers/comfyui-json/worker.py
@@ -33,26 +33,51 @@ from pathlib import Path
 
 from vastai import Worker, WorkerConfig, HandlerConfig, LogActionConfig, BenchmarkConfig
 
-# ComfyUI model configuration
+# ComfyUI model configuration. The model server here is the ai-dock
+# comfyui-api-wrapper sitting in front of ComfyUI itself, not ComfyUI's
+# own port (18188). We watch the api-wrapper's log rather than ComfyUI's
+# because the api-wrapper runs convert-workflows.sh before launching
+# uvicorn — by the time uvicorn logs "Uvicorn running on ...", the
+# benchmark workflows are converted, the pyworker_benchmark.json symlink
+# exists, and :18288 is accepting connections. Watching ComfyUI's log
+# fires the benchmark too early (before the api-wrapper is reachable),
+# which the SDK can't recover from since __call_backend doesn't retry
+# connection-refused.
 MODEL_SERVER_URL           = 'http://127.0.0.1'
 MODEL_SERVER_PORT          = 18288
-MODEL_LOG_FILE             = '/var/log/portal/comfyui.log'
+MODEL_LOG_FILE             = '/var/log/portal/api-wrapper.log'
 MODEL_HEALTHCHECK_ENDPOINT = "/health"
 
-# ComfyUI-specific log messages
+# api-wrapper log messages
 MODEL_LOAD_LOG_MSG = [
-    "To see the GUI go to: "
+    "Uvicorn running on"
 ]
 
+# LogAction.ModelError is fatal: the SDK calls backend_errored() and the
+# worker is locked into a permanent error state. Patterns must therefore
+# only match conditions where the api-wrapper genuinely cannot serve any
+# request — supervisord restarts on uvicorn exit, so a real failure
+# self-heals rather than dragging the worker down.
+#
+# Notably *not* matched here:
+#   - per-request errors (PreprocessWorker failures, ComfyUI workflow
+#     validation, "Value not in list:") — one malformed client payload
+#     would otherwise kill the worker
+#   - "CUDA out of memory" — surfaces both as misconfigured GPU (which
+#     the benchmark-failure path already catches via backend_errored)
+#     and as a too-greedy client request, which is indistinguishable
+#     from a substring match
+#   - convert-workflows.sh warnings — that script is not load-bearing
+#     for serving (uvicorn starts even if conversion partially failed)
 MODEL_ERROR_LOG_MSGS = [
-    "MetadataIncompleteBuffer",
-    "Value not in list: ",
-    "[ERROR] Provisioning Script failed"
+    "Application startup failed",  # uvicorn ASGI lifespan startup failed -> uvicorn exits
 ]
 
-MODEL_INFO_LOG_MSGS = [
-    '"message":"Downloading'
-]
+# LogAction.Info is purely informational (echoes log lines into the vast
+# console). Nothing in api-wrapper.log is currently worth surfacing —
+# model downloads are upstream in provisioning, per-request logs are
+# too noisy.
+MODEL_INFO_LOG_MSGS = []
 
 # Benchmark assets shipped alongside this worker. Resolved relative to this
 # file so the worker keeps working regardless of the launch cwd.