Wire null pyworker healthcheck to a stub (and optional user URL)

Adds an in-process aiohttp stub on 127.0.0.1:18999/health so the framework's
periodic healthcheck has something live to talk to. Operators can override
with BACKEND_HEALTH_URL to point at their queue consumer's /health
endpoint, so the autoscaler marks the worker errored if the consumer dies.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Rob Ballantyne
2026-05-11 16:53:26 +01:00
parent 18974873e5
commit 89761b378a
2 changed files with 81 additions and 4 deletions
+57 -4
View File
@@ -2,6 +2,9 @@ import asyncio
import logging
import os
from contextlib import asynccontextmanager
from urllib.parse import urlsplit
from aiohttp import web
from vastai import (
Worker,
@@ -22,14 +25,63 @@ MAX_RESERVATION_SECONDS = float(os.environ.get("MAX_RESERVATION_SECONDS", 3600))
# immediately during capacity estimation instead of sleeping.
BENCHMARK_SENTINEL = "__null_worker_benchmark__"
# Healthcheck wiring. The framework periodically GETs
# `<model_server_url>:<model_server_port><model_healthcheck_url>` and marks the
# worker errored if that ever fails after the first success. For the null
# worker we either:
# * point at a URL the user supplies via BACKEND_HEALTH_URL — typically
# their own queue-consumer's health endpoint, so the autoscaler sees the
# worker as broken if the consumer dies, or
# * run a tiny built-in stub that always returns 200, so the framework has
# something live to talk to until the user wires up a real consumer.
BACKEND_HEALTH_URL = os.environ.get("BACKEND_HEALTH_URL", "").strip()
STUB_HEALTH_HOST = "127.0.0.1"
STUB_HEALTH_PORT = int(os.environ.get("NULL_STUB_HEALTH_PORT", 18999))
STUB_HEALTH_PATH = "/health"
if BACKEND_HEALTH_URL:
_parsed = urlsplit(BACKEND_HEALTH_URL)
if not _parsed.scheme or not _parsed.hostname:
raise ValueError(
f"BACKEND_HEALTH_URL must be an absolute URL, got: {BACKEND_HEALTH_URL!r}"
)
HEALTH_BASE_URL = f"{_parsed.scheme}://{_parsed.hostname}"
HEALTH_PORT = _parsed.port or (443 if _parsed.scheme == "https" else 80)
HEALTH_PATH = _parsed.path or "/"
USE_STUB = False
else:
HEALTH_BASE_URL = f"http://{STUB_HEALTH_HOST}"
HEALTH_PORT = STUB_HEALTH_PORT
HEALTH_PATH = STUB_HEALTH_PATH
USE_STUB = True
@asynccontextmanager
async def null_lifecycle():
log.info("Null pyworker active (no model server)")
runner = None
if USE_STUB:
async def stub_health(_request: web.Request) -> web.Response:
return web.Response(status=200, text="ok")
app = web.Application()
app.router.add_get(STUB_HEALTH_PATH, stub_health)
runner = web.AppRunner(app)
await runner.setup()
site = web.TCPSite(runner, STUB_HEALTH_HOST, STUB_HEALTH_PORT)
await site.start()
log.info(
f"Null pyworker stub healthcheck listening on "
f"http://{STUB_HEALTH_HOST}:{STUB_HEALTH_PORT}{STUB_HEALTH_PATH} "
f"(override by setting BACKEND_HEALTH_URL)"
)
else:
log.info(f"Null pyworker healthcheck pointing at {BACKEND_HEALTH_URL}")
try:
yield
finally:
log.info("Null pyworker shutting down")
if runner is not None:
await runner.cleanup()
async def reserve_worker(**params: object) -> dict:
@@ -59,8 +111,9 @@ async def reserve_worker(**params: object) -> dict:
worker_config = WorkerConfig(
model_server_url="http://127.0.0.1",
model_server_port=1,
model_server_url=HEALTH_BASE_URL,
model_server_port=HEALTH_PORT,
model_healthcheck_url=HEALTH_PATH,
lifecycle=null_lifecycle(),
handlers=[
HandlerConfig(