Add perf heartbeat to keep null pyworker reporting peak throughput

While a /reserve is held, no requests complete so workload_served stays at 0 each metrics tick. The autoscaler sees cur_perf=0 against max_perf=150, concludes the worker can't deliver claimed throughput, downgrades it, and gets cautious about scaling up — so additional /reserve requests pile up behind the held one instead of triggering a new worker. Add a 1Hz heartbeat coroutine that, while anything is in flight, sets workload_served back to TARGET_PERF (150) and flags update_pending. The metrics tick reads 150 and resets to 0; the heartbeat re-pins it before the next tick. Net effect: the autoscaler sees a saturated worker delivering at peak rate, which is the signal it needs to scale a new worker up rather than queue. The heartbeat needs the backend instance, which is only created inside Worker(...) — stash a reference in a module-level dict between Worker() and .run() so the lifecycle coroutine can reach it. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-12 10:35:18 +01:00
parent 2aada7b210
commit 6c2f194b28
1 changed files with 49 additions and 2 deletions
@@ -56,11 +56,49 @@ else:
    USE_STUB_HEALTH = True
 # Workload reported per /reserve and target perf for the heartbeat below.
 TARGET_PERF = 150.0
 # Singleton active reservation. `allow_parallel_requests=False` on the
 # /reserve handler guarantees the framework only runs one at a time per
 # worker, so a single slot is enough.
 _active_reservation: Optional[asyncio.Event] = None
 # Backed in after Worker(...) is constructed so the heartbeat coroutine in
 # null_lifecycle() can mutate backend.metrics. Stored in a dict so the
 # lifecycle closure picks up the assignment that happens before .run().
 _backend_ref: dict = {"backend": None}
 async def _perf_heartbeat() -> None:
    """Keep cur_perf pegged to TARGET_PERF while a reservation is held.
    Without this, workload_served stays at 0 while a /reserve is being held
    open. The autoscaler observes cur_perf=0 against max_perf=150, decides
    the worker can't deliver its claimed throughput, and downgrades it —
    which makes it cautious about scaling up and prone to queueing
    subsequent requests behind the held one instead of routing elsewhere.
    Every second, if anything is in flight, set workload_served=TARGET_PERF
    and mark update_pending so the metrics loop sends immediately. The
    metrics tick resets workload_served back to 0 after sending; we
    re-pin it next iteration.
    """
    while True:
        try:
            await asyncio.sleep(1.0)
            backend = _backend_ref.get("backend")
            if backend is None:
                continue
            mm = backend.metrics.model_metrics
            if mm.requests_working:
                mm.workload_served = TARGET_PERF
                backend.metrics.update_pending = True
        except asyncio.CancelledError:
            raise
        except Exception as e:
            log.debug(f"perf heartbeat error: {e}")
 def _build_internal_app() -> web.Application:
    app = web.Application()
@@ -107,6 +145,8 @@ async def null_lifecycle():
    site = web.TCPSite(runner, INTERNAL_HOST, INTERNAL_PORT)
    await site.start()
    heartbeat = asyncio.create_task(_perf_heartbeat(), name="null-perf-heartbeat")
    lines = [
        f"Null pyworker internal control server: http://{INTERNAL_HOST}:{INTERNAL_PORT}",
        f"  POST /release  - end the active reservation (call from your queue consumer)",
@@ -122,6 +162,11 @@ async def null_lifecycle():
    try:
        yield
    finally:
        heartbeat.cancel()
        try:
            await heartbeat
        except (asyncio.CancelledError, Exception):
            pass
        await runner.cleanup()
@@ -184,7 +229,7 @@ worker_config = WorkerConfig(
            # it to a free worker (or spins up a new one).
            max_queue_time=0.0,
            remote_function=reserve_worker,
-            workload_calculator=lambda _payload: 150.0,
+            workload_calculator=lambda _payload: TARGET_PERF,
            benchmark_config=BenchmarkConfig(
                generator=lambda: {BENCHMARK_SENTINEL: True},
                runs=1,
@@ -196,4 +241,6 @@ worker_config = WorkerConfig(
    log_action_config=LogActionConfig(),
 )
-Worker(worker_config).run()
+_worker = Worker(worker_config)
 _backend_ref["backend"] = _worker.backend
 _worker.run()