Add perf heartbeat to keep null pyworker reporting peak throughput

While a /reserve is held, no requests complete so workload_served stays at 0 each metrics tick. The autoscaler sees cur_perf=0 against max_perf=150, concludes the worker can't deliver claimed throughput, downgrades it, and gets cautious about scaling up — so additional /reserve requests pile up behind the held one instead of triggering a new worker. Add a 1Hz heartbeat coroutine that, while anything is in flight, sets workload_served back to TARGET_PERF (150) and flags update_pending. The metrics tick reads 150 and resets to 0; the heartbeat re-pins it before the next tick. Net effect: the autoscaler sees a saturated worker delivering at peak rate, which is the signal it needs to scale a new worker up rather than queue. The heartbeat needs the backend instance, which is only created inside Worker(...) — stash a reference in a module-level dict between Worker() and .run() so the lifecycle coroutine can reach it. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-12 10:35:18 +01:00
parent 2aada7b210
commit 6c2f194b28
1 changed files with 49 additions and 2 deletions
@@ -56,11 +56,49 @@ else:
    USE_STUB_HEALTH = True


+# Workload reported per /reserve and target perf for the heartbeat below.
+TARGET_PERF = 150.0
+
 # Singleton active reservation. `allow_parallel_requests=False` on the
 # /reserve handler guarantees the framework only runs one at a time per
 # worker, so a single slot is enough.
 _active_reservation: Optional[asyncio.Event] = None

+# Backed in after Worker(...) is constructed so the heartbeat coroutine in
+# null_lifecycle() can mutate backend.metrics. Stored in a dict so the
+# lifecycle closure picks up the assignment that happens before .run().
+_backend_ref: dict = {"backend": None}
+
+
+async def _perf_heartbeat() -> None:
+    """Keep cur_perf pegged to TARGET_PERF while a reservation is held.
+
+    Without this, workload_served stays at 0 while a /reserve is being held
+    open. The autoscaler observes cur_perf=0 against max_perf=150, decides
+    the worker can't deliver its claimed throughput, and downgrades it —
+    which makes it cautious about scaling up and prone to queueing
+    subsequent requests behind the held one instead of routing elsewhere.
+
+    Every second, if anything is in flight, set workload_served=TARGET_PERF
+    and mark update_pending so the metrics loop sends immediately. The
+    metrics tick resets workload_served back to 0 after sending; we
+    re-pin it next iteration.
+    """
+    while True:
+        try:
+            await asyncio.sleep(1.0)
+            backend = _backend_ref.get("backend")
+            if backend is None:
+                continue
+            mm = backend.metrics.model_metrics
+            if mm.requests_working:
+                mm.workload_served = TARGET_PERF
+                backend.metrics.update_pending = True
+        except asyncio.CancelledError:
+            raise
+        except Exception as e:
+            log.debug(f"perf heartbeat error: {e}")
+

 def _build_internal_app() -> web.Application:
    app = web.Application()
@@ -107,6 +145,8 @@ async def null_lifecycle():
    site = web.TCPSite(runner, INTERNAL_HOST, INTERNAL_PORT)
    await site.start()

+    heartbeat = asyncio.create_task(_perf_heartbeat(), name="null-perf-heartbeat")
+
    lines = [
        f"Null pyworker internal control server: http://{INTERNAL_HOST}:{INTERNAL_PORT}",
        f"  POST /release  - end the active reservation (call from your queue consumer)",
@@ -122,6 +162,11 @@ async def null_lifecycle():
    try:
        yield
    finally:
+        heartbeat.cancel()
+        try:
+            await heartbeat
+        except (asyncio.CancelledError, Exception):
+            pass
        await runner.cleanup()


@@ -184,7 +229,7 @@ worker_config = WorkerConfig(
            # it to a free worker (or spins up a new one).
            max_queue_time=0.0,
            remote_function=reserve_worker,
-            workload_calculator=lambda _payload: 150.0,
+            workload_calculator=lambda _payload: TARGET_PERF,
            benchmark_config=BenchmarkConfig(
                generator=lambda: {BENCHMARK_SENTINEL: True},
                runs=1,
@@ -196,4 +241,6 @@ worker_config = WorkerConfig(
    log_action_config=LogActionConfig(),
 )

-Worker(worker_config).run()
+_worker = Worker(worker_config)
+_backend_ref["backend"] = _worker.backend
+_worker.run()