Add perf heartbeat to keep null pyworker reporting peak throughput
While a /reserve is held, no requests complete so workload_served stays at 0 each metrics tick. The autoscaler sees cur_perf=0 against max_perf=150, concludes the worker can't deliver claimed throughput, downgrades it, and gets cautious about scaling up — so additional /reserve requests pile up behind the held one instead of triggering a new worker. Add a 1Hz heartbeat coroutine that, while anything is in flight, sets workload_served back to TARGET_PERF (150) and flags update_pending. The metrics tick reads 150 and resets to 0; the heartbeat re-pins it before the next tick. Net effect: the autoscaler sees a saturated worker delivering at peak rate, which is the signal it needs to scale a new worker up rather than queue. The heartbeat needs the backend instance, which is only created inside Worker(...) — stash a reference in a module-level dict between Worker() and .run() so the lifecycle coroutine can reach it. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
+49
-2
@@ -56,11 +56,49 @@ else:
|
|||||||
USE_STUB_HEALTH = True
|
USE_STUB_HEALTH = True
|
||||||
|
|
||||||
|
|
||||||
|
# Workload reported per /reserve and target perf for the heartbeat below.
|
||||||
|
TARGET_PERF = 150.0
|
||||||
|
|
||||||
# Singleton active reservation. `allow_parallel_requests=False` on the
|
# Singleton active reservation. `allow_parallel_requests=False` on the
|
||||||
# /reserve handler guarantees the framework only runs one at a time per
|
# /reserve handler guarantees the framework only runs one at a time per
|
||||||
# worker, so a single slot is enough.
|
# worker, so a single slot is enough.
|
||||||
_active_reservation: Optional[asyncio.Event] = None
|
_active_reservation: Optional[asyncio.Event] = None
|
||||||
|
|
||||||
|
# Backed in after Worker(...) is constructed so the heartbeat coroutine in
|
||||||
|
# null_lifecycle() can mutate backend.metrics. Stored in a dict so the
|
||||||
|
# lifecycle closure picks up the assignment that happens before .run().
|
||||||
|
_backend_ref: dict = {"backend": None}
|
||||||
|
|
||||||
|
|
||||||
|
async def _perf_heartbeat() -> None:
|
||||||
|
"""Keep cur_perf pegged to TARGET_PERF while a reservation is held.
|
||||||
|
|
||||||
|
Without this, workload_served stays at 0 while a /reserve is being held
|
||||||
|
open. The autoscaler observes cur_perf=0 against max_perf=150, decides
|
||||||
|
the worker can't deliver its claimed throughput, and downgrades it —
|
||||||
|
which makes it cautious about scaling up and prone to queueing
|
||||||
|
subsequent requests behind the held one instead of routing elsewhere.
|
||||||
|
|
||||||
|
Every second, if anything is in flight, set workload_served=TARGET_PERF
|
||||||
|
and mark update_pending so the metrics loop sends immediately. The
|
||||||
|
metrics tick resets workload_served back to 0 after sending; we
|
||||||
|
re-pin it next iteration.
|
||||||
|
"""
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
await asyncio.sleep(1.0)
|
||||||
|
backend = _backend_ref.get("backend")
|
||||||
|
if backend is None:
|
||||||
|
continue
|
||||||
|
mm = backend.metrics.model_metrics
|
||||||
|
if mm.requests_working:
|
||||||
|
mm.workload_served = TARGET_PERF
|
||||||
|
backend.metrics.update_pending = True
|
||||||
|
except asyncio.CancelledError:
|
||||||
|
raise
|
||||||
|
except Exception as e:
|
||||||
|
log.debug(f"perf heartbeat error: {e}")
|
||||||
|
|
||||||
|
|
||||||
def _build_internal_app() -> web.Application:
|
def _build_internal_app() -> web.Application:
|
||||||
app = web.Application()
|
app = web.Application()
|
||||||
@@ -107,6 +145,8 @@ async def null_lifecycle():
|
|||||||
site = web.TCPSite(runner, INTERNAL_HOST, INTERNAL_PORT)
|
site = web.TCPSite(runner, INTERNAL_HOST, INTERNAL_PORT)
|
||||||
await site.start()
|
await site.start()
|
||||||
|
|
||||||
|
heartbeat = asyncio.create_task(_perf_heartbeat(), name="null-perf-heartbeat")
|
||||||
|
|
||||||
lines = [
|
lines = [
|
||||||
f"Null pyworker internal control server: http://{INTERNAL_HOST}:{INTERNAL_PORT}",
|
f"Null pyworker internal control server: http://{INTERNAL_HOST}:{INTERNAL_PORT}",
|
||||||
f" POST /release - end the active reservation (call from your queue consumer)",
|
f" POST /release - end the active reservation (call from your queue consumer)",
|
||||||
@@ -122,6 +162,11 @@ async def null_lifecycle():
|
|||||||
try:
|
try:
|
||||||
yield
|
yield
|
||||||
finally:
|
finally:
|
||||||
|
heartbeat.cancel()
|
||||||
|
try:
|
||||||
|
await heartbeat
|
||||||
|
except (asyncio.CancelledError, Exception):
|
||||||
|
pass
|
||||||
await runner.cleanup()
|
await runner.cleanup()
|
||||||
|
|
||||||
|
|
||||||
@@ -184,7 +229,7 @@ worker_config = WorkerConfig(
|
|||||||
# it to a free worker (or spins up a new one).
|
# it to a free worker (or spins up a new one).
|
||||||
max_queue_time=0.0,
|
max_queue_time=0.0,
|
||||||
remote_function=reserve_worker,
|
remote_function=reserve_worker,
|
||||||
workload_calculator=lambda _payload: 150.0,
|
workload_calculator=lambda _payload: TARGET_PERF,
|
||||||
benchmark_config=BenchmarkConfig(
|
benchmark_config=BenchmarkConfig(
|
||||||
generator=lambda: {BENCHMARK_SENTINEL: True},
|
generator=lambda: {BENCHMARK_SENTINEL: True},
|
||||||
runs=1,
|
runs=1,
|
||||||
@@ -196,4 +241,6 @@ worker_config = WorkerConfig(
|
|||||||
log_action_config=LogActionConfig(),
|
log_action_config=LogActionConfig(),
|
||||||
)
|
)
|
||||||
|
|
||||||
Worker(worker_config).run()
|
_worker = Worker(worker_config)
|
||||||
|
_backend_ref["backend"] = _worker.backend
|
||||||
|
_worker.run()
|
||||||
|
|||||||
Reference in New Issue
Block a user