Default null pyworker session cost to 2x max_perf

Reporting cost == max_perf puts an occupied worker at exactly 100%
utilization, which the autoscaler reads as "at target, no action."
The 3rd session_create then 429s on both active workers and stalls in
the global queue instead of triggering a cold-worker activation
(observed: 1→2 active scales fine, 2→3 does not).

Bumping cost to 2 * max_perf makes each session look like more than
one worker's work, so the autoscaler always keeps an extra active
worker hot. Slight over-provisioning, but the 3rd reservation lands
directly on a free worker rather than queueing.

Expose --session-cost on the client so the value can be swept without
edits. README documents the trade-off.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Rob Ballantyne
2026-05-12 11:31:26 +01:00
parent 01eff874d8
commit 1d2caaf554
2 changed files with 39 additions and 3 deletions
+12
View File
@@ -144,6 +144,18 @@ session of `cost = 100`. Set the endpoint accordingly:
`target_util = 1.0`). `target_util = 1.0`).
- **`max_workers`** — cap on total reservations the endpoint can ever - **`max_workers`** — cap on total reservations the endpoint can ever
serve concurrently. serve concurrently.
- **Session `cost = 2 × max_perf`** (e.g. `200` when `max_perf = 100`) —
recommended. Reporting `cost = max_perf` puts each occupied worker at
exactly 100% utilization, which the autoscaler reads as "at target,
no action needed." The third reservation then gets 429'd by both
occupied workers and stalls in the autoscaler's global queue
indefinitely instead of activating a cold worker.
Bumping `cost` above `max_perf` makes each session look like more than
one worker of work (`cur_load / max_perf > 1.0`), so the autoscaler
keeps an extra active worker hot per session. Slight over-provisioning
in exchange for predictable scale-up. The demo client defaults to
`--session-cost 200`.
- **`max_queue_time = 0`** (or very small, e.g. `0.1`) — required. - **`max_queue_time = 0`** (or very small, e.g. `0.1`) — required.
The per-worker `wait_time` property used internally to reject The per-worker `wait_time` property used internally to reject
requests filters sessions out, but the **autoscaler** computes its requests filters sessions out, but the **autoscaler** computes its
+27 -3
View File
@@ -15,7 +15,12 @@ logging.basicConfig(
log = logging.getLogger(__file__) log = logging.getLogger(__file__)
ENDPOINT_NAME = "null-prod" ENDPOINT_NAME = "null-prod"
SESSION_COST = 100 # Default cost passed to /session/create. Bumping this above the worker's
# max_perf (100) is how you tell the autoscaler "each session is more than
# one worker of work" — keeps an extra active worker warm and ready, so
# the next /session/create lands on a free worker instead of queueing.
# See README "Endpoint scaling parameters" for the math.
DEFAULT_SESSION_COST = 200
async def reserve( async def reserve(
@@ -23,6 +28,7 @@ async def reserve(
*, *,
endpoint_name: str, endpoint_name: str,
hold_for: float, hold_for: float,
session_cost: int,
label: str = "session", label: str = "session",
) -> None: ) -> None:
"""Open a session, hold the worker for `hold_for` seconds, close cleanly. """Open a session, hold the worker for `hold_for` seconds, close cleanly.
@@ -39,8 +45,11 @@ async def reserve(
# don't make any keepalive requests so no extension happens. # don't make any keepalive requests so no extension happens.
lifetime = hold_for + 60 lifetime = hold_for + 60
start = time.monotonic() start = time.monotonic()
log.info("[%s] creating session (lifetime=%.0fs, hold=%.0fs)", label, lifetime, hold_for) log.info(
async with await endpoint.session(cost=SESSION_COST, lifetime=lifetime) as s: "[%s] creating session (cost=%d, lifetime=%.0fs, hold=%.0fs)",
label, session_cost, lifetime, hold_for,
)
async with await endpoint.session(cost=session_cost, lifetime=lifetime) as s:
log.info("[%s] session %s open", label, s.session_id) log.info("[%s] session %s open", label, s.session_id)
try: try:
await asyncio.sleep(hold_for) await asyncio.sleep(hold_for)
@@ -59,6 +68,7 @@ async def run_demo(
endpoint_name: str, endpoint_name: str,
interval: float, interval: float,
plateau: float, plateau: float,
session_cost: int,
) -> None: ) -> None:
"""Trapezoidal load: ramp up three sessions, plateau, then scale down. """Trapezoidal load: ramp up three sessions, plateau, then scale down.
@@ -84,6 +94,7 @@ async def run_demo(
client, client,
endpoint_name=endpoint_name, endpoint_name=endpoint_name,
hold_for=hold, hold_for=hold,
session_cost=session_cost,
label=label, label=label,
), ),
name=label, name=label,
@@ -147,6 +158,17 @@ def build_arg_parser() -> argparse.ArgumentParser:
"up the third worker (default: 300)" "up the third worker (default: 300)"
), ),
) )
p.add_argument(
"--session-cost",
type=int,
default=DEFAULT_SESSION_COST,
help=(
f"Cost reported to the autoscaler for each /session/create. "
f"Setting this above the worker's max_perf (100) over-provisions "
f"slightly, keeping an extra active worker warm so the next "
f"session lands without queueing (default: {DEFAULT_SESSION_COST})"
),
)
return p return p
@@ -165,12 +187,14 @@ async def main_async():
endpoint_name=args.endpoint, endpoint_name=args.endpoint,
interval=args.interval, interval=args.interval,
plateau=args.plateau, plateau=args.plateau,
session_cost=args.session_cost,
) )
else: else:
await reserve( await reserve(
client, client,
endpoint_name=args.endpoint, endpoint_name=args.endpoint,
hold_for=args.duration, hold_for=args.duration,
session_cost=args.session_cost,
label="reservation", label="reservation",
) )
except KeyboardInterrupt: except KeyboardInterrupt: