Default null pyworker session cost to 2x max_perf

Reporting cost == max_perf puts an occupied worker at exactly 100% utilization, which the autoscaler reads as "at target, no action." The 3rd session_create then 429s on both active workers and stalls in the global queue instead of triggering a cold-worker activation (observed: 1→2 active scales fine, 2→3 does not). Bumping cost to 2 * max_perf makes each session look like more than one worker's work, so the autoscaler always keeps an extra active worker hot. Slight over-provisioning, but the 3rd reservation lands directly on a free worker rather than queueing. Expose --session-cost on the client so the value can be swept without edits. README documents the trade-off. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-12 11:31:26 +01:00
parent 01eff874d8
commit 1d2caaf554
2 changed files with 39 additions and 3 deletions
@@ -144,6 +144,18 @@ session of `cost = 100`. Set the endpoint accordingly:
  `target_util = 1.0`).
 - **`max_workers`** — cap on total reservations the endpoint can ever
  serve concurrently.
 - **Session `cost = 2 × max_perf`** (e.g. `200` when `max_perf = 100`) —
  recommended. Reporting `cost = max_perf` puts each occupied worker at
  exactly 100% utilization, which the autoscaler reads as "at target,
  no action needed." The third reservation then gets 429'd by both
  occupied workers and stalls in the autoscaler's global queue
  indefinitely instead of activating a cold worker.
  Bumping `cost` above `max_perf` makes each session look like more than
  one worker of work (`cur_load / max_perf > 1.0`), so the autoscaler
  keeps an extra active worker hot per session. Slight over-provisioning
  in exchange for predictable scale-up. The demo client defaults to
  `--session-cost 200`.
 - **`max_queue_time = 0`** (or very small, e.g. `0.1`) — required.
  The per-worker `wait_time` property used internally to reject
  requests filters sessions out, but the **autoscaler** computes its
@@ -15,7 +15,12 @@ logging.basicConfig(
 log = logging.getLogger(__file__)
 ENDPOINT_NAME = "null-prod"
-SESSION_COST = 100
+# Default cost passed to /session/create. Bumping this above the worker's
 # max_perf (100) is how you tell the autoscaler "each session is more than
 # one worker of work" — keeps an extra active worker warm and ready, so
 # the next /session/create lands on a free worker instead of queueing.
 # See README "Endpoint scaling parameters" for the math.
 DEFAULT_SESSION_COST = 200
 async def reserve(
@@ -23,6 +28,7 @@ async def reserve(
    *,
    endpoint_name: str,
    hold_for: float,
    session_cost: int,
    label: str = "session",
 ) -> None:
    """Open a session, hold the worker for `hold_for` seconds, close cleanly.
@@ -39,8 +45,11 @@ async def reserve(
    # don't make any keepalive requests so no extension happens.
    lifetime = hold_for + 60
    start = time.monotonic()
-    log.info("[%s] creating session (lifetime=%.0fs, hold=%.0fs)", label, lifetime, hold_for)
+    log.info(
-    async with await endpoint.session(cost=SESSION_COST, lifetime=lifetime) as s:
+        "[%s] creating session (cost=%d, lifetime=%.0fs, hold=%.0fs)",
        label, session_cost, lifetime, hold_for,
    )
    async with await endpoint.session(cost=session_cost, lifetime=lifetime) as s:
        log.info("[%s] session %s open", label, s.session_id)
        try:
            await asyncio.sleep(hold_for)
@@ -59,6 +68,7 @@ async def run_demo(
    endpoint_name: str,
    interval: float,
    plateau: float,
    session_cost: int,
 ) -> None:
    """Trapezoidal load: ramp up three sessions, plateau, then scale down.
@@ -84,6 +94,7 @@ async def run_demo(
                client,
                endpoint_name=endpoint_name,
                hold_for=hold,
                session_cost=session_cost,
                label=label,
            ),
            name=label,
@@ -147,6 +158,17 @@ def build_arg_parser() -> argparse.ArgumentParser:
            "up the third worker (default: 300)"
        ),
    )
    p.add_argument(
        "--session-cost",
        type=int,
        default=DEFAULT_SESSION_COST,
        help=(
            f"Cost reported to the autoscaler for each /session/create. "
            f"Setting this above the worker's max_perf (100) over-provisions "
            f"slightly, keeping an extra active worker warm so the next "
            f"session lands without queueing (default: {DEFAULT_SESSION_COST})"
        ),
    )
    return p
@@ -165,12 +187,14 @@ async def main_async():
                    endpoint_name=args.endpoint,
                    interval=args.interval,
                    plateau=args.plateau,
                    session_cost=args.session_cost,
                )
            else:
                await reserve(
                    client,
                    endpoint_name=args.endpoint,
                    hold_for=args.duration,
                    session_cost=args.session_cost,
                    label="reservation",
                )
    except KeyboardInterrupt: