bump up version minor number

feat AUTO-695: add loaded_at attribute to AutoScalerData and Metrics classes
2025-11-14 18:07:17 -08:00 · 2025-11-14 17:07:06 -08:00
6 changed files with 67 additions and 154 deletions
@@ -146,6 +146,7 @@ class Metrics:
    def _set_mtoken(self, mtoken: str) -> None:
        self.mtoken = mtoken
    #######################################Private#######################################
    async def __send_delete_requests_and_reset(self):
@@ -280,7 +281,6 @@ class Metrics:
        if sent:
            # clear the one-shot loadtime only if we actually sent *this* value
            self.system_metrics.reset(expected=loadtime_snapshot)
            self.update_pending = False
            self.model_metrics.reset()
            self.last_metric_update = time.time()
@@ -3,17 +3,15 @@ import logging
 from typing import List
 import ssl
 from asyncio import run, gather
-import asyncio
+
 from lib.backend import Backend
 from lib.metrics import Metrics
 from aiohttp import web
 log = logging.getLogger(__file__)
 def start_server(backend: Backend, routes: List[web.RouteDef], **kwargs):
    try:
    log.debug("getting certificate...")
    use_ssl = os.environ.get("USE_SSL", "false") == "true"
    if use_ssl is True:
@@ -40,21 +38,3 @@ def start_server(backend: Backend, routes: List[web.RouteDef], **kwargs):
        await gather(site.start(), backend._start_tracking())
    run(main())
    except Exception as e:
        err_msg = f"PyWorker failed to launch: {e}"
        log.error(err_msg)
        async def beacon():
            metrics = Metrics()
            metrics._set_version(getattr(backend, "version", "0"))
            metrics._set_mtoken(getattr(backend, "mtoken", ""))
            try:
                while True:
                    metrics._model_errored(err_msg)
                    await metrics._Metrics__send_metrics_and_reset()
                    await asyncio.sleep(10)
            finally:
                await metrics.aclose()
        run(beacon())
@@ -41,14 +41,6 @@ echo_var DEBUG_LOG
 echo_var PYWORKER_LOG
 echo_var MODEL_LOG
 # if instance is rebooted, we want to clear out the log file so pyworker doesn't read lines
 # from the run prior to reboot. past logs are saved in $MODEL_LOG.old for debugging only
 if [ -e "$MODEL_LOG" ]; then
    echo "Rotating model log at $MODEL_LOG to $MODEL_LOG.old"
    cat "$MODEL_LOG" >> "$MODEL_LOG.old" 
    : > "$MODEL_LOG"
 fi
 # Populate /etc/environment with quoted values
 if ! grep -q "VAST" /etc/environment; then
    env -0 | grep -zEv "^(HOME=|SHLVL=)|CONDA" | while IFS= read -r -d '' line; do
@@ -132,43 +124,9 @@ cd "$SERVER_DIR"
 echo "launching PyWorker server"
-set +e
+# if instance is rebooted, we want to clear out the log file so pyworker doesn't read lines
-python3 -m "workers.$BACKEND.server" |& tee -a "$PYWORKER_LOG"
+# from the run prior to reboot. past logs are saved in $MODEL_LOG.old for debugging only
-PY_STATUS=${PIPESTATUS[0]}
+[ -e "$MODEL_LOG" ] && cat "$MODEL_LOG" >> "$MODEL_LOG.old" && : > "$MODEL_LOG"
 set -e
 if [ "${PY_STATUS}" -ne 0 ]; then
  echo "PyWorker exited with status ${PY_STATUS}; notifying autoscaler..."
  ERROR_MSG="PyWorker exited: code ${PY_STATUS}"
  MTOKEN="${MASTER_TOKEN:-}"
  VERSION="${PYWORKER_VERSION:-0}"
  IFS=',' read -r -a REPORT_ADDRS <<< "${REPORT_ADDR}"
  for addr in "${REPORT_ADDRS[@]}"; do
    curl -sS -X POST -H 'Content-Type: application/json' \
      -d "$(cat <<JSON
 {
  "id": ${CONTAINER_ID:-0},
  "mtoken": "${MTOKEN}",
  "version": "${VERSION}",
  "loadtime": 0,
  "new_load": 0,
  "cur_load": 0,
  "rej_load": 0,
  "max_perf": 0,
  "cur_perf": 0,
  "error_msg": "${ERROR_MSG}",
  "num_requests_working": 0,
  "num_requests_recieved": 0,
  "additional_disk_usage": 0,
  "working_request_idxs": [],
  "cur_capacity": 0,
  "max_capacity": 0,
  "url": "${URL}"
 }
 JSON
 )" "${addr%/}/worker_status/" || true
  done
 fi
 (python3 -m "workers.$BACKEND.server" |& tee -a "$PYWORKER_LOG") &
 echo "launching PyWorker server done"
@@ -34,30 +34,12 @@ uv pip install -r requirements.txt
 Several examples have been provided in the client to help you get started with your own implementation.
-First, set your API key as an environment variable:
+### Completions
 Call to `/v1/completions` with json response
 ```bash
-export VAST_API_KEY=<your_api_key>
+python -m workers.openai.client -k <API_KEY> -e <ENDPOINT_NAME> --completion --model <MODEL_NAME>
 ```
 The `--model` and `--endpoint` flags are optional. If not provided, they default to `Qwen/Qwen3-8B` and `my-vllm-endpoint` respectively.
 ### Chat Completion (streaming)
 Call to `/v1/chat/completions` with streaming response
 ```bash
 python -m workers.openai.client --chat-stream --endpoint <ENDPOINT_NAME> --model <MODEL_NAME>
 ```
 ### Interactive Chat (streaming)
 Interactive session with calls to `/v1/chat/completions`.
 Type `clear` to clear the chat history or `quit` to exit.
 ```bash
 python -m workers.openai.client --interactive --endpoint <ENDPOINT_NAME> --model <MODEL_NAME>
 ```
 ### Chat Completion (json)
@@ -65,7 +47,15 @@ python -m workers.openai.client --interactive --endpoint <ENDPOINT_NAME> --model
 Call to `/v1/chat/completions` with json response
 ```bash
-python -m workers.openai.client --chat --endpoint <ENDPOINT_NAME> --model <MODEL_NAME>
+python -m workers.openai.client -k <API_KEY> -e <ENDPOINT_NAME> --chat --model <MODEL_NAME>
 ```
 ### Chat Completion (streaming)
 Call to `/v1/chat/completions` with streaming response
 ```bash
 python -m workers.openai.client -k <API_KEY> -e <ENDPOINT_NAME> --chat-stream --model <MODEL_NAME>
 ```
 ### Tool Use (json)
@@ -75,14 +65,16 @@ Call to `/v1/chat/completions` with tool and json response.
 This test defines a simple tool which will list the contents of the local pyworker directory.  The output is then analysed by the model.
 ```bash
-python -m workers.openai.client --tools --endpoint <ENDPOINT_NAME> --model <MODEL_NAME>
+python -m workers.openai.client -k <API_KEY> -e <ENDPOINT_NAME> --tools --model <MODEL_NAME>
 ```
-### Completions
+### Interactive Chat (streaming)
-Call to `/v1/completions` with json response
+Interactive session with calls to `/v1/chat/completions`.
 Type `clear` to clear the chat history or `quit` to exit.
 ```bash
-python -m workers.openai.client --completion --endpoint <ENDPOINT_NAME> --model <MODEL_NAME>
+python -m workers.openai.client -k <API_KEY> -e <ENDPOINT_NAME> --interactive --model <MODEL_NAME>
 ```
@@ -18,7 +18,7 @@ logging.basicConfig(
 log = logging.getLogger(__file__)
 # ---------------------- Prompts ----------------------
-COMPLETIONS_PROMPT = "Zebras are primarily grazers and can subsist on lower-quality vegetation. They are preyed on mainly by"
+COMPLETIONS_PROMPT = "the capital of USA is"
 CHAT_PROMPT = "Think step by step: Tell me about the Python programming language."
 TOOLS_PROMPT = (
    "Can you list the files in the current working directory and tell me what you see? "
@@ -97,9 +97,9 @@ def _tool_state_to_message_tool_calls(state: Dict[int, Dict[str, Any]]) -> List[
 # ---- OpenAI-compatible calls (non-streaming) ----
-async def call_completions(client: Serverless, *, model: str, prompt: str, endpoint_name: str, **kwargs) -> Dict[str, Any]:
+async def call_completions(client: Serverless, *, model: str, prompt: str, **kwargs) -> Dict[str, Any]:
-    endpoint = await client.get_endpoint(name=endpoint_name)
+    endpoint = await client.get_endpoint(name=ENDPOINT_NAME)
    payload = {
        "input": {
@@ -113,9 +113,9 @@ async def call_completions(client: Serverless, *, model: str, prompt: str, endpo
    resp = await endpoint.request("/v1/completions", payload, cost=payload["input"]["max_tokens"])
    return resp["response"]
-async def call_chat_completions(client: Serverless, *, model: str, messages: List[Dict[str, Any]], endpoint_name: str, **kwargs) -> Dict[str, Any]:
+async def call_chat_completions(client: Serverless, *, model: str, messages: List[Dict[str, Any]], **kwargs) -> Dict[str, Any]:
-    endpoint = await client.get_endpoint(name=endpoint_name)
+    endpoint = await client.get_endpoint(name=ENDPOINT_NAME)
    payload = {
        "input": {
@@ -132,9 +132,9 @@ async def call_chat_completions(client: Serverless, *, model: str, messages: Lis
    return resp["response"]
 # ---- Streaming variants ----
-async def stream_completions(client: Serverless, *, model: str, prompt: str, endpoint_name: str, **kwargs):
+async def stream_completions(client: Serverless, *, model: str, prompt: str, **kwargs):
-    endpoint = await client.get_endpoint(name=endpoint_name)
+    endpoint = await client.get_endpoint(name=ENDPOINT_NAME)
    payload = {
        "input": {
@@ -150,9 +150,9 @@ async def stream_completions(client: Serverless, *, model: str, prompt: str, end
    resp = await endpoint.request("/v1/completions", payload, cost=payload["input"]["max_tokens"], stream=True)
    return resp["response"]  # async generator
-async def stream_chat_completions(client: Serverless, *, model: str, messages: List[Dict[str, Any]], endpoint_name: str, **kwargs):
+async def stream_chat_completions(client: Serverless, *, model: str, messages: List[Dict[str, Any]], **kwargs):
-    endpoint = await client.get_endpoint(name=endpoint_name)
+    endpoint = await client.get_endpoint(name=ENDPOINT_NAME)
    payload = {
        "input": {
@@ -174,10 +174,9 @@ async def stream_chat_completions(client: Serverless, *, model: str, messages: L
 class APIDemo:
    """Demo and testing functionality for the API client"""
-    def __init__(self, client: Serverless, model: str, endpoint_name: str, tool_manager: Optional[ToolManager] = None):
+    def __init__(self, client: Serverless, model: str, tool_manager: Optional[ToolManager] = None):
        self.client = client
        self.model = model
        self.endpoint_name = endpoint_name
        self.tool_manager = tool_manager or ToolManager()
    # ----- Streaming handler -----
@@ -186,16 +185,11 @@ class APIDemo:
        reasoning_content = ""
        printed_reasoning = False
        printed_answer = False
        finish_reason = None
        async for chunk in stream:
            choice = (chunk.get("choices") or [{}])[0]
            delta = choice.get("delta", {})
            # Track finish reason
            if choice.get("finish_reason"):
                finish_reason = choice.get("finish_reason")
            # reasoning tokens
            rc = delta.get("reasoning_content")
            if rc and show_reasoning:
@@ -225,8 +219,6 @@ class APIDemo:
                print(f"Reasoning tokens: {len(reasoning_content.split())}")
            if printed_answer:
                print(f"Response tokens: {len(full_response.split())}")
            if finish_reason:
                print(f"Finish reason: {finish_reason}")
        return full_response
@@ -239,7 +231,6 @@ class APIDemo:
            client=self.client,
            model=self.model,
            prompt=COMPLETIONS_PROMPT,
            endpoint_name=self.endpoint_name,
            max_tokens=MAX_TOKENS,
            temperature=DEFAULT_TEMPERATURE,
        )
@@ -258,7 +249,6 @@ class APIDemo:
                client=self.client,
                model=self.model, 
                messages=messages,
                endpoint_name=self.endpoint_name,
                max_tokens=MAX_TOKENS,
                temperature=DEFAULT_TEMPERATURE
            )
@@ -271,7 +261,6 @@ class APIDemo:
                client=self.client,
                model=self.model, 
                messages=messages,
                endpoint_name=self.endpoint_name,
                max_tokens=MAX_TOKENS,
                temperature=DEFAULT_TEMPERATURE
            )
@@ -298,7 +287,6 @@ class APIDemo:
                client=self.client,
                model=self.model,
                messages=messages,
                endpoint_name=self.endpoint_name,
                tools=minimal_tool,
                tool_choice="none",
                max_tokens=10
@@ -324,7 +312,6 @@ class APIDemo:
            client=self.client,
            model=self.model,
            messages=messages,
            endpoint_name=self.endpoint_name,
            tools=self.tool_manager.get_ls_tool_definition(),
            tool_choice="auto",
            max_tokens=MAX_TOKENS,
@@ -402,7 +389,6 @@ class APIDemo:
            client=self.client,
            model=self.model,
            messages=messages,
            endpoint_name=self.endpoint_name,
            max_tokens=MAX_TOKENS,
            temperature=DEFAULT_TEMPERATURE,
        )
@@ -441,6 +427,7 @@ class APIDemo:
        print("=" * 60)
        print("INTERACTIVE STREAMING CHAT")
        print("=" * 60)
        print(f"Using model: {self.model}")
        print("Type 'quit' to exit, 'clear' to clear history")
        print()
@@ -467,7 +454,6 @@ class APIDemo:
                    client=self.client,
                    model=self.model, 
                    messages=messages, 
                    endpoint_name=self.endpoint_name,
                    max_tokens=MAX_TOKENS, 
                    temperature=0.7
                )
@@ -487,8 +473,8 @@ class APIDemo:
 # ---------------------- CLI ----------------------
 def build_arg_parser() -> argparse.ArgumentParser:
    p = argparse.ArgumentParser(description="Vast vLLM Demo (Serverless SDK)")
-    p.add_argument("--model", default=DEFAULT_MODEL, help=f"Model to use for requests (default: {DEFAULT_MODEL})")
+    p.add_argument("--model", required=True, help="Model to use for requests (required)")
-    p.add_argument("--endpoint", default=ENDPOINT_NAME, help=f"Vast endpoint name (default: {ENDPOINT_NAME})")
+    p.add_argument("--endpoint", default="my-vllm-endpoint", help="Vast endpoint name (default: my-vllm-endpoint)")
    modes = p.add_mutually_exclusive_group(required=False)
    modes.add_argument("--completion", action="store_true", help="Test completions endpoint")
@@ -516,14 +502,12 @@ async def main_async():
        print("Please specify exactly one test mode")
        sys.exit(1)
    print("=" * 60)
    print(f"Using model: {args.model}")
-    print(f"Using endpoint: {args.endpoint}")
+    print("=" * 60)
    try:
        async with Serverless() as client:
-            demo = APIDemo(client, args.model, args.endpoint, ToolManager())
+            demo = APIDemo(client, args.model, ToolManager())
            if args.completion:
                await demo.demo_completions()
@@ -11,7 +11,6 @@ MODEL_SERVER_START_LOG_MSG = [
    "llama runner started",  # Ollama
    '"message":"Connected","target":"text_generation_router"',  # TGI
    '"message":"Connected","target":"text_generation_router::server"',  # TGI
    "main: model loaded" # llama.cpp
 ]
 MODEL_SERVER_ERROR_LOG_MSGS = [
Author	SHA1	Message	Date
Abiola Akinnubi	74efc2cb42	bump up version minor number	2025-11-14 18:07:17 -08:00
Abiola Akinnubi	db3096bbaf	feat AUTO-695: add loaded_at attribute to AutoScalerData and Metrics classes	2025-11-14 17:07:06 -08:00