bump up version minor number

feat AUTO-695: add loaded_at attribute to AutoScalerData and Metrics classes
2025-11-14 18:07:17 -08:00 · 2025-11-14 17:07:06 -08:00
8 changed files with 112 additions and 444 deletions
@@ -146,6 +146,7 @@ class Metrics:
    def _set_mtoken(self, mtoken: str) -> None:
        self.mtoken = mtoken
    #######################################Private#######################################
    async def __send_delete_requests_and_reset(self):
@@ -216,7 +217,7 @@ class Metrics:
                id=self.id,
                mtoken=self.mtoken,
                version=self.version,
-                loadtime=(loadtime_snapshot or 0.0), 
+                loadtime=(loadtime_snapshot or 0.0),
                new_load=self.model_metrics.workload_processing,
                cur_load=self.model_metrics.cur_load,
                rej_load=self.model_metrics.workload_rejected,
@@ -280,7 +281,6 @@ class Metrics:
        if sent:
            # clear the one-shot loadtime only if we actually sent *this* value
            self.system_metrics.reset(expected=loadtime_snapshot)
            self.update_pending = False
            self.model_metrics.reset()
            self.last_metric_update = time.time()
@@ -3,58 +3,38 @@ import logging
 from typing import List
 import ssl
 from asyncio import run, gather
-import asyncio
+
 from lib.backend import Backend
 from lib.metrics import Metrics
 from aiohttp import web
 log = logging.getLogger(__file__)
 def start_server(backend: Backend, routes: List[web.RouteDef], **kwargs):
-    try:
+    log.debug("getting certificate...")
-        log.debug("getting certificate...")
+    use_ssl = os.environ.get("USE_SSL", "false") == "true"
-        use_ssl = os.environ.get("USE_SSL", "false") == "true"
+    if use_ssl is True:
-        if use_ssl is True:
+        ssl_context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
-            ssl_context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
+        ssl_context.load_cert_chain(
-            ssl_context.load_cert_chain(
+            certfile="/etc/instance.crt",
-                certfile="/etc/instance.crt",
+            keyfile="/etc/instance.key",
-                keyfile="/etc/instance.key",
+        )
-            )
+    else:
-        else:
+        ssl_context = None
            ssl_context = None
-        async def main():
+    async def main():
-            log.debug("starting server...")
+        log.debug("starting server...")
-            app = web.Application()
+        app = web.Application()
-            app.add_routes(routes)
+        app.add_routes(routes)
-            runner = web.AppRunner(app)
+        runner = web.AppRunner(app)
-            await runner.setup()
+        await runner.setup()
-            site = web.TCPSite(
+        site = web.TCPSite(
-                runner,
+            runner,
-                ssl_context=ssl_context,
+            ssl_context=ssl_context,
-                port=int(os.environ["WORKER_PORT"]),
+            port=int(os.environ["WORKER_PORT"]),
-                **kwargs
+            **kwargs
-            )
+        )
-            await gather(site.start(), backend._start_tracking())
+        await gather(site.start(), backend._start_tracking())
-        run(main())
+    run(main())
    except Exception as e:
        err_msg = f"PyWorker failed to launch: {e}"
        log.error(err_msg)
        async def beacon():
            metrics = Metrics()
            metrics._set_version(getattr(backend, "version", "0"))
            metrics._set_mtoken(getattr(backend, "mtoken", ""))
            try:
                while True:
                    metrics._model_errored(err_msg)
                    await metrics._Metrics__send_metrics_and_reset()
                    await asyncio.sleep(10)
            finally:
                await metrics.aclose()
        run(beacon())
@@ -41,14 +41,6 @@ echo_var DEBUG_LOG
 echo_var PYWORKER_LOG
 echo_var MODEL_LOG
 # if instance is rebooted, we want to clear out the log file so pyworker doesn't read lines
 # from the run prior to reboot. past logs are saved in $MODEL_LOG.old for debugging only
 if [ -e "$MODEL_LOG" ]; then
    echo "Rotating model log at $MODEL_LOG to $MODEL_LOG.old"
    cat "$MODEL_LOG" >> "$MODEL_LOG.old" 
    : > "$MODEL_LOG"
 fi
 # Populate /etc/environment with quoted values
 if ! grep -q "VAST" /etc/environment; then
    env -0 | grep -zEv "^(HOME=|SHLVL=)|CONDA" | while IFS= read -r -d '' line; do
@@ -132,43 +124,9 @@ cd "$SERVER_DIR"
 echo "launching PyWorker server"
-set +e
+# if instance is rebooted, we want to clear out the log file so pyworker doesn't read lines
-python3 -m "workers.$BACKEND.server" |& tee -a "$PYWORKER_LOG"
+# from the run prior to reboot. past logs are saved in $MODEL_LOG.old for debugging only
-PY_STATUS=${PIPESTATUS[0]}
+[ -e "$MODEL_LOG" ] && cat "$MODEL_LOG" >> "$MODEL_LOG.old" && : > "$MODEL_LOG"
 set -e
-if [ "${PY_STATUS}" -ne 0 ]; then
+(python3 -m "workers.$BACKEND.server" |& tee -a "$PYWORKER_LOG") &
-  echo "PyWorker exited with status ${PY_STATUS}; notifying autoscaler..."
+echo "launching PyWorker server done"
  ERROR_MSG="PyWorker exited: code ${PY_STATUS}"
  MTOKEN="${MASTER_TOKEN:-}"
  VERSION="${PYWORKER_VERSION:-0}"
  IFS=',' read -r -a REPORT_ADDRS <<< "${REPORT_ADDR}"
  for addr in "${REPORT_ADDRS[@]}"; do
    curl -sS -X POST -H 'Content-Type: application/json' \
      -d "$(cat <<JSON
 {
  "id": ${CONTAINER_ID:-0},
  "mtoken": "${MTOKEN}",
  "version": "${VERSION}",
  "loadtime": 0,
  "new_load": 0,
  "cur_load": 0,
  "rej_load": 0,
  "max_perf": 0,
  "cur_perf": 0,
  "error_msg": "${ERROR_MSG}",
  "num_requests_working": 0,
  "num_requests_recieved": 0,
  "additional_disk_usage": 0,
  "working_request_idxs": [],
  "cur_capacity": 0,
  "max_capacity": 0,
  "url": "${URL}"
 }
 JSON
 )" "${addr%/}/worker_status/" || true
  done
 fi
 echo "launching PyWorker server done"
@@ -8,13 +8,14 @@ This is the base PyWorker for OpenAI compatible inference servers.  See the [Ser
 This worker is compatible with any backend API that properly implements the `/v1/completions` and `/v1/chat/completions` endpoints.  We currently have three templates you can choose from but you can also create your own without having to modify the PyWorker.
- [vLLM](https://cloud.vast.ai/?ref_id=62897&creator_id=62897&name=vLLM%20(Serverless)) (recommended)
+- [vLLM](https://cloud.vast.ai/?ref_id=62897&creator_id=62897&name=vLLM%20%2B%20Qwen%2FQwen3-8B%20(Serverless)) (recommended)
 - [Ollama](https://cloud.vast.ai/?ref_id=62897&creator_id=62897&name=Ollama%20%2B%20Qwen3%3A32b%20(Serverless))
 - [HuggingFace TGI](https://cloud.vast.ai/?ref_id=62897&creator_id=62897&name=TGI%20%2B%20Qwen3-8B%20(Serverless))
 All of these templates can be configured via the template interface.  You may want to change the model or startup arguments, depending on the template you selected.
-2. Follow the [getting started guide](https://docs.vast.ai/documentation/serverless/quickstart) for help with configuring your serverless setup.  For testing, we recommend that you use the default options presented by the web interface.
+2. Follow the [getting started guide](https://docs.vast.ai/serverless/getting-started) for help with configuring your serverless setup.  For testing, we recommend that you use the default options presented by the web interface.
 ## Client Setup (Demo)
@@ -33,30 +34,12 @@ uv pip install -r requirements.txt
 Several examples have been provided in the client to help you get started with your own implementation.
-First, set your API key as an environment variable:
+### Completions
 Call to `/v1/completions` with json response
 ```bash
-export VAST_API_KEY=<your_api_key>
+python -m workers.openai.client -k <API_KEY> -e <ENDPOINT_NAME> --completion --model <MODEL_NAME>
 ```
 The `--model` and `--endpoint` flags are optional. If not provided, they default to `Qwen/Qwen3-8B` and `my-vllm-endpoint` respectively.
 ### Chat Completion (streaming)
 Call to `/v1/chat/completions` with streaming response
 ```bash
 python -m workers.openai.client --chat-stream --endpoint <ENDPOINT_NAME> --model <MODEL_NAME>
 ```
 ### Interactive Chat (streaming)
 Interactive session with calls to `/v1/chat/completions`.
 Type `clear` to clear the chat history or `quit` to exit.
 ```bash
 python -m workers.openai.client --interactive --endpoint <ENDPOINT_NAME> --model <MODEL_NAME>
 ```
 ### Chat Completion (json)
@@ -64,7 +47,15 @@ python -m workers.openai.client --interactive --endpoint <ENDPOINT_NAME> --model
 Call to `/v1/chat/completions` with json response
 ```bash
-python -m workers.openai.client --chat --endpoint <ENDPOINT_NAME> --model <MODEL_NAME>
+python -m workers.openai.client -k <API_KEY> -e <ENDPOINT_NAME> --chat --model <MODEL_NAME>
 ```
 ### Chat Completion (streaming)
 Call to `/v1/chat/completions` with streaming response
 ```bash
 python -m workers.openai.client -k <API_KEY> -e <ENDPOINT_NAME> --chat-stream --model <MODEL_NAME>
 ```
 ### Tool Use (json)
@@ -74,14 +65,16 @@ Call to `/v1/chat/completions` with tool and json response.
 This test defines a simple tool which will list the contents of the local pyworker directory.  The output is then analysed by the model.
 ```bash
-python -m workers.openai.client --tools --endpoint <ENDPOINT_NAME> --model <MODEL_NAME>
+python -m workers.openai.client -k <API_KEY> -e <ENDPOINT_NAME> --tools --model <MODEL_NAME>
 ```
-### Completions
+### Interactive Chat (streaming)
-Call to `/v1/completions` with json response
+Interactive session with calls to `/v1/chat/completions`.
 Type `clear` to clear the chat history or `quit` to exit.
 ```bash
-python -m workers.openai.client --completion --endpoint <ENDPOINT_NAME> --model <MODEL_NAME>
+python -m workers.openai.client -k <API_KEY> -e <ENDPOINT_NAME> --interactive --model <MODEL_NAME>
 ```
@@ -18,7 +18,7 @@ logging.basicConfig(
 log = logging.getLogger(__file__)
 # ---------------------- Prompts ----------------------
-COMPLETIONS_PROMPT = "Zebras are primarily grazers and can subsist on lower-quality vegetation. They are preyed on mainly by"
+COMPLETIONS_PROMPT = "the capital of USA is"
 CHAT_PROMPT = "Think step by step: Tell me about the Python programming language."
 TOOLS_PROMPT = (
    "Can you list the files in the current working directory and tell me what you see? "
@@ -97,9 +97,9 @@ def _tool_state_to_message_tool_calls(state: Dict[int, Dict[str, Any]]) -> List[
 # ---- OpenAI-compatible calls (non-streaming) ----
-async def call_completions(client: Serverless, *, model: str, prompt: str, endpoint_name: str, **kwargs) -> Dict[str, Any]:
+async def call_completions(client: Serverless, *, model: str, prompt: str, **kwargs) -> Dict[str, Any]:
-    endpoint = await client.get_endpoint(name=endpoint_name)
+    endpoint = await client.get_endpoint(name=ENDPOINT_NAME)
    payload = {
        "input": {
@@ -113,9 +113,9 @@ async def call_completions(client: Serverless, *, model: str, prompt: str, endpo
    resp = await endpoint.request("/v1/completions", payload, cost=payload["input"]["max_tokens"])
    return resp["response"]
-async def call_chat_completions(client: Serverless, *, model: str, messages: List[Dict[str, Any]], endpoint_name: str, **kwargs) -> Dict[str, Any]:
+async def call_chat_completions(client: Serverless, *, model: str, messages: List[Dict[str, Any]], **kwargs) -> Dict[str, Any]:
-    endpoint = await client.get_endpoint(name=endpoint_name)
+    endpoint = await client.get_endpoint(name=ENDPOINT_NAME)
    payload = {
        "input": {
@@ -132,9 +132,9 @@ async def call_chat_completions(client: Serverless, *, model: str, messages: Lis
    return resp["response"]
 # ---- Streaming variants ----
-async def stream_completions(client: Serverless, *, model: str, prompt: str, endpoint_name: str, **kwargs):
+async def stream_completions(client: Serverless, *, model: str, prompt: str, **kwargs):
-    endpoint = await client.get_endpoint(name=endpoint_name)
+    endpoint = await client.get_endpoint(name=ENDPOINT_NAME)
    payload = {
        "input": {
@@ -150,9 +150,9 @@ async def stream_completions(client: Serverless, *, model: str, prompt: str, end
    resp = await endpoint.request("/v1/completions", payload, cost=payload["input"]["max_tokens"], stream=True)
    return resp["response"]  # async generator
-async def stream_chat_completions(client: Serverless, *, model: str, messages: List[Dict[str, Any]], endpoint_name: str, **kwargs):
+async def stream_chat_completions(client: Serverless, *, model: str, messages: List[Dict[str, Any]], **kwargs):
-    endpoint = await client.get_endpoint(name=endpoint_name)
+    endpoint = await client.get_endpoint(name=ENDPOINT_NAME)
    payload = {
        "input": {
@@ -174,10 +174,9 @@ async def stream_chat_completions(client: Serverless, *, model: str, messages: L
 class APIDemo:
    """Demo and testing functionality for the API client"""
-    def __init__(self, client: Serverless, model: str, endpoint_name: str, tool_manager: Optional[ToolManager] = None):
+    def __init__(self, client: Serverless, model: str, tool_manager: Optional[ToolManager] = None):
        self.client = client
        self.model = model
        self.endpoint_name = endpoint_name
        self.tool_manager = tool_manager or ToolManager()
    # ----- Streaming handler -----
@@ -186,15 +185,10 @@ class APIDemo:
        reasoning_content = ""
        printed_reasoning = False
        printed_answer = False
        finish_reason = None
        async for chunk in stream:
            choice = (chunk.get("choices") or [{}])[0]
            delta = choice.get("delta", {})
            # Track finish reason
            if choice.get("finish_reason"):
                finish_reason = choice.get("finish_reason")
            # reasoning tokens
            rc = delta.get("reasoning_content")
@@ -225,8 +219,6 @@ class APIDemo:
                print(f"Reasoning tokens: {len(reasoning_content.split())}")
            if printed_answer:
                print(f"Response tokens: {len(full_response.split())}")
            if finish_reason:
                print(f"Finish reason: {finish_reason}")
        return full_response
@@ -239,7 +231,6 @@ class APIDemo:
            client=self.client,
            model=self.model,
            prompt=COMPLETIONS_PROMPT,
            endpoint_name=self.endpoint_name,
            max_tokens=MAX_TOKENS,
            temperature=DEFAULT_TEMPERATURE,
        )
@@ -258,7 +249,6 @@ class APIDemo:
                client=self.client,
                model=self.model, 
                messages=messages,
                endpoint_name=self.endpoint_name,
                max_tokens=MAX_TOKENS,
                temperature=DEFAULT_TEMPERATURE
            )
@@ -271,7 +261,6 @@ class APIDemo:
                client=self.client,
                model=self.model, 
                messages=messages,
                endpoint_name=self.endpoint_name,
                max_tokens=MAX_TOKENS,
                temperature=DEFAULT_TEMPERATURE
            )
@@ -298,7 +287,6 @@ class APIDemo:
                client=self.client,
                model=self.model,
                messages=messages,
                endpoint_name=self.endpoint_name,
                tools=minimal_tool,
                tool_choice="none",
                max_tokens=10
@@ -324,7 +312,6 @@ class APIDemo:
            client=self.client,
            model=self.model,
            messages=messages,
            endpoint_name=self.endpoint_name,
            tools=self.tool_manager.get_ls_tool_definition(),
            tool_choice="auto",
            max_tokens=MAX_TOKENS,
@@ -402,7 +389,6 @@ class APIDemo:
            client=self.client,
            model=self.model,
            messages=messages,
            endpoint_name=self.endpoint_name,
            max_tokens=MAX_TOKENS,
            temperature=DEFAULT_TEMPERATURE,
        )
@@ -441,6 +427,7 @@ class APIDemo:
        print("=" * 60)
        print("INTERACTIVE STREAMING CHAT")
        print("=" * 60)
        print(f"Using model: {self.model}")
        print("Type 'quit' to exit, 'clear' to clear history")
        print()
@@ -466,8 +453,7 @@ class APIDemo:
                stream = await stream_chat_completions(
                    client=self.client,
                    model=self.model, 
-                    messages=messages,
+                    messages=messages, 
                    endpoint_name=self.endpoint_name,
                    max_tokens=MAX_TOKENS, 
                    temperature=0.7
                )
@@ -487,8 +473,8 @@ class APIDemo:
 # ---------------------- CLI ----------------------
 def build_arg_parser() -> argparse.ArgumentParser:
    p = argparse.ArgumentParser(description="Vast vLLM Demo (Serverless SDK)")
-    p.add_argument("--model", default=DEFAULT_MODEL, help=f"Model to use for requests (default: {DEFAULT_MODEL})")
+    p.add_argument("--model", required=True, help="Model to use for requests (required)")
-    p.add_argument("--endpoint", default=ENDPOINT_NAME, help=f"Vast endpoint name (default: {ENDPOINT_NAME})")
+    p.add_argument("--endpoint", default="my-vllm-endpoint", help="Vast endpoint name (default: my-vllm-endpoint)")
    modes = p.add_mutually_exclusive_group(required=False)
    modes.add_argument("--completion", action="store_true", help="Test completions endpoint")
@@ -516,14 +502,12 @@ async def main_async():
        print("Please specify exactly one test mode")
        sys.exit(1)
    print("=" * 60)
    print(f"Using model: {args.model}")
-    print(f"Using endpoint: {args.endpoint}")
+    print("=" * 60)
    try:
        async with Serverless() as client:
-            demo = APIDemo(client, args.model, args.endpoint, ToolManager())
+            demo = APIDemo(client, args.model, ToolManager())
            if args.completion:
                await demo.demo_completions()
@@ -11,7 +11,6 @@ MODEL_SERVER_START_LOG_MSG = [
    "llama runner started",  # Ollama
    '"message":"Connected","target":"text_generation_router"',  # TGI
    '"message":"Connected","target":"text_generation_router::server"',  # TGI
    "main: model loaded" # llama.cpp
 ]
 MODEL_SERVER_ERROR_LOG_MSGS = [
@@ -35,7 +34,6 @@ backend = Backend(
    model_server_url=os.environ["MODEL_SERVER_URL"],
    model_log_file=os.environ["MODEL_LOG"],
    allow_parallel_requests=True,
    max_wait_time=600.0,
    benchmark_handler=CompletionsHandler(benchmark_runs=3, benchmark_words=256),
    log_actions=[
        *[(LogAction.ModelLoaded, info_msg) for info_msg in MODEL_SERVER_START_LOG_MSG],
@@ -1,103 +1,19 @@
-# HuggingFace TGI PyWorker
+This is the base PyWorker for TGI, designed to create PyWorkers that can utilize various LLMs. It offers two primary endpoints:
-This is the base PyWorker for HuggingFace Text Generation Inference (TGI) servers. See the [Serverless documentation](https://docs.vast.ai/serverless) for guides and how-to's.
+1. `generate`: Generates the LLM's response to a given prompt in a single request.
 2. `generate_stream`: Streams the LLM's response token by token.
-## Instance Setup
+Both endpoints use the following API payload format:
 1. Pick a template
 This worker is compatible with any TGI backend. We have a template you can use or you can create your own.
 - [HuggingFace TGI](https://cloud.vast.ai/?ref_id=62897&creator_id=62897&name=TGI%20(Serverless))
 The template can be configured via the template interface. You may want to change the model or startup arguments.
 2. Follow the [getting started guide](https://docs.vast.ai/documentation/serverless/quickstart) for help with configuring your serverless setup. For testing, we recommend that you use the default options presented by the web interface.
 ## Client Setup (Demo)
 1. Clone the PyWorker repository to your local machine and install the necessary requirements for running the test client.
 ```bash
 git clone https://github.com/vast-ai/pyworker
 cd pyworker
 pip install uv
 uv venv -p 3.12
 source .venv/bin/activate
 uv pip install -r requirements.txt
 ```
 ## Using the Test Client
 The test client demonstrates both streaming and non-streaming generation using TGI's native API.
 First, set your API key as an environment variable:
 ```bash
 export VAST_API_KEY=<your_api_key>
 ```
 The `--endpoint` flag is optional. If not provided, it defaults to `my-tgi-endpoint`.
 ### Generate (Streaming)
 Call to `/generate_stream` with streaming response:
 ```bash
 python -m workers.tgi.client --generate-stream --endpoint <ENDPOINT_NAME>
 ```
 ### Generate (Non-Streaming)
 Call to `/generate` with json response:
 ```bash
 python -m workers.tgi.client --generate --endpoint <ENDPOINT_NAME>
 ```
 ### Interactive Session (Streaming)
 Interactive session with streaming responses. Type `quit` to exit.
 ```bash
 python -m workers.tgi.client --interactive --endpoint <ENDPOINT_NAME>
 ```
 ## API Endpoints
 TGI provides two primary endpoints:
 ### Generate (Non-Streaming)
 `/generate` - Returns the complete response in a single request.
 ```json
 {
-  "inputs": "Your prompt here",
+  "inputs": "PROMPT",
  "parameters": {
-    "max_new_tokens": 1024,
+    "max_new_tokens": 250
    "temperature": 0.7,
    "return_full_text": false
  }
 }
 ```
-### Generate Stream (Streaming)
+Note that the max_new_tokens parameter, rather than the prompt size, impacts performance. For example, if an
-
+instance is benchmarked to process 100 tokens per second, a request with max_new_tokens = 200 will take
-`/generate_stream` - Streams the response token by token.
+approximately 2 seconds to complete.
 ```json
 {
  "inputs": "Your prompt here",
  "parameters": {
    "max_new_tokens": 1024,
    "temperature": 0.7,
    "do_sample": true,
    "return_full_text": false
  }
 }
 ```
 ## Performance Notes
 The `max_new_tokens` parameter (not the prompt size) primarily impacts performance. For example, if an instance is benchmarked to process 100 tokens per second, a request with `max_new_tokens = 200` will take approximately 2 seconds to complete.
@@ -1,222 +1,61 @@
 import logging
 import json
 import os
 import sys
 import argparse
 from vastai import Serverless
 import asyncio
-# ---------------------- Logging ----------------------
+ENDPOINT_NAME = "my-tgi-endpoint" # Change this to match your endpoint name
 logging.basicConfig(
    level=logging.DEBUG,
    format="%(asctime)s[%(levelname)-5s] %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
 )
 log = logging.getLogger(__file__)
 # ---------------------- Defaults ----------------------
 DEFAULT_PROMPT = "Think step by step: Tell me about the Python programming language."
 ENDPOINT_NAME = "TGI-Prod2"       # change this to your TGI endpoint name
 MAX_TOKENS = 1024
-DEFAULT_TEMPERATURE = 0.7
+PROMPT = "Think step by step: Tell me about the Python programming language."
-
+async def call_generate(client: Serverless) -> None:
-# ---------------------- API Calls ----------------------
+    endpoint = await client.get_endpoint(name=ENDPOINT_NAME)
 async def call_generate(client: Serverless, *, endpoint_name: str, prompt: str, **kwargs) -> dict:
    """Non-streaming generation via /generate endpoint"""
    endpoint = await client.get_endpoint(name=endpoint_name)
    payload = {
-        "inputs": prompt,
+        "inputs": PROMPT,
        "parameters": {
-            "max_new_tokens": kwargs.get("max_tokens", MAX_TOKENS),
+            "max_new_tokens": MAX_TOKENS,
-            "temperature": kwargs.get("temperature", DEFAULT_TEMPERATURE),
+            "temperature": 0.7,
-            "return_full_text": False,
+            "return_full_text": False
        }
    }
-    log.debug("POST /generate %s", json.dumps(payload)[:500])
+
-    resp = await endpoint.request("/generate", payload, cost=payload["parameters"]["max_new_tokens"])
+    resp = await endpoint.request("/generate", payload, cost=MAX_TOKENS)
-    return resp["response"]
+
    print(resp["response"]["generated_text"])
-async def call_generate_stream(client: Serverless, *, endpoint_name: str, prompt: str, **kwargs):
+async def call_generate_stream(client: Serverless) -> None:
-    """Streaming generation via /generate_stream endpoint"""
+    endpoint = await client.get_endpoint(name=ENDPOINT_NAME)
    endpoint = await client.get_endpoint(name=endpoint_name)
    payload = {
-        "inputs": prompt,
+        "inputs": PROMPT,
        "parameters": {
-            "max_new_tokens": kwargs.get("max_tokens", MAX_TOKENS),
+            "max_new_tokens": MAX_TOKENS,
-            "temperature": kwargs.get("temperature", DEFAULT_TEMPERATURE),
+            "temperature": 0.7,
            "do_sample": True,
            "return_full_text": False,
        }
    }
-    log.debug("STREAM /generate_stream %s", json.dumps(payload)[:500])
+
    resp = await endpoint.request(
        "/generate_stream",
        payload,
-        cost=payload["parameters"]["max_new_tokens"],
+        cost=MAX_TOKENS,
        stream=True,
    )
-    return resp["response"]  # async generator
+    stream = resp["response"]
    printed_answer = False
    async for event in stream:
        tok = (event.get("token") or {}).get("text")
        if tok:
            if not printed_answer:
                printed_answer = True
                print("Answer:\n", end="", flush=True)
            print(tok, end="", flush=True)
-# ---------------------- Demo Runner ----------------------
+async def main():
-class APIDemo:
+    async with Serverless() as client:
-    """Demo and testing functionality for the TGI API client"""
+        await call_generate(client)
-
+        await call_generate_stream(client)
    def __init__(self, client: Serverless, endpoint_name: str):
        self.client = client
        self.endpoint_name = endpoint_name
    async def handle_streaming_response(self, stream) -> str:
        """Process streaming response and print tokens"""
        full_response = ""
        printed_answer = False
        async for event in stream:
            tok = (event.get("token") or {}).get("text")
            if tok:
                if not printed_answer:
                    printed_answer = True
                    print("\n💬 Response: ", end="", flush=True)
                print(tok, end="", flush=True)
                full_response += tok
        print()  # newline
        if printed_answer:
            print(f"\nStreaming completed. Response tokens: {len(full_response.split())}")
        return full_response
    async def demo_generate(self) -> None:
        """Demo non-streaming generation"""
        print("=" * 60)
        print("GENERATE DEMO (NON-STREAMING)")
        print("=" * 60)
        response = await call_generate(
            client=self.client,
            endpoint_name=self.endpoint_name,
            prompt=DEFAULT_PROMPT,
            max_tokens=MAX_TOKENS,
            temperature=DEFAULT_TEMPERATURE,
        )
        print(f"\n💬 Response: {response.get('generated_text', '')}")
        print(f"\nFull Response:\n{json.dumps(response, indent=2)}")
    async def demo_generate_stream(self) -> None:
        """Demo streaming generation"""
        print("=" * 60)
        print("GENERATE DEMO (STREAMING)")
        print("=" * 60)
        stream = await call_generate_stream(
            client=self.client,
            endpoint_name=self.endpoint_name,
            prompt=DEFAULT_PROMPT,
            max_tokens=MAX_TOKENS,
            temperature=DEFAULT_TEMPERATURE,
        )
        try:
            await self.handle_streaming_response(stream)
        except Exception as e:
            log.error("\nError during streaming: %s", e, exc_info=True)
    async def interactive_chat(self) -> None:
        """Interactive session with streaming generation"""
        print("=" * 60)
        print("INTERACTIVE STREAMING SESSION")
        print("=" * 60)
        print(f"Using endpoint: {self.endpoint_name}")
        print("Type 'quit' to exit")
        print()
        while True:
            try:
                user_input = input("You: ").strip()
                if user_input.lower() == "quit":
                    print("👋 Goodbye!")
                    break
                elif not user_input:
                    continue
                print("Assistant: ", end="", flush=True)
                stream = await call_generate_stream(
                    client=self.client,
                    endpoint_name=self.endpoint_name,
                    prompt=user_input,
                    max_tokens=MAX_TOKENS,
                    temperature=DEFAULT_TEMPERATURE,
                )
                full_response = ""
                async for event in stream:
                    tok = (event.get("token") or {}).get("text")
                    if tok:
                        print(tok, end="", flush=True)
                        full_response += tok
                print()  # newline
            except KeyboardInterrupt:
                print("\n👋 Session interrupted. Goodbye!")
                break
            except Exception as e:
                log.error("\nError: %s", e)
                continue
 # ---------------------- CLI ----------------------
 def build_arg_parser() -> argparse.ArgumentParser:
    p = argparse.ArgumentParser(description="Vast TGI Demo (Serverless SDK)")
    p.add_argument("--endpoint", default=ENDPOINT_NAME, help=f"Vast endpoint name (default: {ENDPOINT_NAME})")
    modes = p.add_mutually_exclusive_group(required=False)
    modes.add_argument("--generate", action="store_true", help="Test generate endpoint (non-streaming)")
    modes.add_argument("--generate-stream", action="store_true", help="Test generate endpoint with streaming")
    modes.add_argument("--interactive", action="store_true", help="Start interactive streaming session")
    return p
 async def main_async():
    args = build_arg_parser().parse_args()
    selected = sum([args.generate, args.generate_stream, args.interactive])
    if selected == 0:
        print("Please specify exactly one test mode:")
        print("  --generate        : Test generate endpoint (non-streaming)")
        print("  --generate-stream : Test generate endpoint with streaming")
        print("  --interactive     : Start interactive streaming session")
        print(f"\nExample: python {os.path.basename(sys.argv[0])} --generate-stream --endpoint my-tgi-endpoint")
        sys.exit(1)
    elif selected > 1:
        print("Please specify exactly one test mode")
        sys.exit(1)
    print("=" * 60)
    print(f"Using endpoint: {args.endpoint}")
    try:
        async with Serverless() as client:
            demo = APIDemo(client, args.endpoint)
            if args.generate:
                await demo.demo_generate()
            elif args.generate_stream:
                await demo.demo_generate_stream()
            elif args.interactive:
                await demo.interactive_chat()
    except Exception as e:
        log.error("Error during test: %s", e, exc_info=True)
        sys.exit(1)
 if __name__ == "__main__":
-    asyncio.run(main_async())
+    asyncio.run(main())
Author	SHA1	Message	Date
Abiola Akinnubi	74efc2cb42	bump up version minor number	2025-11-14 18:07:17 -08:00
Abiola Akinnubi	db3096bbaf	feat AUTO-695: add loaded_at attribute to AutoScalerData and Metrics classes	2025-11-14 17:07:06 -08:00