Increase model wait time for vLLM

2025-12-03 12:38:52 -08:00
3 changed files with 36 additions and 59 deletions
@@ -34,30 +34,12 @@ uv pip install -r requirements.txt

 Several examples have been provided in the client to help you get started with your own implementation.

-First, set your API key as an environment variable:
+### Completions
+
+Call to `/v1/completions` with json response

 ```bash
-export VAST_API_KEY=<your_api_key>
-```
-
-The `--model` and `--endpoint` flags are optional. If not provided, they default to `Qwen/Qwen3-8B` and `my-vllm-endpoint` respectively.
-
-### Chat Completion (streaming)
-
-Call to `/v1/chat/completions` with streaming response
-
-```bash
-python -m workers.openai.client --chat-stream --endpoint <ENDPOINT_NAME> --model <MODEL_NAME>
-```
-
-### Interactive Chat (streaming)
-
-Interactive session with calls to `/v1/chat/completions`.
-
-Type `clear` to clear the chat history or `quit` to exit.
-
-```bash
-python -m workers.openai.client --interactive --endpoint <ENDPOINT_NAME> --model <MODEL_NAME>
+python -m workers.openai.client -k <API_KEY> -e <ENDPOINT_NAME> --completion --model <MODEL_NAME>
 ```

 ### Chat Completion (json)
@@ -65,7 +47,15 @@ python -m workers.openai.client --interactive --endpoint <ENDPOINT_NAME> --model
 Call to `/v1/chat/completions` with json response

 ```bash
-python -m workers.openai.client --chat --endpoint <ENDPOINT_NAME> --model <MODEL_NAME>
+python -m workers.openai.client -k <API_KEY> -e <ENDPOINT_NAME> --chat --model <MODEL_NAME>
+```
+
+### Chat Completion (streaming)
+
+Call to `/v1/chat/completions` with streaming response
+
+```bash
+python -m workers.openai.client -k <API_KEY> -e <ENDPOINT_NAME> --chat-stream --model <MODEL_NAME>
 ```

 ### Tool Use (json)
@@ -75,14 +65,16 @@ Call to `/v1/chat/completions` with tool and json response.
 This test defines a simple tool which will list the contents of the local pyworker directory.  The output is then analysed by the model.

 ```bash
-python -m workers.openai.client --tools --endpoint <ENDPOINT_NAME> --model <MODEL_NAME>
+python -m workers.openai.client -k <API_KEY> -e <ENDPOINT_NAME> --tools --model <MODEL_NAME>
 ```

-### Completions
+### Interactive Chat (streaming)

-Call to `/v1/completions` with json response
+Interactive session with calls to `/v1/chat/completions`.
+
+Type `clear` to clear the chat history or `quit` to exit.

 ```bash
-python -m workers.openai.client --completion --endpoint <ENDPOINT_NAME> --model <MODEL_NAME>
+python -m workers.openai.client -k <API_KEY> -e <ENDPOINT_NAME> --interactive --model <MODEL_NAME>
 ```

@@ -18,7 +18,7 @@ logging.basicConfig(
 log = logging.getLogger(__file__)

 # ---------------------- Prompts ----------------------
-COMPLETIONS_PROMPT = "Zebras are primarily grazers and can subsist on lower-quality vegetation. They are preyed on mainly by"
+COMPLETIONS_PROMPT = "the capital of USA is"
 CHAT_PROMPT = "Think step by step: Tell me about the Python programming language."
 TOOLS_PROMPT = (
    "Can you list the files in the current working directory and tell me what you see? "
@@ -97,9 +97,9 @@ def _tool_state_to_message_tool_calls(state: Dict[int, Dict[str, Any]]) -> List[


 # ---- OpenAI-compatible calls (non-streaming) ----
-async def call_completions(client: Serverless, *, model: str, prompt: str, endpoint_name: str, **kwargs) -> Dict[str, Any]:
+async def call_completions(client: Serverless, *, model: str, prompt: str, **kwargs) -> Dict[str, Any]:

-    endpoint = await client.get_endpoint(name=endpoint_name)
+    endpoint = await client.get_endpoint(name=ENDPOINT_NAME)

    payload = {
        "input": {
@@ -113,9 +113,9 @@ async def call_completions(client: Serverless, *, model: str, prompt: str, endpo
    resp = await endpoint.request("/v1/completions", payload, cost=payload["input"]["max_tokens"])
    return resp["response"]

-async def call_chat_completions(client: Serverless, *, model: str, messages: List[Dict[str, Any]], endpoint_name: str, **kwargs) -> Dict[str, Any]:
+async def call_chat_completions(client: Serverless, *, model: str, messages: List[Dict[str, Any]], **kwargs) -> Dict[str, Any]:

-    endpoint = await client.get_endpoint(name=endpoint_name)
+    endpoint = await client.get_endpoint(name=ENDPOINT_NAME)

    payload = {
        "input": {
@@ -132,9 +132,9 @@ async def call_chat_completions(client: Serverless, *, model: str, messages: Lis
    return resp["response"]

 # ---- Streaming variants ----
-async def stream_completions(client: Serverless, *, model: str, prompt: str, endpoint_name: str, **kwargs):
+async def stream_completions(client: Serverless, *, model: str, prompt: str, **kwargs):

-    endpoint = await client.get_endpoint(name=endpoint_name)
+    endpoint = await client.get_endpoint(name=ENDPOINT_NAME)

    payload = {
        "input": {
@@ -150,9 +150,9 @@ async def stream_completions(client: Serverless, *, model: str, prompt: str, end
    resp = await endpoint.request("/v1/completions", payload, cost=payload["input"]["max_tokens"], stream=True)
    return resp["response"]  # async generator

-async def stream_chat_completions(client: Serverless, *, model: str, messages: List[Dict[str, Any]], endpoint_name: str, **kwargs):
+async def stream_chat_completions(client: Serverless, *, model: str, messages: List[Dict[str, Any]], **kwargs):

-    endpoint = await client.get_endpoint(name=endpoint_name)
+    endpoint = await client.get_endpoint(name=ENDPOINT_NAME)

    payload = {
        "input": {
@@ -174,10 +174,9 @@ async def stream_chat_completions(client: Serverless, *, model: str, messages: L
 class APIDemo:
    """Demo and testing functionality for the API client"""

-    def __init__(self, client: Serverless, model: str, endpoint_name: str, tool_manager: Optional[ToolManager] = None):
+    def __init__(self, client: Serverless, model: str, tool_manager: Optional[ToolManager] = None):
        self.client = client
        self.model = model
-        self.endpoint_name = endpoint_name
        self.tool_manager = tool_manager or ToolManager()

    # ----- Streaming handler -----
@@ -186,15 +185,10 @@ class APIDemo:
        reasoning_content = ""
        printed_reasoning = False
        printed_answer = False
-        finish_reason = None

        async for chunk in stream:
            choice = (chunk.get("choices") or [{}])[0]
            delta = choice.get("delta", {})
-            
-            # Track finish reason
-            if choice.get("finish_reason"):
-                finish_reason = choice.get("finish_reason")

            # reasoning tokens
            rc = delta.get("reasoning_content")
@@ -225,8 +219,6 @@ class APIDemo:
                print(f"Reasoning tokens: {len(reasoning_content.split())}")
            if printed_answer:
                print(f"Response tokens: {len(full_response.split())}")
-            if finish_reason:
-                print(f"Finish reason: {finish_reason}")

        return full_response
    
@@ -239,7 +231,6 @@ class APIDemo:
            client=self.client,
            model=self.model,
            prompt=COMPLETIONS_PROMPT,
-            endpoint_name=self.endpoint_name,
            max_tokens=MAX_TOKENS,
            temperature=DEFAULT_TEMPERATURE,
        )
@@ -258,7 +249,6 @@ class APIDemo:
                client=self.client,
                model=self.model, 
                messages=messages,
-                endpoint_name=self.endpoint_name,
                max_tokens=MAX_TOKENS,
                temperature=DEFAULT_TEMPERATURE
            )
@@ -271,7 +261,6 @@ class APIDemo:
                client=self.client,
                model=self.model, 
                messages=messages,
-                endpoint_name=self.endpoint_name,
                max_tokens=MAX_TOKENS,
                temperature=DEFAULT_TEMPERATURE
            )
@@ -298,7 +287,6 @@ class APIDemo:
                client=self.client,
                model=self.model,
                messages=messages,
-                endpoint_name=self.endpoint_name,
                tools=minimal_tool,
                tool_choice="none",
                max_tokens=10
@@ -324,7 +312,6 @@ class APIDemo:
            client=self.client,
            model=self.model,
            messages=messages,
-            endpoint_name=self.endpoint_name,
            tools=self.tool_manager.get_ls_tool_definition(),
            tool_choice="auto",
            max_tokens=MAX_TOKENS,
@@ -402,7 +389,6 @@ class APIDemo:
            client=self.client,
            model=self.model,
            messages=messages,
-            endpoint_name=self.endpoint_name,
            max_tokens=MAX_TOKENS,
            temperature=DEFAULT_TEMPERATURE,
        )
@@ -441,6 +427,7 @@ class APIDemo:
        print("=" * 60)
        print("INTERACTIVE STREAMING CHAT")
        print("=" * 60)
+        print(f"Using model: {self.model}")
        print("Type 'quit' to exit, 'clear' to clear history")
        print()

@@ -466,8 +453,7 @@ class APIDemo:
                stream = await stream_chat_completions(
                    client=self.client,
                    model=self.model, 
-                    messages=messages,
-                    endpoint_name=self.endpoint_name,
+                    messages=messages, 
                    max_tokens=MAX_TOKENS, 
                    temperature=0.7
                )
@@ -487,8 +473,8 @@ class APIDemo:
 # ---------------------- CLI ----------------------
 def build_arg_parser() -> argparse.ArgumentParser:
    p = argparse.ArgumentParser(description="Vast vLLM Demo (Serverless SDK)")
-    p.add_argument("--model", default=DEFAULT_MODEL, help=f"Model to use for requests (default: {DEFAULT_MODEL})")
-    p.add_argument("--endpoint", default=ENDPOINT_NAME, help=f"Vast endpoint name (default: {ENDPOINT_NAME})")
+    p.add_argument("--model", required=True, help="Model to use for requests (required)")
+    p.add_argument("--endpoint", default="my-vllm-endpoint", help="Vast endpoint name (default: my-vllm-endpoint)")

    modes = p.add_mutually_exclusive_group(required=False)
    modes.add_argument("--completion", action="store_true", help="Test completions endpoint")
@@ -516,14 +502,12 @@ async def main_async():
        print("Please specify exactly one test mode")
        sys.exit(1)

-    print("=" * 60)
    print(f"Using model: {args.model}")
-    print(f"Using endpoint: {args.endpoint}")
-
+    print("=" * 60)

    try:
        async with Serverless() as client:
-            demo = APIDemo(client, args.model, args.endpoint, ToolManager())
+            demo = APIDemo(client, args.model, ToolManager())

            if args.completion:
                await demo.demo_completions()
@@ -35,6 +35,7 @@ backend = Backend(
    model_server_url=os.environ["MODEL_SERVER_URL"],
    model_log_file=os.environ["MODEL_LOG"],
    allow_parallel_requests=True,
+    max_wait_time=600.0,
    benchmark_handler=CompletionsHandler(benchmark_runs=3, benchmark_words=256),
    log_actions=[
        *[(LogAction.ModelLoaded, info_msg) for info_msg in MODEL_SERVER_START_LOG_MSG],