workers/openai/client.py

import logging
import json
import os
import sys
import subprocess
import argparse
from typing import Any, Dict, List, Optional

from vastai import Serverless
import asyncio

# ---------------------- Logging ----------------------
logging.basicConfig(
    level=logging.DEBUG,
    format="%(asctime)s[%(levelname)-5s] %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)
log = logging.getLogger(__file__)

# ---------------------- Prompts ----------------------
COMPLETIONS_PROMPT = "Zebras are primarily grazers and can subsist on lower-quality vegetation. They are preyed on mainly by"
CHAT_PROMPT = "Think step by step: Tell me about the Python programming language."
TOOLS_PROMPT = (
    "Can you list the files in the current working directory and tell me what you see? "
    "What do you think this directory might be for?"
)

ENDPOINT_NAME = "my-vllm-endpoint"       # change this to your vLLM endpoint name
DEFAULT_MODEL = "Qwen/Qwen3-8B"          # must support tool calling
MAX_TOKENS = 1024
DEFAULT_TEMPERATURE = 0.7

# ---------------------- Tooling ----------------------
class ToolManager:
    """Handles tool definitions and execution"""

    @staticmethod
    def list_files() -> str:
        """Execute ls on current directory"""
        try:
            result = subprocess.run(
                ["ls", "-la", "."], capture_output=True, text=True, timeout=10
            )
            if result.returncode == 0:
                return result.stdout
            else:
                return f"Error: {result.stderr}"
        except Exception as e:
            return f"Error running ls: {e}"

    @staticmethod
    def get_ls_tool_definition() -> List[Dict[str, Any]]:
        """OpenAI-compatible tool schema"""
        return [
            {
                "type": "function",
                "function": {
                    "name": "list_files",
                    "description": "List files and directories in the cwd",
                    "parameters": {"type": "object", "properties": {}, "required": []},
                },
            }
        ]

    def execute_tool_call(self, tool_call: Dict[str, Any]) -> str:
        """Execute a tool call and return the result"""
        function_name = (tool_call.get("function") or {}).get("name")
        if function_name == "list_files":
            return self.list_files()
        raise ValueError(f"Unknown tool function: {function_name}")


# ----- Helpers to handle streamed tool_calls assembly -----
def _merge_tool_call_delta(state: Dict[int, Dict[str, Any]], tc_delta: Dict[str, Any]) -> None:
    """
    OpenAI-style streaming sends partial tool_calls with an index and partial fields.
    We merge into a per-index state dict until the assistant message finishes.
    """
    idx = tc_delta.get("index")
    if idx is None:
        return

    entry = state.setdefault(idx, {"id": None, "function": {"name": None, "arguments": ""}, "type": "function"})

    if tc_delta.get("id"):
        entry["id"] = tc_delta["id"]

    fn_delta = tc_delta.get("function") or {}
    if "name" in fn_delta and fn_delta["name"]:
        entry["function"]["name"] = fn_delta["name"]
    if "arguments" in fn_delta and fn_delta["arguments"]:
        entry["function"]["arguments"] += fn_delta["arguments"]


def _tool_state_to_message_tool_calls(state: Dict[int, Dict[str, Any]]) -> List[Dict[str, Any]]:
    return [state[i] for i in sorted(state.keys())]


# ---- OpenAI-compatible calls (non-streaming) ----
async def call_completions(client: Serverless, *, model: str, prompt: str, endpoint_name: str, **kwargs) -> Dict[str, Any]:

    endpoint = await client.get_endpoint(name=endpoint_name)

    payload = {
        "model": model,
        "prompt": prompt,
        "max_tokens": kwargs.get("max_tokens", MAX_TOKENS),
        "temperature": kwargs.get("temperature", DEFAULT_TEMPERATURE),
    }
    log.debug("POST /v1/completions %s", json.dumps(payload)[:500])
    resp = await endpoint.request("/v1/completions", payload, cost=payload["max_tokens"])
    return resp["response"]

async def call_chat_completions(client: Serverless, *, model: str, messages: List[Dict[str, Any]], endpoint_name: str, **kwargs) -> Dict[str, Any]:

    endpoint = await client.get_endpoint(name=endpoint_name)

    payload = {
        "model": model,
        "messages": messages,
        "max_tokens": kwargs.get("max_tokens", MAX_TOKENS),
        "temperature": kwargs.get("temperature", DEFAULT_TEMPERATURE),
        **({"tools": kwargs["tools"]} if "tools" in kwargs else {}),
        **({"tool_choice": kwargs["tool_choice"]} if "tool_choice" in kwargs else {}),
    }
    log.debug("POST /v1/chat/completions %s", json.dumps(payload)[:500])
    resp = await endpoint.request("/v1/chat/completions", payload, cost=payload["max_tokens"])
    return resp["response"]

# ---- Streaming variants ----
async def stream_completions(client: Serverless, *, model: str, prompt: str, endpoint_name: str, **kwargs):

    endpoint = await client.get_endpoint(name=endpoint_name)

    payload = {
        "model": model,
        "prompt": prompt,
        "max_tokens": kwargs.get("max_tokens", MAX_TOKENS),
        "temperature": kwargs.get("temperature", DEFAULT_TEMPERATURE),
        "stream": True,
        **({"stop": kwargs["stop"]} if "stop" in kwargs else {}),
    }
    log.debug("STREAM /v1/completions %s", json.dumps(payload)[:500])
    resp = await endpoint.request("/v1/completions", payload, cost=payload["max_tokens"], stream=True)
    return resp["response"]  # async generator

async def stream_chat_completions(client: Serverless, *, model: str, messages: List[Dict[str, Any]], endpoint_name: str, **kwargs):

    endpoint = await client.get_endpoint(name=endpoint_name)

    payload = {
        "model": model,
        "messages": messages,
        "max_tokens": kwargs.get("max_tokens", MAX_TOKENS),
        "temperature": kwargs.get("temperature", DEFAULT_TEMPERATURE),
        "stream": True,
        **({"tools": kwargs["tools"]} if "tools" in kwargs else {}),
        **({"tool_choice": kwargs["tool_choice"]} if "tool_choice" in kwargs else {}),
    }
    log.debug("STREAM /v1/chat/completions %s", json.dumps(payload)[:500])
    resp = await endpoint.request("/v1/chat/completions", payload, cost=payload["max_tokens"], stream=True)
    return resp["response"]  # async generator


# ---------------------- Demo Runner ----------------------
class APIDemo:
    """Demo and testing functionality for the API client"""

    def __init__(self, client: Serverless, model: str, endpoint_name: str, tool_manager: Optional[ToolManager] = None):
        self.client = client
        self.model = model
        self.endpoint_name = endpoint_name
        self.tool_manager = tool_manager or ToolManager()

    # ----- Streaming handler -----
    async def handle_streaming_response(self, stream, show_reasoning: bool = True) -> str:
        full_response = ""
        reasoning_content = ""
        printed_reasoning = False
        printed_answer = False
        finish_reason = None

        async for chunk in stream:
            choice = (chunk.get("choices") or [{}])[0]
            delta = choice.get("delta", {})
            
            # Track finish reason
            if choice.get("finish_reason"):
                finish_reason = choice.get("finish_reason")

            # reasoning tokens
            rc = delta.get("reasoning_content")
            if rc and show_reasoning:
                if not printed_reasoning:
                    print("\n🧠 Reasoning: ", end="", flush=True)
                    printed_reasoning = True
                print(rc, end="", flush=True)
                reasoning_content += rc

            # content tokens
            content_part = delta.get("content")
            if content_part:
                if not printed_answer:
                    if show_reasoning and printed_reasoning:
                        print("\n💬 Response: ", end="", flush=True)
                    else:
                        print("Assistant: ", end="", flush=True)
                    printed_answer = True
                print(content_part, end="", flush=True)
                full_response += content_part

        print()  # newline
        if show_reasoning:
            if printed_reasoning or printed_answer:
                print("\nStreaming completed.")
            if printed_reasoning:
                print(f"Reasoning tokens: {len(reasoning_content.split())}")
            if printed_answer:
                print(f"Response tokens: {len(full_response.split())}")
            if finish_reason:
                print(f"Finish reason: {finish_reason}")

        return full_response
    
    async def demo_completions(self) -> None:
        print("=" * 60)
        print("COMPLETIONS DEMO")
        print("=" * 60)

        response = await call_completions(
            client=self.client,
            model=self.model,
            prompt=COMPLETIONS_PROMPT,
            endpoint_name=self.endpoint_name,
            max_tokens=MAX_TOKENS,
            temperature=DEFAULT_TEMPERATURE,
        )
        print("\nResponse:")
        print(json.dumps(response, indent=2))

    async def demo_chat(self, use_streaming: bool = True) -> None:
        print("=" * 60)
        print(f"CHAT COMPLETIONS DEMO {'(STREAMING)' if use_streaming else '(NON-STREAMING)'}")
        print("=" * 60)

        messages = [{"role": "user", "content": CHAT_PROMPT}]

        if use_streaming:
            stream = await stream_chat_completions(
                client=self.client,
                model=self.model, 
                messages=messages,
                endpoint_name=self.endpoint_name,
                max_tokens=MAX_TOKENS,
                temperature=DEFAULT_TEMPERATURE
            )
            try:
                await self.handle_streaming_response(stream, show_reasoning=True)
            except Exception as e:
                log.error("\nError during streaming: %s", e, exc_info=True)
        else:
            response = await call_chat_completions(
                client=self.client,
                model=self.model, 
                messages=messages,
                endpoint_name=self.endpoint_name,
                max_tokens=MAX_TOKENS,
                temperature=DEFAULT_TEMPERATURE
            )
            choice = (response.get("choices") or [{}])[0]
            message = choice.get("message", {})
            content = message.get("content", "")
            reasoning = message.get("reasoning_content", "") or message.get("reasoning", "")
            if reasoning:
                print(f"\n🧠 Reasoning: \033[90m{reasoning}\033[0m")
            print(f"\n💬 Assistant: {content}")
            print(f"\nFull Response:\n{json.dumps(response, indent=2)}")

    async def test_tool_support(self) -> bool:
        """Probe that tool schema is accepted (no actual call)"""
        messages = [{"role": "user", "content": "Hello"}]
        minimal_tool = [
            {
                "type": "function",
                "function": {"name": "test_function", "description": "Test function"},
            }
        ]
        try:
            _ = await call_chat_completions(
                client=self.client,
                model=self.model,
                messages=messages,
                endpoint_name=self.endpoint_name,
                tools=minimal_tool,
                tool_choice="none",
                max_tokens=10
            )
            return True
        except Exception as e:
            log.error("Endpoint does not support tool calling: %s", e)
            return False

    async def demo_ls_tool(self) -> None:
        """Ask to list files using function calling, then provide final analysis"""
        print("=" * 60)
        print("TOOL USE DEMO: List Directory Contents")
        print("=" * 60)

        if not await self.test_tool_support():
            return

        messages: List[Dict[str, Any]] = [{"role": "user", "content": TOOLS_PROMPT}]

        # First pass: let the model decide tools, stream tool_calls and partial content
        stream = await stream_chat_completions(
            client=self.client,
            model=self.model,
            messages=messages,
            endpoint_name=self.endpoint_name,
            tools=self.tool_manager.get_ls_tool_definition(),
            tool_choice="auto",
            max_tokens=MAX_TOKENS,
            temperature=DEFAULT_TEMPERATURE,
        )

        assistant_content_buf: List[str] = []
        tool_calls_state: Dict[int, Dict[str, Any]] = {}
        printed_reasoning = False
        printed_answer = False

        async for chunk in stream:
            choice = (chunk.get("choices") or [{}])[0]
            delta = choice.get("delta", {})

            rc = delta.get("reasoning_content")
            if rc:
                if not printed_reasoning:
                    printed_reasoning = True
                    print("🧠 Reasoning: ", end="", flush=True)
                print(rc, end="", flush=True)

            content_part = delta.get("content")
            if content_part:
                assistant_content_buf.append(content_part)
                if not printed_answer:
                    printed_answer = True
                    print("\n💬 Response: ", end="", flush=True)
                print(content_part, end="", flush=True)

            if "tool_calls" in delta and delta["tool_calls"]:
                for tc_delta in delta["tool_calls"]:
                    _merge_tool_call_delta(tool_calls_state, tc_delta)

        # If no tool calls, we’re done.
        if not tool_calls_state:
            print("\n(No tool calls were made.)")
            return

        # Build assistant message with tool_calls
        assistant_message = {
            "role": "assistant",
            "content": "".join(assistant_content_buf) if assistant_content_buf else None,
            "tool_calls": _tool_state_to_message_tool_calls(tool_calls_state),
        }
        messages.append(assistant_message)

        # Execute tools and feed results back
        for tc in assistant_message["tool_calls"]:
            tool_name = (tc.get("function") or {}).get("name")
            call_id = tc.get("id")
            raw_args = (tc.get("function") or {}).get("arguments") or "{}"

            try:
                args = json.loads(raw_args) if raw_args.strip() else {}
            except Exception as e:
                tool_result = json.dumps({"error": f"Argument parse failed: {str(e)}", "raw_arguments": raw_args})
                messages.append({"role": "tool", "tool_call_id": call_id, "content": tool_result})
                continue

            try:
                if tool_name == "list_files":
                    tool_result = self.tool_manager.list_files()
                else:
                    tool_result = json.dumps({"error": f"Unknown tool '{tool_name}'"})
            except Exception as e:
                tool_result = json.dumps({"error": f"Tool '{tool_name}' failed: {str(e)}"})

            print("\n[Tool executed]", tool_name)
            print(tool_result[:500] + ("..." if len(tool_result) > 500 else ""))
            messages.append({"role": "tool", "tool_call_id": call_id, "content": tool_result})

        # Second pass: get final streamed answer after tool results
        stream2 = await stream_chat_completions(
            client=self.client,
            model=self.model,
            messages=messages,
            endpoint_name=self.endpoint_name,
            max_tokens=MAX_TOKENS,
            temperature=DEFAULT_TEMPERATURE,
        )

        final_buf = []
        printed_reasoning2 = False
        printed_answer2 = False

        async for chunk in stream2:
            choice = (chunk.get("choices") or [{}])[0]
            delta = choice.get("delta", {})

            rc2 = delta.get("reasoning_content")
            if rc2:
                if not printed_reasoning2:
                    printed_reasoning2 = True
                    print("\n🧠 Reasoning (post-tools): ", end="", flush=True)
                print(rc2, end="", flush=True)

            c2 = delta.get("content")
            if c2:
                final_buf.append(c2)
                if not printed_answer2:
                    printed_answer2 = True
                    print("\n💬 Response (final): ", end="", flush=True)
                print(c2, end="", flush=True)

        print("\n" + "=" * 60)
        print("FINAL LLM ANALYSIS:")
        print("=" * 60)
        print("".join(final_buf))
        print("=" * 60)

    async def interactive_chat(self) -> None:
        """Interactive chat session with streaming"""
        print("=" * 60)
        print("INTERACTIVE STREAMING CHAT")
        print("=" * 60)
        print("Type 'quit' to exit, 'clear' to clear history")
        print()

        messages: List[Dict[str, Any]] = []

        while True:
            try:
                user_input = input("You: ").strip()

                if user_input.lower() == "quit":
                    print("👋 Goodbye!")
                    break
                elif user_input.lower() == "clear":
                    messages = []
                    print("Chat history cleared")
                    continue
                elif not user_input:
                    continue

                messages.append({"role": "user", "content": user_input})

                print("Assistant: ", end="", flush=True)
                stream = await stream_chat_completions(
                    client=self.client,
                    model=self.model, 
                    messages=messages,
                    endpoint_name=self.endpoint_name,
                    max_tokens=MAX_TOKENS, 
                    temperature=0.7
                )
                assistant_content = await self.handle_streaming_response(stream, show_reasoning=True)

                # Add assistant response to conversation history
                messages.append({"role": "assistant", "content": assistant_content})

            except KeyboardInterrupt:
                print("\n👋 Chat interrupted. Goodbye!")
                break
            except Exception as e:
                log.error("\nError: %s", e)
                continue


# ---------------------- CLI ----------------------
def build_arg_parser() -> argparse.ArgumentParser:
    p = argparse.ArgumentParser(description="Vast vLLM Demo (Serverless SDK)")
    p.add_argument("--model", default=DEFAULT_MODEL, help=f"Model to use for requests (default: {DEFAULT_MODEL})")
    p.add_argument("--endpoint", default=ENDPOINT_NAME, help=f"Vast endpoint name (default: {ENDPOINT_NAME})")

    modes = p.add_mutually_exclusive_group(required=False)
    modes.add_argument("--completion", action="store_true", help="Test completions endpoint")
    modes.add_argument("--chat", action="store_true", help="Test chat completions endpoint (non-streaming)")
    modes.add_argument("--chat-stream", action="store_true", help="Test chat completions endpoint with streaming")
    modes.add_argument("--tools", action="store_true", help="Test function calling with ls tool (non-streaming+streamed phases)")
    modes.add_argument("--interactive", action="store_true", help="Start interactive streaming chat session")
    return p


async def main_async():
    args = build_arg_parser().parse_args()

    selected = sum([args.completion, args.chat, args.chat_stream, args.tools, args.interactive])
    if selected == 0:
        print("Please specify exactly one test mode:")
        print("  --completion    : Test completions endpoint")
        print("  --chat          : Test chat completions endpoint (non-streaming)")
        print("  --chat-stream   : Test chat completions endpoint with streaming")
        print("  --tools         : Test function calling with ls tool")
        print("  --interactive   : Start interactive streaming chat session")
        print(f"\nExample: python {os.path.basename(sys.argv[0])} --model Qwen/Qwen3-8B --chat-stream --endpoint my-vllm-endpoint")
        sys.exit(1)
    elif selected > 1:
        print("Please specify exactly one test mode")
        sys.exit(1)

    print("=" * 60)
    print(f"Using model: {args.model}")
    print(f"Using endpoint: {args.endpoint}")


    try:
        async with Serverless() as client:
            demo = APIDemo(client, args.model, args.endpoint, ToolManager())

            if args.completion:
                await demo.demo_completions()
            elif args.chat:
                await demo.demo_chat(use_streaming=False)
            elif args.chat_stream:
                await demo.demo_chat(use_streaming=True)
            elif args.tools:
                await demo.demo_ls_tool()
            elif args.interactive:
                await demo.interactive_chat()

    except Exception as e:
        log.error("Error during test: %s", e, exc_info=True)
        sys.exit(1)


if __name__ == "__main__":
    asyncio.run(main_async())
-											OpenAI compatible worker (#19)
										
										
											2025-07-16 09:46:26 +01:00
+								import logging
 								import json
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
+								import os
 								import sys
-											OpenAI compatible worker (#19)
										
										
											2025-07-16 09:46:26 +01:00
+								import subprocess
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
+								import argparse
 								from typing import Any, Dict, List, Optional
 								from vastai import Serverless
 								import asyncio
-											OpenAI compatible worker (#19)
										
										
											2025-07-16 09:46:26 +01:00
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
+								# ---------------------- Logging ----------------------
-											OpenAI compatible worker (#19)
										
										
											2025-07-16 09:46:26 +01:00
+								logging.basicConfig(
 								    level=logging.DEBUG,
 								    format="%(asctime)s[%(levelname)-5s] %(message)s",
 								    datefmt="%Y-%m-%d %H:%M:%S",
 								)
 								log = logging.getLogger(__file__)
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
+								# ---------------------- Prompts ----------------------
-											defaults to ENDPOINT_NAME and DEFAULT_MODEL but uses the flag first if present
										
										
											2025-12-03 16:57:28 -08:00
+								COMPLETIONS_PROMPT = "Zebras are primarily grazers and can subsist on lower-quality vegetation. They are preyed on mainly by"
-											OpenAI compatible worker (#19)
										
										
											2025-07-16 09:46:26 +01:00
+								CHAT_PROMPT = "Think step by step: Tell me about the Python programming language."
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
+								TOOLS_PROMPT = (
 								    "Can you list the files in the current working directory and tell me what you see? "
 								    "What do you think this directory might be for?"
 								)
-											OpenAI compatible worker (#19)
										
										
											2025-07-16 09:46:26 +01:00
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
+								ENDPOINT_NAME = "my-vllm-endpoint"       # change this to your vLLM endpoint name
 								DEFAULT_MODEL = "Qwen/Qwen3-8B"          # must support tool calling
 								MAX_TOKENS = 1024
 								DEFAULT_TEMPERATURE = 0.7
-											OpenAI compatible worker (#19)
										
										
											2025-07-16 09:46:26 +01:00
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
+								# ---------------------- Tooling ----------------------
-											OpenAI compatible worker (#19)
										
										
											2025-07-16 09:46:26 +01:00
+								class ToolManager:
 								    """Handles tool definitions and execution"""
-											fix pyright errors + revert to old way of handling cancelled api requests (#23)
										
										
											2025-07-17 16:59:06 -07:00
-											OpenAI compatible worker (#19)
										
										
											2025-07-16 09:46:26 +01:00
+								    @staticmethod
 								    def list_files() -> str:
 								        """Execute ls on current directory"""
 								        try:
-											fix pyright errors + revert to old way of handling cancelled api requests (#23)
										
										
											2025-07-17 16:59:06 -07:00
+								            result = subprocess.run(
 								                ["ls", "-la", "."], capture_output=True, text=True, timeout=10
 								            )
-											OpenAI compatible worker (#19)
										
										
											2025-07-16 09:46:26 +01:00
+								            if result.returncode == 0:
 								                return result.stdout
 								            else:
 								                return f"Error: {result.stderr}"
 								        except Exception as e:
 								            return f"Error running ls: {e}"
-											fix pyright errors + revert to old way of handling cancelled api requests (#23)
										
										
											2025-07-17 16:59:06 -07:00
-											OpenAI compatible worker (#19)
										
										
											2025-07-16 09:46:26 +01:00
+								    @staticmethod
 								    def get_ls_tool_definition() -> List[Dict[str, Any]]:
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
+								        """OpenAI-compatible tool schema"""
-											fix pyright errors + revert to old way of handling cancelled api requests (#23)
										
										
											2025-07-17 16:59:06 -07:00
+								        return [
 								            {
 								                "type": "function",
 								                "function": {
 								                    "name": "list_files",
 								                    "description": "List files and directories in the cwd",
 								                    "parameters": {"type": "object", "properties": {}, "required": []},
 								                },
-											OpenAI compatible worker (#19)
										
										
											2025-07-16 09:46:26 +01:00
+								            }
-											fix pyright errors + revert to old way of handling cancelled api requests (#23)
										
										
											2025-07-17 16:59:06 -07:00
+								        ]
-											OpenAI compatible worker (#19)
										
										
											2025-07-16 09:46:26 +01:00
+								    def execute_tool_call(self, tool_call: Dict[str, Any]) -> str:
 								        """Execute a tool call and return the result"""
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
+								        function_name = (tool_call.get("function") or {}).get("name")
-											OpenAI compatible worker (#19)
										
										
											2025-07-16 09:46:26 +01:00
+								        if function_name == "list_files":
 								            return self.list_files()
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
+								        raise ValueError(f"Unknown tool function: {function_name}")
 								# ----- Helpers to handle streamed tool_calls assembly -----
 								def _merge_tool_call_delta(state: Dict[int, Dict[str, Any]], tc_delta: Dict[str, Any]) -> None:
 								    """
 								    OpenAI-style streaming sends partial tool_calls with an index and partial fields.
 								    We merge into a per-index state dict until the assistant message finishes.
 								    """
 								    idx = tc_delta.get("index")
 								    if idx is None:
 								        return
 								    entry = state.setdefault(idx, {"id": None, "function": {"name": None, "arguments": ""}, "type": "function"})
 								    if tc_delta.get("id"):
 								        entry["id"] = tc_delta["id"]
 								    fn_delta = tc_delta.get("function") or {}
 								    if "name" in fn_delta and fn_delta["name"]:
 								        entry["function"]["name"] = fn_delta["name"]
 								    if "arguments" in fn_delta and fn_delta["arguments"]:
 								        entry["function"]["arguments"] += fn_delta["arguments"]
 								def _tool_state_to_message_tool_calls(state: Dict[int, Dict[str, Any]]) -> List[Dict[str, Any]]:
 								    return [state[i] for i in sorted(state.keys())]
 								# ---- OpenAI-compatible calls (non-streaming) ----
-											defaults to ENDPOINT_NAME and DEFAULT_MODEL but uses the flag first if present
										
										
											2025-12-03 16:57:28 -08:00
+								async def call_completions(client: Serverless, *, model: str, prompt: str, endpoint_name: str, **kwargs) -> Dict[str, Any]:
-											OpenAI compatible worker (#19)
										
										
											2025-07-16 09:46:26 +01:00
-											defaults to ENDPOINT_NAME and DEFAULT_MODEL but uses the flag first if present
										
										
											2025-12-03 16:57:28 -08:00
+								    endpoint = await client.get_endpoint(name=endpoint_name)
-											OpenAI compatible worker (#19)
										
										
											2025-07-16 09:46:26 +01:00
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
+								    payload = {
-											Use PyWorker SDK (#67)
										
										
											2025-12-15 22:33:03 -05:00
+								        "model": model,
 								        "prompt": prompt,
 								        "max_tokens": kwargs.get("max_tokens", MAX_TOKENS),
 								        "temperature": kwargs.get("temperature", DEFAULT_TEMPERATURE),
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
+								    }
 								    log.debug("POST /v1/completions %s", json.dumps(payload)[:500])
-											Use PyWorker SDK (#67)
										
										
											2025-12-15 22:33:03 -05:00
+								    resp = await endpoint.request("/v1/completions", payload, cost=payload["max_tokens"])
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
+								    return resp["response"]
-											defaults to ENDPOINT_NAME and DEFAULT_MODEL but uses the flag first if present
										
										
											2025-12-03 16:57:28 -08:00
+								async def call_chat_completions(client: Serverless, *, model: str, messages: List[Dict[str, Any]], endpoint_name: str, **kwargs) -> Dict[str, Any]:
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
-											defaults to ENDPOINT_NAME and DEFAULT_MODEL but uses the flag first if present
										
										
											2025-12-03 16:57:28 -08:00
+								    endpoint = await client.get_endpoint(name=endpoint_name)
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
 								    payload = {
-											Use PyWorker SDK (#67)
										
										
											2025-12-15 22:33:03 -05:00
+								        "model": model,
 								        "messages": messages,
 								        "max_tokens": kwargs.get("max_tokens", MAX_TOKENS),
 								        "temperature": kwargs.get("temperature", DEFAULT_TEMPERATURE),
 								        **({"tools": kwargs["tools"]} if "tools" in kwargs else {}),
 								        **({"tool_choice": kwargs["tool_choice"]} if "tool_choice" in kwargs else {}),
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
+								    }
 								    log.debug("POST /v1/chat/completions %s", json.dumps(payload)[:500])
-											Use PyWorker SDK (#67)
										
										
											2025-12-15 22:33:03 -05:00
+								    resp = await endpoint.request("/v1/chat/completions", payload, cost=payload["max_tokens"])
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
+								    return resp["response"]
 								# ---- Streaming variants ----
-											defaults to ENDPOINT_NAME and DEFAULT_MODEL but uses the flag first if present
										
										
											2025-12-03 16:57:28 -08:00
+								async def stream_completions(client: Serverless, *, model: str, prompt: str, endpoint_name: str, **kwargs):
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
-											defaults to ENDPOINT_NAME and DEFAULT_MODEL but uses the flag first if present
										
										
											2025-12-03 16:57:28 -08:00
+								    endpoint = await client.get_endpoint(name=endpoint_name)
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
 								    payload = {
-											Use PyWorker SDK (#67)
										
										
											2025-12-15 22:33:03 -05:00
+								        "model": model,
 								        "prompt": prompt,
 								        "max_tokens": kwargs.get("max_tokens", MAX_TOKENS),
 								        "temperature": kwargs.get("temperature", DEFAULT_TEMPERATURE),
 								        "stream": True,
 								        **({"stop": kwargs["stop"]} if "stop" in kwargs else {}),
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
+								    }
 								    log.debug("STREAM /v1/completions %s", json.dumps(payload)[:500])
-											Use PyWorker SDK (#67)
										
										
											2025-12-15 22:33:03 -05:00
+								    resp = await endpoint.request("/v1/completions", payload, cost=payload["max_tokens"], stream=True)
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
+								    return resp["response"]  # async generator
-											defaults to ENDPOINT_NAME and DEFAULT_MODEL but uses the flag first if present
										
										
											2025-12-03 16:57:28 -08:00
+								async def stream_chat_completions(client: Serverless, *, model: str, messages: List[Dict[str, Any]], endpoint_name: str, **kwargs):
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
-											defaults to ENDPOINT_NAME and DEFAULT_MODEL but uses the flag first if present
										
										
											2025-12-03 16:57:28 -08:00
+								    endpoint = await client.get_endpoint(name=endpoint_name)
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
 								    payload = {
-											Use PyWorker SDK (#67)
										
										
											2025-12-15 22:33:03 -05:00
+								        "model": model,
 								        "messages": messages,
 								        "max_tokens": kwargs.get("max_tokens", MAX_TOKENS),
 								        "temperature": kwargs.get("temperature", DEFAULT_TEMPERATURE),
 								        "stream": True,
 								        **({"tools": kwargs["tools"]} if "tools" in kwargs else {}),
 								        **({"tool_choice": kwargs["tool_choice"]} if "tool_choice" in kwargs else {}),
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
+								    }
 								    log.debug("STREAM /v1/chat/completions %s", json.dumps(payload)[:500])
-											Use PyWorker SDK (#67)
										
										
											2025-12-15 22:33:03 -05:00
+								    resp = await endpoint.request("/v1/chat/completions", payload, cost=payload["max_tokens"], stream=True)
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
+								    return resp["response"]  # async generator
 								# ---------------------- Demo Runner ----------------------
-											OpenAI compatible worker (#19)
										
										
											2025-07-16 09:46:26 +01:00
+								class APIDemo:
 								    """Demo and testing functionality for the API client"""
-											fix pyright errors + revert to old way of handling cancelled api requests (#23)
										
										
											2025-07-17 16:59:06 -07:00
-											defaults to ENDPOINT_NAME and DEFAULT_MODEL but uses the flag first if present
										
										
											2025-12-03 16:57:28 -08:00
+								    def __init__(self, client: Serverless, model: str, endpoint_name: str, tool_manager: Optional[ToolManager] = None):
-											OpenAI compatible worker (#19)
										
										
											2025-07-16 09:46:26 +01:00
+								        self.client = client
 								        self.model = model
-											defaults to ENDPOINT_NAME and DEFAULT_MODEL but uses the flag first if present
										
										
											2025-12-03 16:57:28 -08:00
+								        self.endpoint_name = endpoint_name
-											OpenAI compatible worker (#19)
										
										
											2025-07-16 09:46:26 +01:00
+								        self.tool_manager = tool_manager or ToolManager()
-											fix pyright errors + revert to old way of handling cancelled api requests (#23)
										
										
											2025-07-17 16:59:06 -07:00
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
+								    # ----- Streaming handler -----
 								    async def handle_streaming_response(self, stream, show_reasoning: bool = True) -> str:
-											OpenAI compatible worker (#19)
										
										
											2025-07-16 09:46:26 +01:00
+								        full_response = ""
 								        reasoning_content = ""
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
+								        printed_reasoning = False
 								        printed_answer = False
-											defaults to ENDPOINT_NAME and DEFAULT_MODEL but uses the flag first if present
										
										
											2025-12-03 16:57:28 -08:00
+								        finish_reason = None
-											OpenAI compatible worker (#19)
										
										
											2025-07-16 09:46:26 +01:00
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
+								        async for chunk in stream:
 								            choice = (chunk.get("choices") or [{}])[0]
 								            delta = choice.get("delta", {})
-											defaults to ENDPOINT_NAME and DEFAULT_MODEL but uses the flag first if present
										
										
											2025-12-03 16:57:28 -08:00
 								            # Track finish reason
 								            if choice.get("finish_reason"):
 								                finish_reason = choice.get("finish_reason")
-											OpenAI compatible worker (#19)
										
										
											2025-07-16 09:46:26 +01:00
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
+								            # reasoning tokens
 								            rc = delta.get("reasoning_content")
 								            if rc and show_reasoning:
 								                if not printed_reasoning:
-											OpenAI compatible worker (#19)
										
										
											2025-07-16 09:46:26 +01:00
+								                    print("\n🧠 Reasoning: ", end="", flush=True)
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
+								                    printed_reasoning = True
 								                print(rc, end="", flush=True)
 								                reasoning_content += rc
 								            # content tokens
 								            content_part = delta.get("content")
 								            if content_part:
 								                if not printed_answer:
 								                    if show_reasoning and printed_reasoning:
 								                        print("\n💬 Response: ", end="", flush=True)
-											OpenAI compatible worker (#19)
										
										
											2025-07-16 09:46:26 +01:00
+								                    else:
 								                        print("Assistant: ", end="", flush=True)
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
+								                    printed_answer = True
 								                print(content_part, end="", flush=True)
 								                full_response += content_part
-											OpenAI compatible worker (#19)
										
										
											2025-07-16 09:46:26 +01:00
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
+								        print()  # newline
-											OpenAI compatible worker (#19)
										
										
											2025-07-16 09:46:26 +01:00
+								        if show_reasoning:
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
+								            if printed_reasoning or printed_answer:
-											OpenAI compatible worker (#19)
										
										
											2025-07-16 09:46:26 +01:00
+								                print("\nStreaming completed.")
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
+								            if printed_reasoning:
-											OpenAI compatible worker (#19)
										
										
											2025-07-16 09:46:26 +01:00
+								                print(f"Reasoning tokens: {len(reasoning_content.split())}")
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
+								            if printed_answer:
-											OpenAI compatible worker (#19)
										
										
											2025-07-16 09:46:26 +01:00
+								                print(f"Response tokens: {len(full_response.split())}")
-											defaults to ENDPOINT_NAME and DEFAULT_MODEL but uses the flag first if present
										
										
											2025-12-03 16:57:28 -08:00
+								            if finish_reason:
 								                print(f"Finish reason: {finish_reason}")
-											OpenAI compatible worker (#19)
										
										
											2025-07-16 09:46:26 +01:00
 								        return full_response
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
 								    async def demo_completions(self) -> None:
-											OpenAI compatible worker (#19)
										
										
											2025-07-16 09:46:26 +01:00
+								        print("=" * 60)
 								        print("COMPLETIONS DEMO")
 								        print("=" * 60)
-											fix pyright errors + revert to old way of handling cancelled api requests (#23)
										
										
											2025-07-17 16:59:06 -07:00
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
+								        response = await call_completions(
 								            client=self.client,
 								            model=self.model,
 								            prompt=COMPLETIONS_PROMPT,
-											defaults to ENDPOINT_NAME and DEFAULT_MODEL but uses the flag first if present
										
										
											2025-12-03 16:57:28 -08:00
+								            endpoint_name=self.endpoint_name,
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
+								            max_tokens=MAX_TOKENS,
 								            temperature=DEFAULT_TEMPERATURE,
-											OpenAI compatible worker (#19)
										
										
											2025-07-16 09:46:26 +01:00
+								        )
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
+								        print("\nResponse:")
 								        print(json.dumps(response, indent=2))
-											fix pyright errors + revert to old way of handling cancelled api requests (#23)
										
										
											2025-07-17 16:59:06 -07:00
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
+								    async def demo_chat(self, use_streaming: bool = True) -> None:
-											OpenAI compatible worker (#19)
										
										
											2025-07-16 09:46:26 +01:00
+								        print("=" * 60)
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
+								        print(f"CHAT COMPLETIONS DEMO {'(STREAMING)' if use_streaming else '(NON-STREAMING)'}")
-											OpenAI compatible worker (#19)
										
										
											2025-07-16 09:46:26 +01:00
+								        print("=" * 60)
-											fix pyright errors + revert to old way of handling cancelled api requests (#23)
										
										
											2025-07-17 16:59:06 -07:00
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
+								        messages = [{"role": "user", "content": CHAT_PROMPT}]
-											fix pyright errors + revert to old way of handling cancelled api requests (#23)
										
										
											2025-07-17 16:59:06 -07:00
-											OpenAI compatible worker (#19)
										
										
											2025-07-16 09:46:26 +01:00
+								        if use_streaming:
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
+								            stream = await stream_chat_completions(
 								                client=self.client,
 								                model=self.model,
 								                messages=messages,
-											defaults to ENDPOINT_NAME and DEFAULT_MODEL but uses the flag first if present
										
										
											2025-12-03 16:57:28 -08:00
+								                endpoint_name=self.endpoint_name,
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
+								                max_tokens=MAX_TOKENS,
 								                temperature=DEFAULT_TEMPERATURE
 								            )
-											OpenAI compatible worker (#19)
										
										
											2025-07-16 09:46:26 +01:00
+								            try:
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
+								                await self.handle_streaming_response(stream, show_reasoning=True)
-											OpenAI compatible worker (#19)
										
										
											2025-07-16 09:46:26 +01:00
+								            except Exception as e:
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
+								                log.error("\nError during streaming: %s", e, exc_info=True)
-											OpenAI compatible worker (#19)
										
										
											2025-07-16 09:46:26 +01:00
+								        else:
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
+								            response = await call_chat_completions(
 								                client=self.client,
 								                model=self.model,
 								                messages=messages,
-											defaults to ENDPOINT_NAME and DEFAULT_MODEL but uses the flag first if present
										
										
											2025-12-03 16:57:28 -08:00
+								                endpoint_name=self.endpoint_name,
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
+								                max_tokens=MAX_TOKENS,
 								                temperature=DEFAULT_TEMPERATURE
 								            )
 								            choice = (response.get("choices") or [{}])[0]
 								            message = choice.get("message", {})
 								            content = message.get("content", "")
 								            reasoning = message.get("reasoning_content", "") or message.get("reasoning", "")
 								            if reasoning:
 								                print(f"\n🧠 Reasoning: \033[90m{reasoning}\033[0m")
 								            print(f"\n💬 Assistant: {content}")
 								            print(f"\nFull Response:\n{json.dumps(response, indent=2)}")
 								    async def test_tool_support(self) -> bool:
 								        """Probe that tool schema is accepted (no actual call)"""
 								        messages = [{"role": "user", "content": "Hello"}]
 								        minimal_tool = [
 								            {
 								                "type": "function",
 								                "function": {"name": "test_function", "description": "Test function"},
 								            }
 								        ]
 								        try:
 								            _ = await call_chat_completions(
 								                client=self.client,
 								                model=self.model,
 								                messages=messages,
-											defaults to ENDPOINT_NAME and DEFAULT_MODEL but uses the flag first if present
										
										
											2025-12-03 16:57:28 -08:00
+								                endpoint_name=self.endpoint_name,
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
+								                tools=minimal_tool,
 								                tool_choice="none",
 								                max_tokens=10
 								            )
 								            return True
 								        except Exception as e:
 								            log.error("Endpoint does not support tool calling: %s", e)
 								            return False
-											OpenAI compatible worker (#19)
										
										
											2025-07-16 09:46:26 +01:00
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
+								    async def demo_ls_tool(self) -> None:
 								        """Ask to list files using function calling, then provide final analysis"""
-											OpenAI compatible worker (#19)
										
										
											2025-07-16 09:46:26 +01:00
+								        print("=" * 60)
 								        print("TOOL USE DEMO: List Directory Contents")
 								        print("=" * 60)
-											fix pyright errors + revert to old way of handling cancelled api requests (#23)
										
										
											2025-07-17 16:59:06 -07:00
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
+								        if not await self.test_tool_support():
-											OpenAI compatible worker (#19)
										
										
											2025-07-16 09:46:26 +01:00
+								            return
-											fix pyright errors + revert to old way of handling cancelled api requests (#23)
										
										
											2025-07-17 16:59:06 -07:00
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
+								        messages: List[Dict[str, Any]] = [{"role": "user", "content": TOOLS_PROMPT}]
-											fix pyright errors + revert to old way of handling cancelled api requests (#23)
										
										
											2025-07-17 16:59:06 -07:00
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
+								        # First pass: let the model decide tools, stream tool_calls and partial content
 								        stream = await stream_chat_completions(
 								            client=self.client,
-											OpenAI compatible worker (#19)
										
										
											2025-07-16 09:46:26 +01:00
+								            model=self.model,
 								            messages=messages,
-											defaults to ENDPOINT_NAME and DEFAULT_MODEL but uses the flag first if present
										
										
											2025-12-03 16:57:28 -08:00
+								            endpoint_name=self.endpoint_name,
-											OpenAI compatible worker (#19)
										
										
											2025-07-16 09:46:26 +01:00
+								            tools=self.tool_manager.get_ls_tool_definition(),
-											fix pyright errors + revert to old way of handling cancelled api requests (#23)
										
										
											2025-07-17 16:59:06 -07:00
+								            tool_choice="auto",
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
+								            max_tokens=MAX_TOKENS,
 								            temperature=DEFAULT_TEMPERATURE,
-											OpenAI compatible worker (#19)
										
										
											2025-07-16 09:46:26 +01:00
+								        )
-											fix pyright errors + revert to old way of handling cancelled api requests (#23)
										
										
											2025-07-17 16:59:06 -07:00
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
+								        assistant_content_buf: List[str] = []
 								        tool_calls_state: Dict[int, Dict[str, Any]] = {}
 								        printed_reasoning = False
 								        printed_answer = False
 								        async for chunk in stream:
 								            choice = (chunk.get("choices") or [{}])[0]
 								            delta = choice.get("delta", {})
 								            rc = delta.get("reasoning_content")
 								            if rc:
 								                if not printed_reasoning:
 								                    printed_reasoning = True
 								                    print("🧠 Reasoning: ", end="", flush=True)
 								                print(rc, end="", flush=True)
 								            content_part = delta.get("content")
 								            if content_part:
 								                assistant_content_buf.append(content_part)
 								                if not printed_answer:
 								                    printed_answer = True
 								                    print("\n💬 Response: ", end="", flush=True)
 								                print(content_part, end="", flush=True)
 								            if "tool_calls" in delta and delta["tool_calls"]:
 								                for tc_delta in delta["tool_calls"]:
 								                    _merge_tool_call_delta(tool_calls_state, tc_delta)
 								        # If no tool calls, we’re done.
 								        if not tool_calls_state:
 								            print("\n(No tool calls were made.)")
 								            return
-											fix pyright errors + revert to old way of handling cancelled api requests (#23)
										
										
											2025-07-17 16:59:06 -07:00
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
+								        # Build assistant message with tool_calls
 								        assistant_message = {
 								            "role": "assistant",
 								            "content": "".join(assistant_content_buf) if assistant_content_buf else None,
 								            "tool_calls": _tool_state_to_message_tool_calls(tool_calls_state),
 								        }
 								        messages.append(assistant_message)
-											fix pyright errors + revert to old way of handling cancelled api requests (#23)
										
										
											2025-07-17 16:59:06 -07:00
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
+								        # Execute tools and feed results back
 								        for tc in assistant_message["tool_calls"]:
 								            tool_name = (tc.get("function") or {}).get("name")
 								            call_id = tc.get("id")
 								            raw_args = (tc.get("function") or {}).get("arguments") or "{}"
-											fix pyright errors + revert to old way of handling cancelled api requests (#23)
										
										
											2025-07-17 16:59:06 -07:00
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
+								            try:
 								                args = json.loads(raw_args) if raw_args.strip() else {}
 								            except Exception as e:
 								                tool_result = json.dumps({"error": f"Argument parse failed: {str(e)}", "raw_arguments": raw_args})
 								                messages.append({"role": "tool", "tool_call_id": call_id, "content": tool_result})
 								                continue
-											fix pyright errors + revert to old way of handling cancelled api requests (#23)
										
										
											2025-07-17 16:59:06 -07:00
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
+								            try:
 								                if tool_name == "list_files":
 								                    tool_result = self.tool_manager.list_files()
 								                else:
 								                    tool_result = json.dumps({"error": f"Unknown tool '{tool_name}'"})
 								            except Exception as e:
 								                tool_result = json.dumps({"error": f"Tool '{tool_name}' failed: {str(e)}"})
-											fix pyright errors + revert to old way of handling cancelled api requests (#23)
										
										
											2025-07-17 16:59:06 -07:00
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
+								            print("\n[Tool executed]", tool_name)
 								            print(tool_result[:500] + ("..." if len(tool_result) > 500 else ""))
 								            messages.append({"role": "tool", "tool_call_id": call_id, "content": tool_result})
-											fix pyright errors + revert to old way of handling cancelled api requests (#23)
										
										
											2025-07-17 16:59:06 -07:00
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
+								        # Second pass: get final streamed answer after tool results
 								        stream2 = await stream_chat_completions(
 								            client=self.client,
 								            model=self.model,
 								            messages=messages,
-											defaults to ENDPOINT_NAME and DEFAULT_MODEL but uses the flag first if present
										
										
											2025-12-03 16:57:28 -08:00
+								            endpoint_name=self.endpoint_name,
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
+								            max_tokens=MAX_TOKENS,
 								            temperature=DEFAULT_TEMPERATURE,
 								        )
-											fix pyright errors + revert to old way of handling cancelled api requests (#23)
										
										
											2025-07-17 16:59:06 -07:00
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
+								        final_buf = []
 								        printed_reasoning2 = False
 								        printed_answer2 = False
 								        async for chunk in stream2:
 								            choice = (chunk.get("choices") or [{}])[0]
 								            delta = choice.get("delta", {})
 								            rc2 = delta.get("reasoning_content")
 								            if rc2:
 								                if not printed_reasoning2:
 								                    printed_reasoning2 = True
 								                    print("\n🧠 Reasoning (post-tools): ", end="", flush=True)
 								                print(rc2, end="", flush=True)
 								            c2 = delta.get("content")
 								            if c2:
 								                final_buf.append(c2)
 								                if not printed_answer2:
 								                    printed_answer2 = True
 								                    print("\n💬 Response (final): ", end="", flush=True)
 								                print(c2, end="", flush=True)
 								        print("\n" + "=" * 60)
 								        print("FINAL LLM ANALYSIS:")
 								        print("=" * 60)
 								        print("".join(final_buf))
 								        print("=" * 60)
-											fix pyright errors + revert to old way of handling cancelled api requests (#23)
										
										
											2025-07-17 16:59:06 -07:00
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
+								    async def interactive_chat(self) -> None:
-											OpenAI compatible worker (#19)
										
										
											2025-07-16 09:46:26 +01:00
+								        """Interactive chat session with streaming"""
 								        print("=" * 60)
 								        print("INTERACTIVE STREAMING CHAT")
 								        print("=" * 60)
 								        print("Type 'quit' to exit, 'clear' to clear history")
 								        print()
-											fix pyright errors + revert to old way of handling cancelled api requests (#23)
										
										
											2025-07-17 16:59:06 -07:00
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
+								        messages: List[Dict[str, Any]] = []
-											fix pyright errors + revert to old way of handling cancelled api requests (#23)
										
										
											2025-07-17 16:59:06 -07:00
-											OpenAI compatible worker (#19)
										
										
											2025-07-16 09:46:26 +01:00
+								        while True:
 								            try:
 								                user_input = input("You: ").strip()
-											fix pyright errors + revert to old way of handling cancelled api requests (#23)
										
										
											2025-07-17 16:59:06 -07:00
 								                if user_input.lower() == "quit":
-											OpenAI compatible worker (#19)
										
										
											2025-07-16 09:46:26 +01:00
+								                    print("👋 Goodbye!")
 								                    break
-											fix pyright errors + revert to old way of handling cancelled api requests (#23)
										
										
											2025-07-17 16:59:06 -07:00
+								                elif user_input.lower() == "clear":
-											OpenAI compatible worker (#19)
										
										
											2025-07-16 09:46:26 +01:00
+								                    messages = []
 								                    print("Chat history cleared")
 								                    continue
 								                elif not user_input:
 								                    continue
-											fix pyright errors + revert to old way of handling cancelled api requests (#23)
										
										
											2025-07-17 16:59:06 -07:00
-											OpenAI compatible worker (#19)
										
										
											2025-07-16 09:46:26 +01:00
+								                messages.append({"role": "user", "content": user_input})
-											fix pyright errors + revert to old way of handling cancelled api requests (#23)
										
										
											2025-07-17 16:59:06 -07:00
-											OpenAI compatible worker (#19)
										
										
											2025-07-16 09:46:26 +01:00
+								                print("Assistant: ", end="", flush=True)
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
+								                stream = await stream_chat_completions(
 								                    client=self.client,
 								                    model=self.model,
-											defaults to ENDPOINT_NAME and DEFAULT_MODEL but uses the flag first if present
										
										
											2025-12-03 16:57:28 -08:00
+								                    messages=messages,
 								                    endpoint_name=self.endpoint_name,
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
+								                    max_tokens=MAX_TOKENS,
 								                    temperature=0.7
-											fix pyright errors + revert to old way of handling cancelled api requests (#23)
										
										
											2025-07-17 16:59:06 -07:00
+								                )
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
+								                assistant_content = await self.handle_streaming_response(stream, show_reasoning=True)
-											fix pyright errors + revert to old way of handling cancelled api requests (#23)
										
										
											2025-07-17 16:59:06 -07:00
-											OpenAI compatible worker (#19)
										
										
											2025-07-16 09:46:26 +01:00
+								                # Add assistant response to conversation history
 								                messages.append({"role": "assistant", "content": assistant_content})
-											fix pyright errors + revert to old way of handling cancelled api requests (#23)
										
										
											2025-07-17 16:59:06 -07:00
-											OpenAI compatible worker (#19)
										
										
											2025-07-16 09:46:26 +01:00
+								            except KeyboardInterrupt:
 								                print("\n👋 Chat interrupted. Goodbye!")
 								                break
 								            except Exception as e:
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
+								                log.error("\nError: %s", e)
-											OpenAI compatible worker (#19)
										
										
											2025-07-16 09:46:26 +01:00
+								                continue
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
+								# ---------------------- CLI ----------------------
 								def build_arg_parser() -> argparse.ArgumentParser:
 								    p = argparse.ArgumentParser(description="Vast vLLM Demo (Serverless SDK)")
-											defaults to ENDPOINT_NAME and DEFAULT_MODEL but uses the flag first if present
										
										
											2025-12-03 16:57:28 -08:00
+								    p.add_argument("--model", default=DEFAULT_MODEL, help=f"Model to use for requests (default: {DEFAULT_MODEL})")
 								    p.add_argument("--endpoint", default=ENDPOINT_NAME, help=f"Vast endpoint name (default: {ENDPOINT_NAME})")
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
 								    modes = p.add_mutually_exclusive_group(required=False)
 								    modes.add_argument("--completion", action="store_true", help="Test completions endpoint")
 								    modes.add_argument("--chat", action="store_true", help="Test chat completions endpoint (non-streaming)")
 								    modes.add_argument("--chat-stream", action="store_true", help="Test chat completions endpoint with streaming")
 								    modes.add_argument("--tools", action="store_true", help="Test function calling with ls tool (non-streaming+streamed phases)")
 								    modes.add_argument("--interactive", action="store_true", help="Start interactive streaming chat session")
 								    return p
 								async def main_async():
 								    args = build_arg_parser().parse_args()
 								    selected = sum([args.completion, args.chat, args.chat_stream, args.tools, args.interactive])
 								    if selected == 0:
-											OpenAI compatible worker (#19)
										
										
											2025-07-16 09:46:26 +01:00
+								        print("Please specify exactly one test mode:")
 								        print("  --completion    : Test completions endpoint")
 								        print("  --chat          : Test chat completions endpoint (non-streaming)")
 								        print("  --chat-stream   : Test chat completions endpoint with streaming")
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
+								        print("  --tools         : Test function calling with ls tool")
-											OpenAI compatible worker (#19)
										
										
											2025-07-16 09:46:26 +01:00
+								        print("  --interactive   : Start interactive streaming chat session")
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
+								        print(f"\nExample: python {os.path.basename(sys.argv[0])} --model Qwen/Qwen3-8B --chat-stream --endpoint my-vllm-endpoint")
-											OpenAI compatible worker (#19)
										
										
											2025-07-16 09:46:26 +01:00
+								        sys.exit(1)
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
+								    elif selected > 1:
-											OpenAI compatible worker (#19)
										
										
											2025-07-16 09:46:26 +01:00
+								        print("Please specify exactly one test mode")
 								        sys.exit(1)
-											fix pyright errors + revert to old way of handling cancelled api requests (#23)
										
										
											2025-07-17 16:59:06 -07:00
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
+								    print("=" * 60)
-											defaults to ENDPOINT_NAME and DEFAULT_MODEL but uses the flag first if present
										
										
											2025-12-03 16:57:28 -08:00
+								    print(f"Using model: {args.model}")
 								    print(f"Using endpoint: {args.endpoint}")
-											fix pyright errors + revert to old way of handling cancelled api requests (#23)
										
										
											2025-07-17 16:59:06 -07:00
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
+								    try:
 								        async with Serverless() as client:
-											defaults to ENDPOINT_NAME and DEFAULT_MODEL but uses the flag first if present
										
										
											2025-12-03 16:57:28 -08:00
+								            demo = APIDemo(client, args.model, args.endpoint, ToolManager())
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
 								            if args.completion:
 								                await demo.demo_completions()
 								            elif args.chat:
 								                await demo.demo_chat(use_streaming=False)
 								            elif args.chat_stream:
 								                await demo.demo_chat(use_streaming=True)
 								            elif args.tools:
 								                await demo.demo_ls_tool()
 								            elif args.interactive:
 								                await demo.interactive_chat()
-											fix pyright errors + revert to old way of handling cancelled api requests (#23)
										
										
											2025-07-17 16:59:06 -07:00
-											OpenAI compatible worker (#19)
										
										
											2025-07-16 09:46:26 +01:00
+								    except Exception as e:
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
+								        log.error("Error during test: %s", e, exc_info=True)
-											OpenAI compatible worker (#19)
										
										
											2025-07-16 09:46:26 +01:00
+								        sys.exit(1)
 								if __name__ == "__main__":
-											Updated clients, include vastai-sdk, handle non-UTF-8
										
										
											2025-11-11 17:09:28 -08:00
+								    asyncio.run(main_async())