pyworker/workers/openai/client.py

#!/usr/bin/env python3
import logging
import json
import os
import sys
import subprocess
import argparse
from typing import Any, Dict, List, Optional

from vastai import Serverless
import asyncio

# ---------------------- Logging ----------------------
logging.basicConfig(
    level=logging.DEBUG,
    format="%(asctime)s[%(levelname)-5s] %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)
log = logging.getLogger(__file__)

# ---------------------- Prompts ----------------------
COMPLETIONS_PROMPT = "the capital of USA is"
CHAT_PROMPT = "Think step by step: Tell me about the Python programming language."
TOOLS_PROMPT = (
    "Can you list the files in the current working directory and tell me what you see? "
    "What do you think this directory might be for?"
)

ENDPOINT_NAME = "my-vllm-endpoint"       # change this to your vLLM endpoint name
DEFAULT_MODEL = "Qwen/Qwen3-8B"          # must support tool calling
MAX_TOKENS = 1024
DEFAULT_TEMPERATURE = 0.7

# ---------------------- Tooling ----------------------
class ToolManager:
    """Handles tool definitions and execution"""

    @staticmethod
    def list_files() -> str:
        """Execute ls on current directory"""
        try:
            result = subprocess.run(
                ["ls", "-la", "."], capture_output=True, text=True, timeout=10
            )
            if result.returncode == 0:
                return result.stdout
            else:
                return f"Error: {result.stderr}"
        except Exception as e:
            return f"Error running ls: {e}"

    @staticmethod
    def get_ls_tool_definition() -> List[Dict[str, Any]]:
        """OpenAI-compatible tool schema"""
        return [
            {
                "type": "function",
                "function": {
                    "name": "list_files",
                    "description": "List files and directories in the cwd",
                    "parameters": {"type": "object", "properties": {}, "required": []},
                },
            }
        ]

    def execute_tool_call(self, tool_call: Dict[str, Any]) -> str:
        """Execute a tool call and return the result"""
        function_name = (tool_call.get("function") or {}).get("name")
        if function_name == "list_files":
            return self.list_files()
        raise ValueError(f"Unknown tool function: {function_name}")


# ----- Helpers to handle streamed tool_calls assembly -----
def _merge_tool_call_delta(state: Dict[int, Dict[str, Any]], tc_delta: Dict[str, Any]) -> None:
    """
    OpenAI-style streaming sends partial tool_calls with an index and partial fields.
    We merge into a per-index state dict until the assistant message finishes.
    """
    idx = tc_delta.get("index")
    if idx is None:
        return

    entry = state.setdefault(idx, {"id": None, "function": {"name": None, "arguments": ""}, "type": "function"})

    if tc_delta.get("id"):
        entry["id"] = tc_delta["id"]

    fn_delta = tc_delta.get("function") or {}
    if "name" in fn_delta and fn_delta["name"]:
        entry["function"]["name"] = fn_delta["name"]
    if "arguments" in fn_delta and fn_delta["arguments"]:
        entry["function"]["arguments"] += fn_delta["arguments"]


def _tool_state_to_message_tool_calls(state: Dict[int, Dict[str, Any]]) -> List[Dict[str, Any]]:
    return [state[i] for i in sorted(state.keys())]


# ---- OpenAI-compatible calls (non-streaming) ----
async def call_completions(client: Serverless, *, model: str, prompt: str, **kwargs) -> Dict[str, Any]:

    endpoint = await client.get_endpoint(name=ENDPOINT_NAME)

    payload = {
        "input": {
            "model": model,
            "prompt": prompt,
            "max_tokens": kwargs.get("max_tokens", MAX_TOKENS),
            "temperature": kwargs.get("temperature", DEFAULT_TEMPERATURE),
        }
    }
    log.debug("POST /v1/completions %s", json.dumps(payload)[:500])
    resp = await endpoint.request("/v1/completions", payload, cost=payload["input"]["max_tokens"])
    return resp["response"]

async def call_chat_completions(client: Serverless, *, model: str, messages: List[Dict[str, Any]], **kwargs) -> Dict[str, Any]:

    endpoint = await client.get_endpoint(name=ENDPOINT_NAME)

    payload = {
        "input": {
            "model": model,
            "messages": messages,
            "max_tokens": kwargs.get("max_tokens", MAX_TOKENS),
            "temperature": kwargs.get("temperature", DEFAULT_TEMPERATURE),
            **({"tools": kwargs["tools"]} if "tools" in kwargs else {}),
            **({"tool_choice": kwargs["tool_choice"]} if "tool_choice" in kwargs else {}),
        }
    }
    log.debug("POST /v1/chat/completions %s", json.dumps(payload)[:500])
    resp = await endpoint.request("/v1/chat/completions", payload, cost=payload["input"]["max_tokens"])
    return resp["response"]

# ---- Streaming variants ----
async def stream_completions(client: Serverless, *, model: str, prompt: str, **kwargs):

    endpoint = await client.get_endpoint(name=ENDPOINT_NAME)

    payload = {
        "input": {
            "model": model,
            "prompt": prompt,
            "max_tokens": kwargs.get("max_tokens", MAX_TOKENS),
            "temperature": kwargs.get("temperature", DEFAULT_TEMPERATURE),
            "stream": True,
            **({"stop": kwargs["stop"]} if "stop" in kwargs else {}),
        }
    }
    log.debug("STREAM /v1/completions %s", json.dumps(payload)[:500])
    resp = await endpoint.request("/v1/completions", payload, cost=payload["input"]["max_tokens"], stream=True)
    return resp["response"]  # async generator

async def stream_chat_completions(client: Serverless, *, model: str, messages: List[Dict[str, Any]], **kwargs):

    endpoint = await client.get_endpoint(name=ENDPOINT_NAME)

    payload = {
        "input": {
            "model": model,
            "messages": messages,
            "max_tokens": kwargs.get("max_tokens", MAX_TOKENS),
            "temperature": kwargs.get("temperature", DEFAULT_TEMPERATURE),
            "stream": True,
            **({"tools": kwargs["tools"]} if "tools" in kwargs else {}),
            **({"tool_choice": kwargs["tool_choice"]} if "tool_choice" in kwargs else {}),
        }
    }
    log.debug("STREAM /v1/chat/completions %s", json.dumps(payload)[:500])
    resp = await endpoint.request("/v1/chat/completions", payload, cost=payload["input"]["max_tokens"], stream=True)
    return resp["response"]  # async generator


# ---------------------- Demo Runner ----------------------
class APIDemo:
    """Demo and testing functionality for the API client"""

    def __init__(self, client: Serverless, model: str, tool_manager: Optional[ToolManager] = None):
        self.client = client
        self.model = model
        self.tool_manager = tool_manager or ToolManager()

    # ----- Streaming handler -----
    async def handle_streaming_response(self, stream, show_reasoning: bool = True) -> str:
        full_response = ""
        reasoning_content = ""
        printed_reasoning = False
        printed_answer = False

        async for chunk in stream:
            choice = (chunk.get("choices") or [{}])[0]
            delta = choice.get("delta", {})

            # reasoning tokens
            rc = delta.get("reasoning_content")
            if rc and show_reasoning:
                if not printed_reasoning:
                    print("\n🧠 Reasoning: ", end="", flush=True)
                    printed_reasoning = True
                print(rc, end="", flush=True)
                reasoning_content += rc

            # content tokens
            content_part = delta.get("content")
            if content_part:
                if not printed_answer:
                    if show_reasoning and printed_reasoning:
                        print("\n💬 Response: ", end="", flush=True)
                    else:
                        print("Assistant: ", end="", flush=True)
                    printed_answer = True
                print(content_part, end="", flush=True)
                full_response += content_part

        print()  # newline
        if show_reasoning:
            if printed_reasoning or printed_answer:
                print("\nStreaming completed.")
            if printed_reasoning:
                print(f"Reasoning tokens: {len(reasoning_content.split())}")
            if printed_answer:
                print(f"Response tokens: {len(full_response.split())}")

        return full_response

    async def demo_completions(self) -> None:
        print("=" * 60)
        print("COMPLETIONS DEMO")
        print("=" * 60)

        response = await call_completions(
            client=self.client,
            model=self.model,
            prompt=COMPLETIONS_PROMPT,
            max_tokens=MAX_TOKENS,
            temperature=DEFAULT_TEMPERATURE,
        )
        print("\nResponse:")
        print(json.dumps(response, indent=2))

    async def demo_chat(self, use_streaming: bool = True) -> None:
        print("=" * 60)
        print(f"CHAT COMPLETIONS DEMO {'(STREAMING)' if use_streaming else '(NON-STREAMING)'}")
        print("=" * 60)

        messages = [{"role": "user", "content": CHAT_PROMPT}]

        if use_streaming:
            stream = await stream_chat_completions(
                client=self.client,
                model=self.model,
                messages=messages,
                max_tokens=MAX_TOKENS,
                temperature=DEFAULT_TEMPERATURE
            )
            try:
                await self.handle_streaming_response(stream, show_reasoning=True)
            except Exception as e:
                log.error("\nError during streaming: %s", e, exc_info=True)
        else:
            response = await call_chat_completions(
                client=self.client,
                model=self.model,
                messages=messages,
                max_tokens=MAX_TOKENS,
                temperature=DEFAULT_TEMPERATURE
            )
            choice = (response.get("choices") or [{}])[0]
            message = choice.get("message", {})
            content = message.get("content", "")
            reasoning = message.get("reasoning_content", "") or message.get("reasoning", "")
            if reasoning:
                print(f"\n🧠 Reasoning: \033[90m{reasoning}\033[0m")
            print(f"\n💬 Assistant: {content}")
            print(f"\nFull Response:\n{json.dumps(response, indent=2)}")

    async def test_tool_support(self) -> bool:
        """Probe that tool schema is accepted (no actual call)"""
        messages = [{"role": "user", "content": "Hello"}]
        minimal_tool = [
            {
                "type": "function",
                "function": {"name": "test_function", "description": "Test function"},
            }
        ]
        try:
            _ = await call_chat_completions(
                client=self.client,
                model=self.model,
                messages=messages,
                tools=minimal_tool,
                tool_choice="none",
                max_tokens=10
            )
            return True
        except Exception as e:
            log.error("Endpoint does not support tool calling: %s", e)
            return False

    async def demo_ls_tool(self) -> None:
        """Ask to list files using function calling, then provide final analysis"""
        print("=" * 60)
        print("TOOL USE DEMO: List Directory Contents")
        print("=" * 60)

        if not await self.test_tool_support():
            return

        messages: List[Dict[str, Any]] = [{"role": "user", "content": TOOLS_PROMPT}]

        # First pass: let the model decide tools, stream tool_calls and partial content
        stream = await stream_chat_completions(
            client=self.client,
            model=self.model,
            messages=messages,
            tools=self.tool_manager.get_ls_tool_definition(),
            tool_choice="auto",
            max_tokens=MAX_TOKENS,
            temperature=DEFAULT_TEMPERATURE,
        )

        assistant_content_buf: List[str] = []
        tool_calls_state: Dict[int, Dict[str, Any]] = {}
        printed_reasoning = False
        printed_answer = False

        async for chunk in stream:
            choice = (chunk.get("choices") or [{}])[0]
            delta = choice.get("delta", {})

            rc = delta.get("reasoning_content")
            if rc:
                if not printed_reasoning:
                    printed_reasoning = True
                    print("🧠 Reasoning: ", end="", flush=True)
                print(rc, end="", flush=True)

            content_part = delta.get("content")
            if content_part:
                assistant_content_buf.append(content_part)
                if not printed_answer:
                    printed_answer = True
                    print("\n💬 Response: ", end="", flush=True)
                print(content_part, end="", flush=True)

            if "tool_calls" in delta and delta["tool_calls"]:
                for tc_delta in delta["tool_calls"]:
                    _merge_tool_call_delta(tool_calls_state, tc_delta)

        # If no tool calls, we’re done.
        if not tool_calls_state:
            print("\n(No tool calls were made.)")
            return

        # Build assistant message with tool_calls
        assistant_message = {
            "role": "assistant",
            "content": "".join(assistant_content_buf) if assistant_content_buf else None,
            "tool_calls": _tool_state_to_message_tool_calls(tool_calls_state),
        }
        messages.append(assistant_message)

        # Execute tools and feed results back
        for tc in assistant_message["tool_calls"]:
            tool_name = (tc.get("function") or {}).get("name")
            call_id = tc.get("id")
            raw_args = (tc.get("function") or {}).get("arguments") or "{}"

            try:
                args = json.loads(raw_args) if raw_args.strip() else {}
            except Exception as e:
                tool_result = json.dumps({"error": f"Argument parse failed: {str(e)}", "raw_arguments": raw_args})
                messages.append({"role": "tool", "tool_call_id": call_id, "content": tool_result})
                continue

            try:
                if tool_name == "list_files":
                    tool_result = self.tool_manager.list_files()
                else:
                    tool_result = json.dumps({"error": f"Unknown tool '{tool_name}'"})
            except Exception as e:
                tool_result = json.dumps({"error": f"Tool '{tool_name}' failed: {str(e)}"})

            print("\n[Tool executed]", tool_name)
            print(tool_result[:500] + ("..." if len(tool_result) > 500 else ""))
            messages.append({"role": "tool", "tool_call_id": call_id, "content": tool_result})

        # Second pass: get final streamed answer after tool results
        stream2 = await stream_chat_completions(
            client=self.client,
            model=self.model,
            messages=messages,
            max_tokens=MAX_TOKENS,
            temperature=DEFAULT_TEMPERATURE,
        )

        final_buf = []
        printed_reasoning2 = False
        printed_answer2 = False

        async for chunk in stream2:
            choice = (chunk.get("choices") or [{}])[0]
            delta = choice.get("delta", {})

            rc2 = delta.get("reasoning_content")
            if rc2:
                if not printed_reasoning2:
                    printed_reasoning2 = True
                    print("\n🧠 Reasoning (post-tools): ", end="", flush=True)
                print(rc2, end="", flush=True)

            c2 = delta.get("content")
            if c2:
                final_buf.append(c2)
                if not printed_answer2:
                    printed_answer2 = True
                    print("\n💬 Response (final): ", end="", flush=True)
                print(c2, end="", flush=True)

        print("\n" + "=" * 60)
        print("FINAL LLM ANALYSIS:")
        print("=" * 60)
        print("".join(final_buf))
        print("=" * 60)

    async def interactive_chat(self) -> None:
        """Interactive chat session with streaming"""
        print("=" * 60)
        print("INTERACTIVE STREAMING CHAT")
        print("=" * 60)
        print(f"Using model: {self.model}")
        print("Type 'quit' to exit, 'clear' to clear history")
        print()

        messages: List[Dict[str, Any]] = []

        while True:
            try:
                user_input = input("You: ").strip()

                if user_input.lower() == "quit":
                    print("👋 Goodbye!")
                    break
                elif user_input.lower() == "clear":
                    messages = []
                    print("Chat history cleared")
                    continue
                elif not user_input:
                    continue

                messages.append({"role": "user", "content": user_input})

                print("Assistant: ", end="", flush=True)
                stream = await stream_chat_completions(
                    client=self.client,
                    model=self.model,
                    messages=messages,
                    max_tokens=MAX_TOKENS,
                    temperature=0.7
                )
                assistant_content = await self.handle_streaming_response(stream, show_reasoning=True)

                # Add assistant response to conversation history
                messages.append({"role": "assistant", "content": assistant_content})

            except KeyboardInterrupt:
                print("\n👋 Chat interrupted. Goodbye!")
                break
            except Exception as e:
                log.error("\nError: %s", e)
                continue


# ---------------------- CLI ----------------------
def build_arg_parser() -> argparse.ArgumentParser:
    p = argparse.ArgumentParser(description="Vast vLLM Demo (Serverless SDK)")
    p.add_argument("--model", required=True, help="Model to use for requests (required)")
    p.add_argument("--endpoint", default="my-vllm-endpoint", help="Vast endpoint name (default: my-vllm-endpoint)")

    modes = p.add_mutually_exclusive_group(required=False)
    modes.add_argument("--completion", action="store_true", help="Test completions endpoint")
    modes.add_argument("--chat", action="store_true", help="Test chat completions endpoint (non-streaming)")
    modes.add_argument("--chat-stream", action="store_true", help="Test chat completions endpoint with streaming")
    modes.add_argument("--tools", action="store_true", help="Test function calling with ls tool (non-streaming+streamed phases)")
    modes.add_argument("--interactive", action="store_true", help="Start interactive streaming chat session")
    return p


async def main_async():
    args = build_arg_parser().parse_args()

    selected = sum([args.completion, args.chat, args.chat_stream, args.tools, args.interactive])
    if selected == 0:
        print("Please specify exactly one test mode:")
        print("  --completion    : Test completions endpoint")
        print("  --chat          : Test chat completions endpoint (non-streaming)")
        print("  --chat-stream   : Test chat completions endpoint with streaming")
        print("  --tools         : Test function calling with ls tool")
        print("  --interactive   : Start interactive streaming chat session")
        print(f"\nExample: python {os.path.basename(sys.argv[0])} --model Qwen/Qwen3-8B --chat-stream --endpoint my-vllm-endpoint")
        sys.exit(1)
    elif selected > 1:
        print("Please specify exactly one test mode")
        sys.exit(1)

    print(f"Using model: {args.model}")
    print("=" * 60)

    try:
        async with Serverless() as client:
            demo = APIDemo(client, args.model, ToolManager())

            if args.completion:
                await demo.demo_completions()
            elif args.chat:
                await demo.demo_chat(use_streaming=False)
            elif args.chat_stream:
                await demo.demo_chat(use_streaming=True)
            elif args.tools:
                await demo.demo_ls_tool()
            elif args.interactive:
                await demo.interactive_chat()

    except Exception as e:
        log.error("Error during test: %s", e, exc_info=True)
        sys.exit(1)


if __name__ == "__main__":
    asyncio.run(main_async())