diff --git a/requirements.txt b/requirements.txt index f0a66ae..807a1c4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,6 @@ -aiohttp[speedups]==3.10.1 +aiohttp==3.10.1 +aiodns~=3.6.0 +pycares~=4.11.0 anyio~=4.4 lib~=4.0 nltk~=3.9 @@ -8,4 +10,4 @@ Requests~=2.32 transformers~=4.52 utils==1.0.* hf_transfer>=0.1.9 -git+https://github.com/vast-ai/vast-sdk.git@worker-sdk \ No newline at end of file +vastai-sdk>=0.3.0 diff --git a/start_server.sh b/start_server.sh index 9522102..84656c2 100755 --- a/start_server.sh +++ b/start_server.sh @@ -114,7 +114,7 @@ then if ! uv venv --python-preference only-managed "$ENV_PATH" -p 3.10; then report_error_and_exit "Failed to create virtual environment" fi - + if ! source "$ENV_PATH/bin/activate"; then report_error_and_exit "Failed to activate virtual environment" fi diff --git a/workers/comfyui-json/README.md b/workers/comfyui-json/README.md index baa2711..c91b09d 100644 --- a/workers/comfyui-json/README.md +++ b/workers/comfyui-json/README.md @@ -1,8 +1,16 @@ # ComfyUI PyWorker -This is the base PyWorker for ComfyUI. It provides a unified interface for running any ComfyUI workflow through a proxy-based architecture. +This is the base PyWorker for ComfyUI. It provides a unified interface for running any ComfyUI workflow through a proxy-based architecture. See the [Serverless documentation](https://docs.vast.ai/serverless) for guides and how-to's. -The cost for each request has a static value of `100`. ComfyUI does not handle concurrent workloads and there is no current provision to load multiple instances of ComfyUI per worker node. +The cost for each request has a static value of `100`. ComfyUI does not handle concurrent workloads and there is no current provision to load multiple instances of ComfyUI per worker node. + +## Instance Setup + +1. Pick a template + +- [ComfyUI (Serverless)](https://cloud.vast.ai/?ref_id=62897&creator_id=62897&name=ComfyUI%20(Serverless)) + +2. Follow the [getting started guide](https://docs.vast.ai/documentation/serverless/quickstart) for help with configuring your serverless setup. For testing, we recommend that you use the default options presented by the web interface. ## Requirements @@ -10,6 +18,137 @@ This worker requires both [ComfyUI](https://github.com/comfyanonymous/ComfyUI) a A docker image is provided but you may use any if the above requirements are met. +## Client + +The client demonstrates how to use the Vast Serverless SDK to generate images, save them locally, and optionally upload to S3-compatible storage. + +### Setup + +1. Clone the PyWorker repository to your local machine and install the necessary requirements for running the test client. + +```bash +git clone https://github.com/vast-ai/pyworker +cd pyworker +pip install uv +uv venv -p 3.12 +source .venv/bin/activate +uv pip install -r requirements.txt +``` + +2. Set your API key: + +```bash +export VAST_API_KEY= +``` + +### Usage + +```bash +# Default prompt +python -m workers.comfyui-json.client + +# Custom prompt +python -m workers.comfyui-json.client --prompt "a cat sitting on a rainbow" + +# With options +python -m workers.comfyui-json.client --prompt "sunset" --width 1024 --height 1024 --steps 30 + +# Using a custom workflow file +python -m workers.comfyui-json.client --workflow my_workflow.json + +# With S3 upload +python -m workers.comfyui-json.client --s3 +``` + +### CLI Flags + +| Flag | Default | Description | +|------|---------|-------------| +| `--endpoint` | `my-comfyui-endpoint` | Vast endpoint name | +| `--prompt` | (default) | Text prompt for image generation | +| `--workflow` | (none) | Path to custom workflow JSON file | +| `--width` | 512 | Image width in pixels | +| `--height` | 512 | Image height in pixels | +| `--steps` | 20 | Number of denoising steps | +| `--seed` | (random) | Random seed for reproducibility | +| `--s3` | (disabled) | Upload generated images to S3 | + +### Output + +Images are saved to `./generated_images/comfy_{seed}.png`. + +### S3 Upload (Optional) + +You can optionally upload generated images to an S3-compatible storage service (AWS S3, Cloudflare R2, Backblaze B2, etc.) by using the `--s3` flag. + +**1. Set environment variables:** + +```bash +export S3_ENDPOINT_URL="https://your-account.r2.cloudflarestorage.com" +export S3_BUCKET_NAME="my-bucket" +export S3_ACCESS_KEY_ID="your-access-key-id" +export S3_SECRET_ACCESS_KEY="your-secret-access-key" +``` + +**2. Run with S3 upload enabled:** + +```bash +python -m workers.comfyui-json.client --prompt "a beautiful landscape" --s3 +``` + +Images will be saved locally AND uploaded to `s3://{bucket}/comfyui/{filename}`. + +**Note:** Requires `boto3` (`pip install boto3`). + +## Benchmarking + +### Custom Benchmark Workflows + +You can provide a custom ComfyUI workflow for benchmarking by creating `workers/comfyui-json/misc/benchmark.json`. This allows you to test performance using your preferred models and workflow complexity. + +**Ways to provide the benchmark file:** +- Fork this repository and add your `benchmark.json` file +- Write the file during worker provisioning (onstart script or setup phase) + +An example file is provided in the repository. To ensure varied generations, use the placeholder `__RANDOM_INT__` in place of static seed values - it will be replaced with a random integer for each benchmark run. + +### Default Benchmark (Fallback) + +If `benchmark.json` is not available, a simple image generation benchmark runs when each worker initializes. This validates GPU performance and helps identify underperforming machines. + +The default benchmark uses Stable Diffusion v1.5 with ComfyUI's standard text-to-image workflow. Configure it using these environment variables: + +| Environment Variable | Default Value | Description | +| -------------------- | ------------- | ----------- | +| BENCHMARK_TEST_WIDTH | 512 | Image width (pixels) | +| BENCHMARK_TEST_HEIGHT | 512 | Image height (pixels) | +| BENCHMARK_TEST_STEPS | 20 | Number of denoising steps | + +Each benchmark run uses a random prompt from `misc/test_prompts.txt` and a random seed to ensure consistent GPU load patterns. + +#### Calibrating Fallback Benchmark Duration + +To screen for underperforming hardware, set `BENCHMARK_TEST_STEPS` to match your expected production workflow duration. This allows you to identify machines that won't meet performance requirements. + +**Example:** If your typical workflow should complete in 90 seconds on acceptable hardware: + +```bash +# 1. Measure it/sec on your reference machine +# RTX 4090 typically achieves ~43 it/sec with SD1.5 + +# 2. Calculate required steps +# 90 seconds Ɨ 43 it/sec = 3870 steps + +# 3. Configure benchmark +export BENCHMARK_TEST_STEPS=3870 + +# 4. Machines completing significantly slower than 90s indicate hardware issues +``` + +**Performance expectations:** +- Benchmark duration should remain consistent across identical GPU models +- Significant variation (>20%) may indicate thermal, power, or configuration issues + ## Endpoint The worker provides a single endpoint: @@ -170,4 +309,4 @@ See the client example for implementation details on how to integrate with the C --- -See Vast's serverless documentation for more details on how to use ComfyUI with autoscaler. \ No newline at end of file +See Vast's serverless documentation for more details on how to use ComfyUI with autoscaler. diff --git a/workers/comfyui-json/client.py b/workers/comfyui-json/client.py index 07a92b8..d79b30d 100644 --- a/workers/comfyui-json/client.py +++ b/workers/comfyui-json/client.py @@ -1,34 +1,312 @@ +import os +import sys +import json import uuid import random import asyncio -import random +import logging +import argparse +import aiohttp from vastai import Serverless -async def main(): - async with Serverless() as client: - endpoint = await client.get_endpoint(name="my-comfy-endpoint") # Change this to your endpoint name +# ---------------------- Config ---------------------- +DEFAULT_PROMPT = "a beautiful sunset over mountains, digital art, highly detailed" +ENDPOINT_NAME = "my-comfyui-endpoint" +DEFAULT_WIDTH = 512 +DEFAULT_HEIGHT = 512 +DEFAULT_STEPS = 20 +COST = 100 # Fixed cost for ComfyUI requests - payload = { - "input": { - "request_id": str(uuid.uuid4()), - "modifier": "Text2Image", - "modifications": { - "prompt": "a beautiful landscape with mountains and lakes", - "width": 1024, - "height": 1024, - "steps": 20, - "seed": random.randint(0, 2**32 - 1) - }, - "workflow_json": {} # Empty since using modifier approach - } +# Optional S3 Configuration (from environment variables) +S3_ENDPOINT_URL = os.getenv("S3_ENDPOINT_URL") +S3_BUCKET_NAME = os.getenv("S3_BUCKET_NAME") +S3_ACCESS_KEY_ID = os.getenv("S3_ACCESS_KEY_ID") +S3_SECRET_ACCESS_KEY = os.getenv("S3_SECRET_ACCESS_KEY") + +logging.basicConfig(level=logging.INFO, format="%(levelname)s - %(message)s") +log = logging.getLogger(__name__) + + +def get_s3_client(): + """Create and return an S3 client configured for the S3-compatible endpoint""" + try: + import boto3 + from botocore.config import Config + except ImportError: + log.error("boto3 is required for S3 uploads. Install with: pip install boto3") + return None + + if not all([S3_ENDPOINT_URL, S3_BUCKET_NAME, S3_ACCESS_KEY_ID, S3_SECRET_ACCESS_KEY]): + log.error("S3 environment variables not fully configured. Required:") + log.error(" S3_ENDPOINT_URL, S3_BUCKET_NAME, S3_ACCESS_KEY_ID, S3_SECRET_ACCESS_KEY") + return None + + return boto3.client( + "s3", + endpoint_url=S3_ENDPOINT_URL, + aws_access_key_id=S3_ACCESS_KEY_ID, + aws_secret_access_key=S3_SECRET_ACCESS_KEY, + config=Config(signature_version="s3v4"), + ) + + +# ---------------------- API Functions ---------------------- +async def call_generate( + client: Serverless, + *, + endpoint_name: str, + prompt: str, + width: int, + height: int, + steps: int, + seed: int, +) -> dict: + """Generate image using Text2Image modifier""" + endpoint = await client.get_endpoint(name=endpoint_name) + payload = { + "input": { + "request_id": str(uuid.uuid4()), + "modifier": "Text2Image", + "modifications": { + "prompt": prompt, + "width": width, + "height": height, + "steps": steps, + "seed": seed, + }, } - - response = await endpoint.request("/generate/sync", payload, cost=100) + } + return await endpoint.request("/generate/sync", payload, cost=COST) + + +async def call_generate_workflow( + client: Serverless, + *, + endpoint_name: str, + workflow_json: dict, +) -> dict: + """Generate using custom workflow JSON""" + endpoint = await client.get_endpoint(name=endpoint_name) + payload = { + "input": { + "request_id": str(uuid.uuid4()), + "workflow_json": workflow_json, + } + } + return await endpoint.request("/generate/sync", payload, cost=COST) + + +# ---------------------- Demo Class ---------------------- +class APIDemo: + def __init__(self, client: Serverless, endpoint_name: str, upload_s3: bool = False): + self.client = client + self.endpoint_name = endpoint_name + self.upload_s3 = upload_s3 + self.s3_client = get_s3_client() if upload_s3 else None + + if upload_s3 and not self.s3_client: + log.warning("S3 upload requested but client creation failed. Images will only be saved locally.") + + def extract_filename(self, response: dict) -> str | None: + """Extract the generated image filename from ComfyUI response""" + if "comfyui_response" in response: + for data in response["comfyui_response"].values(): + if isinstance(data, dict) and "outputs" in data: + for node_output in data["outputs"].values(): + if "images" in node_output and node_output["images"]: + return node_output["images"][0].get("filename") + return None + + async def save_image(self, worker_url: str, filename: str, local_name: str) -> str | None: + """Fetch and save image locally from the worker, optionally upload to S3""" + os.makedirs("generated_images", exist_ok=True) + return await self._fetch_image(worker_url, filename, local_name) + + def _upload_to_s3(self, local_path: str, s3_key: str) -> str | None: + """Upload a local file to S3 and return the S3 URL""" + if not self.s3_client: + return None + + try: + self.s3_client.upload_file( + local_path, + S3_BUCKET_NAME, + s3_key, + ExtraArgs={"ContentType": "image/png"} + ) + s3_url = f"{S3_ENDPOINT_URL}/{S3_BUCKET_NAME}/{s3_key}" + print(f" ā˜ļø Uploaded to S3: {s3_key}") + return s3_url + except Exception as e: + log.error(f"Failed to upload to S3: {e}") + return None + + async def _fetch_image(self, worker_url: str, filename: str, local_name: str) -> str | None: + """Fetch image from worker's /view endpoint and save locally""" + if not worker_url: + return None + + try: + url = f"{worker_url}/view" + params = {"filename": filename, "type": "output"} + + async with aiohttp.ClientSession() as session: + async with session.get(url, params=params, ssl=False) as resp: + if resp.status == 200: + path = f"generated_images/{local_name}" + image_data = await resp.read() + with open(path, "wb") as f: + f.write(image_data) + print(f" šŸ’¾ Saved: {path}") + + # Upload to S3 if enabled + if self.upload_s3 and self.s3_client: + s3_key = f"comfyui/{local_name}" + self._upload_to_s3(path, s3_key) + + return path + return None + except Exception: + return None + + async def demo_prompt( + self, + prompt: str, + width: int, + height: int, + steps: int, + seed: int | None, + ): + """Demo: Generate image from text prompt""" + print("=" * 60) + print("COMFYUI TEXT-TO-IMAGE DEMO") + print("=" * 60) + + if seed is None: + seed = random.randint(0, 2**32 - 1) + + print(f"Prompt: {prompt[:100]}..." if len(prompt) > 100 else f"Prompt: {prompt}") + print(f"Size: {width}x{height}, Steps: {steps}, Seed: {seed}") + print("\nšŸŽØ Generating image...") + + response = await call_generate( + self.client, + endpoint_name=self.endpoint_name, + prompt=prompt, + width=width, + height=height, + steps=steps, + seed=seed, + ) + + print("\nāœ… Generation complete!") + + # Get worker URL for fetching images + worker_url = response.get("url", "") + print(f"Worker URL: {worker_url}") + + # Fetch and save image + if "response" in response: + filename = self.extract_filename(response["response"]) + if filename: + path = await self.save_image(worker_url, filename, f"comfy_{seed}.png") + if not path: + print(f"āŒ Failed to fetch image") + else: + print("āŒ No image in response") + else: + print("āŒ Unexpected response format") + + async def demo_workflow(self, workflow_file: str): + """Demo: Generate using custom workflow file""" + print("=" * 60) + print("COMFYUI CUSTOM WORKFLOW DEMO") + print("=" * 60) + + if not os.path.exists(workflow_file): + log.error(f"Workflow file not found: {workflow_file}") + return + + with open(workflow_file, "r") as f: + workflow_json = json.load(f) + + print(f"Workflow: {workflow_file}") + print("\nšŸŽØ Generating...") + + response = await call_generate_workflow( + self.client, + endpoint_name=self.endpoint_name, + workflow_json=workflow_json, + ) + + print("\nāœ… Generation complete!") + + worker_url = response.get("url", "") + + if "response" in response: + filename = self.extract_filename(response["response"]) + if filename: + path = await self.save_image(worker_url, filename, "workflow.png") + if not path: + print(f"āŒ Failed to fetch image") + else: + print("āŒ No image in response") + else: + print("āŒ Unexpected response format") + + +# ---------------------- CLI ---------------------- +def build_arg_parser() -> argparse.ArgumentParser: + p = argparse.ArgumentParser(description="Vast ComfyUI-JSON Demo (Serverless SDK)") + p.add_argument("--endpoint", default=ENDPOINT_NAME, help=f"Vast endpoint name (default: {ENDPOINT_NAME})") + p.add_argument("--prompt", type=str, default=DEFAULT_PROMPT, metavar="TEXT", + help=f"Prompt text (default: '{DEFAULT_PROMPT[:30]}...')") + p.add_argument("--workflow", type=str, metavar="FILE", help="Use custom workflow JSON file instead") + p.add_argument("--width", type=int, default=DEFAULT_WIDTH, help=f"Image width (default: {DEFAULT_WIDTH})") + p.add_argument("--height", type=int, default=DEFAULT_HEIGHT, help=f"Image height (default: {DEFAULT_HEIGHT})") + p.add_argument("--steps", type=int, default=DEFAULT_STEPS, help=f"Steps (default: {DEFAULT_STEPS})") + p.add_argument("--seed", type=int, default=None, help="Seed (default: random)") + p.add_argument("--s3", action="store_true", + help="Upload generated images to S3 (requires S3_ENDPOINT_URL, S3_BUCKET_NAME, S3_ACCESS_KEY_ID, S3_SECRET_ACCESS_KEY env vars)") + return p + + +async def main_async(): + args = build_arg_parser().parse_args() + + print("=" * 60) + print(f"Using endpoint: {args.endpoint}") + if args.s3: + print(f"S3 upload: enabled (bucket: {S3_BUCKET_NAME})") + + try: + async with Serverless() as client: + demo = APIDemo(client, args.endpoint, upload_s3=args.s3) + + if args.workflow: + await demo.demo_workflow(workflow_file=args.workflow) + else: + await demo.demo_prompt( + prompt=args.prompt, + width=args.width, + height=args.height, + steps=args.steps, + seed=args.seed, + ) + + except AttributeError as e: + if "API key" in str(e): + log.error("API key missing. Set VAST_API_KEY environment variable.") + else: + log.error(f"Error: {e}") + sys.exit(1) + except Exception as e: + log.error(f"Error: {e}") + import traceback + traceback.print_exc() + sys.exit(1) - # Get the file from the path on the local machine using SCP or SFTP - # or configure S3 to upload to cloud storage. - print(response["response"]["output"][0]["local_path"]) if __name__ == "__main__": - asyncio.run(main()) \ No newline at end of file + asyncio.run(main_async()) diff --git a/workers/openai/README.md b/workers/openai/README.md index 2436784..f7596f3 100644 --- a/workers/openai/README.md +++ b/workers/openai/README.md @@ -8,14 +8,13 @@ This is the base PyWorker for OpenAI compatible inference servers. See the [Ser This worker is compatible with any backend API that properly implements the `/v1/completions` and `/v1/chat/completions` endpoints. We currently have three templates you can choose from but you can also create your own without having to modify the PyWorker. -- [vLLM](https://cloud.vast.ai/?ref_id=62897&creator_id=62897&name=vLLM%20%2B%20Qwen%2FQwen3-8B%20(Serverless)) (recommended) +- [vLLM](https://cloud.vast.ai/?ref_id=62897&creator_id=62897&name=vLLM%20(Serverless)) (recommended) - [Ollama](https://cloud.vast.ai/?ref_id=62897&creator_id=62897&name=Ollama%20%2B%20Qwen3%3A32b%20(Serverless)) -- [HuggingFace TGI](https://cloud.vast.ai/?ref_id=62897&creator_id=62897&name=TGI%20%2B%20Qwen3-8B%20(Serverless)) All of these templates can be configured via the template interface. You may want to change the model or startup arguments, depending on the template you selected. -2. Follow the [getting started guide](https://docs.vast.ai/serverless/getting-started) for help with configuring your serverless setup. For testing, we recommend that you use the default options presented by the web interface. +2. Follow the [getting started guide](https://docs.vast.ai/documentation/serverless/quickstart) for help with configuring your serverless setup. For testing, we recommend that you use the default options presented by the web interface. ## Client Setup (Demo) @@ -34,38 +33,20 @@ uv pip install -r requirements.txt Several examples have been provided in the client to help you get started with your own implementation. -### Completions - -Call to `/v1/completions` with json response +First, set your API key as an environment variable: ```bash -python -m workers.openai.client -k -e --completion --model +export VAST_API_KEY= ``` -### Chat Completion (json) - -Call to `/v1/chat/completions` with json response - -```bash -python -m workers.openai.client -k -e --chat --model -``` +The `--model` and `--endpoint` flags are optional. If not provided, they default to `Qwen/Qwen3-8B` and `my-vllm-endpoint` respectively. ### Chat Completion (streaming) Call to `/v1/chat/completions` with streaming response ```bash -python -m workers.openai.client -k -e --chat-stream --model -``` - -### Tool Use (json) - -Call to `/v1/chat/completions` with tool and json response. - -This test defines a simple tool which will list the contents of the local pyworker directory. The output is then analysed by the model. - -```bash -python -m workers.openai.client -k -e --tools --model +python -m workers.openai.client --chat-stream --endpoint --model ``` ### Interactive Chat (streaming) @@ -75,6 +56,32 @@ Interactive session with calls to `/v1/chat/completions`. Type `clear` to clear the chat history or `quit` to exit. ```bash -python -m workers.openai.client -k -e --interactive --model +python -m workers.openai.client --interactive --endpoint --model +``` + +### Chat Completion (json) + +Call to `/v1/chat/completions` with json response + +```bash +python -m workers.openai.client --chat --endpoint --model +``` + +### Tool Use (json) + +Call to `/v1/chat/completions` with tool and json response. + +This test defines a simple tool which will list the contents of the local pyworker directory. The output is then analysed by the model. + +```bash +python -m workers.openai.client --tools --endpoint --model +``` + +### Completions + +Call to `/v1/completions` with json response + +```bash +python -m workers.openai.client --completion --endpoint --model ``` diff --git a/workers/openai/client.py b/workers/openai/client.py index eb1d989..2385aef 100644 --- a/workers/openai/client.py +++ b/workers/openai/client.py @@ -18,7 +18,7 @@ logging.basicConfig( log = logging.getLogger(__file__) # ---------------------- Prompts ---------------------- -COMPLETIONS_PROMPT = "the capital of USA is" +COMPLETIONS_PROMPT = "Zebras are primarily grazers and can subsist on lower-quality vegetation. They are preyed on mainly by" CHAT_PROMPT = "Think step by step: Tell me about the Python programming language." TOOLS_PROMPT = ( "Can you list the files in the current working directory and tell me what you see? " @@ -97,9 +97,9 @@ def _tool_state_to_message_tool_calls(state: Dict[int, Dict[str, Any]]) -> List[ # ---- OpenAI-compatible calls (non-streaming) ---- -async def call_completions(client: Serverless, *, model: str, prompt: str, **kwargs) -> Dict[str, Any]: +async def call_completions(client: Serverless, *, model: str, prompt: str, endpoint_name: str, **kwargs) -> Dict[str, Any]: - endpoint = await client.get_endpoint(name=ENDPOINT_NAME) + endpoint = await client.get_endpoint(name=endpoint_name) payload = { "model": model, @@ -111,9 +111,9 @@ async def call_completions(client: Serverless, *, model: str, prompt: str, **kwa resp = await endpoint.request("/v1/completions", payload, cost=payload["max_tokens"]) return resp["response"] -async def call_chat_completions(client: Serverless, *, model: str, messages: List[Dict[str, Any]], **kwargs) -> Dict[str, Any]: +async def call_chat_completions(client: Serverless, *, model: str, messages: List[Dict[str, Any]], endpoint_name: str, **kwargs) -> Dict[str, Any]: - endpoint = await client.get_endpoint(name=ENDPOINT_NAME) + endpoint = await client.get_endpoint(name=endpoint_name) payload = { "model": model, @@ -128,9 +128,9 @@ async def call_chat_completions(client: Serverless, *, model: str, messages: Lis return resp["response"] # ---- Streaming variants ---- -async def stream_completions(client: Serverless, *, model: str, prompt: str, **kwargs): +async def stream_completions(client: Serverless, *, model: str, prompt: str, endpoint_name: str, **kwargs): - endpoint = await client.get_endpoint(name=ENDPOINT_NAME) + endpoint = await client.get_endpoint(name=endpoint_name) payload = { "model": model, @@ -144,9 +144,9 @@ async def stream_completions(client: Serverless, *, model: str, prompt: str, **k resp = await endpoint.request("/v1/completions", payload, cost=payload["max_tokens"], stream=True) return resp["response"] # async generator -async def stream_chat_completions(client: Serverless, *, model: str, messages: List[Dict[str, Any]], **kwargs): +async def stream_chat_completions(client: Serverless, *, model: str, messages: List[Dict[str, Any]], endpoint_name: str, **kwargs): - endpoint = await client.get_endpoint(name=ENDPOINT_NAME) + endpoint = await client.get_endpoint(name=endpoint_name) payload = { "model": model, @@ -166,9 +166,10 @@ async def stream_chat_completions(client: Serverless, *, model: str, messages: L class APIDemo: """Demo and testing functionality for the API client""" - def __init__(self, client: Serverless, model: str, tool_manager: Optional[ToolManager] = None): + def __init__(self, client: Serverless, model: str, endpoint_name: str, tool_manager: Optional[ToolManager] = None): self.client = client self.model = model + self.endpoint_name = endpoint_name self.tool_manager = tool_manager or ToolManager() # ----- Streaming handler ----- @@ -177,10 +178,15 @@ class APIDemo: reasoning_content = "" printed_reasoning = False printed_answer = False + finish_reason = None async for chunk in stream: choice = (chunk.get("choices") or [{}])[0] delta = choice.get("delta", {}) + + # Track finish reason + if choice.get("finish_reason"): + finish_reason = choice.get("finish_reason") # reasoning tokens rc = delta.get("reasoning_content") @@ -211,6 +217,8 @@ class APIDemo: print(f"Reasoning tokens: {len(reasoning_content.split())}") if printed_answer: print(f"Response tokens: {len(full_response.split())}") + if finish_reason: + print(f"Finish reason: {finish_reason}") return full_response @@ -223,6 +231,7 @@ class APIDemo: client=self.client, model=self.model, prompt=COMPLETIONS_PROMPT, + endpoint_name=self.endpoint_name, max_tokens=MAX_TOKENS, temperature=DEFAULT_TEMPERATURE, ) @@ -241,6 +250,7 @@ class APIDemo: client=self.client, model=self.model, messages=messages, + endpoint_name=self.endpoint_name, max_tokens=MAX_TOKENS, temperature=DEFAULT_TEMPERATURE ) @@ -253,6 +263,7 @@ class APIDemo: client=self.client, model=self.model, messages=messages, + endpoint_name=self.endpoint_name, max_tokens=MAX_TOKENS, temperature=DEFAULT_TEMPERATURE ) @@ -279,6 +290,7 @@ class APIDemo: client=self.client, model=self.model, messages=messages, + endpoint_name=self.endpoint_name, tools=minimal_tool, tool_choice="none", max_tokens=10 @@ -304,6 +316,7 @@ class APIDemo: client=self.client, model=self.model, messages=messages, + endpoint_name=self.endpoint_name, tools=self.tool_manager.get_ls_tool_definition(), tool_choice="auto", max_tokens=MAX_TOKENS, @@ -381,6 +394,7 @@ class APIDemo: client=self.client, model=self.model, messages=messages, + endpoint_name=self.endpoint_name, max_tokens=MAX_TOKENS, temperature=DEFAULT_TEMPERATURE, ) @@ -419,7 +433,6 @@ class APIDemo: print("=" * 60) print("INTERACTIVE STREAMING CHAT") print("=" * 60) - print(f"Using model: {self.model}") print("Type 'quit' to exit, 'clear' to clear history") print() @@ -445,7 +458,8 @@ class APIDemo: stream = await stream_chat_completions( client=self.client, model=self.model, - messages=messages, + messages=messages, + endpoint_name=self.endpoint_name, max_tokens=MAX_TOKENS, temperature=0.7 ) @@ -465,8 +479,8 @@ class APIDemo: # ---------------------- CLI ---------------------- def build_arg_parser() -> argparse.ArgumentParser: p = argparse.ArgumentParser(description="Vast vLLM Demo (Serverless SDK)") - p.add_argument("--model", required=True, help="Model to use for requests (required)") - p.add_argument("--endpoint", default="my-vllm-endpoint", help="Vast endpoint name (default: my-vllm-endpoint)") + p.add_argument("--model", default=DEFAULT_MODEL, help=f"Model to use for requests (default: {DEFAULT_MODEL})") + p.add_argument("--endpoint", default=ENDPOINT_NAME, help=f"Vast endpoint name (default: {ENDPOINT_NAME})") modes = p.add_mutually_exclusive_group(required=False) modes.add_argument("--completion", action="store_true", help="Test completions endpoint") @@ -494,12 +508,14 @@ async def main_async(): print("Please specify exactly one test mode") sys.exit(1) - print(f"Using model: {args.model}") print("=" * 60) + print(f"Using model: {args.model}") + print(f"Using endpoint: {args.endpoint}") + try: async with Serverless() as client: - demo = APIDemo(client, args.model, ToolManager()) + demo = APIDemo(client, args.model, args.endpoint, ToolManager()) if args.completion: await demo.demo_completions() diff --git a/workers/tgi/README.md b/workers/tgi/README.md index 5cf8488..9147e38 100644 --- a/workers/tgi/README.md +++ b/workers/tgi/README.md @@ -1,19 +1,103 @@ -This is the base PyWorker for TGI, designed to create PyWorkers that can utilize various LLMs. It offers two primary endpoints: +# HuggingFace TGI PyWorker -1. `generate`: Generates the LLM's response to a given prompt in a single request. -2. `generate_stream`: Streams the LLM's response token by token. +This is the base PyWorker for HuggingFace Text Generation Inference (TGI) servers. See the [Serverless documentation](https://docs.vast.ai/serverless) for guides and how-to's. -Both endpoints use the following API payload format: +## Instance Setup + +1. Pick a template + +This worker is compatible with any TGI backend. We have a template you can use or you can create your own. + +- [HuggingFace TGI](https://cloud.vast.ai/?ref_id=62897&creator_id=62897&name=TGI%20(Serverless)) + +The template can be configured via the template interface. You may want to change the model or startup arguments. + +2. Follow the [getting started guide](https://docs.vast.ai/documentation/serverless/quickstart) for help with configuring your serverless setup. For testing, we recommend that you use the default options presented by the web interface. + +## Client Setup (Demo) + +1. Clone the PyWorker repository to your local machine and install the necessary requirements for running the test client. + +```bash +git clone https://github.com/vast-ai/pyworker +cd pyworker +pip install uv +uv venv -p 3.12 +source .venv/bin/activate +uv pip install -r requirements.txt +``` + +## Using the Test Client + +The test client demonstrates both streaming and non-streaming generation using TGI's native API. + +First, set your API key as an environment variable: + +```bash +export VAST_API_KEY= +``` + +The `--endpoint` flag is optional. If not provided, it defaults to `my-tgi-endpoint`. + +### Generate (Streaming) + +Call to `/generate_stream` with streaming response: + +```bash +python -m workers.tgi.client --generate-stream --endpoint +``` + +### Generate (Non-Streaming) + +Call to `/generate` with json response: + +```bash +python -m workers.tgi.client --generate --endpoint +``` + +### Interactive Session (Streaming) + +Interactive session with streaming responses. Type `quit` to exit. + +```bash +python -m workers.tgi.client --interactive --endpoint +``` + +## API Endpoints + +TGI provides two primary endpoints: + +### Generate (Non-Streaming) + +`/generate` - Returns the complete response in a single request. ```json { - "inputs": "PROMPT", + "inputs": "Your prompt here", "parameters": { - "max_new_tokens": 250 + "max_new_tokens": 1024, + "temperature": 0.7, + "return_full_text": false } } ``` -Note that the max_new_tokens parameter, rather than the prompt size, impacts performance. For example, if an -instance is benchmarked to process 100 tokens per second, a request with max_new_tokens = 200 will take -approximately 2 seconds to complete. +### Generate Stream (Streaming) + +`/generate_stream` - Streams the response token by token. + +```json +{ + "inputs": "Your prompt here", + "parameters": { + "max_new_tokens": 1024, + "temperature": 0.7, + "do_sample": true, + "return_full_text": false + } +} +``` + +## Performance Notes + +The `max_new_tokens` parameter (not the prompt size) primarily impacts performance. For example, if an instance is benchmarked to process 100 tokens per second, a request with `max_new_tokens = 200` will take approximately 2 seconds to complete. diff --git a/workers/tgi/client.py b/workers/tgi/client.py index f307602..23b40c2 100644 --- a/workers/tgi/client.py +++ b/workers/tgi/client.py @@ -1,61 +1,222 @@ +import logging +import json +import os +import sys +import argparse + from vastai import Serverless import asyncio -ENDPOINT_NAME = "my-tgi-endpoint" # Change this to match your endpoint name -MAX_TOKENS = 1024 -PROMPT = "Think step by step: Tell me about the Python programming language." +# ---------------------- Logging ---------------------- +logging.basicConfig( + level=logging.DEBUG, + format="%(asctime)s[%(levelname)-5s] %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", +) +log = logging.getLogger(__file__) -async def call_generate(client: Serverless) -> None: - endpoint = await client.get_endpoint(name=ENDPOINT_NAME) +# ---------------------- Defaults ---------------------- +DEFAULT_PROMPT = "Think step by step: Tell me about the Python programming language." + +ENDPOINT_NAME = "TGI-Prod2" # change this to your TGI endpoint name +MAX_TOKENS = 1024 +DEFAULT_TEMPERATURE = 0.7 + + +# ---------------------- API Calls ---------------------- +async def call_generate(client: Serverless, *, endpoint_name: str, prompt: str, **kwargs) -> dict: + """Non-streaming generation via /generate endpoint""" + endpoint = await client.get_endpoint(name=endpoint_name) payload = { - "inputs": PROMPT, + "inputs": prompt, "parameters": { - "max_new_tokens": MAX_TOKENS, - "temperature": 0.7, - "return_full_text": False + "max_new_tokens": kwargs.get("max_tokens", MAX_TOKENS), + "temperature": kwargs.get("temperature", DEFAULT_TEMPERATURE), + "return_full_text": False, } } - - resp = await endpoint.request("/generate", payload, cost=MAX_TOKENS) - - print(resp["response"]["generated_text"]) + log.debug("POST /generate %s", json.dumps(payload)[:500]) + resp = await endpoint.request("/generate", payload, cost=payload["parameters"]["max_new_tokens"]) + return resp["response"] -async def call_generate_stream(client: Serverless) -> None: - endpoint = await client.get_endpoint(name=ENDPOINT_NAME) +async def call_generate_stream(client: Serverless, *, endpoint_name: str, prompt: str, **kwargs): + """Streaming generation via /generate_stream endpoint""" + endpoint = await client.get_endpoint(name=endpoint_name) payload = { - "inputs": PROMPT, + "inputs": prompt, "parameters": { - "max_new_tokens": MAX_TOKENS, - "temperature": 0.7, + "max_new_tokens": kwargs.get("max_tokens", MAX_TOKENS), + "temperature": kwargs.get("temperature", DEFAULT_TEMPERATURE), "do_sample": True, "return_full_text": False, } } - + log.debug("STREAM /generate_stream %s", json.dumps(payload)[:500]) resp = await endpoint.request( "/generate_stream", payload, - cost=MAX_TOKENS, + cost=payload["parameters"]["max_new_tokens"], stream=True, ) - stream = resp["response"] + return resp["response"] # async generator - printed_answer = False - async for event in stream: - tok = (event.get("token") or {}).get("text") - if tok: - if not printed_answer: - printed_answer = True - print("Answer:\n", end="", flush=True) - print(tok, end="", flush=True) -async def main(): - async with Serverless() as client: - await call_generate(client) - await call_generate_stream(client) +# ---------------------- Demo Runner ---------------------- +class APIDemo: + """Demo and testing functionality for the TGI API client""" + + def __init__(self, client: Serverless, endpoint_name: str): + self.client = client + self.endpoint_name = endpoint_name + + async def handle_streaming_response(self, stream) -> str: + """Process streaming response and print tokens""" + full_response = "" + printed_answer = False + + async for event in stream: + tok = (event.get("token") or {}).get("text") + if tok: + if not printed_answer: + printed_answer = True + print("\nšŸ’¬ Response: ", end="", flush=True) + print(tok, end="", flush=True) + full_response += tok + + print() # newline + if printed_answer: + print(f"\nStreaming completed. Response tokens: {len(full_response.split())}") + + return full_response + + async def demo_generate(self) -> None: + """Demo non-streaming generation""" + print("=" * 60) + print("GENERATE DEMO (NON-STREAMING)") + print("=" * 60) + + response = await call_generate( + client=self.client, + endpoint_name=self.endpoint_name, + prompt=DEFAULT_PROMPT, + max_tokens=MAX_TOKENS, + temperature=DEFAULT_TEMPERATURE, + ) + + print(f"\nšŸ’¬ Response: {response.get('generated_text', '')}") + print(f"\nFull Response:\n{json.dumps(response, indent=2)}") + + async def demo_generate_stream(self) -> None: + """Demo streaming generation""" + print("=" * 60) + print("GENERATE DEMO (STREAMING)") + print("=" * 60) + + stream = await call_generate_stream( + client=self.client, + endpoint_name=self.endpoint_name, + prompt=DEFAULT_PROMPT, + max_tokens=MAX_TOKENS, + temperature=DEFAULT_TEMPERATURE, + ) + + try: + await self.handle_streaming_response(stream) + except Exception as e: + log.error("\nError during streaming: %s", e, exc_info=True) + + async def interactive_chat(self) -> None: + """Interactive session with streaming generation""" + print("=" * 60) + print("INTERACTIVE STREAMING SESSION") + print("=" * 60) + print(f"Using endpoint: {self.endpoint_name}") + print("Type 'quit' to exit") + print() + + while True: + try: + user_input = input("You: ").strip() + + if user_input.lower() == "quit": + print("šŸ‘‹ Goodbye!") + break + elif not user_input: + continue + + print("Assistant: ", end="", flush=True) + stream = await call_generate_stream( + client=self.client, + endpoint_name=self.endpoint_name, + prompt=user_input, + max_tokens=MAX_TOKENS, + temperature=DEFAULT_TEMPERATURE, + ) + + full_response = "" + async for event in stream: + tok = (event.get("token") or {}).get("text") + if tok: + print(tok, end="", flush=True) + full_response += tok + print() # newline + + except KeyboardInterrupt: + print("\nšŸ‘‹ Session interrupted. Goodbye!") + break + except Exception as e: + log.error("\nError: %s", e) + continue + + +# ---------------------- CLI ---------------------- +def build_arg_parser() -> argparse.ArgumentParser: + p = argparse.ArgumentParser(description="Vast TGI Demo (Serverless SDK)") + p.add_argument("--endpoint", default=ENDPOINT_NAME, help=f"Vast endpoint name (default: {ENDPOINT_NAME})") + + modes = p.add_mutually_exclusive_group(required=False) + modes.add_argument("--generate", action="store_true", help="Test generate endpoint (non-streaming)") + modes.add_argument("--generate-stream", action="store_true", help="Test generate endpoint with streaming") + modes.add_argument("--interactive", action="store_true", help="Start interactive streaming session") + return p + + +async def main_async(): + args = build_arg_parser().parse_args() + + selected = sum([args.generate, args.generate_stream, args.interactive]) + if selected == 0: + print("Please specify exactly one test mode:") + print(" --generate : Test generate endpoint (non-streaming)") + print(" --generate-stream : Test generate endpoint with streaming") + print(" --interactive : Start interactive streaming session") + print(f"\nExample: python {os.path.basename(sys.argv[0])} --generate-stream --endpoint my-tgi-endpoint") + sys.exit(1) + elif selected > 1: + print("Please specify exactly one test mode") + sys.exit(1) + + print("=" * 60) + print(f"Using endpoint: {args.endpoint}") + + try: + async with Serverless() as client: + demo = APIDemo(client, args.endpoint) + + if args.generate: + await demo.demo_generate() + elif args.generate_stream: + await demo.demo_generate_stream() + elif args.interactive: + await demo.interactive_chat() + + except Exception as e: + log.error("Error during test: %s", e, exc_info=True) + sys.exit(1) + if __name__ == "__main__": - asyncio.run(main()) + asyncio.run(main_async())