Merge pull request #1 from Nader-gator/main

add pyworker v2
This commit is contained in:
Nader Arbabian
2024-09-04 11:53:45 -07:00
committed by Nader Arbabian
31 changed files with 3000 additions and 1 deletions
+19
View File
@@ -0,0 +1,19 @@
This is the base PyWorker for TGI, designed to create PyWorkers that can utilize various LLMs. It offers two primary endpoints:
1. `generate`: Generates the LLM's response to a given prompt in a single request.
2. `generate_stream`: Streams the LLM's response token by token.
Both endpoints use the following API payload format:
```json
{
"inputs": "PROMPT",
"parameters": {
"max_new_tokens": 250
}
}
```
Note that the max_new_tokens parameter, rather than the prompt size, impacts performance. For example, if an
instance is benchmarked to process 100 tokens per second, a request with max_new_tokens = 200 will take
approximately 2 seconds to complete.
View File
+91
View File
@@ -0,0 +1,91 @@
import sys
import json
from urllib.parse import urljoin
import requests
def call_generate(endpoint_group_name: str, api_key: str, server_url: str) -> None:
WORKER_ENDPOINT = "/generate"
COST = 100
route_payload = {
"endpoint": endpoint_group_name,
"api_key": api_key,
"cost": COST,
}
response = requests.post(
urljoin(server_url, "/route/"),
json=route_payload,
timeout=4,
)
message = response.json()
url = message["url"]
auth_data = dict(
signature=message["signature"],
cost=message["cost"],
endpoint=message["endpoint"],
reqnum=message["reqnum"],
url=message["url"],
)
payload = dict(inputs="tell me about cats", parameters=dict(max_new_tokens=500))
req_data = dict(payload=payload, auth_data=auth_data)
url = urljoin(url, WORKER_ENDPOINT)
print(f"url: {url}")
response = requests.post(
url,
json=req_data,
)
res = response.json()
print(res)
def call_generate_stream(endpoint_group_name: str, api_key: str, server_url: str):
WORKER_ENDPOINT = "/generate_stream"
COST = 100
route_payload = {
"endpoint": endpoint_group_name,
"api_key": api_key,
"cost": COST,
}
response = requests.post(
urljoin(server_url, "/route/"),
json=route_payload,
timeout=4,
)
message = response.json()
url = message["url"]
print(f"url: {url}")
auth_data = dict(
signature=message["signature"],
cost=message["cost"],
endpoint=message["endpoint"],
reqnum=message["reqnum"],
url=message["url"],
)
payload = dict(inputs="tell me about dogs", parameters=dict(max_new_tokens=500))
req_data = dict(payload=payload, auth_data=auth_data)
url = urljoin(url, WORKER_ENDPOINT)
response = requests.post(url, json=req_data, stream=True)
for line in response.iter_lines():
payload = line.decode().lstrip("data:").rstrip()
if payload:
data = json.loads(payload)
print(data["token"]["text"], end="")
sys.stdout.flush()
print()
if __name__ == "__main__":
from lib.test_utils import test_args
args = test_args.parse_args()
call_generate(
api_key=args.api_key,
endpoint_group_name=args.endpoint_group_name,
server_url=args.server_url,
)
call_generate_stream(
api_key=args.api_key,
endpoint_group_name=args.endpoint_group_name,
server_url=args.server_url,
)
+73
View File
@@ -0,0 +1,73 @@
import dataclasses
import random
import inspect
from typing import Dict, Any
from transformers import AutoTokenizer
import nltk
from lib.data_types import ApiPayload, JsonDataException
nltk.download("words")
WORD_LIST = nltk.corpus.words.words()
tokenizer = AutoTokenizer.from_pretrained("openai-community/openai-gpt")
@dataclasses.dataclass
class InputParameters:
max_new_tokens: int = 256
@classmethod
def from_json_msg(cls, json_msg: Dict[str, Any]) -> "InputParameters":
errors = {}
for param in inspect.signature(cls).parameters:
if param not in json_msg:
errors[param] = "missing parameter"
if errors:
raise JsonDataException(errors)
return cls(
**{
k: v
for k, v in json_msg.items()
if k in inspect.signature(cls).parameters
}
)
@dataclasses.dataclass
class InputData(ApiPayload):
inputs: str
parameters: InputParameters
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "InputData":
return cls(
inputs=data["inputs"], parameters=InputParameters(**data["parameters"])
)
@classmethod
def for_test(cls) -> "InputData":
prompt = " ".join(random.choices(WORD_LIST, k=int(250)))
return cls(inputs=prompt, parameters=InputParameters())
def generate_payload_json(self) -> Dict[str, Any]:
return dataclasses.asdict(self)
def count_workload(self) -> int:
return self.parameters.max_new_tokens
@classmethod
def from_json_msg(cls, json_msg: Dict[str, Any]) -> "InputData":
errors = {}
for param in inspect.signature(cls).parameters:
if param not in json_msg:
errors[param] = "missing parameter"
if errors:
raise JsonDataException(errors)
try:
parameters = InputParameters.from_json_msg(json_msg["parameters"])
return cls(inputs=json_msg["inputs"], parameters=parameters)
except JsonDataException as e:
errors["parameters"] = e.message
raise JsonDataException(errors)
+115
View File
@@ -0,0 +1,115 @@
import os
import logging
from typing import Union, Type
import dataclasses
from aiohttp import web, ClientResponse
from lib.backend import Backend, LogAction
from lib.data_types import EndpointHandler
from lib.server import start_server
from .data_types import InputData
MODEL_SERVER_URL = "http://0.0.0.0:5001"
# This is the last log line that gets emitted once comfyui+extensions have been fully loaded
MODEL_SERVER_START_LOG_MSG = '"message":"Connected","target":"text_generation_router"'
MODEL_SERVER_ERROR_LOG_MSGS = ["Error: WebserverFailed", "Error: DownloadError"]
logging.basicConfig(
level=logging.DEBUG,
format="%(asctime)s[%(levelname)-5s] %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
log = logging.getLogger(__file__)
@dataclasses.dataclass
class GenerateHandler(EndpointHandler[InputData]):
@property
def endpoint(self) -> str:
return "/generate"
@classmethod
def payload_cls(cls) -> Type[InputData]:
return InputData
def make_benchmark_payload(self) -> InputData:
return InputData.for_test()
async def generate_client_response(
self, client_request: web.Request, model_response: ClientResponse
) -> Union[web.Response, web.StreamResponse]:
_ = client_request
match model_response.status:
case 200:
log.debug("SUCCESS")
data = await model_response.json()
return web.json_response(data=data)
case code:
log.debug("SENDING RESPONSE: ERROR: unknown code")
return web.Response(status=code)
class GenerateStreamHandler(EndpointHandler[InputData]):
@property
def endpoint(self) -> str:
return "/generate_stream"
@classmethod
def payload_cls(cls) -> Type[InputData]:
return InputData
def make_benchmark_payload(self) -> InputData:
return InputData.for_test()
async def generate_client_response(
self, client_request: web.Request, model_response: ClientResponse
) -> Union[web.Response, web.StreamResponse]:
match model_response.status:
case 200:
log.debug("Streaming response...")
res = web.StreamResponse()
res.content_type = "text/event-stream"
await res.prepare(client_request)
async for chunk in model_response.content:
await res.write(chunk)
await res.write_eof()
log.debug("Done streaming response")
return res
case code:
log.debug("SENDING RESPONSE: ERROR: unknown code")
return web.Response(status=code)
backend = Backend(
model_server_url=MODEL_SERVER_URL,
model_log_file=os.environ["MODEL_LOG"],
allow_parallel_requests=True,
benchmark_handler=GenerateHandler(benchmark_runs=3, benchmark_words=256),
log_actions=[
(LogAction.ModelLoaded, MODEL_SERVER_START_LOG_MSG),
(LogAction.Info, '"message":"Download'),
*[
(LogAction.ModelError, error_msg)
for error_msg in MODEL_SERVER_ERROR_LOG_MSGS
],
],
)
async def handle_ping(_):
return web.Response(body="pong")
routes = [
web.post("/generate", backend.create_handler(GenerateHandler())),
web.post("/generate_stream", backend.create_handler(GenerateStreamHandler())),
web.get("/ping", handle_ping),
]
if __name__ == "__main__":
start_server(backend, routes)
+7
View File
@@ -0,0 +1,7 @@
from lib.test_utils import test_load_cmd, test_args
from .data_types import InputData
WORKER_ENDPOINT = "/generate"
if __name__ == "__main__":
test_load_cmd(InputData, WORKER_ENDPOINT, arg_parser=test_args)