pyworker/workers/openai/worker.py

import random
import os
import re

from vastai import Worker, WorkerConfig, HandlerConfig, LogActionConfig, BenchmarkConfig

# Ollama model configuration
MODEL_SERVER_URL           = 'http://127.0.0.1'
MODEL_SERVER_PORT          = 11434
MODEL_LOG_FILE             = '/var/log/onstart.log'
MODEL_HEALTHCHECK_ENDPOINT = "/"

# Ollama-specific log messages
def request_parser(request):
    data = request
    if request.get("input") is not None:
        data = request.get("input")
    return data


def completions_benchmark_generator() -> dict:
    # extract words from the python source code of the worker to create a list of words for generating prompts

    WORD_LIST = []

    # Try to load from perl copyright file first
    try:
        with open("/usr/share/doc/perl/copyright", 'r') as f:
            source_code = f.read()
            WORD_LIST = re.findall(r'\b\w+\b', source_code)
    except (FileNotFoundError, IOError):
        # Fallback to loading from python file
        with open(__file__, 'r') as f:
            source_code = f.read()
            WORD_LIST = re.findall(r'\b\w+\b', source_code)

    prompt = " ".join(random.choices(WORD_LIST, k=int(250)))
    model = os.environ.get("MODEL_NAME")

    if not model:
        raise ValueError("MODEL_NAME environment variable not set")

    benchmark_data = {
        "model": model,
        "prompt": prompt,
        "temperature": 0.7,
        "max_tokens": 500,
    }

    return benchmark_data

worker_config = WorkerConfig(
    model_server_url=MODEL_SERVER_URL,
    model_server_port=MODEL_SERVER_PORT,
    model_log_file=MODEL_LOG_FILE,
    model_healthcheck_url=MODEL_HEALTHCHECK_ENDPOINT,
    handlers=[
        HandlerConfig(
            route="/v1/completions",
            workload_calculator= lambda data: data.get("max_tokens", 0),
            allow_parallel_requests=True,
            request_parser=request_parser,
            max_queue_time=600.0,
            benchmark_config=BenchmarkConfig(
                generator=completions_benchmark_generator,
                concurrency=10,
                runs=3
            )
        ),
        HandlerConfig(
            route="/v1/chat/completions",
            workload_calculator= lambda data: data.get("max_tokens", 0),
            allow_parallel_requests=True,
            request_parser=request_parser,
            max_queue_time=600.0,
        )
    ],
    log_action_config=LogActionConfig(
        on_load=["llama runner started in "],
        on_error=["Traceback (most recent call last):","Error:"],
        on_info=["load_tensors:","llama_context:","print_info:","llama_model_loader:"]
    )
)

Worker(worker_config).run()