import random import os import re import logging from vastai import Worker, WorkerConfig, HandlerConfig, LogActionConfig, BenchmarkConfig logging.getLogger().setLevel(logging.WARNING) # Only show warnings and errors # Ollama model configuration MODEL_SERVER_URL = 'http://127.0.0.1' MODEL_SERVER_PORT = 11434 MODEL_LOG_FILE = '/var/log/onstart.log' MODEL_HEALTHCHECK_ENDPOINT = "/" # Ollama-specific log messages def request_parser(request): data = request if request.get("input") is not None: data = request.get("input") return data def completions_benchmark_generator() -> dict: # extract words from the python source code of the worker to create a list of words for generating prompts WORD_LIST = [] # Try to load from perl copyright file first try: with open("/usr/share/doc/perl/copyright", 'r') as f: source_code = f.read() WORD_LIST = re.findall(r'\b\w+\b', source_code) except (FileNotFoundError, IOError): # Fallback to loading from python file with open(__file__, 'r') as f: source_code = f.read() WORD_LIST = re.findall(r'\b\w+\b', source_code) prompt = " ".join(random.choices(WORD_LIST, k=int(250))) model = os.environ.get("MODEL_NAME") if not model: raise ValueError("MODEL_NAME environment variable not set") benchmark_data = { "model": model, "prompt": prompt, "temperature": 0.7, "max_tokens": 500, } return benchmark_data worker_config = WorkerConfig( model_server_url=MODEL_SERVER_URL, model_server_port=MODEL_SERVER_PORT, model_log_file=MODEL_LOG_FILE, model_healthcheck_url=MODEL_HEALTHCHECK_ENDPOINT, handlers=[ HandlerConfig( route="/v1/completions", workload_calculator= lambda data: data.get("max_tokens", 0), allow_parallel_requests=True, request_parser=request_parser, max_queue_time=600.0, benchmark_config=BenchmarkConfig( generator=completions_benchmark_generator, concurrency=10, runs=3 ) ), HandlerConfig( route="/v1/chat/completions", workload_calculator= lambda data: data.get("max_tokens", 0), allow_parallel_requests=True, request_parser=request_parser, max_queue_time=600.0, ) ], log_action_config=LogActionConfig( on_load=["llama runner started in "], on_error=["Traceback (most recent call last):","Error:"], #on_info=["load_tensors:","llama_context:","print_info:","llama_model_loader:"] ) ) Worker(worker_config).run()