From a12523b1d29c8a1a8ee2e402c67c73992fcb8e22 Mon Sep 17 00:00:00 2001 From: Lucas Armand Date: Tue, 11 Nov 2025 17:41:12 -0800 Subject: [PATCH] Added bad code to tgi server to test --- lib/server.py | 71 ++++++++++++++++++++++++++++--------------- start_server.sh | 45 ++++++++++++++++++++++++++- workers/tgi/server.py | 1 + 3 files changed, 91 insertions(+), 26 deletions(-) diff --git a/lib/server.py b/lib/server.py index b21c880..25250ea 100644 --- a/lib/server.py +++ b/lib/server.py @@ -3,38 +3,59 @@ import logging from typing import List import ssl from asyncio import run, gather - +import asyncio from lib.backend import Backend +from lib.metrics import Metrics from aiohttp import web log = logging.getLogger(__file__) def start_server(backend: Backend, routes: List[web.RouteDef], **kwargs): - log.debug("getting certificate...") - use_ssl = os.environ.get("USE_SSL", "false") == "true" - if use_ssl is True: - ssl_context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH) - ssl_context.load_cert_chain( - certfile="/etc/instance.crt", - keyfile="/etc/instance.key", - ) - else: - ssl_context = None + try: + log.debug("getting certificate...") + use_ssl = os.environ.get("USE_SSL", "false") == "true" + if use_ssl is True: + ssl_context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH) + raise Exception("Oh no the SSL cert is gone!") + ssl_context.load_cert_chain( + certfile="/etc/instance.crt", + keyfile="/etc/instance.key", + ) + else: + ssl_context = None - async def main(): - log.debug("starting server...") - app = web.Application() - app.add_routes(routes) - runner = web.AppRunner(app) - await runner.setup() - site = web.TCPSite( - runner, - ssl_context=ssl_context, - port=int(os.environ["WORKER_PORT"]), - **kwargs - ) - await gather(site.start(), backend._start_tracking()) + async def main(): + log.debug("starting server...") + app = web.Application() + app.add_routes(routes) + runner = web.AppRunner(app) + await runner.setup() + site = web.TCPSite( + runner, + ssl_context=ssl_context, + port=int(os.environ["WORKER_PORT"]), + **kwargs + ) + await gather(site.start(), backend._start_tracking()) - run(main()) + run(main()) + + except Exception as e: + err_msg = f"PyWorker failed to launch: {e}" + log.error(err_msg) + + async def beacon(): + metrics = Metrics() + metrics._set_version(getattr(backend, "version", "0")) + metrics._set_mtoken(getattr(backend, "mtoken", "")) + try: + while True: + metrics._model_errored(err_msg) + await metrics.__send_metrics_and_reset() + await asyncio.sleep(10) + finally: + await metrics.aclose() + + run(beacon()) diff --git a/start_server.sh b/start_server.sh index edc16a4..87c7702 100755 --- a/start_server.sh +++ b/start_server.sh @@ -128,5 +128,48 @@ echo "launching PyWorker server" # from the run prior to reboot. past logs are saved in $MODEL_LOG.old for debugging only [ -e "$MODEL_LOG" ] && cat "$MODEL_LOG" >> "$MODEL_LOG.old" && : > "$MODEL_LOG" -(python3 -m "workers.$BACKEND.server" |& tee -a "$PYWORKER_LOG") & + +# Run the worker in foreground so we can detect non-zero exit and report it +python3 -m "workers.$BACKEND.server" |& tee -a "$PYWORKER_LOG" +STATUS=$? + +if [ $STATUS -ne 0 ]; then + echo "PyWorker exited with status $STATUS; notifying autoscaler..." + + ERROR_MSG="PyWorker exited: code ${STATUS}" + MTOKEN="${MASTER_TOKEN:-}" + VERSION="${PYWORKER_VERSION:-0}" + + # Comma-separated REPORT_ADDR is supported + IFS=',' read -r -a REPORT_ADDRS <<< "${REPORT_ADDR}" + for addr in "${REPORT_ADDRS[@]}"; do + # minimal, schema-compatible payload + curl -sS -X POST -H 'Content-Type: application/json' \ + -d "$(cat <