From a12523b1d29c8a1a8ee2e402c67c73992fcb8e22 Mon Sep 17 00:00:00 2001 From: Lucas Armand Date: Tue, 11 Nov 2025 17:41:12 -0800 Subject: [PATCH 1/5] Added bad code to tgi server to test --- lib/server.py | 71 ++++++++++++++++++++++++++++--------------- start_server.sh | 45 ++++++++++++++++++++++++++- workers/tgi/server.py | 1 + 3 files changed, 91 insertions(+), 26 deletions(-) diff --git a/lib/server.py b/lib/server.py index b21c880..25250ea 100644 --- a/lib/server.py +++ b/lib/server.py @@ -3,38 +3,59 @@ import logging from typing import List import ssl from asyncio import run, gather - +import asyncio from lib.backend import Backend +from lib.metrics import Metrics from aiohttp import web log = logging.getLogger(__file__) def start_server(backend: Backend, routes: List[web.RouteDef], **kwargs): - log.debug("getting certificate...") - use_ssl = os.environ.get("USE_SSL", "false") == "true" - if use_ssl is True: - ssl_context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH) - ssl_context.load_cert_chain( - certfile="/etc/instance.crt", - keyfile="/etc/instance.key", - ) - else: - ssl_context = None + try: + log.debug("getting certificate...") + use_ssl = os.environ.get("USE_SSL", "false") == "true" + if use_ssl is True: + ssl_context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH) + raise Exception("Oh no the SSL cert is gone!") + ssl_context.load_cert_chain( + certfile="/etc/instance.crt", + keyfile="/etc/instance.key", + ) + else: + ssl_context = None - async def main(): - log.debug("starting server...") - app = web.Application() - app.add_routes(routes) - runner = web.AppRunner(app) - await runner.setup() - site = web.TCPSite( - runner, - ssl_context=ssl_context, - port=int(os.environ["WORKER_PORT"]), - **kwargs - ) - await gather(site.start(), backend._start_tracking()) + async def main(): + log.debug("starting server...") + app = web.Application() + app.add_routes(routes) + runner = web.AppRunner(app) + await runner.setup() + site = web.TCPSite( + runner, + ssl_context=ssl_context, + port=int(os.environ["WORKER_PORT"]), + **kwargs + ) + await gather(site.start(), backend._start_tracking()) - run(main()) + run(main()) + + except Exception as e: + err_msg = f"PyWorker failed to launch: {e}" + log.error(err_msg) + + async def beacon(): + metrics = Metrics() + metrics._set_version(getattr(backend, "version", "0")) + metrics._set_mtoken(getattr(backend, "mtoken", "")) + try: + while True: + metrics._model_errored(err_msg) + await metrics.__send_metrics_and_reset() + await asyncio.sleep(10) + finally: + await metrics.aclose() + + run(beacon()) diff --git a/start_server.sh b/start_server.sh index edc16a4..87c7702 100755 --- a/start_server.sh +++ b/start_server.sh @@ -128,5 +128,48 @@ echo "launching PyWorker server" # from the run prior to reboot. past logs are saved in $MODEL_LOG.old for debugging only [ -e "$MODEL_LOG" ] && cat "$MODEL_LOG" >> "$MODEL_LOG.old" && : > "$MODEL_LOG" -(python3 -m "workers.$BACKEND.server" |& tee -a "$PYWORKER_LOG") & + +# Run the worker in foreground so we can detect non-zero exit and report it +python3 -m "workers.$BACKEND.server" |& tee -a "$PYWORKER_LOG" +STATUS=$? + +if [ $STATUS -ne 0 ]; then + echo "PyWorker exited with status $STATUS; notifying autoscaler..." + + ERROR_MSG="PyWorker exited: code ${STATUS}" + MTOKEN="${MASTER_TOKEN:-}" + VERSION="${PYWORKER_VERSION:-0}" + + # Comma-separated REPORT_ADDR is supported + IFS=',' read -r -a REPORT_ADDRS <<< "${REPORT_ADDR}" + for addr in "${REPORT_ADDRS[@]}"; do + # minimal, schema-compatible payload + curl -sS -X POST -H 'Content-Type: application/json' \ + -d "$(cat < Date: Tue, 11 Nov 2025 17:49:34 -0800 Subject: [PATCH 2/5] fix --- lib/server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/server.py b/lib/server.py index 25250ea..def7340 100644 --- a/lib/server.py +++ b/lib/server.py @@ -53,7 +53,7 @@ def start_server(backend: Backend, routes: List[web.RouteDef], **kwargs): try: while True: metrics._model_errored(err_msg) - await metrics.__send_metrics_and_reset() + await metrics._Metrics__send_metrics_and_reset() await asyncio.sleep(10) finally: await metrics.aclose() From de9b50abb9d5043cd0b08d5cc556d3dba2616f80 Mon Sep 17 00:00:00 2001 From: Lucas Armand Date: Tue, 11 Nov 2025 17:53:36 -0800 Subject: [PATCH 3/5] use set +e --- start_server.sh | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/start_server.sh b/start_server.sh index 87c7702..4763895 100755 --- a/start_server.sh +++ b/start_server.sh @@ -129,21 +129,19 @@ echo "launching PyWorker server" [ -e "$MODEL_LOG" ] && cat "$MODEL_LOG" >> "$MODEL_LOG.old" && : > "$MODEL_LOG" -# Run the worker in foreground so we can detect non-zero exit and report it +set +e python3 -m "workers.$BACKEND.server" |& tee -a "$PYWORKER_LOG" -STATUS=$? +PY_STATUS=${PIPESTATUS[0]} +set -e -if [ $STATUS -ne 0 ]; then - echo "PyWorker exited with status $STATUS; notifying autoscaler..." - - ERROR_MSG="PyWorker exited: code ${STATUS}" +if [ "${PY_STATUS}" -ne 0 ]; then + echo "PyWorker exited with status ${PY_STATUS}; notifying autoscaler..." + ERROR_MSG="PyWorker exited: code ${PY_STATUS}" MTOKEN="${MASTER_TOKEN:-}" VERSION="${PYWORKER_VERSION:-0}" - # Comma-separated REPORT_ADDR is supported IFS=',' read -r -a REPORT_ADDRS <<< "${REPORT_ADDR}" for addr in "${REPORT_ADDRS[@]}"; do - # minimal, schema-compatible payload curl -sS -X POST -H 'Content-Type: application/json' \ -d "$(cat < Date: Tue, 11 Nov 2025 17:57:08 -0800 Subject: [PATCH 4/5] dont exit on pyworker fail --- start_server.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/start_server.sh b/start_server.sh index 4763895..09c33d8 100755 --- a/start_server.sh +++ b/start_server.sh @@ -166,8 +166,6 @@ if [ "${PY_STATUS}" -ne 0 ]; then JSON )" "${addr%/}/worker_status/" || true done - - exit "${PY_STATUS}" fi echo "launching PyWorker server done" \ No newline at end of file From a47c9d1ed0821aab48ec904a2dba863df3feff1a Mon Sep 17 00:00:00 2001 From: Lucas Armand Date: Tue, 11 Nov 2025 18:13:46 -0800 Subject: [PATCH 5/5] remove test bugs --- lib/server.py | 1 - workers/tgi/server.py | 1 - 2 files changed, 2 deletions(-) diff --git a/lib/server.py b/lib/server.py index def7340..0029311 100644 --- a/lib/server.py +++ b/lib/server.py @@ -18,7 +18,6 @@ def start_server(backend: Backend, routes: List[web.RouteDef], **kwargs): use_ssl = os.environ.get("USE_SSL", "false") == "true" if use_ssl is True: ssl_context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH) - raise Exception("Oh no the SSL cert is gone!") ssl_context.load_cert_chain( certfile="/etc/instance.crt", keyfile="/etc/instance.key", diff --git a/workers/tgi/server.py b/workers/tgi/server.py index 9ce8374..99fc810 100644 --- a/workers/tgi/server.py +++ b/workers/tgi/server.py @@ -127,5 +127,4 @@ routes = [ ] if __name__ == "__main__": - blips = blorps start_server(backend, routes)