From a12523b1d29c8a1a8ee2e402c67c73992fcb8e22 Mon Sep 17 00:00:00 2001 From: Lucas Armand Date: Tue, 11 Nov 2025 17:41:12 -0800 Subject: [PATCH 1/8] Added bad code to tgi server to test --- lib/server.py | 71 ++++++++++++++++++++++++++++--------------- start_server.sh | 45 ++++++++++++++++++++++++++- workers/tgi/server.py | 1 + 3 files changed, 91 insertions(+), 26 deletions(-) diff --git a/lib/server.py b/lib/server.py index b21c880..25250ea 100644 --- a/lib/server.py +++ b/lib/server.py @@ -3,38 +3,59 @@ import logging from typing import List import ssl from asyncio import run, gather - +import asyncio from lib.backend import Backend +from lib.metrics import Metrics from aiohttp import web log = logging.getLogger(__file__) def start_server(backend: Backend, routes: List[web.RouteDef], **kwargs): - log.debug("getting certificate...") - use_ssl = os.environ.get("USE_SSL", "false") == "true" - if use_ssl is True: - ssl_context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH) - ssl_context.load_cert_chain( - certfile="/etc/instance.crt", - keyfile="/etc/instance.key", - ) - else: - ssl_context = None + try: + log.debug("getting certificate...") + use_ssl = os.environ.get("USE_SSL", "false") == "true" + if use_ssl is True: + ssl_context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH) + raise Exception("Oh no the SSL cert is gone!") + ssl_context.load_cert_chain( + certfile="/etc/instance.crt", + keyfile="/etc/instance.key", + ) + else: + ssl_context = None - async def main(): - log.debug("starting server...") - app = web.Application() - app.add_routes(routes) - runner = web.AppRunner(app) - await runner.setup() - site = web.TCPSite( - runner, - ssl_context=ssl_context, - port=int(os.environ["WORKER_PORT"]), - **kwargs - ) - await gather(site.start(), backend._start_tracking()) + async def main(): + log.debug("starting server...") + app = web.Application() + app.add_routes(routes) + runner = web.AppRunner(app) + await runner.setup() + site = web.TCPSite( + runner, + ssl_context=ssl_context, + port=int(os.environ["WORKER_PORT"]), + **kwargs + ) + await gather(site.start(), backend._start_tracking()) - run(main()) + run(main()) + + except Exception as e: + err_msg = f"PyWorker failed to launch: {e}" + log.error(err_msg) + + async def beacon(): + metrics = Metrics() + metrics._set_version(getattr(backend, "version", "0")) + metrics._set_mtoken(getattr(backend, "mtoken", "")) + try: + while True: + metrics._model_errored(err_msg) + await metrics.__send_metrics_and_reset() + await asyncio.sleep(10) + finally: + await metrics.aclose() + + run(beacon()) diff --git a/start_server.sh b/start_server.sh index edc16a4..87c7702 100755 --- a/start_server.sh +++ b/start_server.sh @@ -128,5 +128,48 @@ echo "launching PyWorker server" # from the run prior to reboot. past logs are saved in $MODEL_LOG.old for debugging only [ -e "$MODEL_LOG" ] && cat "$MODEL_LOG" >> "$MODEL_LOG.old" && : > "$MODEL_LOG" -(python3 -m "workers.$BACKEND.server" |& tee -a "$PYWORKER_LOG") & + +# Run the worker in foreground so we can detect non-zero exit and report it +python3 -m "workers.$BACKEND.server" |& tee -a "$PYWORKER_LOG" +STATUS=$? + +if [ $STATUS -ne 0 ]; then + echo "PyWorker exited with status $STATUS; notifying autoscaler..." + + ERROR_MSG="PyWorker exited: code ${STATUS}" + MTOKEN="${MASTER_TOKEN:-}" + VERSION="${PYWORKER_VERSION:-0}" + + # Comma-separated REPORT_ADDR is supported + IFS=',' read -r -a REPORT_ADDRS <<< "${REPORT_ADDR}" + for addr in "${REPORT_ADDRS[@]}"; do + # minimal, schema-compatible payload + curl -sS -X POST -H 'Content-Type: application/json' \ + -d "$(cat < Date: Tue, 11 Nov 2025 17:49:34 -0800 Subject: [PATCH 2/8] fix --- lib/server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/server.py b/lib/server.py index 25250ea..def7340 100644 --- a/lib/server.py +++ b/lib/server.py @@ -53,7 +53,7 @@ def start_server(backend: Backend, routes: List[web.RouteDef], **kwargs): try: while True: metrics._model_errored(err_msg) - await metrics.__send_metrics_and_reset() + await metrics._Metrics__send_metrics_and_reset() await asyncio.sleep(10) finally: await metrics.aclose() From de9b50abb9d5043cd0b08d5cc556d3dba2616f80 Mon Sep 17 00:00:00 2001 From: Lucas Armand Date: Tue, 11 Nov 2025 17:53:36 -0800 Subject: [PATCH 3/8] use set +e --- start_server.sh | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/start_server.sh b/start_server.sh index 87c7702..4763895 100755 --- a/start_server.sh +++ b/start_server.sh @@ -129,21 +129,19 @@ echo "launching PyWorker server" [ -e "$MODEL_LOG" ] && cat "$MODEL_LOG" >> "$MODEL_LOG.old" && : > "$MODEL_LOG" -# Run the worker in foreground so we can detect non-zero exit and report it +set +e python3 -m "workers.$BACKEND.server" |& tee -a "$PYWORKER_LOG" -STATUS=$? +PY_STATUS=${PIPESTATUS[0]} +set -e -if [ $STATUS -ne 0 ]; then - echo "PyWorker exited with status $STATUS; notifying autoscaler..." - - ERROR_MSG="PyWorker exited: code ${STATUS}" +if [ "${PY_STATUS}" -ne 0 ]; then + echo "PyWorker exited with status ${PY_STATUS}; notifying autoscaler..." + ERROR_MSG="PyWorker exited: code ${PY_STATUS}" MTOKEN="${MASTER_TOKEN:-}" VERSION="${PYWORKER_VERSION:-0}" - # Comma-separated REPORT_ADDR is supported IFS=',' read -r -a REPORT_ADDRS <<< "${REPORT_ADDR}" for addr in "${REPORT_ADDRS[@]}"; do - # minimal, schema-compatible payload curl -sS -X POST -H 'Content-Type: application/json' \ -d "$(cat < Date: Tue, 11 Nov 2025 17:57:08 -0800 Subject: [PATCH 4/8] dont exit on pyworker fail --- start_server.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/start_server.sh b/start_server.sh index 4763895..09c33d8 100755 --- a/start_server.sh +++ b/start_server.sh @@ -166,8 +166,6 @@ if [ "${PY_STATUS}" -ne 0 ]; then JSON )" "${addr%/}/worker_status/" || true done - - exit "${PY_STATUS}" fi echo "launching PyWorker server done" \ No newline at end of file From a47c9d1ed0821aab48ec904a2dba863df3feff1a Mon Sep 17 00:00:00 2001 From: Lucas Armand Date: Tue, 11 Nov 2025 18:13:46 -0800 Subject: [PATCH 5/8] remove test bugs --- lib/server.py | 1 - workers/tgi/server.py | 1 - 2 files changed, 2 deletions(-) diff --git a/lib/server.py b/lib/server.py index def7340..0029311 100644 --- a/lib/server.py +++ b/lib/server.py @@ -18,7 +18,6 @@ def start_server(backend: Backend, routes: List[web.RouteDef], **kwargs): use_ssl = os.environ.get("USE_SSL", "false") == "true" if use_ssl is True: ssl_context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH) - raise Exception("Oh no the SSL cert is gone!") ssl_context.load_cert_chain( certfile="/etc/instance.crt", keyfile="/etc/instance.key", diff --git a/workers/tgi/server.py b/workers/tgi/server.py index 9ce8374..99fc810 100644 --- a/workers/tgi/server.py +++ b/workers/tgi/server.py @@ -127,5 +127,4 @@ routes = [ ] if __name__ == "__main__": - blips = blorps start_server(backend, routes) From 45e0c7d9caf62805495fa3d499a91909ba64363f Mon Sep 17 00:00:00 2001 From: Lucas Armand Date: Mon, 24 Nov 2025 15:02:33 -0800 Subject: [PATCH 6/8] Move model log rotate to top --- start_server.sh | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/start_server.sh b/start_server.sh index edc16a4..dd57b5a 100755 --- a/start_server.sh +++ b/start_server.sh @@ -41,6 +41,14 @@ echo_var DEBUG_LOG echo_var PYWORKER_LOG echo_var MODEL_LOG +# if instance is rebooted, we want to clear out the log file so pyworker doesn't read lines +# from the run prior to reboot. past logs are saved in $MODEL_LOG.old for debugging only +if [ -e "$MODEL_LOG" ]; then + echo "Rotating model log at $MODEL_LOG to $MODEL_LOG.old" + cat "$MODEL_LOG" >> "$MODEL_LOG.old" + : > "$MODEL_LOG" +fi + # Populate /etc/environment with quoted values if ! grep -q "VAST" /etc/environment; then env -0 | grep -zEv "^(HOME=|SHLVL=)|CONDA" | while IFS= read -r -d '' line; do @@ -124,9 +132,7 @@ cd "$SERVER_DIR" echo "launching PyWorker server" -# if instance is rebooted, we want to clear out the log file so pyworker doesn't read lines -# from the run prior to reboot. past logs are saved in $MODEL_LOG.old for debugging only -[ -e "$MODEL_LOG" ] && cat "$MODEL_LOG" >> "$MODEL_LOG.old" && : > "$MODEL_LOG" +# Model log line used to be here ! (python3 -m "workers.$BACKEND.server" |& tee -a "$PYWORKER_LOG") & echo "launching PyWorker server done" From 9c6ab7850343a2ac6d83e37f628ea8de61499d38 Mon Sep 17 00:00:00 2001 From: Lucas Armand Date: Mon, 24 Nov 2025 15:22:23 -0800 Subject: [PATCH 7/8] Move model log line --- start_server.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/start_server.sh b/start_server.sh index dd57b5a..e30a2bc 100755 --- a/start_server.sh +++ b/start_server.sh @@ -132,7 +132,5 @@ cd "$SERVER_DIR" echo "launching PyWorker server" -# Model log line used to be here ! - (python3 -m "workers.$BACKEND.server" |& tee -a "$PYWORKER_LOG") & echo "launching PyWorker server done" From e14316243859f100f040b22f13a0412fda789a51 Mon Sep 17 00:00:00 2001 From: Lucas Armand Date: Tue, 25 Nov 2025 16:01:23 -0800 Subject: [PATCH 8/8] bumpy pyworker version --- lib/backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/backend.py b/lib/backend.py index 19764bd..0d9a273 100644 --- a/lib/backend.py +++ b/lib/backend.py @@ -30,7 +30,7 @@ from lib.data_types import ( BenchmarkResult ) -VERSION = "0.2.0" +VERSION = "0.2.1" MSG_HISTORY_LEN = 100 log = logging.getLogger(__file__)