Added bad code to tgi server to test

This commit is contained in:
Lucas Armand
2025-11-11 17:41:12 -08:00
parent 7db54f3bd7
commit a12523b1d2
3 changed files with 91 additions and 26 deletions
+22 -1
View File
@@ -3,19 +3,22 @@ import logging
from typing import List from typing import List
import ssl import ssl
from asyncio import run, gather from asyncio import run, gather
import asyncio
from lib.backend import Backend from lib.backend import Backend
from lib.metrics import Metrics
from aiohttp import web from aiohttp import web
log = logging.getLogger(__file__) log = logging.getLogger(__file__)
def start_server(backend: Backend, routes: List[web.RouteDef], **kwargs): def start_server(backend: Backend, routes: List[web.RouteDef], **kwargs):
try:
log.debug("getting certificate...") log.debug("getting certificate...")
use_ssl = os.environ.get("USE_SSL", "false") == "true" use_ssl = os.environ.get("USE_SSL", "false") == "true"
if use_ssl is True: if use_ssl is True:
ssl_context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH) ssl_context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
raise Exception("Oh no the SSL cert is gone!")
ssl_context.load_cert_chain( ssl_context.load_cert_chain(
certfile="/etc/instance.crt", certfile="/etc/instance.crt",
keyfile="/etc/instance.key", keyfile="/etc/instance.key",
@@ -38,3 +41,21 @@ def start_server(backend: Backend, routes: List[web.RouteDef], **kwargs):
await gather(site.start(), backend._start_tracking()) await gather(site.start(), backend._start_tracking())
run(main()) run(main())
except Exception as e:
err_msg = f"PyWorker failed to launch: {e}"
log.error(err_msg)
async def beacon():
metrics = Metrics()
metrics._set_version(getattr(backend, "version", "0"))
metrics._set_mtoken(getattr(backend, "mtoken", ""))
try:
while True:
metrics._model_errored(err_msg)
await metrics.__send_metrics_and_reset()
await asyncio.sleep(10)
finally:
await metrics.aclose()
run(beacon())
+44 -1
View File
@@ -128,5 +128,48 @@ echo "launching PyWorker server"
# from the run prior to reboot. past logs are saved in $MODEL_LOG.old for debugging only # from the run prior to reboot. past logs are saved in $MODEL_LOG.old for debugging only
[ -e "$MODEL_LOG" ] && cat "$MODEL_LOG" >> "$MODEL_LOG.old" && : > "$MODEL_LOG" [ -e "$MODEL_LOG" ] && cat "$MODEL_LOG" >> "$MODEL_LOG.old" && : > "$MODEL_LOG"
(python3 -m "workers.$BACKEND.server" |& tee -a "$PYWORKER_LOG") &
# Run the worker in foreground so we can detect non-zero exit and report it
python3 -m "workers.$BACKEND.server" |& tee -a "$PYWORKER_LOG"
STATUS=$?
if [ $STATUS -ne 0 ]; then
echo "PyWorker exited with status $STATUS; notifying autoscaler..."
ERROR_MSG="PyWorker exited: code ${STATUS}"
MTOKEN="${MASTER_TOKEN:-}"
VERSION="${PYWORKER_VERSION:-0}"
# Comma-separated REPORT_ADDR is supported
IFS=',' read -r -a REPORT_ADDRS <<< "${REPORT_ADDR}"
for addr in "${REPORT_ADDRS[@]}"; do
# minimal, schema-compatible payload
curl -sS -X POST -H 'Content-Type: application/json' \
-d "$(cat <<JSON
{
"id": ${CONTAINER_ID:-0},
"mtoken": "${MTOKEN}",
"version": "${VERSION}",
"loadtime": 0,
"new_load": 0,
"cur_load": 0,
"rej_load": 0,
"max_perf": 0,
"cur_perf": 0,
"error_msg": "${ERROR_MSG}",
"num_requests_working": 0,
"num_requests_recieved": 0,
"additional_disk_usage": 0,
"working_request_idxs": [],
"cur_capacity": 0,
"max_capacity": 0,
"url": ""
}
JSON
)" "${addr%/}/worker_status/" || true
done
# Optional: exit non-zero to let the supervisor/container runtime handle restarts
exit $STATUS
fi
echo "launching PyWorker server done" echo "launching PyWorker server done"
+1
View File
@@ -127,4 +127,5 @@ routes = [
] ]
if __name__ == "__main__": if __name__ == "__main__":
blips = blorps
start_server(backend, routes) start_server(backend, routes)