diff --git a/lib/backend.py b/lib/backend.py index 19764bd..0d9a273 100644 --- a/lib/backend.py +++ b/lib/backend.py @@ -30,7 +30,7 @@ from lib.data_types import ( BenchmarkResult ) -VERSION = "0.2.0" +VERSION = "0.2.1" MSG_HISTORY_LEN = 100 log = logging.getLogger(__file__) diff --git a/lib/server.py b/lib/server.py index b21c880..0029311 100644 --- a/lib/server.py +++ b/lib/server.py @@ -3,38 +3,58 @@ import logging from typing import List import ssl from asyncio import run, gather - +import asyncio from lib.backend import Backend +from lib.metrics import Metrics from aiohttp import web log = logging.getLogger(__file__) def start_server(backend: Backend, routes: List[web.RouteDef], **kwargs): - log.debug("getting certificate...") - use_ssl = os.environ.get("USE_SSL", "false") == "true" - if use_ssl is True: - ssl_context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH) - ssl_context.load_cert_chain( - certfile="/etc/instance.crt", - keyfile="/etc/instance.key", - ) - else: - ssl_context = None + try: + log.debug("getting certificate...") + use_ssl = os.environ.get("USE_SSL", "false") == "true" + if use_ssl is True: + ssl_context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH) + ssl_context.load_cert_chain( + certfile="/etc/instance.crt", + keyfile="/etc/instance.key", + ) + else: + ssl_context = None - async def main(): - log.debug("starting server...") - app = web.Application() - app.add_routes(routes) - runner = web.AppRunner(app) - await runner.setup() - site = web.TCPSite( - runner, - ssl_context=ssl_context, - port=int(os.environ["WORKER_PORT"]), - **kwargs - ) - await gather(site.start(), backend._start_tracking()) + async def main(): + log.debug("starting server...") + app = web.Application() + app.add_routes(routes) + runner = web.AppRunner(app) + await runner.setup() + site = web.TCPSite( + runner, + ssl_context=ssl_context, + port=int(os.environ["WORKER_PORT"]), + **kwargs + ) + await gather(site.start(), backend._start_tracking()) - run(main()) + run(main()) + + except Exception as e: + err_msg = f"PyWorker failed to launch: {e}" + log.error(err_msg) + + async def beacon(): + metrics = Metrics() + metrics._set_version(getattr(backend, "version", "0")) + metrics._set_mtoken(getattr(backend, "mtoken", "")) + try: + while True: + metrics._model_errored(err_msg) + await metrics._Metrics__send_metrics_and_reset() + await asyncio.sleep(10) + finally: + await metrics.aclose() + + run(beacon()) diff --git a/start_server.sh b/start_server.sh index edc16a4..4b07e01 100755 --- a/start_server.sh +++ b/start_server.sh @@ -41,6 +41,14 @@ echo_var DEBUG_LOG echo_var PYWORKER_LOG echo_var MODEL_LOG +# if instance is rebooted, we want to clear out the log file so pyworker doesn't read lines +# from the run prior to reboot. past logs are saved in $MODEL_LOG.old for debugging only +if [ -e "$MODEL_LOG" ]; then + echo "Rotating model log at $MODEL_LOG to $MODEL_LOG.old" + cat "$MODEL_LOG" >> "$MODEL_LOG.old" + : > "$MODEL_LOG" +fi + # Populate /etc/environment with quoted values if ! grep -q "VAST" /etc/environment; then env -0 | grep -zEv "^(HOME=|SHLVL=)|CONDA" | while IFS= read -r -d '' line; do @@ -124,9 +132,43 @@ cd "$SERVER_DIR" echo "launching PyWorker server" -# if instance is rebooted, we want to clear out the log file so pyworker doesn't read lines -# from the run prior to reboot. past logs are saved in $MODEL_LOG.old for debugging only -[ -e "$MODEL_LOG" ] && cat "$MODEL_LOG" >> "$MODEL_LOG.old" && : > "$MODEL_LOG" +set +e +python3 -m "workers.$BACKEND.server" |& tee -a "$PYWORKER_LOG" +PY_STATUS=${PIPESTATUS[0]} +set -e -(python3 -m "workers.$BACKEND.server" |& tee -a "$PYWORKER_LOG") & -echo "launching PyWorker server done" +if [ "${PY_STATUS}" -ne 0 ]; then + echo "PyWorker exited with status ${PY_STATUS}; notifying autoscaler..." + ERROR_MSG="PyWorker exited: code ${PY_STATUS}" + MTOKEN="${MASTER_TOKEN:-}" + VERSION="${PYWORKER_VERSION:-0}" + + IFS=',' read -r -a REPORT_ADDRS <<< "${REPORT_ADDR}" + for addr in "${REPORT_ADDRS[@]}"; do + curl -sS -X POST -H 'Content-Type: application/json' \ + -d "$(cat <