From c772e1651b7e4ab81dd87565ec58425dd2a4eb05 Mon Sep 17 00:00:00 2001 From: Colter Downing Date: Mon, 24 Nov 2025 18:21:35 -0800 Subject: [PATCH] debug logs --- lib/backend.py | 2 ++ lib/metrics.py | 5 +++++ lib/server.py | 20 ++++++++++++++++++++ 3 files changed, 27 insertions(+) diff --git a/lib/backend.py b/lib/backend.py index d555c03..1137655 100644 --- a/lib/backend.py +++ b/lib/backend.py @@ -256,6 +256,7 @@ class Backend: self.backend_errored(str(e)) async def _start_tracking(self) -> None: + log.info("Starting tracking tasks (read_logs, send_metrics_loop, healthcheck, send_delete_requests_loop)") task_names = ["read_logs", "send_metrics_loop", "healthcheck", "send_delete_requests_loop"] results = await gather( self.__read_logs(), @@ -265,6 +266,7 @@ class Backend: return_exceptions=True ) # If we get here, one or more tasks exited (they should run forever) + log.error(f"CRITICAL: _start_tracking gather returned! This should never happen. Results: {results}") for name, result in zip(task_names, results): if isinstance(result, Exception): log.error(f"Tracking task '{name}' crashed with exception: {result}", exc_info=result) diff --git a/lib/metrics.py b/lib/metrics.py index 48774fe..8b6a7c4 100644 --- a/lib/metrics.py +++ b/lib/metrics.py @@ -119,9 +119,14 @@ class Metrics: await self.__send_delete_requests_and_reset() async def _send_metrics_loop(self) -> Awaitable[NoReturn]: + loop_count = 0 while True: await sleep(METRICS_UPDATE_INTERVAL) + loop_count += 1 elapsed = time.time() - self.last_metric_update + # Log heartbeat every 30 seconds to confirm loop is running + if loop_count % 30 == 0: + log.debug(f"[heartbeat] metrics loop alive, loop_count={loop_count}, model_loaded={self.system_metrics.model_is_loaded}") if self.system_metrics.model_is_loaded is False and elapsed >= 10: log.debug(f"sending loading model metrics after {int(elapsed)}s wait") await self.__send_metrics_and_reset() diff --git a/lib/server.py b/lib/server.py index 0029311..52f30c3 100644 --- a/lib/server.py +++ b/lib/server.py @@ -1,5 +1,7 @@ import os import logging +import signal +import sys from typing import List import ssl from asyncio import run, gather @@ -12,7 +14,25 @@ from aiohttp import web log = logging.getLogger(__file__) +def _setup_signal_handlers(): + """Setup signal handlers to log when process receives termination signals.""" + def signal_handler(signum, frame): + sig_name = signal.Signals(signum).name + log.error(f"SIGNAL RECEIVED: {sig_name} ({signum}) - process is being terminated") + sys.stdout.flush() + sys.stderr.flush() + sys.exit(128 + signum) + + # Handle common termination signals + for sig in [signal.SIGTERM, signal.SIGINT, signal.SIGHUP]: + try: + signal.signal(sig, signal_handler) + except (OSError, ValueError): + pass # Some signals may not be available + + def start_server(backend: Backend, routes: List[web.RouteDef], **kwargs): + _setup_signal_handlers() try: log.debug("getting certificate...") use_ssl = os.environ.get("USE_SSL", "false") == "true"