debug logs

This commit is contained in:
Colter Downing
2025-11-24 18:21:35 -08:00
parent ecc6a3ce0d
commit c772e1651b
3 changed files with 27 additions and 0 deletions
+2
View File
@@ -256,6 +256,7 @@ class Backend:
self.backend_errored(str(e)) self.backend_errored(str(e))
async def _start_tracking(self) -> None: async def _start_tracking(self) -> None:
log.info("Starting tracking tasks (read_logs, send_metrics_loop, healthcheck, send_delete_requests_loop)")
task_names = ["read_logs", "send_metrics_loop", "healthcheck", "send_delete_requests_loop"] task_names = ["read_logs", "send_metrics_loop", "healthcheck", "send_delete_requests_loop"]
results = await gather( results = await gather(
self.__read_logs(), self.__read_logs(),
@@ -265,6 +266,7 @@ class Backend:
return_exceptions=True return_exceptions=True
) )
# If we get here, one or more tasks exited (they should run forever) # If we get here, one or more tasks exited (they should run forever)
log.error(f"CRITICAL: _start_tracking gather returned! This should never happen. Results: {results}")
for name, result in zip(task_names, results): for name, result in zip(task_names, results):
if isinstance(result, Exception): if isinstance(result, Exception):
log.error(f"Tracking task '{name}' crashed with exception: {result}", exc_info=result) log.error(f"Tracking task '{name}' crashed with exception: {result}", exc_info=result)
+5
View File
@@ -119,9 +119,14 @@ class Metrics:
await self.__send_delete_requests_and_reset() await self.__send_delete_requests_and_reset()
async def _send_metrics_loop(self) -> Awaitable[NoReturn]: async def _send_metrics_loop(self) -> Awaitable[NoReturn]:
loop_count = 0
while True: while True:
await sleep(METRICS_UPDATE_INTERVAL) await sleep(METRICS_UPDATE_INTERVAL)
loop_count += 1
elapsed = time.time() - self.last_metric_update elapsed = time.time() - self.last_metric_update
# Log heartbeat every 30 seconds to confirm loop is running
if loop_count % 30 == 0:
log.debug(f"[heartbeat] metrics loop alive, loop_count={loop_count}, model_loaded={self.system_metrics.model_is_loaded}")
if self.system_metrics.model_is_loaded is False and elapsed >= 10: if self.system_metrics.model_is_loaded is False and elapsed >= 10:
log.debug(f"sending loading model metrics after {int(elapsed)}s wait") log.debug(f"sending loading model metrics after {int(elapsed)}s wait")
await self.__send_metrics_and_reset() await self.__send_metrics_and_reset()
+20
View File
@@ -1,5 +1,7 @@
import os import os
import logging import logging
import signal
import sys
from typing import List from typing import List
import ssl import ssl
from asyncio import run, gather from asyncio import run, gather
@@ -12,7 +14,25 @@ from aiohttp import web
log = logging.getLogger(__file__) log = logging.getLogger(__file__)
def _setup_signal_handlers():
"""Setup signal handlers to log when process receives termination signals."""
def signal_handler(signum, frame):
sig_name = signal.Signals(signum).name
log.error(f"SIGNAL RECEIVED: {sig_name} ({signum}) - process is being terminated")
sys.stdout.flush()
sys.stderr.flush()
sys.exit(128 + signum)
# Handle common termination signals
for sig in [signal.SIGTERM, signal.SIGINT, signal.SIGHUP]:
try:
signal.signal(sig, signal_handler)
except (OSError, ValueError):
pass # Some signals may not be available
def start_server(backend: Backend, routes: List[web.RouteDef], **kwargs): def start_server(backend: Backend, routes: List[web.RouteDef], **kwargs):
_setup_signal_handlers()
try: try:
log.debug("getting certificate...") log.debug("getting certificate...")
use_ssl = os.environ.get("USE_SSL", "false") == "true" use_ssl = os.environ.get("USE_SSL", "false") == "true"