more logs
This commit is contained in:
@@ -235,10 +235,14 @@ class Backend:
|
|||||||
log.debug("No healthcheck endpoint defined, skipping healthcheck")
|
log.debug("No healthcheck endpoint defined, skipping healthcheck")
|
||||||
return
|
return
|
||||||
|
|
||||||
|
first_healthcheck = True
|
||||||
while True:
|
while True:
|
||||||
await sleep(10)
|
await sleep(10)
|
||||||
if self.__start_healthcheck is False:
|
if self.__start_healthcheck is False:
|
||||||
continue
|
continue
|
||||||
|
if first_healthcheck:
|
||||||
|
log.info(f"[healthcheck] First healthcheck starting (model is now loaded)")
|
||||||
|
first_healthcheck = False
|
||||||
try:
|
try:
|
||||||
log.debug(f"Performing healthcheck on {health_check_url}")
|
log.debug(f"Performing healthcheck on {health_check_url}")
|
||||||
async with self.healthcheck_session.get(health_check_url) as response:
|
async with self.healthcheck_session.get(health_check_url) as response:
|
||||||
@@ -412,10 +416,12 @@ class Backend:
|
|||||||
# await sleep(5)
|
# await sleep(5)
|
||||||
try:
|
try:
|
||||||
max_throughput = await run_benchmark()
|
max_throughput = await run_benchmark()
|
||||||
|
log.info(f"[benchmark] Benchmark complete, max_throughput={max_throughput}, setting healthcheck=True")
|
||||||
self.__start_healthcheck = True
|
self.__start_healthcheck = True
|
||||||
self.metrics._model_loaded(
|
self.metrics._model_loaded(
|
||||||
max_throughput=max_throughput,
|
max_throughput=max_throughput,
|
||||||
)
|
)
|
||||||
|
log.info(f"[benchmark] _model_loaded() called, returning from handle_log_line")
|
||||||
except ClientConnectorError as e:
|
except ClientConnectorError as e:
|
||||||
log.debug(
|
log.debug(
|
||||||
f"failed to connect to model api during benchmark"
|
f"failed to connect to model api during benchmark"
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
import os
|
import os
|
||||||
|
import sys
|
||||||
import time
|
import time
|
||||||
import logging
|
import logging
|
||||||
import json
|
import json
|
||||||
@@ -17,6 +18,14 @@ DELETE_REQUESTS_INTERVAL = 1
|
|||||||
log = logging.getLogger(__file__)
|
log = logging.getLogger(__file__)
|
||||||
|
|
||||||
|
|
||||||
|
def _flush_logs():
|
||||||
|
"""Force flush all log handlers and stdout/stderr."""
|
||||||
|
for handler in logging.root.handlers:
|
||||||
|
handler.flush()
|
||||||
|
sys.stdout.flush()
|
||||||
|
sys.stderr.flush()
|
||||||
|
|
||||||
|
|
||||||
@cache
|
@cache
|
||||||
def get_url() -> str:
|
def get_url() -> str:
|
||||||
use_ssl = os.environ.get("USE_SSL", "false") == "true"
|
use_ssl = os.environ.get("USE_SSL", "false") == "true"
|
||||||
@@ -120,6 +129,7 @@ class Metrics:
|
|||||||
|
|
||||||
async def _send_metrics_loop(self) -> Awaitable[NoReturn]:
|
async def _send_metrics_loop(self) -> Awaitable[NoReturn]:
|
||||||
loop_count = 0
|
loop_count = 0
|
||||||
|
first_loaded_send_done = False
|
||||||
while True:
|
while True:
|
||||||
await sleep(METRICS_UPDATE_INTERVAL)
|
await sleep(METRICS_UPDATE_INTERVAL)
|
||||||
loop_count += 1
|
loop_count += 1
|
||||||
@@ -127,19 +137,32 @@ class Metrics:
|
|||||||
# Log heartbeat every 30 seconds to confirm loop is running
|
# Log heartbeat every 30 seconds to confirm loop is running
|
||||||
if loop_count % 30 == 0:
|
if loop_count % 30 == 0:
|
||||||
log.debug(f"[heartbeat] metrics loop alive, loop_count={loop_count}, model_loaded={self.system_metrics.model_is_loaded}")
|
log.debug(f"[heartbeat] metrics loop alive, loop_count={loop_count}, model_loaded={self.system_metrics.model_is_loaded}")
|
||||||
|
_flush_logs()
|
||||||
|
# Extra logging for first few iterations after model loads
|
||||||
|
if self.system_metrics.model_is_loaded and not first_loaded_send_done:
|
||||||
|
log.info(f"[transition] First iteration with model_loaded=True, loop_count={loop_count}, elapsed={elapsed:.1f}")
|
||||||
|
_flush_logs()
|
||||||
if self.system_metrics.model_is_loaded is False and elapsed >= 10:
|
if self.system_metrics.model_is_loaded is False and elapsed >= 10:
|
||||||
log.debug(f"sending loading model metrics after {int(elapsed)}s wait")
|
log.debug(f"sending loading model metrics after {int(elapsed)}s wait")
|
||||||
await self.__send_metrics_and_reset()
|
await self.__send_metrics_and_reset()
|
||||||
elif self.update_pending or elapsed > 10:
|
elif self.update_pending or elapsed > 10:
|
||||||
log.debug(f"sending loaded model metrics after {int(elapsed)}s wait")
|
log.debug(f"sending loaded model metrics after {int(elapsed)}s wait")
|
||||||
await self.__send_metrics_and_reset()
|
await self.__send_metrics_and_reset()
|
||||||
|
if self.system_metrics.model_is_loaded and not first_loaded_send_done:
|
||||||
|
first_loaded_send_done = True
|
||||||
|
log.info(f"[transition] First loaded metrics send complete, continuing to next iteration...")
|
||||||
|
_flush_logs()
|
||||||
|
|
||||||
def _model_loaded(self, max_throughput: float) -> None:
|
def _model_loaded(self, max_throughput: float) -> None:
|
||||||
|
log.info(f"MODEL LOADED: Setting model_is_loaded=True, max_throughput={max_throughput}")
|
||||||
|
_flush_logs()
|
||||||
self.system_metrics.model_loading_time = (
|
self.system_metrics.model_loading_time = (
|
||||||
time.time() - self.system_metrics.model_loading_start
|
time.time() - self.system_metrics.model_loading_start
|
||||||
)
|
)
|
||||||
self.system_metrics.model_is_loaded = True
|
self.system_metrics.model_is_loaded = True
|
||||||
self.model_metrics.max_throughput = max_throughput
|
self.model_metrics.max_throughput = max_throughput
|
||||||
|
log.info(f"MODEL LOADED: model_loading_time={self.system_metrics.model_loading_time}")
|
||||||
|
_flush_logs()
|
||||||
|
|
||||||
def _model_errored(self, error_msg: str) -> None:
|
def _model_errored(self, error_msg: str) -> None:
|
||||||
self.model_metrics.set_errored(error_msg)
|
self.model_metrics.set_errored(error_msg)
|
||||||
@@ -276,6 +299,7 @@ class Metrics:
|
|||||||
###########
|
###########
|
||||||
|
|
||||||
self.system_metrics.update_disk_usage()
|
self.system_metrics.update_disk_usage()
|
||||||
|
had_loadtime = loadtime_snapshot is not None and loadtime_snapshot > 0
|
||||||
|
|
||||||
sent = False
|
sent = False
|
||||||
for report_addr in self.report_addr:
|
for report_addr in self.report_addr:
|
||||||
@@ -284,8 +308,14 @@ class Metrics:
|
|||||||
break
|
break
|
||||||
|
|
||||||
if sent:
|
if sent:
|
||||||
|
if had_loadtime:
|
||||||
|
log.info(f"FIRST LOADTIME METRICS SENT SUCCESSFULLY! loadtime={loadtime_snapshot}")
|
||||||
|
_flush_logs()
|
||||||
# clear the one-shot loadtime only if we actually sent *this* value
|
# clear the one-shot loadtime only if we actually sent *this* value
|
||||||
self.system_metrics.reset(expected=loadtime_snapshot)
|
self.system_metrics.reset(expected=loadtime_snapshot)
|
||||||
self.update_pending = False
|
self.update_pending = False
|
||||||
self.model_metrics.reset()
|
self.model_metrics.reset()
|
||||||
self.last_metric_update = time.time()
|
self.last_metric_update = time.time()
|
||||||
|
if had_loadtime:
|
||||||
|
log.info(f"POST-SEND: reset complete, last_metric_update={self.last_metric_update}, continuing loop...")
|
||||||
|
_flush_logs()
|
||||||
|
|||||||
Reference in New Issue
Block a user