From 9ebf1924ea5ecd34a803b313685131e0223a8110 Mon Sep 17 00:00:00 2001 From: Nader Arbabian Date: Wed, 11 Jun 2025 15:25:57 -0700 Subject: [PATCH] don't healthcheck endpoints until model is loaded and benchmarks have run --- lib/backend.py | 39 ++++++++++++++++++++++----------------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/lib/backend.py b/lib/backend.py index 2a92bd0..e01f7c4 100644 --- a/lib/backend.py +++ b/lib/backend.py @@ -60,6 +60,7 @@ class Backend: self.metrics = Metrics() self._total_pubkey_fetch_errors = 0 self._pubkey = self._fetch_pubkey() + self.__start_healthcheck: bool = False @property def pubkey(self) -> Optional[RSA.RsaKey]: @@ -191,23 +192,26 @@ class Backend: if health_check_url is None: log.debug("No healthcheck endpoint defined, skipping healthcheck") return - await sleep(5) - try: - log.debug(f"Performing healthcheck on {health_check_url}") - async with self.session.get(health_check_url) as response: - if response.status == 200: - log.debug("Healthcheck successful") - elif response.status == 503: - log.debug(f"Healthcheck failed with status: {response.status}") - self.backend_errored( - f"Healthcheck failed with status: {response.status}" - ) - else: - # endpoint not ready yet so bail - log.debug(f"Healthcheck Endpoint not ready: {response.status}") - except Exception as e: - log.debug(f"Healthcheck failed with exception: {e}") - self.backend_errored(str(e)) + while True: + await sleep(10) + if self.__start_healthcheck is False: + continue + try: + log.debug(f"Performing healthcheck on {health_check_url}") + async with self.session.get(health_check_url) as response: + if response.status == 200: + log.debug("Healthcheck successful") + elif response.status == 503: + log.debug(f"Healthcheck failed with status: {response.status}") + self.backend_errored( + f"Healthcheck failed with status: {response.status}" + ) + else: + # endpoint not ready yet so bail + log.debug(f"Healthcheck Endpoint not ready: {response.status}") + except Exception as e: + log.debug(f"Healthcheck failed with exception: {e}") + self.backend_errored(str(e)) async def _start_tracking(self) -> None: await gather( @@ -331,6 +335,7 @@ class Backend: await sleep(5) try: max_throughput = await run_benchmark() + self.__start_healthcheck = True self.metrics._model_loaded( max_throughput=max_throughput, )