Compare commits

...

6 Commits

Author SHA1 Message Date
Colter Downing 25db78e39d Fix healthcheck with separate session 2025-10-01 18:04:31 -07:00
Colter-Downing 1b37054dec Merge pull request #28 from vast-ai/bugfix/backend-timeout-infinite
Bugfix/backend timeout infinite
2025-08-28 11:22:33 -07:00
Colter-Downing 1a1e4174b8 Merge pull request #29 from vast-ai/bugfix/comfyui-json-cost-fix
Set cost to 100
2025-08-28 11:22:21 -07:00
Rob Ballantyne 1e4fa87437 Prevent timeout and allow long running connections 2025-08-28 15:48:57 +01:00
Rob Ballantyne 4c5fa03c7b adds import for ClientTimeout 2025-08-27 20:54:27 +01:00
Rob Ballantyne a8fe74f771 Remove default 300s timeout 2025-08-27 18:34:45 +01:00
+21 -4
View File
@@ -11,7 +11,7 @@ from functools import cached_property
from distutils.util import strtobool
from anyio import open_file
from aiohttp import web, ClientResponse, ClientSession, ClientConnectorError
from aiohttp import web, ClientResponse, ClientSession, ClientConnectorError, ClientTimeout, TCPConnector
import requests
from Crypto.Signature import pkcs1_15
@@ -75,7 +75,13 @@ class Backend:
@cached_property
def session(self):
log.debug(f"starting session with {self.model_server_url}")
return ClientSession(self.model_server_url)
connector = TCPConnector(
force_close=True, # Required for long running jobs
enable_cleanup_closed=True,
)
timeout = ClientTimeout(total=None)
return ClientSession(self.model_server_url, timeout=timeout, connector=connector)
def create_handler(
self,
@@ -184,18 +190,30 @@ class Backend:
log.debug(f"Exception in main handler loop {e}")
return web.Response(status=500)
@cached_property
def healthcheck_session(self):
"""Dedicated session for healthchecks to avoid conflicts with API session"""
log.debug("creating dedicated healthcheck session")
connector = TCPConnector(
force_close=True, # Keep this for isolation
enable_cleanup_closed=True,
)
timeout = ClientTimeout(total=10) # Reasonable timeout for healthchecks
return ClientSession(timeout=timeout, connector=connector)
async def __healthcheck(self):
health_check_url = self.benchmark_handler.healthcheck_endpoint
if health_check_url is None:
log.debug("No healthcheck endpoint defined, skipping healthcheck")
return
while True:
await sleep(10)
if self.__start_healthcheck is False:
continue
try:
log.debug(f"Performing healthcheck on {health_check_url}")
async with self.session.get(health_check_url) as response:
async with self.healthcheck_session.get(health_check_url) as response:
if response.status == 200:
log.debug("Healthcheck successful")
elif response.status == 503:
@@ -204,7 +222,6 @@ class Backend:
f"Healthcheck failed with status: {response.status}"
)
else:
# endpoint not ready yet so bail
log.debug(f"Healthcheck Endpoint not ready: {response.status}")
except Exception as e:
log.debug(f"Healthcheck failed with exception: {e}")