Trying to avoid the random timeout. (#2929)

* Trying to avoid the random timeout.

* More read timeout ?

* Longer timeout ?

* Remove legacy ENV directive.

* Remove the dummy test, only increase the read timeout.

* Wat?
This commit is contained in:
Nicolas Patry 2025-01-21 11:06:10 +01:00 committed by GitHub
parent 17367438f3
commit bdb3e488e4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 3 additions and 2 deletions

View File

@ -58,7 +58,7 @@ ARG INSTALL_CHANNEL=pytorch
# Automatically set by buildx
ARG TARGETPLATFORM
ENV PATH /opt/conda/bin:$PATH
ENV PATH=/opt/conda/bin:$PATH
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
build-essential \

View File

@ -562,6 +562,7 @@ def launcher(event_loop):
docker.types.DeviceRequest(count=gpu_count, capabilities=[["gpu"]])
]
client.api.timeout = 1000
container = client.containers.run(
DOCKER_IMAGE,
command=args,
@ -573,7 +574,7 @@ def launcher(event_loop):
devices=devices,
volumes=volumes,
ports={"80/tcp": port},
healthcheck={"timeout": int(60 * 1e9), "retries": 2}, # 60s
healthcheck={"timeout": int(180 * 1e9), "retries": 2}, # 60s
shm_size="1G",
)