update dockerfile

This commit is contained in:
fxmarty 2024-06-20 15:36:46 +00:00
parent dccab72549
commit 65506e19bf
5 changed files with 325 additions and 689 deletions

View File

@ -2,3 +2,4 @@ aml
target target
server/transformers server/transformers
server/flash-attention server/flash-attention
hf_cache/

876
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -72,17 +72,18 @@ RUN chmod +x ~/mambaforge.sh && \
# Install pytorch # Install pytorch
# On arm64 we exit with an error code # On arm64 we exit with an error code
RUN case ${TARGETPLATFORM} in \ # RUN case ${TARGETPLATFORM} in \
"linux/arm64") exit 1 ;; \ # "linux/arm64") exit 1 ;; \
*) /opt/conda/bin/conda update -y conda && \ # *) /opt/conda/bin/conda update -y conda && \
/opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -c "${CUDA_CHANNEL}" -y "python=${PYTHON_VERSION}" "pytorch=$PYTORCH_VERSION" "pytorch-cuda=$(echo $CUDA_VERSION | cut -d'.' -f 1-2)" ;; \ # /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -c "${CUDA_CHANNEL}" -y "python=${PYTHON_VERSION}" "pytorch=$PYTORCH_VERSION" "pytorch-cuda=$(echo $CUDA_VERSION | cut -d'.' -f 1-2)" ;; \
esac && \ # esac && \
/opt/conda/bin/conda clean -ya # /opt/conda/bin/conda clean -ya
RUN pip install torch --index-url https://download.pytorch.org/whl/cu121
# CUDA kernels builder image # CUDA kernels builder image
FROM pytorch-install as kernel-builder FROM pytorch-install as kernel-builder
ARG MAX_JOBS=8 # ARG MAX_JOBS=64
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
ninja-build cmake \ ninja-build cmake \
@ -106,64 +107,66 @@ WORKDIR /usr/src
COPY server/Makefile-flash-att-v2 Makefile COPY server/Makefile-flash-att-v2 Makefile
# Build specific version of flash attention v2 # Build specific version of flash attention v2
ENV TORCH_CUDA_ARCH_LIST="9.0+PTX"
RUN make build-flash-attention-v2-cuda RUN make build-flash-attention-v2-cuda
# Build Transformers exllama kernels # # Build Transformers exllama kernels
FROM kernel-builder as exllama-kernels-builder # FROM kernel-builder as exllama-kernels-builder
WORKDIR /usr/src # WORKDIR /usr/src
COPY server/exllama_kernels/ . # COPY server/exllama_kernels/ .
RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build # RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build
# Build Transformers exllama kernels # Build Transformers exllama kernels
FROM kernel-builder as exllamav2-kernels-builder # FROM kernel-builder as exllamav2-kernels-builder
WORKDIR /usr/src # WORKDIR /usr/src
COPY server/exllamav2_kernels/ . # COPY server/exllamav2_kernels/ .
# Build specific version of transformers # Build specific version of transformers
RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build # RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build
# Build Transformers awq kernels # Build Transformers awq kernels
FROM kernel-builder as awq-kernels-builder # FROM kernel-builder as awq-kernels-builder
WORKDIR /usr/src # WORKDIR /usr/src
COPY server/Makefile-awq Makefile # COPY server/Makefile-awq Makefile
# Build specific version of transformers # # Build specific version of transformers
RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-awq # RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-awq
# Build eetq kernels # # Build eetq kernels
FROM kernel-builder as eetq-kernels-builder # FROM kernel-builder as eetq-kernels-builder
WORKDIR /usr/src # WORKDIR /usr/src
COPY server/Makefile-eetq Makefile # COPY server/Makefile-eetq Makefile
# Build specific version of transformers # # Build specific version of transformers
RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-eetq # RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-eetq
# Build Transformers CUDA kernels # # Build Transformers CUDA kernels
FROM kernel-builder as custom-kernels-builder # FROM kernel-builder as custom-kernels-builder
WORKDIR /usr/src # WORKDIR /usr/src
COPY server/custom_kernels/ . # COPY server/custom_kernels/ .
# Build specific version of transformers # # Build specific version of transformers
RUN python setup.py build # RUN python setup.py build
# Build vllm CUDA kernels # Build vllm CUDA kernels
FROM kernel-builder as vllm-builder FROM kernel-builder as vllm-builder
WORKDIR /usr/src WORKDIR /usr/src
ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX" ENV TORCH_CUDA_ARCH_LIST="9.0+PTX"
COPY server/Makefile-vllm Makefile COPY server/Makefile-vllm Makefile
# Build specific version of vllm # Build specific version of vllm
RUN make build-vllm-cuda RUN make build-vllm-cuda
# Build mamba kernels # # Build mamba kernels
FROM kernel-builder as mamba-builder # FROM kernel-builder as mamba-builder
WORKDIR /usr/src # WORKDIR /usr/src
COPY server/Makefile-selective-scan Makefile # COPY server/Makefile-selective-scan Makefile
RUN make build-all # RUN make build-all
# Text Generation Inference base image # Text Generation Inference base image
FROM nvidia/cuda:12.1.0-base-ubuntu22.04 as base FROM pytorch-install
# Conda env # Conda env
ENV PATH=/opt/conda/bin:$PATH \ ENV PATH=/opt/conda/bin:$PATH \
@ -184,7 +187,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins
&& rm -rf /var/lib/apt/lists/* && rm -rf /var/lib/apt/lists/*
# Copy conda with PyTorch installed # Copy conda with PyTorch installed
COPY --from=pytorch-install /opt/conda /opt/conda # COPY --from=pytorch-install /opt/conda /opt/conda
# Copy build artifacts from flash attention builder # Copy build artifacts from flash attention builder
COPY --from=flash-att-builder /usr/src/flash-attention/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages COPY --from=flash-att-builder /usr/src/flash-attention/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
@ -194,23 +197,23 @@ COPY --from=flash-att-builder /usr/src/flash-attention/csrc/rotary/build/lib.lin
# Copy build artifacts from flash attention v2 builder # Copy build artifacts from flash attention v2 builder
COPY --from=flash-att-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages COPY --from=flash-att-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
# Copy build artifacts from custom kernels builder # # Copy build artifacts from custom kernels builder
COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages # COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
# Copy build artifacts from exllama kernels builder # # Copy build artifacts from exllama kernels builder
COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages # COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
# Copy build artifacts from exllamav2 kernels builder # # Copy build artifacts from exllamav2 kernels builder
COPY --from=exllamav2-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages # COPY --from=exllamav2-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
# Copy build artifacts from awq kernels builder # # Copy build artifacts from awq kernels builder
COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages # COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
# Copy build artifacts from eetq kernels builder # # Copy build artifacts from eetq kernels builder
COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages # COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
# Copy builds artifacts from vllm builder # Copy builds artifacts from vllm builder
COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
# Copy build artifacts from mamba builder # # Copy build artifacts from mamba builder
COPY --from=mamba-builder /usr/src/mamba/build/lib.linux-x86_64-cpython-310/ /opt/conda/lib/python3.10/site-packages # COPY --from=mamba-builder /usr/src/mamba/build/lib.linux-x86_64-cpython-310/ /opt/conda/lib/python3.10/site-packages
COPY --from=mamba-builder /usr/src/causal-conv1d/build/lib.linux-x86_64-cpython-310/ /opt/conda/lib/python3.10/site-packages # COPY --from=mamba-builder /usr/src/causal-conv1d/build/lib.linux-x86_64-cpython-310/ /opt/conda/lib/python3.10/site-packages
# Install flash-attention dependencies # Install flash-attention dependencies
RUN pip install einops --no-cache-dir RUN pip install einops --no-cache-dir
@ -237,18 +240,18 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins
&& rm -rf /var/lib/apt/lists/* && rm -rf /var/lib/apt/lists/*
# AWS Sagemaker compatible image # AWS Sagemaker compatible image
FROM base as sagemaker # FROM base as sagemaker
COPY sagemaker-entrypoint.sh entrypoint.sh # COPY sagemaker-entrypoint.sh entrypoint.sh
RUN chmod +x entrypoint.sh # RUN chmod +x entrypoint.sh
ENTRYPOINT ["./entrypoint.sh"] # ENTRYPOINT ["./entrypoint.sh"]
# Final image # # Final image
FROM base # FROM base
COPY ./tgi-entrypoint.sh /tgi-entrypoint.sh # COPY ./tgi-entrypoint.sh /tgi-entrypoint.sh
RUN chmod +x /tgi-entrypoint.sh # RUN chmod +x /tgi-entrypoint.sh
ENTRYPOINT ["/tgi-entrypoint.sh"] # ENTRYPOINT ["/tgi-entrypoint.sh"]
CMD ["--json-output"] # CMD ["--json-output"]

View File

@ -48,7 +48,7 @@ minijinja = { git = "https://github.com/mitsuhiko/minijinja.git", rev = "5cd4efb
futures-util = "0.3.30" futures-util = "0.3.30"
regex = "1.10.3" regex = "1.10.3"
once_cell = "1.19.0" once_cell = "1.19.0"
image = "0.25.1" image = "0.24.9"
base64 = "0.22.0" base64 = "0.22.0"
[build-dependencies] [build-dependencies]

View File

@ -55,7 +55,7 @@ def initialize_torch_distributed():
backend = "nccl" backend = "nccl"
options = ProcessGroupNCCL.Options() options = ProcessGroupNCCL.Options()
options.is_high_priority_stream = True options.is_high_priority_stream = True
options._timeout = timedelta(seconds=60) options._timeout = timedelta(seconds=30)
else: else:
try: try:
import oneccl_bindings_for_pytorch import oneccl_bindings_for_pytorch
@ -79,7 +79,7 @@ def initialize_torch_distributed():
backend=backend, backend=backend,
world_size=WORLD_SIZE, world_size=WORLD_SIZE,
rank=RANK, rank=RANK,
timeout=timedelta(seconds=60), timeout=timedelta(seconds=30),
pg_options=options, pg_options=options,
) )
else: else: