diff --git a/Dockerfile_amd b/Dockerfile_amd index f6dffac5..81050343 100644 --- a/Dockerfile_amd +++ b/Dockerfile_amd @@ -114,6 +114,13 @@ ARG BUILD_CAFFE2="0" \ RUN cd pytorch && python tools/amd_build/build_amd.py && python setup.py install +ARG GITHUB_TOKEN +RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends wget && \ + rm -rf /var/lib/apt/lists/* && \ + wget --header "Authorization: token ${GITHUB_TOKEN}" https://raw.githubusercontent.com/fxmarty/patched_hipruntime/main/libamdhip64.so.6.2.41130 + +ENV LD_PRELOAD="/libamdhip64.so.6.2.41130" + # Set as recommended: https://github.com/ROCm/triton/wiki/A-script-to-set-program-execution-environment-in-ROCm # Disabled for now as it is currently not stable with ROCm 6.1. # ENV HIP_FORCE_DEV_KERNARG=1 diff --git a/Dockerfile_amd_nightly_no_patch b/Dockerfile_amd_nightly_no_patch deleted file mode 100644 index 9d99fc16..00000000 --- a/Dockerfile_amd_nightly_no_patch +++ /dev/null @@ -1,199 +0,0 @@ -# Rust builder -FROM lukemathwalker/cargo-chef:latest-rust-1.75 AS chef -WORKDIR /usr/src - -ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse - -FROM chef as planner -COPY Cargo.toml Cargo.toml -COPY rust-toolchain.toml rust-toolchain.toml -COPY proto proto -COPY benchmark benchmark -COPY router router -COPY launcher launcher -RUN cargo chef prepare --recipe-path recipe.json - -FROM chef AS builder - -ARG GIT_SHA -ARG DOCKER_LABEL - -RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \ - curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \ - unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \ - unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \ - rm -f $PROTOC_ZIP - -COPY --from=planner /usr/src/recipe.json recipe.json -RUN cargo chef cook --release --recipe-path recipe.json - -COPY Cargo.toml Cargo.toml -COPY rust-toolchain.toml rust-toolchain.toml -COPY proto proto -COPY benchmark benchmark -COPY router router -COPY launcher launcher -RUN cargo build --release - -# Text Generation Inference base image for RoCm -FROM rocm/dev-ubuntu-22.04:6.1 as base - -RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ - build-essential \ - ca-certificates \ - ccache \ - curl \ - git \ - make \ - libssl-dev \ - g++ \ - # Needed to build VLLM & flash. - rocthrust-dev \ - hipsparse-dev \ - hipblas-dev \ - hipblaslt-dev \ - rocblas-dev \ - hiprand-dev \ - rocrand-dev \ - miopen-hip-dev \ - hipfft-dev \ - hipcub-dev \ - hipsolver-dev \ - rccl-dev \ - cmake \ - python3-dev && \ - rm -rf /var/lib/apt/lists/* - -# Keep in sync with `server/pyproject.toml -ARG MAMBA_VERSION=23.1.0-1 -ARG PYTORCH_VERSION='2.3.0' -ARG ROCM_VERSION='6.0.2' -ARG PYTHON_VERSION='3.10.10' -# Automatically set by buildx -ARG TARGETPLATFORM -ENV PATH /opt/conda/bin:$PATH - -# TGI seem to require libssl.so.1.1 instead of libssl.so.3 so we can't use ubuntu 22.04. Ubuntu 20.04 has python==3.8, and TGI requires python>=3.9, hence the need for miniconda. -# Install mamba -# translating Docker's TARGETPLATFORM into mamba arches -RUN case ${TARGETPLATFORM} in \ - "linux/arm64") MAMBA_ARCH=aarch64 ;; \ - *) MAMBA_ARCH=x86_64 ;; \ - esac && \ - curl -fsSL -v -o ~/mambaforge.sh -O "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh" -RUN chmod +x ~/mambaforge.sh && \ - bash ~/mambaforge.sh -b -p /opt/conda && \ - mamba init && \ - rm ~/mambaforge.sh - -# Install flash-attention, torch dependencies -RUN pip install numpy einops ninja --no-cache-dir - -RUN conda install intel::mkl-static intel::mkl-include - -RUN pip install --pre torch==2.4.0.dev20240506 --index-url https://download.pytorch.org/whl/nightly/rocm6.1 - -RUN pip uninstall -y triton && \ - git clone --depth 1 --single-branch https://github.com/ROCm/triton.git && \ - cd triton/python && \ - pip install . - -# Set as recommended: https://github.com/ROCm/triton/wiki/A-script-to-set-program-execution-environment-in-ROCm -# Disabled for now as it is currently not stable with ROCm 6.1. -# ENV HIP_FORCE_DEV_KERNARG=1 - -FROM base AS kernel-builder - -# # Build vllm kernels -FROM kernel-builder AS vllm-builder -WORKDIR /usr/src - -COPY server/Makefile-vllm Makefile - -# Build specific version of vllm -RUN make build-vllm-rocm - -# Build Flash Attention v2 kernels -FROM kernel-builder AS flash-att-v2-builder -WORKDIR /usr/src - -COPY server/Makefile-flash-att-v2 Makefile - -# Build specific version of flash attention v2 -RUN make build-flash-attention-v2-rocm - -# Build Transformers CUDA kernels (gpt-neox and bloom) -FROM kernel-builder as custom-kernels-builder -WORKDIR /usr/src -COPY server/custom_kernels/ . -RUN python setup.py build - -# Build exllama kernels -FROM kernel-builder as exllama-kernels-builder -WORKDIR /usr/src -COPY server/exllama_kernels/ . - -RUN python setup.py build - -# Build exllama v2 kernels -FROM kernel-builder as exllamav2-kernels-builder -WORKDIR /usr/src -COPY server/exllamav2_kernels/ . - -RUN python setup.py build - -FROM base as base-copy - -# Text Generation Inference base env -ENV HUGGINGFACE_HUB_CACHE=/data \ - HF_HUB_ENABLE_HF_TRANSFER=1 \ - PORT=80 - -# Copy builds artifacts from vllm builder -COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages - -# Copy build artifacts from flash attention v2 builder -COPY --from=flash-att-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages - -# Copy build artifacts from custom kernels builder -COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages - -# Copy build artifacts from exllama kernels builder -COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages - -# Copy build artifacts from exllamav2 kernels builder -COPY --from=exllamav2-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages - -# Install server -COPY proto proto -COPY server server -COPY server/Makefile server/Makefile - -# Install benchmarker -COPY --from=builder /usr/src/target/release/text-generation-benchmark /usr/local/bin/text-generation-benchmark -# Install router -COPY --from=builder /usr/src/target/release/text-generation-router /usr/local/bin/text-generation-router -# Install launcher -COPY --from=builder /usr/src/target/release/text-generation-launcher /usr/local/bin/text-generation-launcher - -RUN cd server && \ - make gen-server && \ - pip install -r requirements_rocm.txt - #pip install ".[accelerate, peft, outlines]" --no-cache-dir - -# AWS Sagemaker compatible image -FROM base as sagemaker - -COPY sagemaker-entrypoint.sh entrypoint.sh -RUN chmod +x entrypoint.sh - -ENTRYPOINT ["./entrypoint.sh"] - -# Final image -FROM base-copy - -# COPY ./tgi-entrypoint.sh /tgi-entrypoint.sh -# RUN chmod +x /tgi-entrypoint.sh - -# ENTRYPOINT ["/tgi-entrypoint.sh"] -# CMD ["--json-output"] diff --git a/Dockerfile_amd_rocm60 b/Dockerfile_amd_rocm60 deleted file mode 100644 index e3f38fbd..00000000 --- a/Dockerfile_amd_rocm60 +++ /dev/null @@ -1,241 +0,0 @@ -# Rust builder -FROM lukemathwalker/cargo-chef:latest-rust-1.75 AS chef -WORKDIR /usr/src - -ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse - -FROM chef as planner -COPY Cargo.toml Cargo.toml -COPY rust-toolchain.toml rust-toolchain.toml -COPY proto proto -COPY benchmark benchmark -COPY router router -COPY launcher launcher -RUN cargo chef prepare --recipe-path recipe.json - -FROM chef AS builder - -ARG GIT_SHA -ARG DOCKER_LABEL - -RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \ - curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \ - unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \ - unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \ - rm -f $PROTOC_ZIP - -COPY --from=planner /usr/src/recipe.json recipe.json -RUN cargo chef cook --release --recipe-path recipe.json - -COPY Cargo.toml Cargo.toml -COPY rust-toolchain.toml rust-toolchain.toml -COPY proto proto -COPY benchmark benchmark -COPY router router -COPY launcher launcher -RUN cargo build --release - -# Text Generation Inference base image for RoCm -FROM rocm/dev-ubuntu-22.04:6.0.2 as base - -RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ - build-essential \ - ca-certificates \ - ccache \ - curl \ - git \ - make \ - libssl-dev \ - g++ \ - # Needed to build VLLM & flash. - rocthrust-dev \ - hipsparse-dev \ - hipblas-dev \ - hipblaslt-dev \ - rocblas-dev \ - hiprand-dev \ - rocrand-dev \ - miopen-hip-dev \ - hipfft-dev \ - hipcub-dev \ - hipsolver-dev \ - rccl-dev \ - wget \ - python3-dev && \ - rm -rf /var/lib/apt/lists/* - -# Keep in sync with `server/pyproject.toml -ARG MAMBA_VERSION=23.1.0-1 -ARG PYTORCH_VERSION='2.3.0' -ARG ROCM_VERSION='6.0.2' -ARG PYTHON_VERSION='3.10.10' -# Automatically set by buildx -ARG TARGETPLATFORM -ENV PATH /opt/conda/bin:$PATH - -# TGI seem to require libssl.so.1.1 instead of libssl.so.3 so we can't use ubuntu 22.04. Ubuntu 20.04 has python==3.8, and TGI requires python>=3.9, hence the need for miniconda. -# Install mamba -# translating Docker's TARGETPLATFORM into mamba arches -RUN case ${TARGETPLATFORM} in \ - "linux/arm64") MAMBA_ARCH=aarch64 ;; \ - *) MAMBA_ARCH=x86_64 ;; \ - esac && \ - curl -fsSL -v -o ~/mambaforge.sh -O "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh" -RUN chmod +x ~/mambaforge.sh && \ - bash ~/mambaforge.sh -b -p /opt/conda && \ - mamba init && \ - rm ~/mambaforge.sh - -# Install flash-attention, torch dependencies -RUN pip install numpy einops ninja --no-cache-dir - -ARG PYTORCH_ROCM_ARCH="gfx90a;gfx942" - -# Install cmake >= 3.25.2 -RUN wget https://cmake.org/files/v3.29/cmake-3.29.3.tar.gz && \ - tar -xf cmake-3.29.3.tar.gz && \ - cd cmake-3.29.3 && \ - ./configure && \ - make && \ - make install - -RUN conda install intel::mkl-static=2024.1.0 intel::mkl-include=2024.1.0 - -# Build HipblasLt -RUN apt-get purge -y hipblaslt hipblaslt-dev && \ - apt-get update && \ - apt-get install -y --no-install-recommends libmsgpack-dev && \ - rm -rf /var/lib/apt/lists/* && \ - mkdir -p libs && \ - cd libs && \ - git clone https://github.com/ROCm/hipBLASLt && \ - cd hipBLASLt && \ - git checkout 560c7e8f73788af47c2135425f7b6e4fa965b311 && \ - pip install -r tensilelite/requirements.txt --no-cache-dir && \ - SCCACHE_IDLE_TIMEOUT=1800 ./install.sh -i --architecture ${PYTORCH_ROCM_ARCH} && \ - cd .. && rm -rf hipBLASLt && \ - sed -i 's/, hipblaslt-dev \(.*\), hipcub-dev/, hipcub-dev/g' /var/lib/dpkg/status && \ - sed -i 's/, hipblaslt \(.*\), hipfft/, hipfft/g' /var/lib/dpkg/status - -RUN pip uninstall -y triton && \ - git clone --depth 1 --single-branch https://github.com/ROCm/triton.git && \ - cd triton/python && \ - pip install . - -RUN git clone --depth 1 --recursive --single-branch --branch 2.3-patched https://github.com/fxmarty/pytorch.git pytorch && cd pytorch && pip install -r requirements.txt --no-cache-dir - -ARG _GLIBCXX_USE_CXX11_ABI="1" -ARG CMAKE_PREFIX_PATH="/opt/conda" -ARG BUILD_CAFFE2="0" \ - BUILD_CAFFE2_OPS="0" \ - USE_CUDA="0" \ - USE_ROCM="1" \ - BUILD_TEST="0" \ - USE_FBGEMM="0" \ - USE_NNPACK="0" \ - USE_QNNPACK="0" \ - USE_XNNPACK="0" \ - USE_FLASH_ATTENTION="1" \ - USE_MEM_EFF_ATTENTION="0" - -RUN cd pytorch && python tools/amd_build/build_amd.py && python setup.py install - -# Set as recommended: https://github.com/ROCm/triton/wiki/A-script-to-set-program-execution-environment-in-ROCm -# Disabled for now as it is currently not stable with ROCm 6.1. -# ENV HIP_FORCE_DEV_KERNARG=1 - -FROM base AS kernel-builder - -# # Build vllm kernels -FROM kernel-builder AS vllm-builder -WORKDIR /usr/src - -COPY server/Makefile-vllm Makefile - -# Build specific version of vllm -RUN make build-vllm-rocm - -# Build Flash Attention v2 kernels -FROM kernel-builder AS flash-att-v2-builder -WORKDIR /usr/src - -COPY server/Makefile-flash-att-v2 Makefile - -# Build specific version of flash attention v2 -RUN make build-flash-attention-v2-rocm - -# Build Transformers CUDA kernels (gpt-neox and bloom) -FROM kernel-builder as custom-kernels-builder -WORKDIR /usr/src -COPY server/custom_kernels/ . -RUN python setup.py build - -# Build exllama kernels -FROM kernel-builder as exllama-kernels-builder -WORKDIR /usr/src -COPY server/exllama_kernels/ . - -RUN python setup.py build - -# Build exllama v2 kernels -FROM kernel-builder as exllamav2-kernels-builder -WORKDIR /usr/src -COPY server/exllamav2_kernels/ . - -RUN python setup.py build - -FROM base as base-copy - -# Text Generation Inference base env -ENV HUGGINGFACE_HUB_CACHE=/data \ - HF_HUB_ENABLE_HF_TRANSFER=1 \ - PORT=80 - -# Copy builds artifacts from vllm builder -COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages - -# Copy build artifacts from flash attention v2 builder -COPY --from=flash-att-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages - -# Copy build artifacts from custom kernels builder -COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages - -# Copy build artifacts from exllama kernels builder -COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages - -# Copy build artifacts from exllamav2 kernels builder -COPY --from=exllamav2-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages - -# Install server -COPY proto proto -COPY server server -COPY server/Makefile server/Makefile - -# Install benchmarker -COPY --from=builder /usr/src/target/release/text-generation-benchmark /usr/local/bin/text-generation-benchmark -# Install router -COPY --from=builder /usr/src/target/release/text-generation-router /usr/local/bin/text-generation-router -# Install launcher -COPY --from=builder /usr/src/target/release/text-generation-launcher /usr/local/bin/text-generation-launcher - -RUN cd server && \ - make gen-server && \ - pip install -r requirements_rocm.txt - #pip install ".[accelerate, peft, outlines]" --no-cache-dir - -# AWS Sagemaker compatible image -FROM base as sagemaker - -COPY sagemaker-entrypoint.sh entrypoint.sh -RUN chmod +x entrypoint.sh - -ENTRYPOINT ["./entrypoint.sh"] - -# Final image -FROM base-copy - -# COPY ./tgi-entrypoint.sh /tgi-entrypoint.sh -# RUN chmod +x /tgi-entrypoint.sh - -# ENTRYPOINT ["/tgi-entrypoint.sh"] -# CMD ["--json-output"]