From 59ef177d5f8638b967f3811d542a416b2e4b54b4 Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Wed, 12 Feb 2025 12:26:15 +0100 Subject: [PATCH] Torch 2.6, fork of rotary, eetq updated. --- Dockerfile | 22 +++++++++++----------- server/Makefile-eetq | 2 +- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/Dockerfile b/Dockerfile index 63d11845..f5e764ad 100644 --- a/Dockerfile +++ b/Dockerfile @@ -48,7 +48,7 @@ FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 AS pytorch-install WORKDIR /usr/src/ # NOTE: When updating PyTorch version, beware to remove `pip install nvidia-nccl-cu12==2.22.3` below in the Dockerfile. Context: https://github.com/huggingface/text-generation-inference/pull/2099 -ARG PYTORCH_VERSION=2.5.1 +ARG PYTORCH_VERSION=2.6 ARG PYTHON_VERSION=3.11 # Keep in sync with `server/pyproject.toml @@ -87,7 +87,7 @@ WORKDIR /usr/src COPY server/Makefile-flash-att Makefile # Build specific version of flash attention -RUN make build-flash-attention +RUN . .venv/bin/activate && make build-flash-attention # Build Flash Attention v2 CUDA kernels FROM kernel-builder AS flash-att-v2-builder @@ -97,14 +97,14 @@ WORKDIR /usr/src COPY server/Makefile-flash-att-v2 Makefile # Build specific version of flash attention v2 -RUN make build-flash-attention-v2-cuda +RUN . .venv/bin/activate && make build-flash-attention-v2-cuda # Build Transformers exllama kernels FROM kernel-builder AS exllama-kernels-builder WORKDIR /usr/src COPY server/exllama_kernels/ . -RUN python setup.py build +RUN . .venv/bin/activate && python setup.py build # Build Transformers exllama kernels FROM kernel-builder AS exllamav2-kernels-builder @@ -112,47 +112,47 @@ WORKDIR /usr/src COPY server/Makefile-exllamav2/ Makefile # Build specific version of transformers -RUN make build-exllamav2 +RUN . .venv/bin/activate && make build-exllamav2 # Build Transformers awq kernels FROM kernel-builder AS awq-kernels-builder WORKDIR /usr/src COPY server/Makefile-awq Makefile # Build specific version of transformers -RUN make build-awq +RUN . .venv/bin/activate && make build-awq # Build eetq kernels FROM kernel-builder AS eetq-kernels-builder WORKDIR /usr/src COPY server/Makefile-eetq Makefile # Build specific version of transformers -RUN make build-eetq +RUN . .venv/bin/activate && make build-eetq # Build Lorax Punica kernels FROM kernel-builder AS lorax-punica-builder WORKDIR /usr/src COPY server/Makefile-lorax-punica Makefile # Build specific version of transformers -RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-lorax-punica +RUN . .venv/bin/activate && TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-lorax-punica # Build Transformers CUDA kernels FROM kernel-builder AS custom-kernels-builder WORKDIR /usr/src COPY server/custom_kernels/ . # Build specific version of transformers -RUN python setup.py build +RUN . .venv/bin/activate && python setup.py build # Build mamba kernels FROM kernel-builder AS mamba-builder WORKDIR /usr/src COPY server/Makefile-selective-scan Makefile -RUN make build-all +RUN . .venv/bin/activate && make build-all # Build flashinfer FROM kernel-builder AS flashinfer-builder WORKDIR /usr/src COPY server/Makefile-flashinfer Makefile -RUN make install-flashinfer +RUN . .venv/bin/activate && make install-flashinfer # Text Generation Inference base image FROM nvidia/cuda:12.4.0-base-ubuntu22.04 AS base diff --git a/server/Makefile-eetq b/server/Makefile-eetq index 5b53122b..1b97f6fd 100644 --- a/server/Makefile-eetq +++ b/server/Makefile-eetq @@ -1,4 +1,4 @@ -eetq_commit := 81e0b14d64088d58ef6acd2c8f3e788d59324407 +eetq_commit := 465e9726bf7ae30803a2d0dd9e5d4315aef17491 eetq: # Clone eetq