diff --git a/Dockerfile b/Dockerfile index 168f2f97..70c60132 100644 --- a/Dockerfile +++ b/Dockerfile @@ -108,6 +108,17 @@ COPY server/Makefile-flash-att-v2 Makefile # Build specific version of flash attention v2 RUN make build-flash-attention-v2 +# Build Transformers exllama kernels +FROM kernel-builder as exllama-kernels-builder + +WORKDIR /usr/src + +COPY server/exllama_kernels/ . + + +# Build specific version of transformers +RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build + # Build Transformers CUDA kernels FROM kernel-builder as custom-kernels-builder @@ -161,6 +172,8 @@ COPY --from=flash-att-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x86 # Copy build artifacts from custom kernels builder COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages +# Copy build artifacts from exllama kernels builder +COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages # Copy builds artifacts from vllm builder COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages diff --git a/server/custom_kernels/setup.py b/server/custom_kernels/setup.py index a070f9db..43b8ee4e 100644 --- a/server/custom_kernels/setup.py +++ b/server/custom_kernels/setup.py @@ -14,17 +14,6 @@ setup( sources=["custom_kernels/fused_attention_cuda.cu"], extra_compile_args=["-arch=compute_80", "-std=c++17"], ), - CUDAExtension( - name="custom_kernels.exllama", - sources=[ - "custom_kernels/exllama/exllama_ext.cpp", - "custom_kernels/exllama/cuda_buffers.cu", - "custom_kernels/exllama/cuda_func/column_remap.cu", - "custom_kernels/exllama/cuda_func/q4_matmul.cu", - "custom_kernels/exllama/cuda_func/q4_matrix.cu" - ], - sources=["custom_kernels/fused_attention_cuda.cu"], - ) ], cmdclass={"build_ext": BuildExtension}, ) diff --git a/server/custom_kernels/custom_kernels/exllama/cuda_buffers.cu b/server/exllama_kernels/exllama_kernels/cuda_buffers.cu similarity index 100% rename from server/custom_kernels/custom_kernels/exllama/cuda_buffers.cu rename to server/exllama_kernels/exllama_kernels/cuda_buffers.cu diff --git a/server/custom_kernels/custom_kernels/exllama/cuda_buffers.cuh b/server/exllama_kernels/exllama_kernels/cuda_buffers.cuh similarity index 100% rename from server/custom_kernels/custom_kernels/exllama/cuda_buffers.cuh rename to server/exllama_kernels/exllama_kernels/cuda_buffers.cuh diff --git a/server/custom_kernels/custom_kernels/exllama/cuda_compat.cuh b/server/exllama_kernels/exllama_kernels/cuda_compat.cuh similarity index 100% rename from server/custom_kernels/custom_kernels/exllama/cuda_compat.cuh rename to server/exllama_kernels/exllama_kernels/cuda_compat.cuh diff --git a/server/custom_kernels/custom_kernels/exllama/cuda_func/column_remap.cu b/server/exllama_kernels/exllama_kernels/cuda_func/column_remap.cu similarity index 100% rename from server/custom_kernels/custom_kernels/exllama/cuda_func/column_remap.cu rename to server/exllama_kernels/exllama_kernels/cuda_func/column_remap.cu diff --git a/server/custom_kernels/custom_kernels/exllama/cuda_func/column_remap.cuh b/server/exllama_kernels/exllama_kernels/cuda_func/column_remap.cuh similarity index 100% rename from server/custom_kernels/custom_kernels/exllama/cuda_func/column_remap.cuh rename to server/exllama_kernels/exllama_kernels/cuda_func/column_remap.cuh diff --git a/server/custom_kernels/custom_kernels/exllama/cuda_func/q4_matmul.cu b/server/exllama_kernels/exllama_kernels/cuda_func/q4_matmul.cu similarity index 100% rename from server/custom_kernels/custom_kernels/exllama/cuda_func/q4_matmul.cu rename to server/exllama_kernels/exllama_kernels/cuda_func/q4_matmul.cu diff --git a/server/custom_kernels/custom_kernels/exllama/cuda_func/q4_matmul.cuh b/server/exllama_kernels/exllama_kernels/cuda_func/q4_matmul.cuh similarity index 100% rename from server/custom_kernels/custom_kernels/exllama/cuda_func/q4_matmul.cuh rename to server/exllama_kernels/exllama_kernels/cuda_func/q4_matmul.cuh diff --git a/server/custom_kernels/custom_kernels/exllama/cuda_func/q4_matrix.cu b/server/exllama_kernels/exllama_kernels/cuda_func/q4_matrix.cu similarity index 100% rename from server/custom_kernels/custom_kernels/exllama/cuda_func/q4_matrix.cu rename to server/exllama_kernels/exllama_kernels/cuda_func/q4_matrix.cu diff --git a/server/custom_kernels/custom_kernels/exllama/cuda_func/q4_matrix.cuh b/server/exllama_kernels/exllama_kernels/cuda_func/q4_matrix.cuh similarity index 100% rename from server/custom_kernels/custom_kernels/exllama/cuda_func/q4_matrix.cuh rename to server/exllama_kernels/exllama_kernels/cuda_func/q4_matrix.cuh diff --git a/server/custom_kernels/custom_kernels/exllama/exllama_ext.cpp b/server/exllama_kernels/exllama_kernels/exllama_ext.cpp similarity index 100% rename from server/custom_kernels/custom_kernels/exllama/exllama_ext.cpp rename to server/exllama_kernels/exllama_kernels/exllama_ext.cpp diff --git a/server/custom_kernels/custom_kernels/exllama/matrix.cuh b/server/exllama_kernels/exllama_kernels/matrix.cuh similarity index 100% rename from server/custom_kernels/custom_kernels/exllama/matrix.cuh rename to server/exllama_kernels/exllama_kernels/matrix.cuh diff --git a/server/custom_kernels/custom_kernels/exllama/tuning.h b/server/exllama_kernels/exllama_kernels/tuning.h similarity index 100% rename from server/custom_kernels/custom_kernels/exllama/tuning.h rename to server/exllama_kernels/exllama_kernels/tuning.h diff --git a/server/custom_kernels/custom_kernels/exllama/util.cuh b/server/exllama_kernels/exllama_kernels/util.cuh similarity index 100% rename from server/custom_kernels/custom_kernels/exllama/util.cuh rename to server/exllama_kernels/exllama_kernels/util.cuh diff --git a/server/exllama_kernels/setup.py b/server/exllama_kernels/setup.py new file mode 100644 index 00000000..f06a72bd --- /dev/null +++ b/server/exllama_kernels/setup.py @@ -0,0 +1,19 @@ +from setuptools import setup +from torch.utils.cpp_extension import BuildExtension, CUDAExtension + +setup( + name="exllama_kernels", + ext_modules=[ + CUDAExtension( + name="exllama_kernels", + sources=[ + "exllama_kernels/exllama_ext.cpp", + "exllama_kernels/cuda_buffers.cu", + "exllama_kernels/cuda_func/column_remap.cu", + "exllama_kernels/cuda_func/q4_matmul.cu", + "exllama_kernels/cuda_func/q4_matrix.cu" + ], + ) + ], + cmdclass={"build_ext": BuildExtension}, +) diff --git a/server/text_generation_server/utils/gptq/exllama.py b/server/text_generation_server/utils/gptq/exllama.py index f1c23cca..512e3fcf 100644 --- a/server/text_generation_server/utils/gptq/exllama.py +++ b/server/text_generation_server/utils/gptq/exllama.py @@ -1,6 +1,6 @@ import torch -from custom_kernels.exllama import make_q4, q4_matmul, prepare_buffers, set_tuning_params +from exllama_kernels import make_q4, q4_matmul, prepare_buffers, set_tuning_params # Dummy tensor to pass instead of g_idx since there is no way to pass "None" to a C++ extension none_tensor = torch.empty((1, 1), device = "meta")