mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-10 20:04:52 +00:00
Separate build process.
This commit is contained in:
parent
c6e702fb2f
commit
3ec3adde2f
13
Dockerfile
13
Dockerfile
@ -108,6 +108,17 @@ COPY server/Makefile-flash-att-v2 Makefile
|
|||||||
# Build specific version of flash attention v2
|
# Build specific version of flash attention v2
|
||||||
RUN make build-flash-attention-v2
|
RUN make build-flash-attention-v2
|
||||||
|
|
||||||
|
# Build Transformers exllama kernels
|
||||||
|
FROM kernel-builder as exllama-kernels-builder
|
||||||
|
|
||||||
|
WORKDIR /usr/src
|
||||||
|
|
||||||
|
COPY server/exllama_kernels/ .
|
||||||
|
|
||||||
|
|
||||||
|
# Build specific version of transformers
|
||||||
|
RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build
|
||||||
|
|
||||||
# Build Transformers CUDA kernels
|
# Build Transformers CUDA kernels
|
||||||
FROM kernel-builder as custom-kernels-builder
|
FROM kernel-builder as custom-kernels-builder
|
||||||
|
|
||||||
@ -161,6 +172,8 @@ COPY --from=flash-att-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x86
|
|||||||
|
|
||||||
# Copy build artifacts from custom kernels builder
|
# Copy build artifacts from custom kernels builder
|
||||||
COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
|
COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
|
||||||
|
# Copy build artifacts from exllama kernels builder
|
||||||
|
COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
|
||||||
|
|
||||||
# Copy builds artifacts from vllm builder
|
# Copy builds artifacts from vllm builder
|
||||||
COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
|
COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
|
||||||
|
@ -14,17 +14,6 @@ setup(
|
|||||||
sources=["custom_kernels/fused_attention_cuda.cu"],
|
sources=["custom_kernels/fused_attention_cuda.cu"],
|
||||||
extra_compile_args=["-arch=compute_80", "-std=c++17"],
|
extra_compile_args=["-arch=compute_80", "-std=c++17"],
|
||||||
),
|
),
|
||||||
CUDAExtension(
|
|
||||||
name="custom_kernels.exllama",
|
|
||||||
sources=[
|
|
||||||
"custom_kernels/exllama/exllama_ext.cpp",
|
|
||||||
"custom_kernels/exllama/cuda_buffers.cu",
|
|
||||||
"custom_kernels/exllama/cuda_func/column_remap.cu",
|
|
||||||
"custom_kernels/exllama/cuda_func/q4_matmul.cu",
|
|
||||||
"custom_kernels/exllama/cuda_func/q4_matrix.cu"
|
|
||||||
],
|
|
||||||
sources=["custom_kernels/fused_attention_cuda.cu"],
|
|
||||||
)
|
|
||||||
],
|
],
|
||||||
cmdclass={"build_ext": BuildExtension},
|
cmdclass={"build_ext": BuildExtension},
|
||||||
)
|
)
|
||||||
|
19
server/exllama_kernels/setup.py
Normal file
19
server/exllama_kernels/setup.py
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
from setuptools import setup
|
||||||
|
from torch.utils.cpp_extension import BuildExtension, CUDAExtension
|
||||||
|
|
||||||
|
setup(
|
||||||
|
name="exllama_kernels",
|
||||||
|
ext_modules=[
|
||||||
|
CUDAExtension(
|
||||||
|
name="exllama_kernels",
|
||||||
|
sources=[
|
||||||
|
"exllama_kernels/exllama_ext.cpp",
|
||||||
|
"exllama_kernels/cuda_buffers.cu",
|
||||||
|
"exllama_kernels/cuda_func/column_remap.cu",
|
||||||
|
"exllama_kernels/cuda_func/q4_matmul.cu",
|
||||||
|
"exllama_kernels/cuda_func/q4_matrix.cu"
|
||||||
|
],
|
||||||
|
)
|
||||||
|
],
|
||||||
|
cmdclass={"build_ext": BuildExtension},
|
||||||
|
)
|
@ -1,6 +1,6 @@
|
|||||||
|
|
||||||
import torch
|
import torch
|
||||||
from custom_kernels.exllama import make_q4, q4_matmul, prepare_buffers, set_tuning_params
|
from exllama_kernels import make_q4, q4_matmul, prepare_buffers, set_tuning_params
|
||||||
|
|
||||||
# Dummy tensor to pass instead of g_idx since there is no way to pass "None" to a C++ extension
|
# Dummy tensor to pass instead of g_idx since there is no way to pass "None" to a C++ extension
|
||||||
none_tensor = torch.empty((1, 1), device = "meta")
|
none_tensor = torch.empty((1, 1), device = "meta")
|
||||||
|
Loading…
Reference in New Issue
Block a user