mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-21 23:12:07 +00:00
fix: add lora kernel to dockerfile, support running without kernels and refactors
This commit is contained in:
parent
d6cf63ca53
commit
aa88c4fd3a
@ -144,6 +144,13 @@ COPY server/Makefile-marlin Makefile
|
|||||||
# Build specific version of transformers
|
# Build specific version of transformers
|
||||||
RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-marlin
|
RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-marlin
|
||||||
|
|
||||||
|
# Build Lorax Punica kernels
|
||||||
|
FROM kernel-builder as lorax-punica-builder
|
||||||
|
WORKDIR /usr/src
|
||||||
|
COPY server/Makefile-lorax-punica Makefile
|
||||||
|
# Build specific version of transformers
|
||||||
|
RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-lorax-punica
|
||||||
|
|
||||||
# Build Transformers CUDA kernels
|
# Build Transformers CUDA kernels
|
||||||
FROM kernel-builder as custom-kernels-builder
|
FROM kernel-builder as custom-kernels-builder
|
||||||
WORKDIR /usr/src
|
WORKDIR /usr/src
|
||||||
@ -214,6 +221,7 @@ COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86
|
|||||||
COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
|
COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
|
||||||
# Copy build artifacts from marlin kernels builder
|
# Copy build artifacts from marlin kernels builder
|
||||||
COPY --from=marlin-kernels-builder /usr/src/marlin/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
|
COPY --from=marlin-kernels-builder /usr/src/marlin/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
|
||||||
|
COPY --from=lorax-punica-builder /usr/src/lorax-punica/server/punica_kernels/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
|
||||||
|
|
||||||
# Copy builds artifacts from vllm builder
|
# Copy builds artifacts from vllm builder
|
||||||
COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
|
COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
|
||||||
|
@ -1,9 +1,12 @@
|
|||||||
lorax_punica_commit := c71861a653412267dc27ec86013dd945ce3474bc
|
lorax_punica_commit := c71861a653412267dc27ec86013dd945ce3474bc
|
||||||
|
|
||||||
lorax-punica: install-lorax-punica
|
build-lorax-punica:
|
||||||
git clone --no-checkout https://github.com/predibase/lorax.git
|
if [ ! -d 'lorax-punica' ]; then \
|
||||||
|
git clone --no-checkout https://github.com/predibase/lorax.git lorax-punica; \
|
||||||
|
fi
|
||||||
|
cd lorax-punica && git sparse-checkout set server/punica_kernels && git checkout $(lorax_punica_commit)
|
||||||
|
cd lorax-punica && git submodule update --init --recursive
|
||||||
|
cd lorax-punica/server/punica_kernels && python setup.py build
|
||||||
|
|
||||||
install-lorax-punica:
|
install-lorax-punica: build-lorax-punica
|
||||||
cd lorax && git sparse-checkout set server/punica_kernels && git checkout $(lorax_punica_commit)
|
cd lorax-punica/server/punica_kernels && python setup.py install
|
||||||
cd lorax && git submodule update --init --recursive
|
|
||||||
cd lorax/server/punica_kernels && python setup.py install
|
|
||||||
|
@ -90,18 +90,6 @@ class Model(ABC):
|
|||||||
self.loaded_adapters = set()
|
self.loaded_adapters = set()
|
||||||
self.static_adapter_id = adapter_id
|
self.static_adapter_id = adapter_id
|
||||||
|
|
||||||
# TODO: review moving adapter loading to the model
|
|
||||||
if adapter_id and adapter_id != BASE_MODEL_ADAPTER_ID:
|
|
||||||
pass
|
|
||||||
# download_adapter(adapter_id, adapter_source, api_token=None)
|
|
||||||
# self.load_adapter(
|
|
||||||
# AdapterParameters(adapter_ids=[adapter_id]),
|
|
||||||
# adapter_source,
|
|
||||||
# adapter_index=0,
|
|
||||||
# api_token=None,
|
|
||||||
# dynamic=False,
|
|
||||||
# )
|
|
||||||
|
|
||||||
if speculate is None:
|
if speculate is None:
|
||||||
speculate = get_speculate()
|
speculate = get_speculate()
|
||||||
self.speculate = speculate
|
self.speculate = speculate
|
||||||
|
@ -136,6 +136,10 @@ def get_tmp_tensor_for_size(size: int, device: torch.device) -> torch.Tensor:
|
|||||||
return torch.empty((tmp_size,), dtype=torch.uint8, device=device)
|
return torch.empty((tmp_size,), dtype=torch.uint8, device=device)
|
||||||
|
|
||||||
|
|
||||||
|
def get_tmp_tensor_for_size_no_kernels(size: int, device: torch.device) -> torch.Tensor:
|
||||||
|
return torch.empty((size,), dtype=torch.uint8, device=device)
|
||||||
|
|
||||||
|
|
||||||
def get_tmp_expand_size(size: int) -> int:
|
def get_tmp_expand_size(size: int) -> int:
|
||||||
return _kernels.sgmv_cutlass_tmp_size(size)
|
return _kernels.sgmv_cutlass_tmp_size(size)
|
||||||
|
|
||||||
@ -143,12 +147,12 @@ def get_tmp_expand_size(size: int) -> int:
|
|||||||
def get_tmp_tensors(
|
def get_tmp_tensors(
|
||||||
nsegments: int, lora_rank: int, device: torch.device
|
nsegments: int, lora_rank: int, device: torch.device
|
||||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||||
if use_cutlass_shrink(lora_rank):
|
if use_cutlass_shrink(lora_rank) and has_sgmv():
|
||||||
tmp = get_tmp_tensor_for_size(nsegments, device)
|
tmp = get_tmp_tensor_for_size(nsegments, device)
|
||||||
return tmp, tmp
|
return tmp, tmp
|
||||||
else:
|
else:
|
||||||
tmp_shrink = get_tmp_tensor(device)
|
tmp_shrink = get_tmp_tensor(device)
|
||||||
tmp_expand = get_tmp_tensor_for_size(nsegments, device)
|
tmp_expand = get_tmp_tensor_for_size_no_kernels(nsegments, device)
|
||||||
return tmp_shrink, tmp_expand
|
return tmp_shrink, tmp_expand
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user