mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-11 04:14:52 +00:00
Merge branch 'main' into fix_local_load_for_medusa
This commit is contained in:
commit
6545383861
@ -74,7 +74,7 @@ curl 127.0.0.1:8080/generate \
|
|||||||
-H 'Content-Type: application/json'
|
-H 'Content-Type: application/json'
|
||||||
```
|
```
|
||||||
|
|
||||||
**Note:** To use NVIDIA GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). We also recommend using NVIDIA drivers with CUDA version 11.8 or higher. For running the Docker container on a machine with no GPUs or CUDA support, it is enough to remove the `--gpus all` flag and add `--disable-custom-kernels`, please note CPU is not the intended platform for this project, so performance might be subpar.
|
**Note:** To use NVIDIA GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). We also recommend using NVIDIA drivers with CUDA version 12.2 or higher. For running the Docker container on a machine with no GPUs or CUDA support, it is enough to remove the `--gpus all` flag and add `--disable-custom-kernels`, please note CPU is not the intended platform for this project, so performance might be subpar.
|
||||||
|
|
||||||
**Note:** TGI supports AMD Instinct MI210 and MI250 GPUs. Details can be found in the [Supported Hardware documentation](https://huggingface.co/docs/text-generation-inference/supported_models#supported-hardware). To use AMD GPUs, please use `docker run --device /dev/kfd --device /dev/dri --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:1.3-rocm --model-id $model` instead of the command above.
|
**Note:** TGI supports AMD Instinct MI210 and MI250 GPUs. Details can be found in the [Supported Hardware documentation](https://huggingface.co/docs/text-generation-inference/supported_models#supported-hardware). To use AMD GPUs, please use `docker run --device /dev/kfd --device /dev/dri --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:1.3-rocm --model-id $model` instead of the command above.
|
||||||
|
|
||||||
|
@ -13,7 +13,7 @@ docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingf
|
|||||||
|
|
||||||
<Tip warning={true}>
|
<Tip warning={true}>
|
||||||
|
|
||||||
To use NVIDIA GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). We also recommend using NVIDIA drivers with CUDA version 11.8 or higher.
|
To use NVIDIA GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). We also recommend using NVIDIA drivers with CUDA version 12.2 or higher.
|
||||||
|
|
||||||
</Tip>
|
</Tip>
|
||||||
|
|
||||||
|
@ -39,7 +39,7 @@ text-generation-launcher --model-id <PATH-TO-LOCAL-BLOOM>
|
|||||||
|
|
||||||
## Supported Hardware
|
## Supported Hardware
|
||||||
|
|
||||||
TGI optimized models are supported on NVIDIA [A100](https://www.nvidia.com/en-us/data-center/a100/), [A10G](https://www.nvidia.com/en-us/data-center/products/a10-gpu/) and [T4](https://www.nvidia.com/en-us/data-center/tesla-t4/) GPUs with CUDA 11.8+. Note that you have to install [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html) to use it. For other NVIDIA GPUs, continuous batching will still apply, but some operations like flash attention and paged attention will not be executed.
|
TGI optimized models are supported on NVIDIA [A100](https://www.nvidia.com/en-us/data-center/a100/), [A10G](https://www.nvidia.com/en-us/data-center/products/a10-gpu/) and [T4](https://www.nvidia.com/en-us/data-center/tesla-t4/) GPUs with CUDA 12.2+. Note that you have to install [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html) to use it. For other NVIDIA GPUs, continuous batching will still apply, but some operations like flash attention and paged attention will not be executed.
|
||||||
|
|
||||||
TGI also has support of ROCm-enabled AMD Instinct MI210 and MI250 GPUs, with paged attention and flash attention v2 support. The following features are currently not supported in the ROCm version of TGI, and the supported may be extended in the future:
|
TGI also has support of ROCm-enabled AMD Instinct MI210 and MI250 GPUs, with paged attention and flash attention v2 support. The following features are currently not supported in the ROCm version of TGI, and the supported may be extended in the future:
|
||||||
* Quantization (GPTQ, AWQ, etc.)
|
* Quantization (GPTQ, AWQ, etc.)
|
||||||
|
@ -227,7 +227,7 @@ def download_weights(
|
|||||||
except (utils.LocalEntryNotFoundError, utils.EntryNotFoundError):
|
except (utils.LocalEntryNotFoundError, utils.EntryNotFoundError):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
else:
|
elif (Path(model_id) / "adapter_config.json").exists():
|
||||||
# Try to load as a local PEFT model
|
# Try to load as a local PEFT model
|
||||||
try:
|
try:
|
||||||
utils.download_and_unload_peft(
|
utils.download_and_unload_peft(
|
||||||
|
@ -23,10 +23,15 @@ try:
|
|||||||
try:
|
try:
|
||||||
import flash_attn_2_cuda
|
import flash_attn_2_cuda
|
||||||
except ImportError:
|
except ImportError:
|
||||||
|
architecture_suffix = ""
|
||||||
|
if IS_CUDA_SYSTEM:
|
||||||
|
architecture_suffix = "-cuda"
|
||||||
|
elif IS_ROCM_SYSTEM:
|
||||||
|
architecture_suffix = "-rocm"
|
||||||
raise ImportError(
|
raise ImportError(
|
||||||
"Flash Attention V2 is not installed.\n"
|
"Flash Attention V2 is not installed.\n"
|
||||||
"Use the official Docker image (ghcr.io/huggingface/text-generation-inference:latest) "
|
"Use the official Docker image (ghcr.io/huggingface/text-generation-inference:latest) "
|
||||||
"or install flash attention v2 with `cd server && make install install-flash-attention-v2`"
|
f"or install flash attention v2 with `cd server && make install install-flash-attention-v2{architecture_suffix}`"
|
||||||
)
|
)
|
||||||
if not (is_sm8x or is_sm90):
|
if not (is_sm8x or is_sm90):
|
||||||
raise ImportError(
|
raise ImportError(
|
||||||
|
@ -10,8 +10,7 @@ from peft import AutoPeftModelForCausalLM, AutoPeftModelForSeq2SeqLM
|
|||||||
def download_and_unload_peft(model_id, revision, trust_remote_code):
|
def download_and_unload_peft(model_id, revision, trust_remote_code):
|
||||||
torch_dtype = torch.float16
|
torch_dtype = torch.float16
|
||||||
|
|
||||||
logger.info("Peft model detected.")
|
logger.info("Trying to load a Peft model. It might take a while without feedback")
|
||||||
logger.info("Loading the model it might take a while without feedback")
|
|
||||||
try:
|
try:
|
||||||
model = AutoPeftModelForCausalLM.from_pretrained(
|
model = AutoPeftModelForCausalLM.from_pretrained(
|
||||||
model_id,
|
model_id,
|
||||||
@ -28,7 +27,7 @@ def download_and_unload_peft(model_id, revision, trust_remote_code):
|
|||||||
trust_remote_code=trust_remote_code,
|
trust_remote_code=trust_remote_code,
|
||||||
low_cpu_mem_usage=True,
|
low_cpu_mem_usage=True,
|
||||||
)
|
)
|
||||||
logger.info(f"Loaded.")
|
logger.info("Peft model detected.")
|
||||||
logger.info(f"Merging the lora weights.")
|
logger.info(f"Merging the lora weights.")
|
||||||
|
|
||||||
base_model_id = model.peft_config["default"].base_model_name_or_path
|
base_model_id = model.peft_config["default"].base_model_name_or_path
|
||||||
|
Loading…
Reference in New Issue
Block a user