Merge branch 'main' into fix_local_load_for_medusa

This commit is contained in:
PYNing 2024-01-10 10:53:00 +08:00
commit 6545383861
6 changed files with 12 additions and 8 deletions

View File

@ -74,7 +74,7 @@ curl 127.0.0.1:8080/generate \
-H 'Content-Type: application/json' -H 'Content-Type: application/json'
``` ```
**Note:** To use NVIDIA GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). We also recommend using NVIDIA drivers with CUDA version 11.8 or higher. For running the Docker container on a machine with no GPUs or CUDA support, it is enough to remove the `--gpus all` flag and add `--disable-custom-kernels`, please note CPU is not the intended platform for this project, so performance might be subpar. **Note:** To use NVIDIA GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). We also recommend using NVIDIA drivers with CUDA version 12.2 or higher. For running the Docker container on a machine with no GPUs or CUDA support, it is enough to remove the `--gpus all` flag and add `--disable-custom-kernels`, please note CPU is not the intended platform for this project, so performance might be subpar.
**Note:** TGI supports AMD Instinct MI210 and MI250 GPUs. Details can be found in the [Supported Hardware documentation](https://huggingface.co/docs/text-generation-inference/supported_models#supported-hardware). To use AMD GPUs, please use `docker run --device /dev/kfd --device /dev/dri --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:1.3-rocm --model-id $model` instead of the command above. **Note:** TGI supports AMD Instinct MI210 and MI250 GPUs. Details can be found in the [Supported Hardware documentation](https://huggingface.co/docs/text-generation-inference/supported_models#supported-hardware). To use AMD GPUs, please use `docker run --device /dev/kfd --device /dev/dri --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:1.3-rocm --model-id $model` instead of the command above.

View File

@ -13,7 +13,7 @@ docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingf
<Tip warning={true}> <Tip warning={true}>
To use NVIDIA GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). We also recommend using NVIDIA drivers with CUDA version 11.8 or higher. To use NVIDIA GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). We also recommend using NVIDIA drivers with CUDA version 12.2 or higher.
</Tip> </Tip>

View File

@ -39,7 +39,7 @@ text-generation-launcher --model-id <PATH-TO-LOCAL-BLOOM>
## Supported Hardware ## Supported Hardware
TGI optimized models are supported on NVIDIA [A100](https://www.nvidia.com/en-us/data-center/a100/), [A10G](https://www.nvidia.com/en-us/data-center/products/a10-gpu/) and [T4](https://www.nvidia.com/en-us/data-center/tesla-t4/) GPUs with CUDA 11.8+. Note that you have to install [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html) to use it. For other NVIDIA GPUs, continuous batching will still apply, but some operations like flash attention and paged attention will not be executed. TGI optimized models are supported on NVIDIA [A100](https://www.nvidia.com/en-us/data-center/a100/), [A10G](https://www.nvidia.com/en-us/data-center/products/a10-gpu/) and [T4](https://www.nvidia.com/en-us/data-center/tesla-t4/) GPUs with CUDA 12.2+. Note that you have to install [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html) to use it. For other NVIDIA GPUs, continuous batching will still apply, but some operations like flash attention and paged attention will not be executed.
TGI also has support of ROCm-enabled AMD Instinct MI210 and MI250 GPUs, with paged attention and flash attention v2 support. The following features are currently not supported in the ROCm version of TGI, and the supported may be extended in the future: TGI also has support of ROCm-enabled AMD Instinct MI210 and MI250 GPUs, with paged attention and flash attention v2 support. The following features are currently not supported in the ROCm version of TGI, and the supported may be extended in the future:
* Quantization (GPTQ, AWQ, etc.) * Quantization (GPTQ, AWQ, etc.)

View File

@ -227,7 +227,7 @@ def download_weights(
except (utils.LocalEntryNotFoundError, utils.EntryNotFoundError): except (utils.LocalEntryNotFoundError, utils.EntryNotFoundError):
pass pass
else: elif (Path(model_id) / "adapter_config.json").exists():
# Try to load as a local PEFT model # Try to load as a local PEFT model
try: try:
utils.download_and_unload_peft( utils.download_and_unload_peft(

View File

@ -23,10 +23,15 @@ try:
try: try:
import flash_attn_2_cuda import flash_attn_2_cuda
except ImportError: except ImportError:
architecture_suffix = ""
if IS_CUDA_SYSTEM:
architecture_suffix = "-cuda"
elif IS_ROCM_SYSTEM:
architecture_suffix = "-rocm"
raise ImportError( raise ImportError(
"Flash Attention V2 is not installed.\n" "Flash Attention V2 is not installed.\n"
"Use the official Docker image (ghcr.io/huggingface/text-generation-inference:latest) " "Use the official Docker image (ghcr.io/huggingface/text-generation-inference:latest) "
"or install flash attention v2 with `cd server && make install install-flash-attention-v2`" f"or install flash attention v2 with `cd server && make install install-flash-attention-v2{architecture_suffix}`"
) )
if not (is_sm8x or is_sm90): if not (is_sm8x or is_sm90):
raise ImportError( raise ImportError(

View File

@ -10,8 +10,7 @@ from peft import AutoPeftModelForCausalLM, AutoPeftModelForSeq2SeqLM
def download_and_unload_peft(model_id, revision, trust_remote_code): def download_and_unload_peft(model_id, revision, trust_remote_code):
torch_dtype = torch.float16 torch_dtype = torch.float16
logger.info("Peft model detected.") logger.info("Trying to load a Peft model. It might take a while without feedback")
logger.info("Loading the model it might take a while without feedback")
try: try:
model = AutoPeftModelForCausalLM.from_pretrained( model = AutoPeftModelForCausalLM.from_pretrained(
model_id, model_id,
@ -28,7 +27,7 @@ def download_and_unload_peft(model_id, revision, trust_remote_code):
trust_remote_code=trust_remote_code, trust_remote_code=trust_remote_code,
low_cpu_mem_usage=True, low_cpu_mem_usage=True,
) )
logger.info(f"Loaded.") logger.info("Peft model detected.")
logger.info(f"Merging the lora weights.") logger.info(f"Merging the lora weights.")
base_model_id = model.peft_config["default"].base_model_name_or_path base_model_id = model.peft_config["default"].base_model_name_or_path