diff --git a/README.md b/README.md index d99b7306..1b3041d3 100644 --- a/README.md +++ b/README.md @@ -74,7 +74,7 @@ curl 127.0.0.1:8080/generate \ -H 'Content-Type: application/json' ``` -**Note:** To use NVIDIA GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). We also recommend using NVIDIA drivers with CUDA version 11.8 or higher. For running the Docker container on a machine with no GPUs or CUDA support, it is enough to remove the `--gpus all` flag and add `--disable-custom-kernels`, please note CPU is not the intended platform for this project, so performance might be subpar. +**Note:** To use NVIDIA GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). We also recommend using NVIDIA drivers with CUDA version 12.2 or higher. For running the Docker container on a machine with no GPUs or CUDA support, it is enough to remove the `--gpus all` flag and add `--disable-custom-kernels`, please note CPU is not the intended platform for this project, so performance might be subpar. **Note:** TGI supports AMD Instinct MI210 and MI250 GPUs. Details can be found in the [Supported Hardware documentation](https://huggingface.co/docs/text-generation-inference/supported_models#supported-hardware). To use AMD GPUs, please use `docker run --device /dev/kfd --device /dev/dri --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:1.3-rocm --model-id $model` instead of the command above. diff --git a/docs/source/quicktour.md b/docs/source/quicktour.md index 03ea03bc..e9a33f04 100644 --- a/docs/source/quicktour.md +++ b/docs/source/quicktour.md @@ -13,7 +13,7 @@ docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingf -To use NVIDIA GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). We also recommend using NVIDIA drivers with CUDA version 11.8 or higher. +To use NVIDIA GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). We also recommend using NVIDIA drivers with CUDA version 12.2 or higher. diff --git a/docs/source/supported_models.md b/docs/source/supported_models.md index 0708c729..dce4f2f9 100644 --- a/docs/source/supported_models.md +++ b/docs/source/supported_models.md @@ -39,7 +39,7 @@ text-generation-launcher --model-id ## Supported Hardware -TGI optimized models are supported on NVIDIA [A100](https://www.nvidia.com/en-us/data-center/a100/), [A10G](https://www.nvidia.com/en-us/data-center/products/a10-gpu/) and [T4](https://www.nvidia.com/en-us/data-center/tesla-t4/) GPUs with CUDA 11.8+. Note that you have to install [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html) to use it. For other NVIDIA GPUs, continuous batching will still apply, but some operations like flash attention and paged attention will not be executed. +TGI optimized models are supported on NVIDIA [A100](https://www.nvidia.com/en-us/data-center/a100/), [A10G](https://www.nvidia.com/en-us/data-center/products/a10-gpu/) and [T4](https://www.nvidia.com/en-us/data-center/tesla-t4/) GPUs with CUDA 12.2+. Note that you have to install [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html) to use it. For other NVIDIA GPUs, continuous batching will still apply, but some operations like flash attention and paged attention will not be executed. TGI also has support of ROCm-enabled AMD Instinct MI210 and MI250 GPUs, with paged attention and flash attention v2 support. The following features are currently not supported in the ROCm version of TGI, and the supported may be extended in the future: * Quantization (GPTQ, AWQ, etc.) diff --git a/server/text_generation_server/cli.py b/server/text_generation_server/cli.py index 35542295..8624cef7 100644 --- a/server/text_generation_server/cli.py +++ b/server/text_generation_server/cli.py @@ -227,7 +227,7 @@ def download_weights( except (utils.LocalEntryNotFoundError, utils.EntryNotFoundError): pass - else: + elif (Path(model_id) / "adapter_config.json").exists(): # Try to load as a local PEFT model try: utils.download_and_unload_peft( diff --git a/server/text_generation_server/utils/flash_attn.py b/server/text_generation_server/utils/flash_attn.py index 02f01e65..48f8ef70 100644 --- a/server/text_generation_server/utils/flash_attn.py +++ b/server/text_generation_server/utils/flash_attn.py @@ -23,10 +23,15 @@ try: try: import flash_attn_2_cuda except ImportError: + architecture_suffix = "" + if IS_CUDA_SYSTEM: + architecture_suffix = "-cuda" + elif IS_ROCM_SYSTEM: + architecture_suffix = "-rocm" raise ImportError( "Flash Attention V2 is not installed.\n" "Use the official Docker image (ghcr.io/huggingface/text-generation-inference:latest) " - "or install flash attention v2 with `cd server && make install install-flash-attention-v2`" + f"or install flash attention v2 with `cd server && make install install-flash-attention-v2{architecture_suffix}`" ) if not (is_sm8x or is_sm90): raise ImportError( diff --git a/server/text_generation_server/utils/peft.py b/server/text_generation_server/utils/peft.py index 45e23320..48ca264b 100644 --- a/server/text_generation_server/utils/peft.py +++ b/server/text_generation_server/utils/peft.py @@ -10,8 +10,7 @@ from peft import AutoPeftModelForCausalLM, AutoPeftModelForSeq2SeqLM def download_and_unload_peft(model_id, revision, trust_remote_code): torch_dtype = torch.float16 - logger.info("Peft model detected.") - logger.info("Loading the model it might take a while without feedback") + logger.info("Trying to load a Peft model. It might take a while without feedback") try: model = AutoPeftModelForCausalLM.from_pretrained( model_id, @@ -28,7 +27,7 @@ def download_and_unload_peft(model_id, revision, trust_remote_code): trust_remote_code=trust_remote_code, low_cpu_mem_usage=True, ) - logger.info(f"Loaded.") + logger.info("Peft model detected.") logger.info(f"Merging the lora weights.") base_model_id = model.peft_config["default"].base_model_name_or_path