From 3f9b3f4539a25e64071527d5b6e4306644dd46a8 Mon Sep 17 00:00:00 2001
From: OlivierDehaene <23298448+OlivierDehaene@users.noreply.github.com>
Date: Tue, 9 Jan 2024 14:28:55 +0100
Subject: [PATCH 1/3] docs: update required CUDA version to 12.2

---
 README.md                       | 2 +-
 docs/source/quicktour.md        | 2 +-
 docs/source/supported_models.md | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/README.md b/README.md
index d99b7306..1b3041d3 100644
--- a/README.md
+++ b/README.md
@@ -74,7 +74,7 @@ curl 127.0.0.1:8080/generate \
     -H 'Content-Type: application/json'
 ```
 
-**Note:** To use NVIDIA GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). We also recommend using NVIDIA drivers with CUDA version 11.8 or higher. For running the Docker container on a machine with no GPUs or CUDA support, it is enough to remove the `--gpus all` flag and add `--disable-custom-kernels`, please note CPU is not the intended platform for this project, so performance might be subpar.
+**Note:** To use NVIDIA GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). We also recommend using NVIDIA drivers with CUDA version 12.2 or higher. For running the Docker container on a machine with no GPUs or CUDA support, it is enough to remove the `--gpus all` flag and add `--disable-custom-kernels`, please note CPU is not the intended platform for this project, so performance might be subpar.
 
 **Note:** TGI supports AMD Instinct MI210 and MI250 GPUs. Details can be found in the [Supported Hardware documentation](https://huggingface.co/docs/text-generation-inference/supported_models#supported-hardware). To use AMD GPUs, please use `docker run --device /dev/kfd --device /dev/dri --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:1.3-rocm --model-id $model` instead of the command above.
 
diff --git a/docs/source/quicktour.md b/docs/source/quicktour.md
index 03ea03bc..e9a33f04 100644
--- a/docs/source/quicktour.md
+++ b/docs/source/quicktour.md
@@ -13,7 +13,7 @@ docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingf
 
 <Tip warning={true}>
 
-To use NVIDIA GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). We also recommend using NVIDIA drivers with CUDA version 11.8 or higher.
+To use NVIDIA GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). We also recommend using NVIDIA drivers with CUDA version 12.2 or higher.
 
 </Tip>
 
diff --git a/docs/source/supported_models.md b/docs/source/supported_models.md
index 0708c729..dce4f2f9 100644
--- a/docs/source/supported_models.md
+++ b/docs/source/supported_models.md
@@ -39,7 +39,7 @@ text-generation-launcher --model-id <PATH-TO-LOCAL-BLOOM>
 
 ## Supported Hardware
 
-TGI optimized models are supported on NVIDIA [A100](https://www.nvidia.com/en-us/data-center/a100/), [A10G](https://www.nvidia.com/en-us/data-center/products/a10-gpu/) and [T4](https://www.nvidia.com/en-us/data-center/tesla-t4/) GPUs with CUDA 11.8+. Note that you have to install [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html) to use it. For other NVIDIA GPUs, continuous batching will still apply, but some operations like flash attention and paged attention will not be executed.
+TGI optimized models are supported on NVIDIA [A100](https://www.nvidia.com/en-us/data-center/a100/), [A10G](https://www.nvidia.com/en-us/data-center/products/a10-gpu/) and [T4](https://www.nvidia.com/en-us/data-center/tesla-t4/) GPUs with CUDA 12.2+. Note that you have to install [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html) to use it. For other NVIDIA GPUs, continuous batching will still apply, but some operations like flash attention and paged attention will not be executed.
 
 TGI also has support of ROCm-enabled AMD Instinct MI210 and MI250 GPUs, with paged attention and flash attention v2 support. The following features are currently not supported in the ROCm version of TGI, and the supported may be extended in the future:
 * Quantization (GPTQ, AWQ, etc.)

From 564f2a3b755edee70518cb5ab30b1ffda717e105 Mon Sep 17 00:00:00 2001
From: OlivierDehaene <olivier@huggingface.co>
Date: Tue, 9 Jan 2024 15:21:00 +0100
Subject: [PATCH 2/3] fix: fix local loading for .bin models (#1419)

---
 server/text_generation_server/cli.py        | 2 +-
 server/text_generation_server/utils/peft.py | 5 ++---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/server/text_generation_server/cli.py b/server/text_generation_server/cli.py
index 1d67d7eb..403f46e7 100644
--- a/server/text_generation_server/cli.py
+++ b/server/text_generation_server/cli.py
@@ -198,7 +198,7 @@ def download_weights(
             if not extension == ".safetensors" or not auto_convert:
                 raise e
 
-    else:
+    elif (Path(model_id) / "adapter_config.json").exists():
         # Try to load as a local PEFT model
         try:
             utils.download_and_unload_peft(
diff --git a/server/text_generation_server/utils/peft.py b/server/text_generation_server/utils/peft.py
index 45e23320..48ca264b 100644
--- a/server/text_generation_server/utils/peft.py
+++ b/server/text_generation_server/utils/peft.py
@@ -10,8 +10,7 @@ from peft import AutoPeftModelForCausalLM, AutoPeftModelForSeq2SeqLM
 def download_and_unload_peft(model_id, revision, trust_remote_code):
     torch_dtype = torch.float16
 
-    logger.info("Peft model detected.")
-    logger.info("Loading the model it might take a while without feedback")
+    logger.info("Trying to load a Peft model. It might take a while without feedback")
     try:
         model = AutoPeftModelForCausalLM.from_pretrained(
             model_id,
@@ -28,7 +27,7 @@ def download_and_unload_peft(model_id, revision, trust_remote_code):
             trust_remote_code=trust_remote_code,
             low_cpu_mem_usage=True,
         )
-    logger.info(f"Loaded.")
+    logger.info("Peft model detected.")
     logger.info(f"Merging the lora weights.")
 
     base_model_id = model.peft_config["default"].base_model_name_or_path

From 91d72675342e34c314a0d7cc9bb9ca9d8f5aa295 Mon Sep 17 00:00:00 2001
From: "R. P. Ruiz" <42214371+deepily@users.noreply.github.com>
Date: Tue, 9 Jan 2024 10:19:31 -0500
Subject: [PATCH 3/3] Fix missing make target platform for local install:
 'install-flash-attention-v2'  (#1414)

---
 server/text_generation_server/utils/flash_attn.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/server/text_generation_server/utils/flash_attn.py b/server/text_generation_server/utils/flash_attn.py
index 02f01e65..48f8ef70 100644
--- a/server/text_generation_server/utils/flash_attn.py
+++ b/server/text_generation_server/utils/flash_attn.py
@@ -23,10 +23,15 @@ try:
     try:
         import flash_attn_2_cuda
     except ImportError:
+        architecture_suffix = ""
+        if IS_CUDA_SYSTEM:
+            architecture_suffix = "-cuda"
+        elif IS_ROCM_SYSTEM:
+            architecture_suffix = "-rocm"
         raise ImportError(
             "Flash Attention V2 is not installed.\n"
             "Use the official Docker image (ghcr.io/huggingface/text-generation-inference:latest) "
-            "or install flash attention v2 with `cd server && make install install-flash-attention-v2`"
+            f"or install flash attention v2 with `cd server && make install install-flash-attention-v2{architecture_suffix}`"
         )
     if not (is_sm8x or is_sm90):
         raise ImportError(