From 146e72c3be4f5bfe88b9baae4c2ff6a2ce41eda9 Mon Sep 17 00:00:00 2001
From: OlivierDehaene <23298448+OlivierDehaene@users.noreply.github.com>
Date: Tue, 30 May 2023 12:52:18 +0200
Subject: [PATCH 1/3] fix(launcher): parse num cuda devices from
 CUDA_VISIBLE_DEVICES and NVIDIA_VISIBLE_DEVICES

---
 launcher/src/main.rs | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)
diff --git a/launcher/src/main.rs b/launcher/src/main.rs
index dc12c90f..a863ea4a 100644
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@@ -455,11 +455,11 @@ fn shutdown_shards(shutdown: Arc<Mutex<bool>>, shutdown_receiver: &mpsc::Receive
 }
 
 fn num_cuda_devices() -> Option<usize> {
-    if let Ok(cuda_visible_devices) = env::var("CUDA_VISIBLE_DEVICES") {
-        let n_devices = cuda_visible_devices.split(',').count();
-        return Some(n_devices);
-    }
-    None
+    let devices = env::var("CUDA_VISIBLE_DEVICES")
+        .map_err(|_| env::var("NVIDIA_VISIBLE_DEVICES"))
+        .ok()?;
+    let n_devices = devices.split(',').count();
+    Some(n_devices)
 }
 
 #[derive(Deserialize)]
@@ -509,9 +509,9 @@ fn find_num_shards(sharded: Option<bool>, num_shard: Option<usize>) -> usize {
     let num_shard = match (sharded, num_shard) {
         (Some(true), None) => {
             // try to default to the number of available GPUs
-            tracing::info!("Parsing num_shard from CUDA_VISIBLE_DEVICES");
-            let n_devices =
-                num_cuda_devices().expect("--num-shard and CUDA_VISIBLE_DEVICES are not set");
+            tracing::info!("Parsing num_shard from CUDA_VISIBLE_DEVICES/NVIDIA_VISIBLE_DEVICES");
+            let n_devices = num_cuda_devices()
+                .expect("--num-shard and CUDA_VISIBLE_DEVICES/NVIDIA_VISIBLE_DEVICES are not set");
             if n_devices <= 1 {
                 panic!("`sharded` is true but only found {n_devices} CUDA devices");
             }

From 49a6c8c1b28742e806dd95a36af82db5b45d181d Mon Sep 17 00:00:00 2001
From: OlivierDehaene <23298448+OlivierDehaene@users.noreply.github.com>
Date: Tue, 30 May 2023 13:27:48 +0200
Subject: [PATCH 2/3] fix(launcher): parse num cuda devices from
 CUDA_VISIBLE_DEVICES and NVIDIA_VISIBLE_DEVICES

---
 launcher/src/main.rs | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/launcher/src/main.rs b/launcher/src/main.rs
index a863ea4a..0810d979 100644
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@@ -455,9 +455,10 @@ fn shutdown_shards(shutdown: Arc<Mutex<bool>>, shutdown_receiver: &mpsc::Receive
 }
 
 fn num_cuda_devices() -> Option<usize> {
-    let devices = env::var("CUDA_VISIBLE_DEVICES")
-        .map_err(|_| env::var("NVIDIA_VISIBLE_DEVICES"))
-        .ok()?;
+    let devices = match env::var("CUDA_VISIBLE_DEVICES") {
+        Ok(devices) => devices,
+        Err(_) => env::var("NVIDIA_VISIBLE_DEVICES").ok()?,
+    };
     let n_devices = devices.split(',').count();
     Some(n_devices)
 }

From bf7f1d54345f9e6e091d4b756e8b8381fe767d89 Mon Sep 17 00:00:00 2001
From: OlivierDehaene <23298448+OlivierDehaene@users.noreply.github.com>
Date: Tue, 30 May 2023 13:56:03 +0200
Subject: [PATCH 3/3] fix(server): fix quantization

---
 server/text_generation_server/models/bloom.py     | 14 ++++++--------
 server/text_generation_server/models/galactica.py | 14 ++++++--------
 server/text_generation_server/models/gpt_neox.py  | 14 ++++++--------
 server/text_generation_server/models/t5.py        | 14 ++++++--------
 4 files changed, 24 insertions(+), 32 deletions(-)

diff --git a/server/text_generation_server/models/bloom.py b/server/text_generation_server/models/bloom.py
index 088a1457..90db59fc 100644
--- a/server/text_generation_server/models/bloom.py
+++ b/server/text_generation_server/models/bloom.py
@@ -245,14 +245,12 @@ class BLOOMSharded(BLOOM):
                                 return linear
 
                             module.linear = replace_linear(state)
-                        elif quantize == "gptq":
-                            raise NotImplementedError(
-                                "`gptq` is not implemented for now"
-                            )
-                        elif quantize is None:
-                            tensor = tensor.to(device)
-                        else:
-                            raise ValueError(f"Unexpected quantize `{quantize}`")
+                    elif quantize == "gptq":
+                        raise NotImplementedError("`gptq` is not implemented for now")
+                    elif quantize is None:
+                        tensor = tensor.to(device)
+                    else:
+                        raise ValueError(f"Unexpected quantize `{quantize}`")
 
                     module._parameters[param_name] = tensor
                     if name == "word_embeddings.weight":
diff --git a/server/text_generation_server/models/galactica.py b/server/text_generation_server/models/galactica.py
index 0a3f341b..954421f0 100644
--- a/server/text_generation_server/models/galactica.py
+++ b/server/text_generation_server/models/galactica.py
@@ -364,14 +364,12 @@ class GalacticaSharded(Galactica):
                                 return linear
 
                             module.linear = replace_linear(state)
-                        elif quantize == "gptq":
-                            raise NotImplementedError(
-                                "`gptq` is not implemented for now"
-                            )
-                        elif quantize is None:
-                            tensor = tensor.to(device)
-                        else:
-                            raise ValueError(f"Unexpected quantize `{quantize}`")
+                    elif quantize == "gptq":
+                        raise NotImplementedError("`gptq` is not implemented for now")
+                    elif quantize is None:
+                        tensor = tensor.to(device)
+                    else:
+                        raise ValueError(f"Unexpected quantize `{quantize}`")
 
                     module._parameters[param_name] = tensor
                     if name == "model.decoder.embed_tokens.weight":
diff --git a/server/text_generation_server/models/gpt_neox.py b/server/text_generation_server/models/gpt_neox.py
index e4a85082..c0e8170d 100644
--- a/server/text_generation_server/models/gpt_neox.py
+++ b/server/text_generation_server/models/gpt_neox.py
@@ -210,14 +210,12 @@ class GPTNeoxSharded(CausalLM):
                                 return linear
 
                             module.linear = replace_linear(state)
-                        elif quantize == "gptq":
-                            raise NotImplementedError(
-                                "`gptq` is not implemented for now"
-                            )
-                        elif quantize is None:
-                            tensor = tensor.to(device)
-                        else:
-                            raise ValueError(f"Unexpected quantize `{quantize}`")
+                    elif quantize == "gptq":
+                        raise NotImplementedError("`gptq` is not implemented for now")
+                    elif quantize is None:
+                        tensor = tensor.to(device)
+                    else:
+                        raise ValueError(f"Unexpected quantize `{quantize}`")
 
                     if current_parameter_tensor is not None:
                         module._parameters[param_name] = tensor
diff --git a/server/text_generation_server/models/t5.py b/server/text_generation_server/models/t5.py
index 17cc50e0..32dcb806 100644
--- a/server/text_generation_server/models/t5.py
+++ b/server/text_generation_server/models/t5.py
@@ -223,14 +223,12 @@ class T5Sharded(Seq2SeqLM):
 
                             module.linear = replace_linear(state)
 
-                        elif quantize == "gptq" and not module_name.endswith("wo"):
-                            raise NotImplementedError(
-                                "`gptq` is not implemented for now"
-                            )
-                        elif quantize is None:
-                            tensor = tensor.to(device)
-                        else:
-                            raise ValueError(f"Unexpected quantize `{quantize}`")
+                    elif quantize == "gptq" and not module_name.endswith("wo"):
+                        raise NotImplementedError("`gptq` is not implemented for now")
+                    elif quantize is None or module_name.endswith("wo"):
+                        tensor = tensor.to(device)
+                    else:
+                        raise ValueError(f"Unexpected quantize `{quantize}`")
 
                     if current_parameter_tensor is not None:
                         module._parameters[param_name] = tensor