From 686cc6671705c666b767fffe71b2ed9c9b6fccd1 Mon Sep 17 00:00:00 2001
From: Nick Hill <nickhill@us.ibm.com>
Date: Fri, 30 Dec 2022 10:30:42 -0800
Subject: [PATCH 1/2] fix(server): Check for device type correctly when
 determining initial padding (#16)

AFAIK there is no torch device type called "gpu".
---
 server/text_generation/models/causal_lm.py  | 2 +-
 server/text_generation/models/seq2seq_lm.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/server/text_generation/models/causal_lm.py b/server/text_generation/models/causal_lm.py
index aeecf127..3ad36219 100644
--- a/server/text_generation/models/causal_lm.py
+++ b/server/text_generation/models/causal_lm.py
@@ -65,7 +65,7 @@ class CausalLMBatch:
             )
             all_logprobs.append(None)
 
-        pad_to_multiple_of = 8 if "gpu" in str(device) else None
+        pad_to_multiple_of = 8 if device.type == "cuda" else None
         tokenized_inputs = tokenizer(
             inputs,
             return_tensors="pt",
diff --git a/server/text_generation/models/seq2seq_lm.py b/server/text_generation/models/seq2seq_lm.py
index fc80c60c..4095db92 100644
--- a/server/text_generation/models/seq2seq_lm.py
+++ b/server/text_generation/models/seq2seq_lm.py
@@ -77,7 +77,7 @@ class Seq2SeqLMBatch:
             decoder_logprobs.append(None)
 
         # Tokenize batch
-        pad_to_multiple_of = 8 if "gpu" in str(device) else None
+        pad_to_multiple_of = 8 if device.type == "cuda" else None
         tokenized_inputs = tokenizer(
             inputs,
             return_tensors="pt",

From 3efa5bbbfd5868695da4d5d9ad23d81f48f1e5a8 Mon Sep 17 00:00:00 2001
From: Nick Hill <nickhill@us.ibm.com>
Date: Fri, 30 Dec 2022 10:31:44 -0800
Subject: [PATCH 2/2] fix(router): Include special tokens when tokenizing (#14)

There's currently a discrepancy in the tokenization between the router
and python server code. The latter includes special tokens but former
does not.

This results in a token count mismatch for seq2seq models such as mt0
where the tokenizer emits an EOS token at the end.

This in turn results in some unexpected/incorrect output, in particular
when batch concatenation is involved, because the python code uses the
input length passed from the router for each row.

As far as I can tell, it is better to include this token in the encoder
`input_ids`, so I guess it's best to just adjust on the router side.
---
 router/src/validation.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/router/src/validation.rs b/router/src/validation.rs
index ff659b3e..f6da1913 100644
--- a/router/src/validation.rs
+++ b/router/src/validation.rs
@@ -131,7 +131,7 @@ fn validation_worker(
         }
 
         // Get the number of tokens in the input
-        match tokenizer.encode(request.inputs.clone(), false) {
+        match tokenizer.encode(request.inputs.clone(), true) {
             Ok(inputs) => {
                 let input_length = inputs.len();