From 686cc6671705c666b767fffe71b2ed9c9b6fccd1 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Fri, 30 Dec 2022 10:30:42 -0800 Subject: [PATCH 1/2] fix(server): Check for device type correctly when determining initial padding (#16) AFAIK there is no torch device type called "gpu". --- server/text_generation/models/causal_lm.py | 2 +- server/text_generation/models/seq2seq_lm.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/server/text_generation/models/causal_lm.py b/server/text_generation/models/causal_lm.py index aeecf127..3ad36219 100644 --- a/server/text_generation/models/causal_lm.py +++ b/server/text_generation/models/causal_lm.py @@ -65,7 +65,7 @@ class CausalLMBatch: ) all_logprobs.append(None) - pad_to_multiple_of = 8 if "gpu" in str(device) else None + pad_to_multiple_of = 8 if device.type == "cuda" else None tokenized_inputs = tokenizer( inputs, return_tensors="pt", diff --git a/server/text_generation/models/seq2seq_lm.py b/server/text_generation/models/seq2seq_lm.py index fc80c60c..4095db92 100644 --- a/server/text_generation/models/seq2seq_lm.py +++ b/server/text_generation/models/seq2seq_lm.py @@ -77,7 +77,7 @@ class Seq2SeqLMBatch: decoder_logprobs.append(None) # Tokenize batch - pad_to_multiple_of = 8 if "gpu" in str(device) else None + pad_to_multiple_of = 8 if device.type == "cuda" else None tokenized_inputs = tokenizer( inputs, return_tensors="pt", From 3efa5bbbfd5868695da4d5d9ad23d81f48f1e5a8 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Fri, 30 Dec 2022 10:31:44 -0800 Subject: [PATCH 2/2] fix(router): Include special tokens when tokenizing (#14) There's currently a discrepancy in the tokenization between the router and python server code. The latter includes special tokens but former does not. This results in a token count mismatch for seq2seq models such as mt0 where the tokenizer emits an EOS token at the end. This in turn results in some unexpected/incorrect output, in particular when batch concatenation is involved, because the python code uses the input length passed from the router for each row. As far as I can tell, it is better to include this token in the encoder `input_ids`, so I guess it's best to just adjust on the router side. --- router/src/validation.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/router/src/validation.rs b/router/src/validation.rs index ff659b3e..f6da1913 100644 --- a/router/src/validation.rs +++ b/router/src/validation.rs @@ -131,7 +131,7 @@ fn validation_worker( } // Get the number of tokens in the input - match tokenizer.encode(request.inputs.clone(), false) { + match tokenizer.encode(request.inputs.clone(), true) { Ok(inputs) => { let input_length = inputs.len();