Transparent tokenizer uses explicit int32 (#31) (#60)

Co-authored-by: Adam Stachowicz <105052242+astachowiczhabana@users.noreply.github.com>
This commit is contained in:
jkaniecki 2024-02-21 14:24:41 +01:00 committed by GitHub
parent a4d3a00d98
commit 6b6dec9ea1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -385,8 +385,8 @@ def make_tokenizer_optional(tokenizer):
return int(i)
all_tokens = [[str_token_to_int(i.strip()) for i in inner_text.split(',')]
for inner_text in text]
return {"input_ids": torch.tensor([[tokenizer.pad_token_id] * (max_length-len(tokens)) + tokens for tokens in all_tokens]),
"attention_mask": torch.tensor([[0] * (max_length-len(tokens)) + [1]*len(tokens) for tokens in all_tokens])}
return {"input_ids": torch.tensor([[tokenizer.pad_token_id] * (max_length-len(tokens)) + tokens for tokens in all_tokens], dtype=torch.int32),
"attention_mask": torch.tensor([[0] * (max_length-len(tokens)) + [1]*len(tokens) for tokens in all_tokens], dtype=torch.int32)}
def decode(
self,