diff --git a/server/Makefile-selective-scan b/server/Makefile-selective-scan index d98d881c..b93b517d 100644 --- a/server/Makefile-selective-scan +++ b/server/Makefile-selective-scan @@ -26,6 +26,3 @@ install-selective-scan: install-causal-conv1d build-selective-scan cd mamba && pip install . build-all: build-causal-conv1d build-selective-scan - -install-ssm: install-causal-conv1d install-selective-scan - @echo "Selective scan model installed" diff --git a/server/text_generation_server/models/causal_lm.py b/server/text_generation_server/models/causal_lm.py index 966f1c7a..212ab7a9 100644 --- a/server/text_generation_server/models/causal_lm.py +++ b/server/text_generation_server/models/causal_lm.py @@ -609,16 +609,15 @@ class CausalLM(Model): truncation_side="left", trust_remote_code=trust_remote_code, ) - device_map = ( - "auto" - if torch.cuda.is_available() and torch.cuda.device_count() > 1 - else None - ) model = AutoModelForCausalLM.from_pretrained( model_id, revision=revision, torch_dtype=dtype, - device_map=device_map, + device_map=( + "auto" + if torch.cuda.is_available() and torch.cuda.device_count() > 1 + else None + ), load_in_8bit=quantize == "bitsandbytes", trust_remote_code=trust_remote_code, ) @@ -629,11 +628,6 @@ class CausalLM(Model): ): model = model.cuda() - # if device_map is "auto", it's unclear which device the model is on - # therefore, we need to get the device the model is on after loading - if device_map is not None: - device = next(model.parameters()).device - if tokenizer.pad_token_id is None: if model.config.pad_token_id is not None: tokenizer.pad_token_id = model.config.pad_token_id diff --git a/server/text_generation_server/models/seq2seq_lm.py b/server/text_generation_server/models/seq2seq_lm.py index 3f086852..79c001b0 100644 --- a/server/text_generation_server/models/seq2seq_lm.py +++ b/server/text_generation_server/models/seq2seq_lm.py @@ -639,28 +639,21 @@ class Seq2SeqLM(Model): device = torch.device("cpu") dtype = torch.float32 if dtype is None else dtype - device_map = ( - "auto" - if torch.cuda.is_available() and torch.cuda.device_count() > 1 - else None - ) - model = AutoModelForSeq2SeqLM.from_pretrained( model_id, revision=revision, torch_dtype=dtype, - device_map=device_map, + device_map=( + "auto" + if torch.cuda.is_available() and torch.cuda.device_count() > 1 + else None + ), load_in_8bit=quantize == "bitsandbytes", trust_remote_code=trust_remote_code, ) if torch.cuda.is_available() and torch.cuda.device_count() == 1: model = model.cuda() - # if device_map is "auto", it's unclear which device the model is on - # therefore, we need to get the device the model is on after loading - if device_map is not None: - device = next(model.parameters()).device - tokenizer = AutoTokenizer.from_pretrained( model_id, revision=revision,