diff --git a/server/Makefile b/server/Makefile index a4ce6d8b..adea6b31 100644 --- a/server/Makefile +++ b/server/Makefile @@ -24,7 +24,6 @@ install: gen-server install-torch pip install -e ".[bnb, accelerate]" run-dev: - SAFETENSORS_FAST_GPU=1 python -m torch.distributed.run --nproc_per_node=2 text_generation_server/cli.py serve bigscience/bloom-560m --sharded - + SAFETENSORS_FAST_GPU=1 python -m torch.distributed.run --nproc_per_node=1 text_generation_server/cli.py serve /mnt/TOFU/HF_MODELS/Llama-2-7b-chat-hf --quantize bitsandbytes export-requirements: poetry export -o requirements.txt -E bnb -E quantize --without-hashes diff --git a/server/text_generation_server/cli.py b/server/text_generation_server/cli.py index b12a9751..38e00d2d 100644 --- a/server/text_generation_server/cli.py +++ b/server/text_generation_server/cli.py @@ -141,10 +141,12 @@ def download_weights( if not extension == ".safetensors" or not auto_convert: raise e + logger.warning("attempting to load local model") # Try to see if there are local pytorch weights try: # Get weights for a local model, a hub cached model and inside the WEIGHTS_CACHE_OVERRIDE local_pt_files = utils.weight_files(model_id, revision, ".bin") + print(local_pt_files) # No local pytorch weights except utils.LocalEntryNotFoundError: diff --git a/server/text_generation_server/models/causal_lm.py b/server/text_generation_server/models/causal_lm.py index cbdf4808..bffbfa64 100644 --- a/server/text_generation_server/models/causal_lm.py +++ b/server/text_generation_server/models/causal_lm.py @@ -3,7 +3,7 @@ import inspect from dataclasses import dataclass from opentelemetry import trace -from transformers import AutoTokenizer, AutoModelForCausalLM, PreTrainedTokenizerBase +from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM, PreTrainedTokenizerBase from typing import Optional, Tuple, List, Type, Dict from text_generation_server.models import Model @@ -481,11 +481,12 @@ class CausalLM(Model): device_map="auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None, - load_in_8bit=quantize == "bitsandbytes", + load_in_4bit=quantize == "bitsandbytes", trust_remote_code=trust_remote_code, ) - if torch.cuda.is_available() and torch.cuda.device_count() == 1: - model = model.cuda() + ## ValueError: Calling `cuda()` is not supported for `4-bit` or `8-bit` quantized models. Please use the model as it is, since the model has already been set to the correct devices and casted to the correct `dtype`. + # if torch.cuda.is_available() and torch.cuda.device_count() == 1: + # model = model.cuda() if tokenizer.pad_token_id is None: if model.config.pad_token_id is not None: diff --git a/server/text_generation_server/server.py b/server/text_generation_server/server.py index 67137aaa..28c671ba 100644 --- a/server/text_generation_server/server.py +++ b/server/text_generation_server/server.py @@ -109,6 +109,7 @@ class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServicer): generations, next_batch = self.model.generate_token(batch) self.cache.set(next_batch) + print(generations) return generate_pb2.DecodeResponse( generations=[generation.to_pb() for generation in generations], batch=next_batch.to_pb() if next_batch else None, diff --git a/server/text_generation_server/utils/__init__.py b/server/text_generation_server/utils/__init__.py index 08ba808d..5c2b11e8 100644 --- a/server/text_generation_server/utils/__init__.py +++ b/server/text_generation_server/utils/__init__.py @@ -1,7 +1,7 @@ from text_generation_server.utils.convert import convert_file, convert_files from text_generation_server.utils.dist import initialize_torch_distributed from text_generation_server.utils.weights import Weights -from text_generation_server.utils.peft import download_and_unload_peft +from text_generation_server.utils.peft import download_and_unload_peft, load_local_peft from text_generation_server.utils.hub import ( weight_files, weight_hub_files, @@ -28,6 +28,7 @@ __all__ = [ "weight_hub_files", "download_weights", "download_and_unload_peft", + "load_local_peft", "EntryNotFoundError", "HeterogeneousNextTokenChooser", "LocalEntryNotFoundError", diff --git a/server/text_generation_server/utils/peft.py b/server/text_generation_server/utils/peft.py index be1f9444..c4eabae7 100644 --- a/server/text_generation_server/utils/peft.py +++ b/server/text_generation_server/utils/peft.py @@ -6,6 +6,8 @@ import torch from transformers import AutoTokenizer from peft import AutoPeftModelForCausalLM, AutoPeftModelForSeq2SeqLM +def load_local_peft(model_id, revision, trust_remote_code): + return model_id def download_and_unload_peft(model_id, revision, trust_remote_code): torch_dtype = torch.float16