diff --git a/backends/gaudi/server/text_generation_server/models/__init__.py b/backends/gaudi/server/text_generation_server/models/__init__.py index dc2cb21c..502e4d8c 100644 --- a/backends/gaudi/server/text_generation_server/models/__init__.py +++ b/backends/gaudi/server/text_generation_server/models/__init__.py @@ -7,9 +7,8 @@ from transformers.models.auto import modeling_auto from huggingface_hub import hf_hub_download, HfApi from typing import Optional from pathlib import Path -from typing import Optional, List, Dict +from typing import List, Dict # Needed to properly setup habana_frameworks -import text_generation_server.habana_quantization_env as hq_env from text_generation_server.utils.speculate import get_speculate, set_speculate from text_generation_server.models.model import Model @@ -31,6 +30,7 @@ from text_generation_server.utils.adapter import ( load_and_merge_adapters, AdapterInfo, ) +from text_generation_server.adapters.lora import LoraWeights from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi diff --git a/backends/gaudi/server/text_generation_server/models/causal_lm.py b/backends/gaudi/server/text_generation_server/models/causal_lm.py index 92f7a806..3844d89f 100644 --- a/backends/gaudi/server/text_generation_server/models/causal_lm.py +++ b/backends/gaudi/server/text_generation_server/models/causal_lm.py @@ -737,7 +737,7 @@ class CausalLM(Model): else: if LAZY_MODE == 0: # It is said that "keep_input_mutations" is safe for inference to be done - dbg_trace("TORCH COMPILE", f"Torch compiling of model") + dbg_trace("TORCH COMPILE", "Torch compiling of model") model.model = torch.compile( model.model, backend="hpu_backend", @@ -932,7 +932,7 @@ class CausalLM(Model): if self.has_position_ids: kwargs["position_ids"] = position_ids - if bypass_hpu_graph != None: + if bypass_hpu_graph is not None: kwargs["bypass_hpu_graphs"] = bypass_hpu_graph kwargs.update(self.kwargs) @@ -1303,7 +1303,7 @@ class CausalLM(Model): try: # max prefill batch size warmup _, prefill_batch, _ = self.generate_token([batch]) - except: + except Exception: raise RuntimeError( f"Not enough memory to handle {len(batch.input_ids)} prefill tokens. " f"You need to decrease `--max-batch-prefill-tokens`" @@ -1331,7 +1331,7 @@ class CausalLM(Model): for seq_len in prefill_seqlen_list: batch = self.generate_warmup_batch(request, seq_len - 1, batch_size) _, prefill_batch, _ = self.generate_token([batch]) - except: + except Exception: prefill_batch_size_list.sort() prefill_seqlen_list.sort() raise RuntimeError( @@ -1384,7 +1384,7 @@ class CausalLM(Model): del decode_batch batches.clear() - except: + except Exception: raise RuntimeError( f"Not enough memory to warmup decode batch_sizes({decode_batch_size_list})." f"You need to decrease `--max-batch-total-tokens`" diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/llava_next.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/llava_next.py index 1ef55019..6d6675ad 100644 --- a/backends/gaudi/server/text_generation_server/models/custom_modeling/llava_next.py +++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/llava_next.py @@ -14,13 +14,11 @@ # limitations under the License. """ PyTorch Llava-NeXT model.""" -from typing import List, Optional, Tuple, Union +from typing import List, Optional import torch import torch.utils.checkpoint -from torch import nn -from transformers.activations import ACT2FN from transformers.models.llava_next.modeling_llava_next import ( unpad_image, ) diff --git a/backends/gaudi/server/text_generation_server/models/model.py b/backends/gaudi/server/text_generation_server/models/model.py index 04172c74..c691f15d 100644 --- a/backends/gaudi/server/text_generation_server/models/model.py +++ b/backends/gaudi/server/text_generation_server/models/model.py @@ -1,5 +1,4 @@ import inspect -from loguru import logger import torch from abc import ABC, abstractmethod @@ -13,7 +12,6 @@ from text_generation_server.utils.speculate import get_speculate from text_generation_server.pb.generate_pb2 import InfoResponse from text_generation_server.adapters.weights import LayerAdapterWeights from text_generation_server.pb import generate_pb2 -import time BASE_MODEL_ADAPTER_ID = "__base_model__" diff --git a/backends/gaudi/server/text_generation_server/models/starcoder.py b/backends/gaudi/server/text_generation_server/models/starcoder.py index 98e7939a..bb13503c 100644 --- a/backends/gaudi/server/text_generation_server/models/starcoder.py +++ b/backends/gaudi/server/text_generation_server/models/starcoder.py @@ -1,7 +1,5 @@ -from loguru import logger import torch from dataclasses import dataclass -import os from typing import List, Optional, Type from text_generation_server.models import CausalLM diff --git a/backends/gaudi/server/text_generation_server/models/vlm_causal_lm.py b/backends/gaudi/server/text_generation_server/models/vlm_causal_lm.py index 5c9955c2..c5d9eda5 100644 --- a/backends/gaudi/server/text_generation_server/models/vlm_causal_lm.py +++ b/backends/gaudi/server/text_generation_server/models/vlm_causal_lm.py @@ -5,8 +5,6 @@ import time import math from PIL import Image from io import BytesIO -import base64 -import numpy from opentelemetry import trace from loguru import logger from typing import Iterable, Optional, Tuple, List, Type, Dict @@ -15,7 +13,6 @@ import tempfile import copy from text_generation_server.models import Model from transformers import PreTrainedTokenizerBase -from transformers.image_processing_utils import select_best_resolution from text_generation_server.utils.tokens import batch_top_tokens from text_generation_server.pb import generate_pb2 from text_generation_server.models.causal_lm import ( @@ -34,7 +31,6 @@ import text_generation_server.habana_quantization_env as hq_env from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi from text_generation_server.utils import ( HeterogeneousNextTokenChooser, - StoppingCriteria, make_tokenizer_optional, is_tokenizer_transparent, pad_next_token_chooser_parameters, @@ -47,8 +43,6 @@ from optimum.habana.checkpoint_utils import get_ds_injection_policy from transformers import ( AutoTokenizer, - AutoModel, - PreTrainedTokenizerBase, AutoConfig, ) from optimum.habana.checkpoint_utils import ( @@ -59,7 +53,6 @@ from optimum.habana.checkpoint_utils import ( from text_generation_server.utils.speculate import get_speculate from text_generation_server.models.types import ( - Batch, Tokens, Generation, GeneratedText, @@ -116,7 +109,6 @@ def image_text_replacement(processor, image_input, config, image_id: int) -> str elif config.model_type == "llava_next": height, width = image_input["image_sizes"][image_id] num_features = get_number_of_features(height, width, config) - from loguru import logger return "" * num_features elif config.model_type == "paligemma": @@ -604,7 +596,7 @@ class VlmCausalLM(Model): if LAZY_MODE == 0: # It is said that "keep_input_mutations" is safe for inference to be done dbg_trace( - "TORCH COMPILE", f'Torch compiling of model') + "TORCH COMPILE", 'Torch compiling of model') model.model = torch.compile(model.model, backend="hpu_backend", options={"keep_input_mutations": True}) model = hq_env.setup_quantization(model) @@ -790,7 +782,7 @@ class VlmCausalLM(Model): if self.has_position_ids: kwargs["position_ids"] = position_ids - if bypass_hpu_graph != None: + if bypass_hpu_graph is not None: hpu_kwargs["bypass_hpu_graphs"] = bypass_hpu_graph kwargs.update(self.kwargs) @@ -1118,7 +1110,7 @@ class VlmCausalLM(Model): try: # max prefill batch size warmup _, prefill_batch, _ = self.generate_token([batch], is_warmup) - except: + except Exception: raise RuntimeError( f"Not enough memory to handle {len(batch.input_ids)} prefill tokens. " f"You need to decrease `--max-batch-prefill-tokens`" @@ -1158,7 +1150,7 @@ class VlmCausalLM(Model): DECODE_WARMUP_BATCH_SIZE_LIST.append(batch_size) - except: + except Exception: raise RuntimeError( f"Not enough memory to handle following prefill and decode warmup." f"Prefill batch size list:{PREFILL_WARMUP_BATCH_SIZE_LIST}" @@ -1209,7 +1201,7 @@ class VlmCausalLM(Model): DECODE_WARMUP_BATCH_SIZE_LIST.append(max_decode_batch_size) max_batch_total_tokens = max_decode_batch_size * MAX_TOTAL_TOKENS MAX_BATCH_TOTAL_TOKENS = max_batch_total_tokens - except : + except Exception: raise RuntimeError( f"Not enough memory to handle batch_size({batch_size}) decode warmup." f"Decode batch size list:{DECODE_WARMUP_BATCH_SIZE_LIST}" diff --git a/backends/gaudi/server/text_generation_server/server.py b/backends/gaudi/server/text_generation_server/server.py index 7e15c0cf..61b0f27f 100644 --- a/backends/gaudi/server/text_generation_server/server.py +++ b/backends/gaudi/server/text_generation_server/server.py @@ -2,7 +2,6 @@ import asyncio import os -import sys import torch import time import signal diff --git a/backends/gaudi/server/text_generation_server/tgi_service.py b/backends/gaudi/server/text_generation_server/tgi_service.py index f0f13126..714b3566 100644 --- a/backends/gaudi/server/text_generation_server/tgi_service.py +++ b/backends/gaudi/server/text_generation_server/tgi_service.py @@ -1,10 +1,8 @@ import os from pathlib import Path from loguru import logger -import sys from text_generation_server import server import argparse -from typing import List from text_generation_server.utils.adapter import parse_lora_adapters diff --git a/backends/gaudi/server/text_generation_server/utils/__init__.py b/backends/gaudi/server/text_generation_server/utils/__init__.py index 565a7c3c..ead0e1f2 100644 --- a/backends/gaudi/server/text_generation_server/utils/__init__.py +++ b/backends/gaudi/server/text_generation_server/utils/__init__.py @@ -1,6 +1,5 @@ # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company. -import text_generation_server.habana_quantization_env from text_generation_server.utils.convert import convert_file, convert_files from text_generation_server.utils.dist import initialize_torch_distributed from text_generation_server.utils.weights import Weights @@ -21,9 +20,6 @@ from text_generation_server.utils.tokens import ( FinishReason, Sampling, Greedy, - make_tokenizer_optional, - is_tokenizer_transparent, - pad_next_token_chooser_parameters, ) __all__ = [ diff --git a/backends/gaudi/server/text_generation_server/utils/dist.py b/backends/gaudi/server/text_generation_server/utils/dist.py index d370a3d5..cf8acacc 100644 --- a/backends/gaudi/server/text_generation_server/utils/dist.py +++ b/backends/gaudi/server/text_generation_server/utils/dist.py @@ -44,9 +44,7 @@ class FakeGroup: def initialize_torch_distributed(): - import habana_frameworks.torch.core as htcore - rank = int(os.getenv("RANK", "0")) world_size = int(os.getenv("WORLD_SIZE", "1")) options = None @@ -69,7 +67,7 @@ def initialize_torch_distributed(): raise ValueError(f"WORLD_SIZE ({world_size}) is higher than the number of available HPUs ({n_hpus}).") else: try: - import oneccl_bindings_for_pytorch + import oneccl_bindings_for_pytorch # noqa: F401 backend = "ccl" if os.getenv("CCL_WORKER_COUNT", None) is None: diff --git a/backends/gaudi/server/text_generation_server/utils/tokens.py b/backends/gaudi/server/text_generation_server/utils/tokens.py index aa4d1fdb..56f5b8c7 100644 --- a/backends/gaudi/server/text_generation_server/utils/tokens.py +++ b/backends/gaudi/server/text_generation_server/utils/tokens.py @@ -705,8 +705,8 @@ def make_tokenizer_optional(tokenizer): ): assert return_tensors == "pt", "inccorrect input arguments when calling TransparentTokenizer" assert padding == "max_length" or padding == "longest", "inccorrect input arguments when calling TransparentTokenizer" - assert return_token_type_ids == False, "inccorrect input arguments when calling TransparentTokenizer" - assert truncation == True, "inccorrect input arguments when calling TransparentTokenizer" + assert not return_token_type_ids, "inccorrect input arguments when calling TransparentTokenizer" + assert truncation, "inccorrect input arguments when calling TransparentTokenizer" def str_token_to_int(i): if i == '?': @@ -727,7 +727,8 @@ def make_tokenizer_optional(tokenizer): clean_up_tokenization_spaces: bool = None, **kwargs, ) -> str: - return ','.join(str(i) for i in to_py_obj(token_ids)) + # I don't think this method is used anywhere and should be removed when doing refactoring + return ','.join(str(i) for i in to_py_obj(token_ids)) # noqa: F821 import os if os.getenv("SKIP_TOKENIZER_IN_TGI", "false").lower() == "true":