diff --git a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py index 3e1fa2d3..1292c1b3 100644 --- a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py @@ -26,8 +26,6 @@ from transformers.activations import ACT2FN from transformers.configuration_utils import PretrainedConfig from typing import Optional, List, Tuple -from loguru import logger - from text_generation_server.utils import paged_attention, flash_attn from text_generation_server.utils.layers import ( TensorParallelRowLinear, @@ -44,7 +42,6 @@ if IS_CUDA_SYSTEM: elif IS_ROCM_SYSTEM: from vllm import layernorm_ops -torch.set_printoptions(threshold=10000000, sci_mode=True) class LlamaConfig(PretrainedConfig): def __init__( diff --git a/server/text_generation_server/utils/flash_attn.py b/server/text_generation_server/utils/flash_attn.py index 18c0cc99..b0f7394c 100644 --- a/server/text_generation_server/utils/flash_attn.py +++ b/server/text_generation_server/utils/flash_attn.py @@ -70,7 +70,6 @@ def attention( softmax_scale, window_size_left=-1, ): - # logger.info(f"HAS_FLASH_ATTN_V2 {HAS_FLASH_ATTN_V2}") if HAS_FLASH_ATTN_V2_CUDA: return flash_attn_2_cuda.varlen_fwd( q, diff --git a/server/text_generation_server/utils/layers.py b/server/text_generation_server/utils/layers.py index 2521f0a9..23e313ef 100644 --- a/server/text_generation_server/utils/layers.py +++ b/server/text_generation_server/utils/layers.py @@ -516,7 +516,6 @@ try: class FastLayerNorm(nn.LayerNorm): def forward(self, hidden_states, residual=None): if hidden_states.shape[-1] > 8192 or IS_ROCM_SYSTEM: - # Mistral does not use RMSNorm. if residual is not None: hidden_states += residual residual = hidden_states