mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-11 20:34:54 +00:00
fix various merge errors
This commit is contained in:
parent
c683597b42
commit
b7e98ba635
@ -41,9 +41,9 @@ class FastLinearROCm(torch.nn.Module):
|
|||||||
bias,
|
bias,
|
||||||
) -> None:
|
) -> None:
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.weight = nn.Parameter(weight)
|
self.weight = torch.nn.Parameter(weight)
|
||||||
if bias is not None:
|
if bias is not None:
|
||||||
self.bias = nn.Parameter(bias)
|
self.bias = torch.nn.Parameter(bias)
|
||||||
else:
|
else:
|
||||||
self.bias = None
|
self.bias = None
|
||||||
|
|
||||||
|
@ -48,6 +48,7 @@ if SYSTEM == "rocm":
|
|||||||
|
|
||||||
|
|
||||||
def load_attention(config, prefix, weights):
|
def load_attention(config, prefix, weights):
|
||||||
|
bias = config.attention_bias
|
||||||
if config.num_attention_heads != config.num_key_value_heads:
|
if config.num_attention_heads != config.num_key_value_heads:
|
||||||
return TensorParallelColumnLinear.load_multi(
|
return TensorParallelColumnLinear.load_multi(
|
||||||
config,
|
config,
|
||||||
|
@ -47,7 +47,7 @@ from text_generation_server.models.custom_modeling.idefics_vision import (
|
|||||||
from text_generation_server.models.custom_modeling.idefics_perceiver import (
|
from text_generation_server.models.custom_modeling.idefics_perceiver import (
|
||||||
IdeficsPerceiverResampler,
|
IdeficsPerceiverResampler,
|
||||||
)
|
)
|
||||||
from text_generation_server.utils.layers import (
|
from text_generation_server.layers import (
|
||||||
TensorParallelColumnLinear,
|
TensorParallelColumnLinear,
|
||||||
TensorParallelEmbedding,
|
TensorParallelEmbedding,
|
||||||
TensorParallelRowLinear,
|
TensorParallelRowLinear,
|
||||||
|
@ -1150,8 +1150,6 @@ class FlashCausalLM(Model):
|
|||||||
next_token_texts = []
|
next_token_texts = []
|
||||||
left = 0
|
left = 0
|
||||||
|
|
||||||
logger.info(f"Accepted ids {n_accepted_ids}")
|
|
||||||
|
|
||||||
current_stopped = False
|
current_stopped = False
|
||||||
for j in range(index, index + n_accepted_ids):
|
for j in range(index, index + n_accepted_ids):
|
||||||
# Generated token
|
# Generated token
|
||||||
|
@ -5,12 +5,15 @@ from loguru import logger
|
|||||||
import math
|
import math
|
||||||
|
|
||||||
from text_generation_server.utils.import_utils import SYSTEM
|
from text_generation_server.utils.import_utils import SYSTEM
|
||||||
|
from text_generation_server.utils.flash_attn_triton import triton_attention
|
||||||
|
|
||||||
if os.getenv("USE_FLASH_ATTENTION", "").lower() == "false":
|
if os.getenv("USE_FLASH_ATTENTION", "").lower() == "false":
|
||||||
raise ImportError("`USE_FLASH_ATTENTION` is false.")
|
raise ImportError("`USE_FLASH_ATTENTION` is false.")
|
||||||
HAS_FLASH_ATTN = False
|
HAS_FLASH_ATTN = False
|
||||||
HAS_FLASH_ATTN_V2_CUDA = False
|
HAS_FLASH_ATTN_V2_CUDA = False
|
||||||
HAS_FLASH_ATTN_V2_ROCM = False
|
HAS_FLASH_ATTN_V2_ROCM = False
|
||||||
|
ROCM_USE_FLASH_ATTN_V2_CK = False
|
||||||
|
ROCM_USE_FLASH_ATTN_V2_TRITON = False
|
||||||
|
|
||||||
if SYSTEM == "xpu":
|
if SYSTEM == "xpu":
|
||||||
import intel_extension_for_pytorch as ipex
|
import intel_extension_for_pytorch as ipex
|
||||||
|
Loading…
Reference in New Issue
Block a user