diff --git a/server/Makefile-vllm b/server/Makefile-vllm index 5e60b9b0..62fa413f 100644 --- a/server/Makefile-vllm +++ b/server/Makefile-vllm @@ -18,7 +18,6 @@ vllm-rocm: build-vllm-rocm: vllm-rocm cd vllm && git fetch && git checkout ca6913b3c2ffacdcb7d15e914dc34adbc6c89479 - cd vllm && patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h ./rocm_patch/rocm_bf16.patch cd vllm && PYTORCH_ROCM_ARCH="gfx90a;gfx942" python setup.py install install-vllm-rocm: build-vllm-rocm diff --git a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py index 0a9d57e6..9a21d043 100644 --- a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py @@ -244,7 +244,7 @@ class LlamaMLP(nn.Module): ) def forward(self, hidden_states): - if False and IS_ROCM_SYSTEM and self.hidden_act == "silu" and hidden_states.shape[0] == 1: + if IS_ROCM_SYSTEM and self.hidden_act == "silu" and hidden_states.shape[0] == 1: out = torch.empty( hidden_states.shape[0], self.intermediate_size, diff --git a/server/text_generation_server/utils/layers.py b/server/text_generation_server/utils/layers.py index 330b7408..6635be56 100644 --- a/server/text_generation_server/utils/layers.py +++ b/server/text_generation_server/utils/layers.py @@ -366,7 +366,7 @@ class FastLinearROCm(nn.Module): weight = self.weight bias = self.bias - if False and IS_ROCM_SYSTEM and inp.numel() // inp.size(-1) == 1: + if IS_ROCM_SYSTEM and inp.numel() // inp.size(-1) == 1: batched = False if inp.dim() == 3: