From 0d3cc033adeed2248437daf8ce8d597fd297492d Mon Sep 17 00:00:00 2001 From: fxmarty <9808326+fxmarty@users.noreply.github.com> Date: Thu, 6 Jun 2024 07:51:33 +0000 Subject: [PATCH] update vllm commit & fix models using sliding window --- server/Makefile-vllm | 4 ++-- server/text_generation_server/models/__init__.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/server/Makefile-vllm b/server/Makefile-vllm index ded2f5d2..2464e348 100644 --- a/server/Makefile-vllm +++ b/server/Makefile-vllm @@ -1,5 +1,5 @@ commit_cuda := b5dfc61db88a81069e45b44f7cc99bd9e62a60fa -commit_rocm := ca6913b3c2ffacdcb7d15e914dc34adbc6c89479 +commit_rocm := e5d1a20b3a8cb744ce562723f5e1dbe907ee18cc build-vllm-cuda: if [ ! -d 'vllm' ]; then \ pip install -U ninja packaging --no-cache-dir && \ @@ -19,5 +19,5 @@ build-vllm-rocm: PYTORCH_ROCM_ARCH="gfx90a;gfx942" python setup.py build install-vllm-rocm: build-vllm-rocm - cd vllm && git fetch && git checkout $(commit_rocm) && \ + cd vllm && git fetch && git checkout $(commit_rocm) && \ PYTORCH_ROCM_ARCH="gfx90a;gfx942" pip install -e . diff --git a/server/text_generation_server/models/__init__.py b/server/text_generation_server/models/__init__.py index dbe49039..2e280ff4 100644 --- a/server/text_generation_server/models/__init__.py +++ b/server/text_generation_server/models/__init__.py @@ -412,9 +412,9 @@ def get_model( sliding_window = config_dict.get("sliding_window", -1) if sliding_window != -1 and not SUPPORTS_WINDOWING: logger.warning( - f"Flash attention is available, but doesn't support windowing which is required by model {model_id}" + f"Flash attention is available, but doesn't support windowing which is required by model {model_id} for best performance." ) - FLASH_ATTENTION = False + # FLASH_ATTENTION = False if model_type == MAMBA: return Mamba(