diff --git a/server/Makefile-vllm b/server/Makefile-vllm index ded2f5d2..2464e348 100644 --- a/server/Makefile-vllm +++ b/server/Makefile-vllm @@ -1,5 +1,5 @@ commit_cuda := b5dfc61db88a81069e45b44f7cc99bd9e62a60fa -commit_rocm := ca6913b3c2ffacdcb7d15e914dc34adbc6c89479 +commit_rocm := e5d1a20b3a8cb744ce562723f5e1dbe907ee18cc build-vllm-cuda: if [ ! -d 'vllm' ]; then \ pip install -U ninja packaging --no-cache-dir && \ @@ -19,5 +19,5 @@ build-vllm-rocm: PYTORCH_ROCM_ARCH="gfx90a;gfx942" python setup.py build install-vllm-rocm: build-vllm-rocm - cd vllm && git fetch && git checkout $(commit_rocm) && \ + cd vllm && git fetch && git checkout $(commit_rocm) && \ PYTORCH_ROCM_ARCH="gfx90a;gfx942" pip install -e . diff --git a/server/text_generation_server/models/__init__.py b/server/text_generation_server/models/__init__.py index dbe49039..2e280ff4 100644 --- a/server/text_generation_server/models/__init__.py +++ b/server/text_generation_server/models/__init__.py @@ -412,9 +412,9 @@ def get_model( sliding_window = config_dict.get("sliding_window", -1) if sliding_window != -1 and not SUPPORTS_WINDOWING: logger.warning( - f"Flash attention is available, but doesn't support windowing which is required by model {model_id}" + f"Flash attention is available, but doesn't support windowing which is required by model {model_id} for best performance." ) - FLASH_ATTENTION = False + # FLASH_ATTENTION = False if model_type == MAMBA: return Mamba(