From 0c90550e9db8b23ea41b01ec74e6248e151f7abf Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Mon, 12 Aug 2024 16:23:18 +0200 Subject: [PATCH] Fixing prefix attention. --- .../models/custom_modeling/flash_llama_modeling.py | 1 - server/text_generation_server/models/globals.py | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py index 91a835f3..3253d2dc 100644 --- a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py @@ -21,7 +21,6 @@ from contextlib import contextmanager from typing import List, Optional, Tuple -from loguru import logger import torch import torch.distributed diff --git a/server/text_generation_server/models/globals.py b/server/text_generation_server/models/globals.py index cea5d698..fbff1cec 100644 --- a/server/text_generation_server/models/globals.py +++ b/server/text_generation_server/models/globals.py @@ -6,8 +6,7 @@ from typing import Dict, Optional from text_generation_server.utils.log import log_master PREFIX_CACHING = os.getenv("USE_PREFIX_CACHING", False) -log_master(logger.info, f"Using Attention = {PREFIX_CACHING}") - +log_master(logger.info, f"Using prefix caching = {PREFIX_CACHING}") ATTENTION = os.getenv("ATTENTION", "flashinfer" if PREFIX_CACHING else "paged") _expected = {"paged", "flashdecoding", "flashinfer"} assert (