Fixing prefix attention.

2025-09-12 04:44:52 +00:00 · 2024-08-12 16:23:18 +02:00 · 2024-08-12 16:23:18 +02:00 · 0c90550e9d
commit 0c90550e9d
parent 44a77dcb9e
2 changed files with 1 additions and 3 deletions
--- a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
@ -21,7 +21,6 @@
 from contextlib import contextmanager
 from typing import List, Optional, Tuple

-from loguru import logger
 import torch
 import torch.distributed

--- a/server/text_generation_server/models/globals.py
+++ b/server/text_generation_server/models/globals.py
@ -6,8 +6,7 @@ from typing import Dict, Optional
 from text_generation_server.utils.log import log_master

 PREFIX_CACHING = os.getenv("USE_PREFIX_CACHING", False)
-log_master(logger.info, f"Using Attention = {PREFIX_CACHING}")
-
+log_master(logger.info, f"Using prefix caching = {PREFIX_CACHING}")
 ATTENTION = os.getenv("ATTENTION", "flashinfer" if PREFIX_CACHING else "paged")
 _expected = {"paged", "flashdecoding", "flashinfer"}
 assert (