mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-21 14:52:20 +00:00
* Using flash decoding Conditional flashdecoding. Fix max_q. Working kvcache Working version with flash decoding. Make it work for mistral. Fix after rebase.. Less intrusive. REvert changes in modeling. Speedup flashdecoding. HHachweew Hack to make other models work. Fixing non flash decoding llama path. Router logic knows about page size. Missing 2 models. Missing cohere. Fixing cohere flash decoding. Revamped all this architecture. Fix cohere. Fixing falcon. Enabling custom block size schedule. Update router/src/infer.rs Not sending preallocated output. * Making it work on non flash decoding. * Fix Cohere. * Fix non decoding paths. * Rebased. * No need for cache_manager anymore. * Update? * "ipex" -> "cpu" * These do not belong. * Factoring cu_seqlen_qk for better abstracting over every model. * Fixing non flash tests/imports. * Changing return everywhere. * Update mistral past. * Fixing Mi{s,x}tral (non functional in Flash Decoding mode though). * Fixup mistral clamping (had issues with cuda graphs). * No need to recreate anything actually.
56 lines
1.4 KiB
Python
56 lines
1.4 KiB
Python
import torch
|
|
import os
|
|
from loguru import logger
|
|
from typing import Dict
|
|
|
|
MEM_POOL = torch.cuda.graph_pool_handle() if torch.cuda.is_available() else None
|
|
# This is overridden by the cli
|
|
FLASH_DECODING = os.getenv("FLASH_DECODING") in {"1", "true", "True"}
|
|
BLOCK_SIZE: int = 256 if FLASH_DECODING else 16
|
|
if FLASH_DECODING:
|
|
logger.info("Using FLASH_DECODING")
|
|
|
|
|
|
cuda_graphs = os.getenv("CUDA_GRAPHS")
|
|
if cuda_graphs is not None:
|
|
try:
|
|
cuda_graphs = [int(item) for item in cuda_graphs.split(",")]
|
|
except Exception as e:
|
|
raise RuntimeError(
|
|
f"Could not parse cuda graphs {cuda_graphs}, expected comma separated list for batch sizes to run on: {e}"
|
|
)
|
|
else:
|
|
cuda_graphs = None
|
|
# sorting the cuda graphs in descending order helps reduce the
|
|
# memory impact and results in less memory usage
|
|
if cuda_graphs is not None:
|
|
cuda_graphs.sort(reverse=True)
|
|
|
|
|
|
CUDA_GRAPHS = cuda_graphs
|
|
|
|
# This is overridden at model loading.
|
|
global MODEL_ID
|
|
MODEL_ID = None
|
|
|
|
|
|
def set_model_id(model_id: str):
|
|
global MODEL_ID
|
|
MODEL_ID = model_id
|
|
|
|
|
|
# NOTE: eventually we should move this into the router and pass back the
|
|
# index in all cases.
|
|
global ADAPTER_TO_INDEX
|
|
ADAPTER_TO_INDEX: Dict[str, int] = None
|
|
|
|
|
|
def set_adapter_to_index(adapter_to_index: Dict[str, int]):
|
|
global ADAPTER_TO_INDEX
|
|
ADAPTER_TO_INDEX = adapter_to_index
|
|
|
|
|
|
def get_adapter_to_index():
|
|
global ADAPTER_TO_INDEX
|
|
return ADAPTER_TO_INDEX
|