mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-10 20:04:52 +00:00
double free
This commit is contained in:
parent
3c4243d627
commit
b1831d5f97
@ -43,8 +43,8 @@ to power LLMs api-inference widgets.
|
||||
- Tensor Parallelism for faster inference on multiple GPUs
|
||||
- Token streaming using Server-Sent Events (SSE)
|
||||
- [Continuous batching of incoming requests](https://github.com/huggingface/text-generation-inference/tree/main/router) for increased total throughput
|
||||
- Optimized transformers code for inference using [flash-attention](https://github.com/HazyResearch/flash-attention) on the most popular architectures
|
||||
- Quantization with [bitsandbytes](https://github.com/TimDettmers/bitsandbytes)
|
||||
- Optimized transformers code for inference using [flash-attention](https://github.com/HazyResearch/flash-attention) and [Paged Attention](https://github.com/vllm-project/vllm) on the most popular architectures
|
||||
- Quantization with [bitsandbytes](https://github.com/TimDettmers/bitsandbytes) and [GPT-Q](https://arxiv.org/abs/2210.17323)
|
||||
- [Safetensors](https://github.com/huggingface/safetensors) weight loading
|
||||
- Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226)
|
||||
- Logits warper (temperature scaling, top-p, top-k, repetition penalty, more details see [transformers.LogitsProcessor](https://huggingface.co/docs/transformers/internal/generation_utils#transformers.LogitsProcessor))
|
||||
|
@ -481,6 +481,8 @@ class FlashCausalLMBatch(Batch):
|
||||
block_indices_to_free.extend(self.block_tables[i])
|
||||
# Free blocks
|
||||
CACHE_MANAGER.free(block_indices_to_free)
|
||||
# Needed to avoid dropping blocks when the batches will go out of scope
|
||||
self.block_tables = None
|
||||
|
||||
# Index into tensors
|
||||
input_ids = self.input_ids[indices]
|
||||
@ -675,7 +677,7 @@ class FlashCausalLMBatch(Batch):
|
||||
)
|
||||
|
||||
def __del__(self):
|
||||
if self.block_tables is not None:
|
||||
if self.block_tables is not None and self.block_tables:
|
||||
global CACHE_MANAGER
|
||||
# Free blocks
|
||||
CACHE_MANAGER.free(list(itertools.chain.from_iterable(self.block_tables)))
|
||||
|
Loading…
Reference in New Issue
Block a user