mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-10 20:04:52 +00:00
Merge branch 'main' into paged-attention-docs
This commit is contained in:
commit
2faf396128
@ -23,6 +23,8 @@
|
|||||||
title: Streaming
|
title: Streaming
|
||||||
- local: conceptual/paged_attention
|
- local: conceptual/paged_attention
|
||||||
title: PagedAttention
|
title: PagedAttention
|
||||||
|
- local: conceptual/safetensors
|
||||||
|
title: Safetensors
|
||||||
- local: conceptual/flash_attention
|
- local: conceptual/flash_attention
|
||||||
title: Flash Attention
|
title: Flash Attention
|
||||||
title: Conceptual Guides
|
title: Conceptual Guides
|
||||||
|
7
docs/source/conceptual/safetensors.md
Normal file
7
docs/source/conceptual/safetensors.md
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
# Safetensors
|
||||||
|
|
||||||
|
Safetensors is a model serialization format for deep learning models. It is [faster](https://huggingface.co/docs/safetensors/speed) and safer compared to other serialization formats like pickle (which is used under the hood in many deep learning libraries).
|
||||||
|
|
||||||
|
TGI depends on safetensors format mainly to enable [tensor parallelism sharding](./tensor_parallelism). For a given model repository during serving, TGI looks for safetensors weights. If there are no safetensors weights, TGI converts the PyTorch weights to safetensors format.
|
||||||
|
|
||||||
|
You can learn more about safetensors by reading the [safetensors documentation](https://huggingface.co/docs/safetensors/index).
|
@ -69,10 +69,11 @@ def create_exllama_buffers():
|
|||||||
TEMP_STATE, TEMP_DQ = temp_state, temp_dq
|
TEMP_STATE, TEMP_DQ = temp_state, temp_dq
|
||||||
|
|
||||||
|
|
||||||
class Ex4bitLinear:
|
class Ex4bitLinear(torch.nn.Module):
|
||||||
"""Linear layer implementation with per-group 4-bit quantization of the weights"""
|
"""Linear layer implementation with per-group 4-bit quantization of the weights"""
|
||||||
|
|
||||||
def __init__(self, qweight, qzeros, scales, g_idx, bias, bits, groupsize):
|
def __init__(self, qweight, qzeros, scales, g_idx, bias, bits, groupsize):
|
||||||
|
super().__init__()
|
||||||
global MAX_DQ, MAX_INNER, ACT_ORDER, DEVICE
|
global MAX_DQ, MAX_INNER, ACT_ORDER, DEVICE
|
||||||
assert bits == 4
|
assert bits == 4
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user