mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-10 11:54:52 +00:00
fmt
This commit is contained in:
parent
a6057c4076
commit
74c87f5888
@ -146,8 +146,10 @@ def serve(
|
||||
# For which we have the finale shapes only after the model has loaded
|
||||
# This will allocate those buffers.
|
||||
from text_generation_server.utils.gptq.exllama import (
|
||||
create_exllama_buffers, set_device
|
||||
create_exllama_buffers,
|
||||
set_device,
|
||||
)
|
||||
|
||||
set_device(model.device)
|
||||
create_exllama_buffers()
|
||||
except ImportError:
|
||||
|
@ -12,7 +12,6 @@ def ext_make_q4(qweight, qzeros, scales, g_idx, device):
|
||||
)
|
||||
|
||||
|
||||
|
||||
def ext_q4_matmul(x, q4, q4_width):
|
||||
"""Matrix multiplication, returns x @ q4"""
|
||||
outshape = x.shape[:-1] + (q4_width,)
|
||||
|
Loading…
Reference in New Issue
Block a user