mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-10 20:04:52 +00:00
Allow dtype for bitsandbytes (it works, checked for idefics
9b/llama/80b)t
This commit is contained in:
parent
2713b21132
commit
e3c31c9d92
@ -210,6 +210,7 @@ def launcher(event_loop):
|
||||
quantize: Optional[str] = None,
|
||||
trust_remote_code: bool = False,
|
||||
use_flash_attention: bool = True,
|
||||
dtype: Optional[str] = None
|
||||
):
|
||||
port = random.randint(8000, 10_000)
|
||||
master_port = random.randint(10_000, 20_000)
|
||||
@ -237,6 +238,9 @@ def launcher(event_loop):
|
||||
if quantize is not None:
|
||||
args.append("--quantize")
|
||||
args.append(quantize)
|
||||
if dtype is not None:
|
||||
args.append("--dtype")
|
||||
args.append(dtype)
|
||||
if trust_remote_code:
|
||||
args.append("--trust-remote-code")
|
||||
|
||||
|
@ -3,7 +3,7 @@ import pytest
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def idefics_handle(launcher):
|
||||
with launcher("HuggingFaceM4/idefics-9b-instruct", num_shard=2) as handle:
|
||||
with launcher("HuggingFaceM4/idefics-9b-instruct", num_shard=2, dtype="float16") as handle:
|
||||
yield handle
|
||||
|
||||
|
||||
|
@ -76,7 +76,7 @@ def serve(
|
||||
# Downgrade enum into str for easier management later on
|
||||
quantize = None if quantize is None else quantize.value
|
||||
dtype = None if dtype is None else dtype.value
|
||||
if dtype is not None and quantize is not None:
|
||||
if dtype is not None and quantize not in {None, "bitsandbytes", "bitsandbytes-nf4", "bitsandbytes-fp4"}:
|
||||
raise RuntimeError(
|
||||
"Only 1 can be set between `dtype` and `quantize`, as they both decide how goes the final model."
|
||||
)
|
||||
|
Loading…
Reference in New Issue
Block a user