fix(server): fix exllama buffers (#689)

Close #683
2025-09-08 19:04:52 +00:00 · 2023-07-24 14:25:43 +02:00 · 2023-07-24 14:25:43 +02:00 · 37df6df38e
commit 37df6df38e
parent 73a4d65d26
2 changed files with 22 additions and 13 deletions
--- a/server/text_generation_server/server.py
+++ b/server/text_generation_server/server.py
@ -105,21 +105,21 @@ class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServicer):
 def serve(
    model_id: str,
    revision: Optional[str],
    sharded: bool,
    quantize: Optional[str],
    dtype: Optional[str],
    trust_remote_code: bool,
    uds_path: Path,
 ):
    async def serve_inner(
        model_id: str,
        revision: Optional[str],
-        sharded: bool = False,
+        sharded: bool,
-        quantize: Optional[str] = None,
+        quantize: Optional[str],
-        dtype: Optional[str] = None,
+        dtype: Optional[str],
-        trust_remote_code: bool = False,
+        trust_remote_code: bool,
        uds_path: Path,
 ):
    async def serve_inner(
            model_id: str,
            revision: Optional[str],
            sharded: bool = False,
            quantize: Optional[str] = None,
            dtype: Optional[str] = None,
            trust_remote_code: bool = False,
    ):
        unix_socket_template = "unix://{}-{}"
        if sharded:
@ -147,8 +147,10 @@ def serve(
                # This will allocate those buffers.
                from text_generation_server.utils.gptq.exllama import (
                    create_exllama_buffers,
                    set_device,
                )
                set_device(model.device)
                create_exllama_buffers()
            except ImportError:
                pass
--- a/server/text_generation_server/utils/gptq/exllama.py
+++ b/server/text_generation_server/utils/gptq/exllama.py
@ -32,9 +32,16 @@ TEMP_STATE = None
 TEMP_DQ = None
 def set_device(device):
    global DEVICE
    DEVICE = device
 def create_exllama_buffers():
    global MAX_DQ, MAX_INNER, ACT_ORDER, DEVICE, TEMP_STATE, TEMP_DQ
    assert DEVICE is not None, "call set_device first"
    if ACT_ORDER:
        # TODO: this should be set to rust side `max_total_tokens`, but TGI
        # does not offer an API to expose this variable to python, as this variable