add parallelization

2025-09-09 19:34:53 +00:00 · 2023-05-16 21:14:29 +02:00 · 2023-05-16 21:14:29 +02:00 · f08a1a50b7
commit f08a1a50b7
parent 8ddbdea45b
16 changed files with 173 additions and 193 deletions
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@ -213,12 +213,13 @@ jobs:
          sudo mount /dev/nvme1n1 ${{ env.DOCKER_VOLUME }}
      - name: Install
        run: |
          pip install pytest-xdist 
          make install-integration-tests
      - name: Run tests
        run: |
          export DOCKER_IMAGE=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:sha-${{ env.GITHUB_SHA_SHORT }}
          export HUGGING_FACE_HUB_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-          pytest -s -vv integration-tests
+          pytest -s -vv -n 2 --dist loadfile integration-tests
  stop-runner:
    name: Stop self-hosted EC2 runner
--- a/integration-tests/conftest.py
+++ b/integration-tests/conftest.py
@ -1,3 +1,4 @@
 import sys
 import subprocess
 import contextlib
 import pytest
@ -7,6 +8,7 @@ import docker
 import json
 import math
 import time
 import random
 from docker.errors import NotFound
 from typing import Optional, List, Dict
@ -205,10 +207,12 @@ def launcher(event_loop):
    def local_launcher(
        model_id: str, num_shard: Optional[int] = None, quantize: Optional[str] = None
    ):
-        port = 9999
+        port = random.randint(8000, 10_000)
-        master_port = 19999
+        master_port = random.randint(10_000, 20_000)
-        shard_uds_path = f"/tmp/{model_id.replace('/', '--')}-server"
+        shard_uds_path = (
            f"/tmp/tgi-tests-{model_id.split('/')[-1]}-{num_shard}-{quantize}-server"
        )
        args = [
            "text-generation-launcher",
@ -236,7 +240,7 @@ def launcher(event_loop):
            process.wait(60)
            launcher_output = process.stdout.read().decode("utf-8")
-            print(launcher_output)
+            print(launcher_output, file=sys.stderr)
            process.stdout.close()
            process.stderr.close()
@ -245,7 +249,7 @@ def launcher(event_loop):
    def docker_launcher(
        model_id: str, num_shard: Optional[int] = None, quantize: Optional[str] = None
    ):
-        port = 9999
+        port = random.randint(8000, 10_000)
        args = ["--model-id", model_id, "--env"]
@ -298,7 +302,7 @@ def launcher(event_loop):
            pass
        container_output = container.logs().decode("utf-8")
-        print(container_output)
+        print(container_output, file=sys.stderr)
        container.remove()
--- a/integration-tests/models/snapshots/test_flash_llama/test_flash_llama_load.json
+++ b/integration-tests/models/snapshots/test_flash_llama/test_flash_llama_load.json
@ -1,92 +1,4 @@
 [
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [
        {
          "id": 1,
          "logprob": null,
          "text": "<s>"
        },
        {
          "id": 4321,
          "logprob": -8.6875,
          "text": "Test"
        },
        {
          "id": 2009,
          "logprob": -11.5546875,
          "text": "request"
        }
      ],
      "seed": null,
      "tokens": [
        {
          "id": 363,
          "logprob": -1.5322266,
          "special": false,
          "text": " for"
        },
        {
          "id": 847,
          "logprob": -2.5585938,
          "special": false,
          "text": " /"
        },
        {
          "id": 2754,
          "logprob": -2.265625,
          "special": false,
          "text": "api"
        },
        {
          "id": 29914,
          "logprob": -0.034088135,
          "special": false,
          "text": "/"
        },
        {
          "id": 29894,
          "logprob": -0.96240234,
          "special": false,
          "text": "v"
        },
        {
          "id": 29896,
          "logprob": -0.36816406,
          "special": false,
          "text": "1"
        },
        {
          "id": 29914,
          "logprob": -0.013191223,
          "special": false,
          "text": "/"
        },
        {
          "id": 16418,
          "logprob": -3.15625,
          "special": false,
          "text": "projects"
        },
        {
          "id": 29914,
          "logprob": -0.43774414,
          "special": false,
          "text": "/"
        },
        {
          "id": 29896,
          "logprob": -1.9443359,
          "special": false,
          "text": "1"
        }
      ]
    },
    "generated_text": "for /api/v1/projects/1"
  },
  {
    "details": {
      "best_of_sequences": null,
@ -263,6 +175,94 @@
    },
    "generated_text": "for /api/v1/projects/1"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [
        {
          "id": 1,
          "logprob": null,
          "text": "<s>"
        },
        {
          "id": 4321,
          "logprob": -8.6875,
          "text": "Test"
        },
        {
          "id": 2009,
          "logprob": -11.5546875,
          "text": "request"
        }
      ],
      "seed": null,
      "tokens": [
        {
          "id": 363,
          "logprob": -1.5322266,
          "special": false,
          "text": " for"
        },
        {
          "id": 847,
          "logprob": -2.5585938,
          "special": false,
          "text": " /"
        },
        {
          "id": 2754,
          "logprob": -2.265625,
          "special": false,
          "text": "api"
        },
        {
          "id": 29914,
          "logprob": -0.034088135,
          "special": false,
          "text": "/"
        },
        {
          "id": 29894,
          "logprob": -0.96240234,
          "special": false,
          "text": "v"
        },
        {
          "id": 29896,
          "logprob": -0.36816406,
          "special": false,
          "text": "1"
        },
        {
          "id": 29914,
          "logprob": -0.013191223,
          "special": false,
          "text": "/"
        },
        {
          "id": 16418,
          "logprob": -3.15625,
          "special": false,
          "text": "projects"
        },
        {
          "id": 29914,
          "logprob": -0.43774414,
          "special": false,
          "text": "/"
        },
        {
          "id": 29896,
          "logprob": -1.9443359,
          "special": false,
          "text": "1"
        }
      ]
    },
    "generated_text": "for /api/v1/projects/1"
  },
  {
    "details": {
      "best_of_sequences": null,
--- a/integration-tests/models/snapshots/test_mt0_base/test_mt0_base.json
+++ b/integration-tests/models/snapshots/test_mt0_base/test_mt0_base.json
@ -16,7 +16,7 @@
        "id": 926,
        "logprob": -4.3554688,
        "special": false,
-        "text": "To"
+        "text": " To"
      },
      {
        "id": 18295,
--- a/integration-tests/models/snapshots/test_mt0_base/test_mt0_base_all_params.json
+++ b/integration-tests/models/snapshots/test_mt0_base/test_mt0_base_all_params.json
@ -16,7 +16,7 @@
        "id": 16017,
        "logprob": -1.3505859,
        "special": false,
-        "text": "blue"
+        "text": " blue"
      },
      {
        "id": 20495,
--- a/integration-tests/models/snapshots/test_mt0_base/test_mt0_base_load.json
+++ b/integration-tests/models/snapshots/test_mt0_base/test_mt0_base_load.json
@ -1,58 +1,4 @@
 [
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "eos_token",
      "generated_tokens": 6,
      "prefill": [
        {
          "id": 0,
          "logprob": null,
          "text": "<pad>"
        }
      ],
      "seed": null,
      "tokens": [
        {
          "id": 259,
          "logprob": -1.3789062,
          "special": false,
          "text": ""
        },
        {
          "id": 39261,
          "logprob": -0.36279297,
          "special": false,
          "text": "Because"
        },
        {
          "id": 609,
          "logprob": -1.0966797,
          "special": false,
          "text": " it"
        },
        {
          "id": 339,
          "logprob": -0.8276367,
          "special": false,
          "text": " is"
        },
        {
          "id": 16017,
          "logprob": -1.6845703,
          "special": false,
          "text": " blue"
        },
        {
          "id": 1,
          "logprob": -0.72753906,
          "special": true,
          "text": "</s>"
        }
      ]
    },
    "generated_text": "Because it is blue"
  },
  {
    "details": {
      "best_of_sequences": null,
@ -71,7 +17,7 @@
          "id": 259,
          "logprob": -1.3798828,
          "special": false,
-          "text": ""
+          "text": " "
        },
        {
          "id": 39261,
@ -125,7 +71,7 @@
          "id": 259,
          "logprob": -1.3789062,
          "special": false,
-          "text": ""
+          "text": " "
        },
        {
          "id": 39261,
@ -179,7 +125,61 @@
          "id": 259,
          "logprob": -1.3789062,
          "special": false,
-          "text": ""
+          "text": " "
        },
        {
          "id": 39261,
          "logprob": -0.36279297,
          "special": false,
          "text": "Because"
        },
        {
          "id": 609,
          "logprob": -1.0966797,
          "special": false,
          "text": " it"
        },
        {
          "id": 339,
          "logprob": -0.8276367,
          "special": false,
          "text": " is"
        },
        {
          "id": 16017,
          "logprob": -1.6845703,
          "special": false,
          "text": " blue"
        },
        {
          "id": 1,
          "logprob": -0.72753906,
          "special": true,
          "text": "</s>"
        }
      ]
    },
    "generated_text": "Because it is blue"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "eos_token",
      "generated_tokens": 6,
      "prefill": [
        {
          "id": 0,
          "logprob": null,
          "text": "<pad>"
        }
      ],
      "seed": null,
      "tokens": [
        {
          "id": 259,
          "logprob": -1.3789062,
          "special": false,
          "text": " "
        },
        {
          "id": 39261,
--- a/router/src/main.rs
+++ b/router/src/main.rs
@ -146,7 +146,7 @@ fn main() -> Result<(), std::io::Error> {
                    sha: None,
                    pipeline_tag: None,
                },
-                false => get_model_info(&tokenizer_name, &revision, authorization_token).await.unwrap_or({
+                false => get_model_info(&tokenizer_name, &revision, authorization_token).await.unwrap_or_else(|| {
                    tracing::warn!("Could not retrieve model info from the Hugging Face hub.");
                    HubModelInfo { model_id: tokenizer_name.to_string(), sha: None, pipeline_tag: None }
                }),
--- a/server/text_generation_server/models/bloom.py
+++ b/server/text_generation_server/models/bloom.py
@ -56,7 +56,7 @@ class BLOOM(CausalLM):
        quantize: Optional[str] = None,
    ):
        super(BLOOM, self).__init__(
-            model_id=model_id, revision=revision, quantize=quantize, decode_buffer=1
+            model_id=model_id, revision=revision, quantize=quantize
        )
    @property
@ -111,7 +111,6 @@ class BLOOMSharded(BLOOM):
            requires_padding=True,
            dtype=dtype,
            device=device,
            decode_buffer=1,
            rank=rank,
            world_size=world_size,
        )
--- a/server/text_generation_server/models/causal_lm.py
+++ b/server/text_generation_server/models/causal_lm.py
@ -81,8 +81,6 @@ class CausalLMBatch(Batch):
        for i, r in enumerate(pb.requests):
            requests_idx_mapping[r.id] = i
            inputs.append(r.inputs)
            # offsets.append(None)
            # token_offsets.append(None)
            next_token_choosers.append(NextTokenChooser.from_pb(r.parameters, device))
            stopping_criteria = StoppingCriteria.from_pb(
                r.stopping_parameters, tokenizer
@ -102,7 +100,7 @@ class CausalLMBatch(Batch):
            truncation=True,
            max_length=max_truncation,
        ).to(device)
-        for i, r in enumerate(pb.requests):
+        for _ in pb.requests:
            input_len = tokenized_inputs["input_ids"].shape[1]
            offsets.append(0)
            token_offsets.append(input_len)
@ -452,7 +450,6 @@ class CausalLM(Model):
        model_id: str,
        revision: Optional[str] = None,
        quantize: Optional[str] = None,
        decode_buffer: int = 4,
    ):
        if torch.cuda.is_available():
            device = torch.device("cuda")
@ -486,7 +483,6 @@ class CausalLM(Model):
            requires_padding=True,
            dtype=dtype,
            device=device,
            decode_buffer=decode_buffer,
        )
    @property
--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@ -108,8 +108,8 @@ class FlashCausalLMBatch(Batch):
            max_seqlen = max(max_seqlen, input_length)
            input_lengths.append(input_length)
-            offsets.append(None)
+            offsets.append(0)
-            token_offsets.append(None)
+            token_offsets.append(input_length)
            all_input_ids.append(tokenized_input)
@ -394,7 +394,6 @@ class FlashCausalLM(Model):
        model_id: str,
        revision: Optional[str] = None,
        quantize: Optional[str] = None,
        decode_buffer: int = 4,
    ):
        if torch.cuda.is_available():
            device = torch.device("cuda")
@ -410,7 +409,7 @@ class FlashCausalLM(Model):
            revision=revision,
            torch_dtype=dtype,
            load_in_8bit=quantize == "bitsandbytes",
-        )
+        ).to(device)
        super(FlashCausalLM, self).__init__(
            model=model,
@ -418,7 +417,6 @@ class FlashCausalLM(Model):
            requires_padding=False,
            dtype=dtype,
            device=device,
            decode_buffer=decode_buffer,
        )
    @property
--- a/server/text_generation_server/models/flash_llama.py
+++ b/server/text_generation_server/models/flash_llama.py
@ -66,7 +66,7 @@ class FlashLlama(FlashCausalLM):
        self.load_weights(model, filenames, quantize, device, dtype)
        super(FlashCausalLM, self).__init__(
-            model=model,
+            model=model.to(device),
            tokenizer=tokenizer,
            requires_padding=False,
            dtype=dtype,
@ -191,7 +191,7 @@ class FlashLlamaSharded(FlashLlama):
        )
        torch.distributed.barrier(group=self.process_group)
        super(FlashCausalLM, self).__init__(
-            model=model,
+            model=model.to(device),
            tokenizer=tokenizer,
            requires_padding=False,
            dtype=dtype,
--- a/server/text_generation_server/models/flash_neox.py
+++ b/server/text_generation_server/models/flash_neox.py
@ -75,7 +75,7 @@ class FlashNeoXSharded(FlashNeoX):
        )
        torch.distributed.barrier(group=self.process_group)
        super(FlashCausalLM, self).__init__(
-            model=model,
+            model=model.to(device),
            tokenizer=tokenizer,
            requires_padding=False,
            dtype=dtype,
--- a/server/text_generation_server/models/flash_santacoder.py
+++ b/server/text_generation_server/models/flash_santacoder.py
@ -69,12 +69,11 @@ class FlashSantacoder(FlashCausalLM):
        )
        super(FlashCausalLM, self).__init__(
-            model=model,
+            model=model.to(device),
            tokenizer=tokenizer,
            requires_padding=False,
            dtype=dtype,
            device=device,
            decode_buffer=1,
        )
    @staticmethod
@ -215,14 +214,13 @@ class FlashSantacoderSharded(FlashSantacoder):
        )
        torch.distributed.barrier(group=self.process_group)
        super(FlashCausalLM, self).__init__(
-            model=model,
+            model=model.to(device),
            tokenizer=tokenizer,
            requires_padding=False,
            dtype=dtype,
            device=device,
            rank=rank,
            world_size=world_size,
            decode_buffer=1,
        )
    @staticmethod
--- a/server/text_generation_server/models/model.py
+++ b/server/text_generation_server/models/model.py
@ -18,20 +18,15 @@ class Model(ABC):
        requires_padding: bool,
        dtype: torch.dtype,
        device: torch.device,
        decode_buffer: int = 4,
        rank: int = 0,
        world_size: int = 1,
    ):
-        if decode_buffer < 1:
+        self.model = model.eval()
            raise ValueError("decode_buffer must be >= 1")
        self.model = model.eval().to(device)
        self.tokenizer = tokenizer
        self.all_special_ids = set(tokenizer.all_special_ids)
        self.requires_padding = requires_padding
        self.dtype = dtype
        self.device = device
        self.decode_buffer = decode_buffer
        self.rank = rank
        self.world_size = world_size
        self.check_initialized()
@ -61,12 +56,6 @@ class Model(ABC):
    ) -> Tuple[str, int, int]:
        """Hack to hopefully support generate_stream for the maximum number of tokenizers"""
        # Compatibility layer for old None values.
        if prefix_offset is None:
            prefix_offset = 0
        if read_offset is None:
            read_offset = 0
        # The prefix text is necessary only to defeat cleanup algorithms in the decode
        # which decide to add a space or not depending on the surrounding ids.
        prefix_text = self.tokenizer.decode(
--- a/server/text_generation_server/models/santacoder.py
+++ b/server/text_generation_server/models/santacoder.py
@ -52,7 +52,7 @@ class SantaCoder(CausalLM):
            torch_dtype=dtype,
            load_in_8bit=quantize == "bitsandbytes",
            trust_remote_code=True,  # required
-        )
+        ).to(device)
        super(CausalLM, self).__init__(
            model=model,
@ -60,7 +60,6 @@ class SantaCoder(CausalLM):
            requires_padding=True,
            dtype=dtype,
            device=device,
            decode_buffer=1,
        )
    def decode(self, generated_ids: List[int]) -> str:
--- a/server/text_generation_server/models/seq2seq_lm.py
+++ b/server/text_generation_server/models/seq2seq_lm.py
@ -91,8 +91,6 @@ class Seq2SeqLMBatch(Batch):
            inputs.append(r.inputs)
            requests_idx_mapping[r.id] = i
            decoder_input_lengths.append(1)
            # offsets.append(None)
            # token_offsets.append(None)
            next_token_choosers.append(NextTokenChooser.from_pb(r.parameters, device))
            stopping_criteria = StoppingCriteria.from_pb(
                r.stopping_parameters, tokenizer
@ -123,7 +121,7 @@ class Seq2SeqLMBatch(Batch):
            .repeat(len(pb.requests))
            .view(-1, 1)
        )
-        for i, r in enumerate(pb.requests):
+        for _ in pb.requests:
            offsets.append(0)
            token_offsets.append(1)
        all_decoder_input_ids = decoder_input_ids.view(-1).split(1)
@ -505,7 +503,6 @@ class Seq2SeqLM(Model):
        model_id: str,
        revision: Optional[str] = None,
        quantize: Optional[str] = None,
        decode_buffer: int = 4,
    ):
        if torch.cuda.is_available():
            device = torch.device("cuda")
@ -535,7 +532,6 @@ class Seq2SeqLM(Model):
            requires_padding=True,
            dtype=dtype,
            device=device,
            decode_buffer=decode_buffer,
        )
    @property