mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-22 23:42:06 +00:00
Merge branch 'habana-main' into 2.3.0
This commit is contained in:
commit
c345c734a7
@ -41,7 +41,7 @@ COPY launcher launcher
|
|||||||
RUN cargo build --profile release-opt
|
RUN cargo build --profile release-opt
|
||||||
|
|
||||||
# Text Generation Inference base image
|
# Text Generation Inference base image
|
||||||
FROM vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest as base
|
FROM vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest as base
|
||||||
|
|
||||||
ENV ATTENTION=default
|
ENV ATTENTION=default
|
||||||
ENV PREFIX_CACHING=0
|
ENV PREFIX_CACHING=0
|
||||||
@ -75,7 +75,7 @@ RUN cd server && \
|
|||||||
make gen-server && \
|
make gen-server && \
|
||||||
pip install -r requirements.txt && \
|
pip install -r requirements.txt && \
|
||||||
bash ./dill-0.3.8-patch.sh && \
|
bash ./dill-0.3.8-patch.sh && \
|
||||||
pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.17.0 && \
|
pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.18.0 && \
|
||||||
BUILD_CUDA_EXT=0 pip install git+https://github.com/AutoGPTQ/AutoGPTQ.git@097dd04e --no-build-isolation && \
|
BUILD_CUDA_EXT=0 pip install git+https://github.com/AutoGPTQ/AutoGPTQ.git@097dd04e --no-build-isolation && \
|
||||||
pip install . --no-cache-dir
|
pip install . --no-cache-dir
|
||||||
|
|
||||||
|
108
README.md
108
README.md
@ -20,6 +20,7 @@ limitations under the License.
|
|||||||
|
|
||||||
- [Text Generation Inference on Habana Gaudi](#text-generation-inference-on-habana-gaudi)
|
- [Text Generation Inference on Habana Gaudi](#text-generation-inference-on-habana-gaudi)
|
||||||
- [Table of contents](#table-of-contents)
|
- [Table of contents](#table-of-contents)
|
||||||
|
- [Tested Models and Configurations](#tested-models-and-configurations)
|
||||||
- [Running TGI on Gaudi](#running-tgi-on-gaudi)
|
- [Running TGI on Gaudi](#running-tgi-on-gaudi)
|
||||||
- [TGI-Gaudi Benchmark](#tgi-gaudi-benchmark)
|
- [TGI-Gaudi Benchmark](#tgi-gaudi-benchmark)
|
||||||
- [Static Batching Benchmark](#static-batching-benchmark)
|
- [Static Batching Benchmark](#static-batching-benchmark)
|
||||||
@ -32,24 +33,46 @@ limitations under the License.
|
|||||||
- [Llama3.1-70B 8 cards](#llama31-70b-8-cards)
|
- [Llama3.1-70B 8 cards](#llama31-70b-8-cards)
|
||||||
- [Llava-v1.6-Mistral-7B on 1 card](#llava-v16-mistral-7b-on-1-card)
|
- [Llava-v1.6-Mistral-7B on 1 card](#llava-v16-mistral-7b-on-1-card)
|
||||||
- [Running TGI with FP8 Precision](#running-tgi-with-fp8-precision)
|
- [Running TGI with FP8 Precision](#running-tgi-with-fp8-precision)
|
||||||
- [Llama2-7B on 1 Card](#llama2-7b-on-1-card-1)
|
- [TGI-Gaudi Benchmark](#tgi-gaudi-benchmark)
|
||||||
- [Llama2-70B on 8 Cards](#llama2-70b-on-8-cards-1)
|
|
||||||
- [Llama3.1-8B on 1 Card](#llama31-8b-on-1-card-1)
|
|
||||||
- [Llama3.1-70B on 8 cards](#llama31-70b-on-8-cards)
|
|
||||||
- [Llava-v1.6-Mistral-7B on 1 Card](#llava-v16-mistral-7b-on-1-card-1)
|
|
||||||
- [Llava-v1.6-Mistral-7B on 8 Cards](#llava-v16-mistral-7b-on-8-cards)
|
|
||||||
- [Adjusting TGI Parameters](#adjusting-tgi-parameters)
|
- [Adjusting TGI Parameters](#adjusting-tgi-parameters)
|
||||||
- [Environment Variables](#environment-variables)
|
- [Environment Variables](#environment-variables)
|
||||||
- [Profiler](#profiler)
|
- [Profiler](#profiler)
|
||||||
- [License](#license)
|
- [License](#license)
|
||||||
|
|
||||||
|
|
||||||
|
## Tested Models and Configurations
|
||||||
|
|
||||||
|
The following table contains models and configurations we have validated on Gaudi2.
|
||||||
|
|
||||||
|
|
||||||
|
| Model | BF16 | | FP8 | |
|
||||||
|
| ---------------------- | ------------ | ----------- | ------------ | ----------- |
|
||||||
|
| | Single Card | Multi-Card | Single Card | Multi-Card |
|
||||||
|
| Llama2-7B | ✔ | ✔ | ✔ | ✔ |
|
||||||
|
| Llama2-70B | | ✔ | | ✔ |
|
||||||
|
| Llama3-8B | ✔ | ✔ | ✔ | ✔ |
|
||||||
|
| Llama3-70B | | ✔ | | ✔ |
|
||||||
|
| Llama3.1-8B | ✔ | ✔ | ✔ | ✔ |
|
||||||
|
| Llama3.1-70B | | ✔ | | ✔ |
|
||||||
|
| CodeLlama-13B | ✔ | ✔ | ✔ | ✔ |
|
||||||
|
| Mixtral-8x7B | ✔ | ✔ | ✔ | ✔ |
|
||||||
|
| Mistral-7B | ✔ | ✔ | ✔ | ✔ |
|
||||||
|
| Falcon-180B | | ✔ | | ✔ |
|
||||||
|
| Qwen2-72B | | ✔ | | ✔ |
|
||||||
|
| Starcoder2-3b | ✔ | ✔ | ✔ | |
|
||||||
|
| Starcoder2-15b | ✔ | ✔ | ✔ | |
|
||||||
|
| Starcoder | ✔ | ✔ | ✔ | ✔ |
|
||||||
|
| Gemma-7b | ✔ | ✔ | ✔ | ✔ |
|
||||||
|
| Llava-v1.6-Mistral-7B | ✔ | ✔ | ✔ | ✔ |
|
||||||
|
|
||||||
|
|
||||||
## Running TGI on Gaudi
|
## Running TGI on Gaudi
|
||||||
|
|
||||||
To use [🤗 text-generation-inference](https://github.com/huggingface/text-generation-inference) on Habana Gaudi/Gaudi2/Gaudi3, follow these steps:
|
To use [🤗 text-generation-inference](https://github.com/huggingface/text-generation-inference) on Habana Gaudi/Gaudi2/Gaudi3, follow these steps:
|
||||||
|
|
||||||
1. Pull the official Docker image with:
|
1. Pull the official Docker image with:
|
||||||
```bash
|
```bash
|
||||||
docker pull ghcr.io/huggingface/tgi-gaudi:2.3.1
|
docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6
|
||||||
```
|
```
|
||||||
> [!NOTE]
|
> [!NOTE]
|
||||||
> Alternatively, you can build the Docker image using the `Dockerfile` located in this folder with:
|
> Alternatively, you can build the Docker image using the `Dockerfile` located in this folder with:
|
||||||
@ -70,7 +93,7 @@ To use [🤗 text-generation-inference](https://github.com/huggingface/text-gene
|
|||||||
-e OMPI_MCA_btl_vader_single_copy_mechanism=none -e HF_TOKEN=$hf_token \
|
-e OMPI_MCA_btl_vader_single_copy_mechanism=none -e HF_TOKEN=$hf_token \
|
||||||
-e ENABLE_HPU_GRAPH=true -e LIMIT_HPU_GRAPH=true -e USE_FLASH_ATTENTION=true \
|
-e ENABLE_HPU_GRAPH=true -e LIMIT_HPU_GRAPH=true -e USE_FLASH_ATTENTION=true \
|
||||||
-e FLASH_ATTENTION_RECOMPUTE=true --cap-add=sys_nice --ipc=host \
|
-e FLASH_ATTENTION_RECOMPUTE=true --cap-add=sys_nice --ipc=host \
|
||||||
ghcr.io/huggingface/tgi-gaudi:2.3.1 --model-id $model --max-input-tokens 1024 \
|
ghcr.io/huggingface/tgi-gaudi:2.0.6 --model-id $model --max-input-tokens 1024 \
|
||||||
--max-total-tokens 2048
|
--max-total-tokens 2048
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -84,7 +107,7 @@ To use [🤗 text-generation-inference](https://github.com/huggingface/text-gene
|
|||||||
-e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none \
|
-e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none \
|
||||||
-e HF_TOKEN=$hf_token -e ENABLE_HPU_GRAPH=true -e LIMIT_HPU_GRAPH=true \
|
-e HF_TOKEN=$hf_token -e ENABLE_HPU_GRAPH=true -e LIMIT_HPU_GRAPH=true \
|
||||||
-e USE_FLASH_ATTENTION=true -e FLASH_ATTENTION_RECOMPUTE=true --cap-add=sys_nice \
|
-e USE_FLASH_ATTENTION=true -e FLASH_ATTENTION_RECOMPUTE=true --cap-add=sys_nice \
|
||||||
--ipc=host ghcr.io/huggingface/tgi-gaudi:2.3.1 --model-id $model --sharded true \
|
--ipc=host ghcr.io/huggingface/tgi-gaudi:2.0.6 --model-id $model --sharded true \
|
||||||
--num-shard 8 --max-input-tokens 1024 --max-total-tokens 2048
|
--num-shard 8 --max-input-tokens 1024 --max-total-tokens 2048
|
||||||
```
|
```
|
||||||
3. Wait for the TGI-Gaudi server to come online. You will see something like so:
|
3. Wait for the TGI-Gaudi server to come online. You will see something like so:
|
||||||
@ -98,36 +121,6 @@ To use [🤗 text-generation-inference](https://github.com/huggingface/text-gene
|
|||||||
```
|
```
|
||||||
4. Please note that the model warmup can take several minutes, especially for FP8 inference. To minimize this time in consecutive runs, please refer to [Disk Caching Eviction Policy](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_PyTorch_Models.html#disk-caching-eviction-policy).
|
4. Please note that the model warmup can take several minutes, especially for FP8 inference. To minimize this time in consecutive runs, please refer to [Disk Caching Eviction Policy](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_PyTorch_Models.html#disk-caching-eviction-policy).
|
||||||
|
|
||||||
### TGI-Gaudi Benchmark
|
|
||||||
|
|
||||||
#### Static Batching Benchmark
|
|
||||||
To run static batching benchmark, please refer to [TGI's benchmark tool](https://github.com/huggingface/text-generation-inference/tree/main/benchmark).
|
|
||||||
|
|
||||||
To run it on the same machine, you can do the following:
|
|
||||||
* `docker exec -it <docker name> bash` , pick the docker started from step 2 using docker ps
|
|
||||||
* `text-generation-benchmark -t <model-id>` , pass the model-id from docker run command
|
|
||||||
* after the completion of tests, hit ctrl+c to see the performance data summary.
|
|
||||||
|
|
||||||
#### Continuous Batching Benchmark
|
|
||||||
To run continuous batching benchmark, please refer to [README in examples folder](https://github.com/huggingface/tgi-gaudi/blob/habana-main/examples/README.md).
|
|
||||||
|
|
||||||
### Tested Models and Configurations
|
|
||||||
|
|
||||||
The following table contains models and configurations we have validated on Gaudi2.
|
|
||||||
|
|
||||||
| Model | BF16 | FP8 | Single Card | Multi-Cards |
|
|
||||||
|-----------------------|------|-----|-------------|-------------|
|
|
||||||
| Llama2-7B | ✔ | ✔ | ✔ | ✔ |
|
|
||||||
| Llama2-70B | ✔ | ✔ | | ✔ |
|
|
||||||
| Llama3-8B | ✔ | ✔ | ✔ | ✔ |
|
|
||||||
| Llama3-70B | ✔ | ✔ | | ✔ |
|
|
||||||
| Llama3.1-8B | ✔ | ✔ | ✔ | ✔ |
|
|
||||||
| Llama3.1-70B | ✔ | ✔ | | ✔ |
|
|
||||||
| CodeLlama-13B | ✔ | ✔ | ✔ | |
|
|
||||||
| Mixtral-8x7B | ✔ | ✔ | ✔ | ✔ |
|
|
||||||
| Mistral-7B | ✔ | ✔ | ✔ | ✔ |
|
|
||||||
| Llava-v1.6-Mistral-7B | ✔ | ✔ | ✔ | ✔ |
|
|
||||||
|
|
||||||
|
|
||||||
## Running TGI with BF16 Precision
|
## Running TGI with BF16 Precision
|
||||||
|
|
||||||
@ -157,7 +150,7 @@ docker run -p 8080:80 \
|
|||||||
-e FLASH_ATTENTION_RECOMPUTE=true \
|
-e FLASH_ATTENTION_RECOMPUTE=true \
|
||||||
--cap-add=sys_nice \
|
--cap-add=sys_nice \
|
||||||
--ipc=host \
|
--ipc=host \
|
||||||
ghcr.io/huggingface/tgi-gaudi:2.3.1 \
|
ghcr.io/huggingface/tgi-gaudi:2.0.6 \
|
||||||
--model-id $model \
|
--model-id $model \
|
||||||
--max-input-length 1024 --max-total-tokens 2048 \
|
--max-input-length 1024 --max-total-tokens 2048 \
|
||||||
--max-batch-prefill-tokens 2048 --max-batch-total-tokens 65536 \
|
--max-batch-prefill-tokens 2048 --max-batch-total-tokens 65536 \
|
||||||
@ -189,7 +182,7 @@ docker run -p 8080:80 \
|
|||||||
-e FLASH_ATTENTION_RECOMPUTE=true \
|
-e FLASH_ATTENTION_RECOMPUTE=true \
|
||||||
--cap-add=sys_nice \
|
--cap-add=sys_nice \
|
||||||
--ipc=host \
|
--ipc=host \
|
||||||
ghcr.io/huggingface/tgi-gaudi:2.3.1 \
|
ghcr.io/huggingface/tgi-gaudi:2.0.6 \
|
||||||
--model-id $model \
|
--model-id $model \
|
||||||
--sharded true --num-shard 8 \
|
--sharded true --num-shard 8 \
|
||||||
--max-input-length 1024 --max-total-tokens 2048 \
|
--max-input-length 1024 --max-total-tokens 2048 \
|
||||||
@ -221,7 +214,7 @@ docker run -p 8080:80 \
|
|||||||
-e FLASH_ATTENTION_RECOMPUTE=true \
|
-e FLASH_ATTENTION_RECOMPUTE=true \
|
||||||
--cap-add=sys_nice \
|
--cap-add=sys_nice \
|
||||||
--ipc=host \
|
--ipc=host \
|
||||||
ghcr.io/huggingface/tgi-gaudi:2.3.1 \
|
ghcr.io/huggingface/tgi-gaudi:2.0.6 \
|
||||||
--model-id $model \
|
--model-id $model \
|
||||||
--max-input-length 1024 --max-total-tokens 2048 \
|
--max-input-length 1024 --max-total-tokens 2048 \
|
||||||
--max-batch-prefill-tokens 2048 --max-batch-total-tokens 65536 \
|
--max-batch-prefill-tokens 2048 --max-batch-total-tokens 65536 \
|
||||||
@ -253,7 +246,7 @@ docker run -p 8080:80 \
|
|||||||
-e FLASH_ATTENTION_RECOMPUTE=true \
|
-e FLASH_ATTENTION_RECOMPUTE=true \
|
||||||
--cap-add=sys_nice \
|
--cap-add=sys_nice \
|
||||||
--ipc=host \
|
--ipc=host \
|
||||||
ghcr.io/huggingface/tgi-gaudi:2.3.1 \
|
ghcr.io/huggingface/tgi-gaudi:2.0.6 \
|
||||||
--model-id $model \
|
--model-id $model \
|
||||||
--sharded true --num-shard 8 \
|
--sharded true --num-shard 8 \
|
||||||
--max-input-length 1024 --max-total-tokens 2048 \
|
--max-input-length 1024 --max-total-tokens 2048 \
|
||||||
@ -285,7 +278,7 @@ docker run -p 8080:80 \
|
|||||||
-e BATCH_BUCKET_SIZE=1 \
|
-e BATCH_BUCKET_SIZE=1 \
|
||||||
--cap-add=sys_nice \
|
--cap-add=sys_nice \
|
||||||
--ipc=host \
|
--ipc=host \
|
||||||
ghcr.io/huggingface/tgi-gaudi:2.3.1 \
|
ghcr.io/huggingface/tgi-gaudi:2.0.6 \
|
||||||
--model-id $model \
|
--model-id $model \
|
||||||
--max-input-tokens 4096 --max-batch-prefill-tokens 16384 \
|
--max-input-tokens 4096 --max-batch-prefill-tokens 16384 \
|
||||||
--max-total-tokens 8192 --max-batch-total-tokens 32768
|
--max-total-tokens 8192 --max-batch-total-tokens 32768
|
||||||
@ -336,7 +329,7 @@ docker run -p 8080:80 \
|
|||||||
-e FLASH_ATTENTION_RECOMPUTE=true \
|
-e FLASH_ATTENTION_RECOMPUTE=true \
|
||||||
--cap-add=sys_nice \
|
--cap-add=sys_nice \
|
||||||
--ipc=host \
|
--ipc=host \
|
||||||
ghcr.io/huggingface/tgi-gaudi:2.3.1 \
|
ghcr.io/huggingface/tgi-gaudi:2.0.6 \
|
||||||
--model-id $model \
|
--model-id $model \
|
||||||
--max-input-length 1024 --max-total-tokens 2048 \
|
--max-input-length 1024 --max-total-tokens 2048 \
|
||||||
--max-batch-prefill-tokens 2048 --max-batch-total-tokens 65536 \
|
--max-batch-prefill-tokens 2048 --max-batch-total-tokens 65536 \
|
||||||
@ -371,7 +364,7 @@ docker run -p 8080:80 \
|
|||||||
-e FLASH_ATTENTION_RECOMPUTE=true \
|
-e FLASH_ATTENTION_RECOMPUTE=true \
|
||||||
--cap-add=sys_nice \
|
--cap-add=sys_nice \
|
||||||
--ipc=host \
|
--ipc=host \
|
||||||
ghcr.io/huggingface/tgi-gaudi:2.3.1 \
|
ghcr.io/huggingface/tgi-gaudi:2.0.6 \
|
||||||
--model-id $model \
|
--model-id $model \
|
||||||
--sharded true --num-shard 8 \
|
--sharded true --num-shard 8 \
|
||||||
--max-input-length 1024 --max-total-tokens 2048 \
|
--max-input-length 1024 --max-total-tokens 2048 \
|
||||||
@ -407,7 +400,7 @@ docker run -p 8080:80 \
|
|||||||
-e FLASH_ATTENTION_RECOMPUTE=true \
|
-e FLASH_ATTENTION_RECOMPUTE=true \
|
||||||
--cap-add=sys_nice \
|
--cap-add=sys_nice \
|
||||||
--ipc=host \
|
--ipc=host \
|
||||||
ghcr.io/huggingface/tgi-gaudi:2.3.1 \
|
ghcr.io/huggingface/tgi-gaudi:2.0.6 \
|
||||||
--model-id $model \
|
--model-id $model \
|
||||||
--max-input-length 1024 --max-total-tokens 2048 \
|
--max-input-length 1024 --max-total-tokens 2048 \
|
||||||
--max-batch-prefill-tokens 2048 --max-batch-total-tokens 65536 \
|
--max-batch-prefill-tokens 2048 --max-batch-total-tokens 65536 \
|
||||||
@ -442,7 +435,7 @@ docker run -p 8080:80 \
|
|||||||
-e FLASH_ATTENTION_RECOMPUTE=true \
|
-e FLASH_ATTENTION_RECOMPUTE=true \
|
||||||
--cap-add=sys_nice \
|
--cap-add=sys_nice \
|
||||||
--ipc=host \
|
--ipc=host \
|
||||||
ghcr.io/huggingface/tgi-gaudi:2.3.1 \
|
ghcr.io/huggingface/tgi-gaudi:2.0.6 \
|
||||||
--model-id $model \
|
--model-id $model \
|
||||||
--sharded true --num-shard 8 \
|
--sharded true --num-shard 8 \
|
||||||
--max-input-length 1024 --max-total-tokens 2048 \
|
--max-input-length 1024 --max-total-tokens 2048 \
|
||||||
@ -475,7 +468,7 @@ docker run -p 8080:80 \
|
|||||||
-e BATCH_BUCKET_SIZE=1 \
|
-e BATCH_BUCKET_SIZE=1 \
|
||||||
--cap-add=sys_nice \
|
--cap-add=sys_nice \
|
||||||
--ipc=host \
|
--ipc=host \
|
||||||
ghcr.io/huggingface/tgi-gaudi:2.3.1 \
|
ghcr.io/huggingface/tgi-gaudi:2.0.6 \
|
||||||
--model-id $model \
|
--model-id $model \
|
||||||
--max-input-tokens 4096 --max-batch-prefill-tokens 16384 \
|
--max-input-tokens 4096 --max-batch-prefill-tokens 16384 \
|
||||||
--max-total-tokens 8192 --max-batch-total-tokens 32768
|
--max-total-tokens 8192 --max-batch-total-tokens 32768
|
||||||
@ -506,13 +499,28 @@ docker run -p 8080:80 \
|
|||||||
-e BATCH_BUCKET_SIZE=1 \
|
-e BATCH_BUCKET_SIZE=1 \
|
||||||
--cap-add=sys_nice \
|
--cap-add=sys_nice \
|
||||||
--ipc=host \
|
--ipc=host \
|
||||||
ghcr.io/huggingface/tgi-gaudi:2.3.1 \
|
ghcr.io/huggingface/tgi-gaudi:2.0.6 \
|
||||||
--model-id $model \
|
--model-id $model \
|
||||||
--sharded true --num-shard 8 \
|
--sharded true --num-shard 8 \
|
||||||
--max-input-tokens 4096 --max-batch-prefill-tokens 16384 \
|
--max-input-tokens 4096 --max-batch-prefill-tokens 16384 \
|
||||||
--max-total-tokens 8192 --max-batch-total-tokens 32768
|
--max-total-tokens 8192 --max-batch-total-tokens 32768
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## TGI-Gaudi Benchmark
|
||||||
|
|
||||||
|
### Static Batching Benchmark
|
||||||
|
To run static batching benchmark, please refer to [TGI's benchmark tool](https://github.com/huggingface/text-generation-inference/tree/main/benchmark).
|
||||||
|
|
||||||
|
To run it on the same machine, you can do the following:
|
||||||
|
* `docker exec -it <docker name> bash` , pick the docker started from step 2 using docker ps
|
||||||
|
* `text-generation-benchmark -t <model-id>` , pass the model-id from docker run command
|
||||||
|
* after the completion of tests, hit ctrl+c to see the performance data summary.
|
||||||
|
> Note: This benchmark runs the model with bs=[1, 2, 4, 8, 16, 32], sequence_length=10 and decode_length=8 by default. if you want to run other configs, please check text-generation-benchmark -h and change the parameters.
|
||||||
|
|
||||||
|
### Continuous Batching Benchmark
|
||||||
|
To run continuous batching benchmark, please refer to [README in examples folder](https://github.com/huggingface/tgi-gaudi/blob/habana-main/examples/README.md).
|
||||||
|
|
||||||
|
|
||||||
## Adjusting TGI Parameters
|
## Adjusting TGI Parameters
|
||||||
|
|
||||||
Maximum sequence length is controlled by two arguments:
|
Maximum sequence length is controlled by two arguments:
|
||||||
|
@ -31,13 +31,18 @@ def get_args():
|
|||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--max_concurrent_requests", type=int, default=256, help="Max number of concurrent requests"
|
"--max_concurrent_requests", type=int, default=256, help="Max number of concurrent requests"
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--seed", type=int, default=42, help="Random seed for datasets"
|
||||||
|
)
|
||||||
|
|
||||||
return parser.parse_args()
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
def read_dataset(
|
def read_dataset(
|
||||||
max_input_length: int,
|
max_input_length: int,
|
||||||
total_sample_count: int,
|
total_sample_count: int,
|
||||||
model_id: str
|
model_id: str,
|
||||||
|
seed: int,
|
||||||
) -> List[str]:
|
) -> List[str]:
|
||||||
"""
|
"""
|
||||||
Loads public dataset from HF: https://huggingface.co/datasets/DIBT/10k_prompts_ranked
|
Loads public dataset from HF: https://huggingface.co/datasets/DIBT/10k_prompts_ranked
|
||||||
@ -51,7 +56,8 @@ def read_dataset(
|
|||||||
)
|
)
|
||||||
if len(dataset) > total_sample_count:
|
if len(dataset) > total_sample_count:
|
||||||
dataset = dataset.select(range(total_sample_count))
|
dataset = dataset.select(range(total_sample_count))
|
||||||
dataset = dataset.shuffle(seed=42)
|
|
||||||
|
dataset = dataset.shuffle(seed=seed)
|
||||||
return [sample["prompt"] for sample in dataset]
|
return [sample["prompt"] for sample in dataset]
|
||||||
|
|
||||||
|
|
||||||
@ -71,7 +77,7 @@ def is_tgi_available(
|
|||||||
def main():
|
def main():
|
||||||
args = get_args()
|
args = get_args()
|
||||||
dataset = read_dataset(
|
dataset = read_dataset(
|
||||||
args.max_input_length, args.total_sample_count, args.model_id
|
args.max_input_length, args.total_sample_count, args.model_id, args.seed
|
||||||
)
|
)
|
||||||
|
|
||||||
if not is_tgi_available(args.server_address):
|
if not is_tgi_available(args.server_address):
|
||||||
|
1688
server/poetry.lock
generated
1688
server/poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@ -22,8 +22,8 @@ opentelemetry-instrumentation-grpc = "^0.36b0"
|
|||||||
hf-transfer = "^0.1.2"
|
hf-transfer = "^0.1.2"
|
||||||
sentencepiece = "^0.1.97"
|
sentencepiece = "^0.1.97"
|
||||||
peft = "^0.10"
|
peft = "^0.10"
|
||||||
optimum-habana = "1.13.2"
|
optimum-habana = "1.14.1"
|
||||||
transformers = "4.43.4"
|
transformers = "4.45.2"
|
||||||
numpy = "1.26.4"
|
numpy = "1.26.4"
|
||||||
accelerate = "0.33.0"
|
accelerate = "0.33.0"
|
||||||
outlines= { version = "^0.0.36", optional = true }
|
outlines= { version = "^0.0.36", optional = true }
|
||||||
|
@ -1,40 +1,40 @@
|
|||||||
accelerate==0.33.0 ; python_version >= "3.9" and python_version < "3.13"
|
accelerate==0.33.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
aiohappyeyeballs==2.4.0 ; python_version >= "3.9" and python_version < "3.13"
|
aiohappyeyeballs==2.4.3 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
aiohttp==3.10.5 ; python_version >= "3.9" and python_version < "3.13"
|
aiohttp==3.10.10 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
aiosignal==1.3.1 ; python_version >= "3.9" and python_version < "3.13"
|
aiosignal==1.3.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
async-timeout==4.0.3 ; python_version >= "3.9" and python_version < "3.11"
|
async-timeout==4.0.3 ; python_version >= "3.9" and python_version < "3.11"
|
||||||
attrs==24.2.0 ; python_version >= "3.9" and python_version < "3.13"
|
attrs==24.2.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
backoff==2.2.1 ; python_version >= "3.9" and python_version < "3.13"
|
backoff==2.2.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
certifi==2024.7.4 ; python_version >= "3.9" and python_version < "3.13"
|
certifi==2024.8.30 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
charset-normalizer==3.3.2 ; python_version >= "3.9" and python_version < "3.13"
|
charset-normalizer==3.4.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
click==8.1.7 ; python_version >= "3.9" and python_version < "3.13"
|
click==8.1.7 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
colorama==0.4.6 ; python_version >= "3.9" and python_version < "3.13" and (sys_platform == "win32" or platform_system == "Windows")
|
colorama==0.4.6 ; python_version >= "3.9" and python_version < "3.13" and (sys_platform == "win32" or platform_system == "Windows")
|
||||||
coloredlogs==15.0.1 ; python_version >= "3.9" and python_version < "3.13"
|
coloredlogs==15.0.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
datasets==2.21.0 ; python_version >= "3.9" and python_version < "3.13"
|
datasets==3.0.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
deprecated==1.2.14 ; python_version >= "3.9" and python_version < "3.13"
|
deprecated==1.2.14 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
diffusers==0.29.2 ; python_version >= "3.9" and python_version < "3.13"
|
diffusers==0.29.2 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
dill==0.3.8 ; python_version >= "3.9" and python_version < "3.13"
|
dill==0.3.7 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
filelock==3.15.4 ; python_version >= "3.9" and python_version < "3.13"
|
filelock==3.16.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
frozenlist==1.4.1 ; python_version >= "3.9" and python_version < "3.13"
|
frozenlist==1.4.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
fsspec==2024.6.1 ; python_version >= "3.9" and python_version < "3.13"
|
fsspec==2024.6.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
fsspec[http]==2024.6.1 ; python_version >= "3.9" and python_version < "3.13"
|
fsspec[http]==2024.6.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
googleapis-common-protos==1.63.2 ; python_version >= "3.9" and python_version < "3.13"
|
googleapis-common-protos==1.65.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
grpc-interceptor==0.15.4 ; python_version >= "3.9" and python_version < "3.13"
|
grpc-interceptor==0.15.4 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
grpcio-reflection==1.48.2 ; python_version >= "3.9" and python_version < "3.13"
|
grpcio-reflection==1.48.2 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
grpcio-status==1.48.2 ; python_version >= "3.9" and python_version < "3.13"
|
grpcio-status==1.48.2 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
grpcio==1.66.0 ; python_version >= "3.9" and python_version < "3.13"
|
grpcio==1.67.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
hf-transfer==0.1.8 ; python_version >= "3.9" and python_version < "3.13"
|
hf-transfer==0.1.8 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
huggingface-hub==0.24.6 ; python_version >= "3.9" and python_version < "3.13"
|
huggingface-hub==0.26.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
humanfriendly==10.0 ; python_version >= "3.9" and python_version < "3.13"
|
humanfriendly==10.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
idna==3.8 ; python_version >= "3.9" and python_version < "3.13"
|
idna==3.10 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
importlib-metadata==8.4.0 ; python_version >= "3.9" and python_version < "3.13"
|
importlib-metadata==8.5.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
jinja2==3.1.4 ; python_version >= "3.9" and python_version < "3.13"
|
jinja2==3.1.4 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
joblib==1.4.2 ; python_version >= "3.9" and python_version < "3.13"
|
joblib==1.4.2 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
loguru==0.6.0 ; python_version >= "3.9" and python_version < "3.13"
|
loguru==0.6.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
markupsafe==2.1.5 ; python_version >= "3.9" and python_version < "3.13"
|
markupsafe==3.0.2 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
mpmath==1.3.0 ; python_version >= "3.9" and python_version < "3.13"
|
mpmath==1.3.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
multidict==6.0.5 ; python_version >= "3.9" and python_version < "3.13"
|
multidict==6.1.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
multiprocess==0.70.16 ; python_version >= "3.9" and python_version < "3.13"
|
multiprocess==0.70.15 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
networkx==3.2.1 ; python_version >= "3.9" and python_version < "3.13"
|
networkx==3.2.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
numpy==1.26.4 ; python_version >= "3.9" and python_version < "3.13"
|
numpy==1.26.4 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
opentelemetry-api==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
|
opentelemetry-api==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
@ -46,43 +46,45 @@ opentelemetry-instrumentation==0.36b0 ; python_version >= "3.9" and python_versi
|
|||||||
opentelemetry-proto==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
|
opentelemetry-proto==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
opentelemetry-sdk==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
|
opentelemetry-sdk==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
opentelemetry-semantic-conventions==0.36b0 ; python_version >= "3.9" and python_version < "3.13"
|
opentelemetry-semantic-conventions==0.36b0 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
optimum-habana==1.13.2 ; python_version >= "3.9" and python_version < "3.13"
|
optimum-habana==1.14.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
optimum==1.21.4 ; python_version >= "3.9" and python_version < "3.13"
|
optimum==1.23.2 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
packaging==24.1 ; python_version >= "3.9" and python_version < "3.13"
|
packaging==24.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
pandas==2.2.2 ; python_version >= "3.9" and python_version < "3.13"
|
pandas==2.2.3 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
peft==0.10.0 ; python_version >= "3.9" and python_version < "3.13"
|
peft==0.10.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
pillow==10.4.0 ; python_version >= "3.9" and python_version < "3.13"
|
pillow==11.0.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
prometheus-client==0.20.0 ; python_version >= "3.9" and python_version < "3.13"
|
prometheus-client==0.20.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
|
propcache==0.2.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
protobuf==3.20.3 ; python_version >= "3.9" and python_version < "3.13"
|
protobuf==3.20.3 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
psutil==6.0.0 ; python_version >= "3.9" and python_version < "3.13"
|
psutil==6.1.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
py-cpuinfo==9.0.0 ; python_version >= "3.9" and python_version < "3.13"
|
py-cpuinfo==9.0.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
pyarrow==17.0.0 ; python_version >= "3.9" and python_version < "3.13"
|
pyarrow==17.0.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
pyreadline3==3.4.1 ; sys_platform == "win32" and python_version >= "3.9" and python_version < "3.13"
|
pyreadline3==3.5.4 ; sys_platform == "win32" and python_version >= "3.9" and python_version < "3.13"
|
||||||
python-dateutil==2.9.0.post0 ; python_version >= "3.9" and python_version < "3.13"
|
python-dateutil==2.9.0.post0 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
pytz==2024.1 ; python_version >= "3.9" and python_version < "3.13"
|
pytz==2024.2 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
pyyaml==6.0.2 ; python_version >= "3.9" and python_version < "3.13"
|
pyyaml==6.0.2 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
regex==2024.7.24 ; python_version >= "3.9" and python_version < "3.13"
|
regex==2024.9.11 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
requests==2.32.3 ; python_version >= "3.9" and python_version < "3.13"
|
requests==2.32.3 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
safetensors==0.4.4 ; python_version >= "3.9" and python_version < "3.13"
|
safetensors==0.4.5 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
scikit-learn==1.5.1 ; python_version >= "3.9" and python_version < "3.13"
|
scikit-learn==1.5.2 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
scipy==1.13.1 ; python_version >= "3.9" and python_version < "3.13"
|
scipy==1.13.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
sentence-transformers[train]==3.0.1 ; python_version >= "3.9" and python_version < "3.13"
|
sentence-transformers[train]==3.0.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
sentencepiece==0.1.99 ; python_version >= "3.9" and python_version < "3.13"
|
sentencepiece==0.1.99 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
setuptools==73.0.1 ; python_version >= "3.9" and python_version < "3.13"
|
setuptools==75.2.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
six==1.16.0 ; python_version >= "3.9" and python_version < "3.13"
|
six==1.16.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
sympy==1.12.1 ; python_version >= "3.9" and python_version < "3.13"
|
sympy==1.12.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
threadpoolctl==3.5.0 ; python_version >= "3.9" and python_version < "3.13"
|
threadpoolctl==3.5.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
tokenizers==0.19.1 ; python_version >= "3.9" and python_version < "3.13"
|
tokenizers==0.20.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
|
torch==2.4.0a0+git74cd574 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
tqdm==4.66.5 ; python_version >= "3.9" and python_version < "3.13"
|
tqdm==4.66.5 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
transformers==4.43.4 ; python_version >= "3.9" and python_version < "3.13"
|
transformers==4.45.2 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
transformers[sentencepiece]==4.43.4 ; python_version >= "3.9" and python_version < "3.13"
|
transformers[sentencepiece]==4.45.2 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
triton==3.0.0 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version < "3.13" and python_version >= "3.9"
|
triton==3.0.0 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version < "3.13" and python_version >= "3.9"
|
||||||
typer==0.7.0 ; python_version >= "3.9" and python_version < "3.13"
|
typer==0.7.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
typing-extensions==4.12.2 ; python_version >= "3.9" and python_version < "3.13"
|
typing-extensions==4.12.2 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
tzdata==2024.1 ; python_version >= "3.9" and python_version < "3.13"
|
tzdata==2024.2 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
urllib3==2.2.2 ; python_version >= "3.9" and python_version < "3.13"
|
urllib3==2.2.3 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
win32-setctime==1.1.0 ; python_version >= "3.9" and python_version < "3.13" and sys_platform == "win32"
|
win32-setctime==1.1.0 ; python_version >= "3.9" and python_version < "3.13" and sys_platform == "win32"
|
||||||
wrapt==1.16.0 ; python_version >= "3.9" and python_version < "3.13"
|
wrapt==1.16.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
xxhash==3.5.0 ; python_version >= "3.9" and python_version < "3.13"
|
xxhash==3.5.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
yarl==1.9.4 ; python_version >= "3.9" and python_version < "3.13"
|
yarl==1.16.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
zipp==3.20.0 ; python_version >= "3.9" and python_version < "3.13"
|
zipp==3.20.2 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
|
Loading…
Reference in New Issue
Block a user