mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-08 19:04:52 +00:00
update docker command
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
This commit is contained in:
parent
e80f6e8e78
commit
b950dd87c3
@ -19,11 +19,7 @@ docker run -p 8080:80 \
|
||||
--ipc=host \
|
||||
-v $volume:/data \
|
||||
-e HF_TOKEN=$hf_token \
|
||||
-e MAX_TOTAL_TOKENS=2048 \
|
||||
-e PREFILL_BATCH_BUCKET_SIZE=2 \
|
||||
-e BATCH_BUCKET_SIZE=32 \
|
||||
-e PAD_SEQUENCE_TO_MULTIPLE_OF=256 \
|
||||
ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
|
||||
ghcr.io/huggingface/text-generation-inference:3.3.4-gaudi \
|
||||
--model-id $model \
|
||||
--max-input-tokens 1024 --max-total-tokens 2048 \
|
||||
--max-batch-prefill-tokens 2048 --max-batch-size 32 \
|
||||
@ -43,60 +39,7 @@ docker run -p 8080:80 \
|
||||
--ipc=host \
|
||||
-v $volume:/data \
|
||||
-e HF_TOKEN=$hf_token \
|
||||
-e MAX_TOTAL_TOKENS=2048 \
|
||||
-e BATCH_BUCKET_SIZE=256 \
|
||||
-e PREFILL_BATCH_BUCKET_SIZE=4 \
|
||||
-e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \
|
||||
ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
|
||||
--model-id $model \
|
||||
--sharded true --num-shard 8 \
|
||||
--max-input-tokens 1024 --max-total-tokens 2048 \
|
||||
--max-batch-prefill-tokens 4096 --max-batch-size 256 \
|
||||
--max-waiting-tokens 7 --waiting-served-ratio 1.2 --max-concurrent-requests 512
|
||||
```
|
||||
|
||||
### Llama2-7B on 1 Card (BF16)
|
||||
|
||||
```bash
|
||||
model=meta-llama/Llama-2-7b-chat-hf
|
||||
hf_token=YOUR_ACCESS_TOKEN
|
||||
volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
|
||||
|
||||
docker run -p 8080:80 \
|
||||
--runtime=habana \
|
||||
--cap-add=sys_nice \
|
||||
--ipc=host \
|
||||
-v $volume:/data \
|
||||
-e HF_TOKEN=$hf_token \
|
||||
-e MAX_TOTAL_TOKENS=2048 \
|
||||
-e PREFILL_BATCH_BUCKET_SIZE=2 \
|
||||
-e BATCH_BUCKET_SIZE=32 \
|
||||
-e PAD_SEQUENCE_TO_MULTIPLE_OF=256 \
|
||||
ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
|
||||
--model-id $model \
|
||||
--max-input-tokens 1024 --max-total-tokens 2048 \
|
||||
--max-batch-prefill-tokens 2048 --max-batch-size 32 \
|
||||
--max-waiting-tokens 7 --waiting-served-ratio 1.2 --max-concurrent-requests 64
|
||||
```
|
||||
|
||||
### Llama2-70B on 8 cards (BF16)
|
||||
|
||||
```bash
|
||||
model=meta-llama/Llama-2-70b-chat-hf
|
||||
hf_token=YOUR_ACCESS_TOKEN
|
||||
volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
|
||||
|
||||
docker run -p 8080:80 \
|
||||
--runtime=habana \
|
||||
--cap-add=sys_nice \
|
||||
--ipc=host \
|
||||
-v $volume:/data \
|
||||
-e HF_TOKEN=$hf_token \
|
||||
-e MAX_TOTAL_TOKENS=2048 \
|
||||
-e BATCH_BUCKET_SIZE=256 \
|
||||
-e PREFILL_BATCH_BUCKET_SIZE=4 \
|
||||
-e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \
|
||||
ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
|
||||
ghcr.io/huggingface/text-generation-inference:3.3.4-gaudi \
|
||||
--model-id $model \
|
||||
--sharded true --num-shard 8 \
|
||||
--max-input-tokens 1024 --max-total-tokens 2048 \
|
||||
@ -115,9 +58,7 @@ docker run -p 8080:80 \
|
||||
--cap-add=sys_nice \
|
||||
--ipc=host \
|
||||
-v $volume:/data \
|
||||
-e PREFILL_BATCH_BUCKET_SIZE=1 \
|
||||
-e BATCH_BUCKET_SIZE=1 \
|
||||
ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
|
||||
ghcr.io/huggingface/text-generation-inference:3.3.4-gaudi \
|
||||
--model-id $model \
|
||||
--max-input-tokens 4096 --max-batch-prefill-tokens 16384 \
|
||||
--max-total-tokens 8192 --max-batch-size 4
|
||||
@ -125,12 +66,12 @@ docker run -p 8080:80 \
|
||||
|
||||
## FP8 Precision
|
||||
|
||||
Please refer to the [FP8 Precision](https://huggingface.co/docs/text-generation-inference/backends/gaudi_new#how-to-use-different-precision-formats) section for more details. You need to measure the statistics of the model first before running the model in FP8 precision.
|
||||
You could also set kv cache dtype to FP8 when launching the server, fp8_e4m3fn is supported in Gaudi
|
||||
|
||||
## Llama3.1-8B on 1 Card (FP8)
|
||||
## Llama3-8B on 1 Card (FP8)
|
||||
|
||||
```bash
|
||||
model=meta-llama/Meta-Llama-3.1-8B-Instruct
|
||||
model=RedHatAI/Meta-Llama-3-8B-Instruct-FP8-KV
|
||||
hf_token=YOUR_ACCESS_TOKEN
|
||||
volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
|
||||
|
||||
@ -139,25 +80,19 @@ docker run -p 8080:80 \
|
||||
--cap-add=sys_nice \
|
||||
--ipc=host \
|
||||
-v $volume:/data \
|
||||
-v $PWD/quantization_config:/usr/src/quantization_config \
|
||||
-v $PWD/hqt_output:/usr/src/hqt_output \
|
||||
-e QUANT_CONFIG=./quantization_config/maxabs_quant.json \
|
||||
-e HF_TOKEN=$hf_token \
|
||||
-e MAX_TOTAL_TOKENS=2048 \
|
||||
-e PREFILL_BATCH_BUCKET_SIZE=2 \
|
||||
-e BATCH_BUCKET_SIZE=32 \
|
||||
-e PAD_SEQUENCE_TO_MULTIPLE_OF=256 \
|
||||
ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
|
||||
ghcr.io/huggingface/text-generation-inference:3.3.4-gaudi \
|
||||
--model-id $model \
|
||||
--kv-cache-dtype fp8_e4m3fn \
|
||||
--max-input-tokens 1024 --max-total-tokens 2048 \
|
||||
--max-batch-prefill-tokens 2048 --max-batch-size 32 \
|
||||
--max-waiting-tokens 7 --waiting-served-ratio 1.2 --max-concurrent-requests 64
|
||||
```
|
||||
|
||||
## Llama3.1-70B on 8 cards (FP8)
|
||||
## Llama3-70B on 8 cards (FP8)
|
||||
|
||||
```bash
|
||||
model=meta-llama/Meta-Llama-3.1-70B-Instruct
|
||||
model=RedHatAI/Meta-Llama-3-70B-Instruct-FP8
|
||||
hf_token=YOUR_ACCESS_TOKEN
|
||||
volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
|
||||
|
||||
@ -166,118 +101,12 @@ docker run -p 8080:80 \
|
||||
--cap-add=sys_nice \
|
||||
--ipc=host \
|
||||
-v $volume:/data \
|
||||
-v $PWD/quantization_config:/usr/src/quantization_config \
|
||||
-v $PWD/hqt_output:/usr/src/hqt_output \
|
||||
-e QUANT_CONFIG=./quantization_config/maxabs_quant.json \
|
||||
-e HF_TOKEN=$hf_token \
|
||||
-e MAX_TOTAL_TOKENS=2048 \
|
||||
-e BATCH_BUCKET_SIZE=256 \
|
||||
-e PREFILL_BATCH_BUCKET_SIZE=4 \
|
||||
-e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \
|
||||
ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
|
||||
ghcr.io/huggingface/text-generation-inference:3.3.4-gaudi \
|
||||
--model-id $model \
|
||||
--kv-cache-dtype fp8_e4m3fn \
|
||||
--sharded true --num-shard 8 \
|
||||
--max-input-tokens 1024 --max-total-tokens 2048 \
|
||||
--max-batch-prefill-tokens 4096 --max-batch-size 256 \
|
||||
--max-waiting-tokens 7 --waiting-served-ratio 1.2 --max-concurrent-requests 512
|
||||
```
|
||||
|
||||
## Llama2-7B on 1 Card (FP8)
|
||||
|
||||
```bash
|
||||
model=meta-llama/Llama-2-7b-chat-hf
|
||||
hf_token=YOUR_ACCESS_TOKEN
|
||||
volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
|
||||
|
||||
docker run -p 8080:80 \
|
||||
--runtime=habana \
|
||||
--cap-add=sys_nice \
|
||||
--ipc=host \
|
||||
-v $volume:/data \
|
||||
-v $PWD/quantization_config:/usr/src/quantization_config \
|
||||
-v $PWD/hqt_output:/usr/src/hqt_output \
|
||||
-e QUANT_CONFIG=./quantization_config/maxabs_quant.json \
|
||||
-e HF_TOKEN=$hf_token \
|
||||
-e MAX_TOTAL_TOKENS=2048 \
|
||||
-e PREFILL_BATCH_BUCKET_SIZE=2 \
|
||||
-e BATCH_BUCKET_SIZE=32 \
|
||||
-e PAD_SEQUENCE_TO_MULTIPLE_OF=256 \
|
||||
ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
|
||||
--model-id $model \
|
||||
--max-input-tokens 1024 --max-total-tokens 2048 \
|
||||
--max-batch-prefill-tokens 2048 --max-batch-size 32 \
|
||||
--max-waiting-tokens 7 --waiting-served-ratio 1.2 --max-concurrent-requests 64
|
||||
```
|
||||
|
||||
## Llama2-70B on 8 Cards (FP8)
|
||||
|
||||
```bash
|
||||
model=meta-llama/Llama-2-70b-chat-hf
|
||||
hf_token=YOUR_ACCESS_TOKEN
|
||||
volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
|
||||
|
||||
docker run -p 8080:80 \
|
||||
--runtime=habana \
|
||||
--cap-add=sys_nice \
|
||||
--ipc=host \
|
||||
-v $volume:/data \
|
||||
-v $PWD/quantization_config:/usr/src/quantization_config \
|
||||
-v $PWD/hqt_output:/usr/src/hqt_output \
|
||||
-e QUANT_CONFIG=./quantization_config/maxabs_quant.json \
|
||||
-e HF_TOKEN=$hf_token \
|
||||
-e MAX_TOTAL_TOKENS=2048 \
|
||||
-e BATCH_BUCKET_SIZE=256 \
|
||||
-e PREFILL_BATCH_BUCKET_SIZE=4 \
|
||||
-e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \
|
||||
ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
|
||||
--model-id $model \
|
||||
--sharded true --num-shard 8 \
|
||||
--max-input-tokens 1024 --max-total-tokens 2048 \
|
||||
--max-batch-prefill-tokens 4096 --max-batch-size 256 \
|
||||
--max-waiting-tokens 7 --waiting-served-ratio 1.2 --max-concurrent-requests 512
|
||||
```
|
||||
|
||||
## Llava-v1.6-Mistral-7B on 1 Card (FP8)
|
||||
|
||||
```bash
|
||||
model=llava-hf/llava-v1.6-mistral-7b-hf
|
||||
volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
|
||||
|
||||
docker run -p 8080:80 \
|
||||
--runtime=habana \
|
||||
--cap-add=sys_nice \
|
||||
--ipc=host \
|
||||
-v $volume:/data \
|
||||
-v $PWD/quantization_config:/usr/src/quantization_config \
|
||||
-v $PWD/hqt_output:/usr/src/hqt_output \
|
||||
-e QUANT_CONFIG=./quantization_config/maxabs_quant.json \
|
||||
-e PREFILL_BATCH_BUCKET_SIZE=1 \
|
||||
-e BATCH_BUCKET_SIZE=1 \
|
||||
ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
|
||||
--model-id $model \
|
||||
--max-input-tokens 4096 --max-batch-prefill-tokens 16384 \
|
||||
--max-total-tokens 8192 --max-batch-size 4
|
||||
```
|
||||
|
||||
## Llava-v1.6-Mistral-7B on 8 Cards (FP8)
|
||||
|
||||
```bash
|
||||
model=llava-hf/llava-v1.6-mistral-7b-hf
|
||||
volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
|
||||
|
||||
docker run -p 8080:80 \
|
||||
--runtime=habana \
|
||||
--cap-add=sys_nice \
|
||||
--ipc=host \
|
||||
-v $volume:/data \
|
||||
-v $PWD/quantization_config:/usr/src/quantization_config \
|
||||
-v $PWD/hqt_output:/usr/src/hqt_output \
|
||||
-e QUANT_CONFIG=./quantization_config/maxabs_quant.json \
|
||||
-e PREFILL_BATCH_BUCKET_SIZE=1 \
|
||||
-e BATCH_BUCKET_SIZE=1 \
|
||||
ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
|
||||
--model-id $model \
|
||||
--sharded true --num-shard 8 \
|
||||
--max-input-tokens 4096 --max-batch-prefill-tokens 16384 \
|
||||
--max-total-tokens 8192 --max-batch-size 4
|
||||
```
|
||||
|
Loading…
Reference in New Issue
Block a user