update docker command

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
2025-09-08 19:04:52 +00:00 · 2025-07-01 23:47:54 -07:00 · 2025-07-01 23:47:54 -07:00 · b950dd87c3
commit b950dd87c3
parent e80f6e8e78
1 changed files with 12 additions and 183 deletions
--- a/backends/gaudi/examples/docker_commands/docker_commands.md
+++ b/backends/gaudi/examples/docker_commands/docker_commands.md
@ -19,11 +19,7 @@ docker run -p 8080:80 \
   --ipc=host \
   -v $volume:/data \
   -e HF_TOKEN=$hf_token \
-   -e MAX_TOTAL_TOKENS=2048 \
-   -e PREFILL_BATCH_BUCKET_SIZE=2 \
-   -e BATCH_BUCKET_SIZE=32 \
-   -e PAD_SEQUENCE_TO_MULTIPLE_OF=256 \
-   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
+   ghcr.io/huggingface/text-generation-inference:3.3.4-gaudi \
   --model-id $model \
   --max-input-tokens 1024 --max-total-tokens 2048 \
   --max-batch-prefill-tokens 2048 --max-batch-size 32 \
@ -43,60 +39,7 @@ docker run -p 8080:80 \
   --ipc=host \
   -v $volume:/data \
   -e HF_TOKEN=$hf_token \
-   -e MAX_TOTAL_TOKENS=2048 \
-   -e BATCH_BUCKET_SIZE=256 \
-   -e PREFILL_BATCH_BUCKET_SIZE=4 \
-   -e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \
-   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
-   --model-id $model \
-   --sharded true --num-shard 8 \
-   --max-input-tokens 1024 --max-total-tokens 2048 \
-   --max-batch-prefill-tokens 4096 --max-batch-size 256 \
-   --max-waiting-tokens 7 --waiting-served-ratio 1.2 --max-concurrent-requests 512
-```
-
-### Llama2-7B on 1 Card (BF16)
-
-```bash
-model=meta-llama/Llama-2-7b-chat-hf
-hf_token=YOUR_ACCESS_TOKEN
-volume=$PWD/data   # share a volume with the Docker container to avoid downloading weights every run
-
-docker run -p 8080:80 \
-   --runtime=habana \
-   --cap-add=sys_nice \
-   --ipc=host \
-   -v $volume:/data \
-   -e HF_TOKEN=$hf_token \
-   -e MAX_TOTAL_TOKENS=2048 \
-   -e PREFILL_BATCH_BUCKET_SIZE=2 \
-   -e BATCH_BUCKET_SIZE=32 \
-   -e PAD_SEQUENCE_TO_MULTIPLE_OF=256 \
-   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
-   --model-id $model \
-   --max-input-tokens 1024 --max-total-tokens 2048 \
-   --max-batch-prefill-tokens 2048 --max-batch-size 32 \
-   --max-waiting-tokens 7 --waiting-served-ratio 1.2 --max-concurrent-requests 64
-```
-
-### Llama2-70B on 8 cards (BF16)
-
-```bash
-model=meta-llama/Llama-2-70b-chat-hf
-hf_token=YOUR_ACCESS_TOKEN
-volume=$PWD/data   # share a volume with the Docker container to avoid downloading weights every run
-
-docker run -p 8080:80 \
-   --runtime=habana \
-   --cap-add=sys_nice \
-   --ipc=host \
-   -v $volume:/data \
-   -e HF_TOKEN=$hf_token \
-   -e MAX_TOTAL_TOKENS=2048 \
-   -e BATCH_BUCKET_SIZE=256 \
-   -e PREFILL_BATCH_BUCKET_SIZE=4 \
-   -e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \
-   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
+   ghcr.io/huggingface/text-generation-inference:3.3.4-gaudi \
   --model-id $model \
   --sharded true --num-shard 8 \
   --max-input-tokens 1024 --max-total-tokens 2048 \
@ -115,9 +58,7 @@ docker run -p 8080:80 \
   --cap-add=sys_nice \
   --ipc=host \
   -v $volume:/data \
-    -e PREFILL_BATCH_BUCKET_SIZE=1 \
-    -e BATCH_BUCKET_SIZE=1 \
-   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
+   ghcr.io/huggingface/text-generation-inference:3.3.4-gaudi \
   --model-id $model \
   --max-input-tokens 4096 --max-batch-prefill-tokens 16384 \
   --max-total-tokens 8192 --max-batch-size 4
@ -125,12 +66,12 @@ docker run -p 8080:80 \

 ## FP8 Precision

-Please refer to the [FP8 Precision](https://huggingface.co/docs/text-generation-inference/backends/gaudi_new#how-to-use-different-precision-formats) section for more details. You need to measure the statistics of the model first before running the model in FP8 precision.
+You could also set kv cache dtype to FP8 when launching the server, fp8_e4m3fn is supported in Gaudi

-## Llama3.1-8B on 1 Card (FP8)
+## Llama3-8B on 1 Card (FP8)

 ```bash
-model=meta-llama/Meta-Llama-3.1-8B-Instruct
+model=RedHatAI/Meta-Llama-3-8B-Instruct-FP8-KV
 hf_token=YOUR_ACCESS_TOKEN
 volume=$PWD/data   # share a volume with the Docker container to avoid downloading weights every run

@ -139,25 +80,19 @@ docker run -p 8080:80 \
   --cap-add=sys_nice \
   --ipc=host \
   -v $volume:/data \
-   -v $PWD/quantization_config:/usr/src/quantization_config \
-   -v $PWD/hqt_output:/usr/src/hqt_output \
-   -e QUANT_CONFIG=./quantization_config/maxabs_quant.json \
   -e HF_TOKEN=$hf_token \
-   -e MAX_TOTAL_TOKENS=2048 \
-   -e PREFILL_BATCH_BUCKET_SIZE=2 \
-   -e BATCH_BUCKET_SIZE=32 \
-   -e PAD_SEQUENCE_TO_MULTIPLE_OF=256 \
-   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
+   ghcr.io/huggingface/text-generation-inference:3.3.4-gaudi \
   --model-id $model \
+   --kv-cache-dtype fp8_e4m3fn \
   --max-input-tokens 1024 --max-total-tokens 2048 \
   --max-batch-prefill-tokens 2048 --max-batch-size 32 \
   --max-waiting-tokens 7 --waiting-served-ratio 1.2 --max-concurrent-requests 64
 ```

-## Llama3.1-70B on 8 cards (FP8)
+## Llama3-70B on 8 cards (FP8)

 ```bash
-model=meta-llama/Meta-Llama-3.1-70B-Instruct
+model=RedHatAI/Meta-Llama-3-70B-Instruct-FP8
 hf_token=YOUR_ACCESS_TOKEN
 volume=$PWD/data   # share a volume with the Docker container to avoid downloading weights every run

@ -166,118 +101,12 @@ docker run -p 8080:80 \
   --cap-add=sys_nice \
   --ipc=host \
   -v $volume:/data \
-   -v $PWD/quantization_config:/usr/src/quantization_config \
-   -v $PWD/hqt_output:/usr/src/hqt_output \
-   -e QUANT_CONFIG=./quantization_config/maxabs_quant.json \
   -e HF_TOKEN=$hf_token \
-   -e MAX_TOTAL_TOKENS=2048 \
-   -e BATCH_BUCKET_SIZE=256 \
-   -e PREFILL_BATCH_BUCKET_SIZE=4 \
-   -e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \
-   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
+   ghcr.io/huggingface/text-generation-inference:3.3.4-gaudi \
   --model-id $model \
+   --kv-cache-dtype fp8_e4m3fn \
   --sharded true --num-shard 8 \
   --max-input-tokens 1024 --max-total-tokens 2048 \
   --max-batch-prefill-tokens 4096 --max-batch-size 256 \
   --max-waiting-tokens 7 --waiting-served-ratio 1.2 --max-concurrent-requests 512
 ```
-
-## Llama2-7B on 1 Card (FP8)
-
-```bash
-model=meta-llama/Llama-2-7b-chat-hf
-hf_token=YOUR_ACCESS_TOKEN
-volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
-
-docker run -p 8080:80 \
-   --runtime=habana \
-   --cap-add=sys_nice \
-   --ipc=host \
-   -v $volume:/data \
-   -v $PWD/quantization_config:/usr/src/quantization_config \
-   -v $PWD/hqt_output:/usr/src/hqt_output \
-   -e QUANT_CONFIG=./quantization_config/maxabs_quant.json \
-   -e HF_TOKEN=$hf_token \
-   -e MAX_TOTAL_TOKENS=2048 \
-   -e PREFILL_BATCH_BUCKET_SIZE=2 \
-   -e BATCH_BUCKET_SIZE=32 \
-   -e PAD_SEQUENCE_TO_MULTIPLE_OF=256 \
-   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
-   --model-id $model \
-   --max-input-tokens 1024 --max-total-tokens 2048 \
-   --max-batch-prefill-tokens 2048 --max-batch-size 32 \
-   --max-waiting-tokens 7 --waiting-served-ratio 1.2 --max-concurrent-requests 64
-```
-
-## Llama2-70B on 8 Cards (FP8)
-
-```bash
-model=meta-llama/Llama-2-70b-chat-hf
-hf_token=YOUR_ACCESS_TOKEN
-volume=$PWD/data   # share a volume with the Docker container to avoid downloading weights every run
-
-docker run -p 8080:80 \
-   --runtime=habana \
-   --cap-add=sys_nice \
-   --ipc=host \
-   -v $volume:/data \
-   -v $PWD/quantization_config:/usr/src/quantization_config \
-   -v $PWD/hqt_output:/usr/src/hqt_output \
-   -e QUANT_CONFIG=./quantization_config/maxabs_quant.json \
-   -e HF_TOKEN=$hf_token \
-   -e MAX_TOTAL_TOKENS=2048 \
-   -e BATCH_BUCKET_SIZE=256 \
-   -e PREFILL_BATCH_BUCKET_SIZE=4 \
-   -e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \
-   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
-   --model-id $model \
-   --sharded true --num-shard 8 \
-   --max-input-tokens 1024 --max-total-tokens 2048 \
-   --max-batch-prefill-tokens 4096 --max-batch-size 256 \
-   --max-waiting-tokens 7 --waiting-served-ratio 1.2 --max-concurrent-requests 512
-```
-
-## Llava-v1.6-Mistral-7B on 1 Card (FP8)
-
-```bash
-model=llava-hf/llava-v1.6-mistral-7b-hf
-volume=$PWD/data   # share a volume with the Docker container to avoid downloading weights every run
-
-docker run -p 8080:80 \
-   --runtime=habana \
-   --cap-add=sys_nice \
-   --ipc=host \
-   -v $volume:/data \
-   -v $PWD/quantization_config:/usr/src/quantization_config \
-   -v $PWD/hqt_output:/usr/src/hqt_output \
-   -e QUANT_CONFIG=./quantization_config/maxabs_quant.json \
-    -e PREFILL_BATCH_BUCKET_SIZE=1 \
-    -e BATCH_BUCKET_SIZE=1 \
-   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
-   --model-id $model \
-   --max-input-tokens 4096 --max-batch-prefill-tokens 16384 \
-   --max-total-tokens 8192 --max-batch-size 4
-```
-
-## Llava-v1.6-Mistral-7B on 8 Cards (FP8)
-
-```bash
-model=llava-hf/llava-v1.6-mistral-7b-hf
-volume=$PWD/data   # share a volume with the Docker container to avoid downloading weights every run
-
-docker run -p 8080:80 \
-   --runtime=habana \
-   --cap-add=sys_nice \
-   --ipc=host \
-   -v $volume:/data \
-   -v $PWD/quantization_config:/usr/src/quantization_config \
-   -v $PWD/hqt_output:/usr/src/hqt_output \
-   -e QUANT_CONFIG=./quantization_config/maxabs_quant.json \
-    -e PREFILL_BATCH_BUCKET_SIZE=1 \
-    -e BATCH_BUCKET_SIZE=1 \
-   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
-   --model-id $model \
-   --sharded true --num-shard 8 \
-   --max-input-tokens 4096 --max-batch-prefill-tokens 16384 \
-   --max-total-tokens 8192 --max-batch-size 4
-```