mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-23 16:02:10 +00:00
Upgrade SynapseAI version to 1.17.0 (#208)
Signed-off-by: yuanwu <yuan.wu@intel.com> Co-authored-by: Thanaji Rao Thakkalapelli <tthakkalapelli@habana.ai> Co-authored-by: regisss <15324346+regisss@users.noreply.github.com>
This commit is contained in:
parent
369e499a66
commit
a8cead1f92
@ -32,7 +32,7 @@ COPY launcher launcher
|
||||
RUN cargo build --release
|
||||
|
||||
# Text Generation Inference base image
|
||||
FROM vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest as base
|
||||
FROM vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest as base
|
||||
|
||||
# Text Generation Inference base env
|
||||
ENV HUGGINGFACE_HUB_CACHE=/data \
|
||||
@ -61,7 +61,7 @@ RUN cd server && \
|
||||
make gen-server && \
|
||||
pip install -r requirements.txt && \
|
||||
bash ./dill-0.3.8-patch.sh && \
|
||||
pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.16.0 && \
|
||||
pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.17.0 && \
|
||||
pip install . --no-cache-dir
|
||||
|
||||
# Install benchmarker
|
||||
|
45
README.md
45
README.md
@ -18,12 +18,19 @@ limitations under the License.
|
||||
|
||||
## Table of contents
|
||||
|
||||
- [Running TGI on Gaudi](#running-tgi-on-gaudi)
|
||||
- [Adjusting TGI parameters](#adjusting-tgi-parameters)
|
||||
- [Running TGI with FP8 precision](#running-tgi-with-fp8-precision)
|
||||
- [Currently supported configurations](#currently-supported-configurations)
|
||||
- [Environment variables](#environment-variables)
|
||||
- [Profiler](#profiler)
|
||||
- [Text Generation Inference on Habana Gaudi](#text-generation-inference-on-habana-gaudi)
|
||||
- [Table of contents](#table-of-contents)
|
||||
- [Running TGI on Gaudi](#running-tgi-on-gaudi)
|
||||
- [Adjusting TGI parameters](#adjusting-tgi-parameters)
|
||||
- [Running TGI with FP8 precision](#running-tgi-with-fp8-precision)
|
||||
- [Currently supported configurations](#currently-supported-configurations)
|
||||
- [LLama 7b BF16 on 1 Gaudi2 card](#llama-7b-bf16-on-1-gaudi2-card)
|
||||
- [LLama 7b FP8 on 1 Gaudi2 card](#llama-7b-fp8-on-1-gaudi2-card)
|
||||
- [LLama 70b BF16 on 8 Gaudi2 card](#llama-70b-bf16-on-8-gaudi2-card)
|
||||
- [LLama 70b FP8 on 8 Gaudi2 card](#llama-70b-fp8-on-8-gaudi2-card)
|
||||
- [LLava-next 7B BF16 on 1 Gaudi2 card](#llava-next-7b-bf16-on-1-gaudi2-card)
|
||||
- [Environment variables](#environment-variables)
|
||||
- [Profiler](#profiler)
|
||||
|
||||
## Running TGI on Gaudi
|
||||
|
||||
@ -242,6 +249,32 @@ docker run -p 8080:80 \
|
||||
--num-shard 8
|
||||
```
|
||||
|
||||
### LLava-next 7B BF16 on 1 Gaudi2 card
|
||||
|
||||
An image usually accounts for 2000 input tokens. For example, an image of size 512x512 is represented by 2800 tokens. Thus, `max-input-tokens` must be larger than the number of tokens associated to the image. Otherwise the image may be truncated. We set `BASE_IMAGE_TOKENS=2048` as the default image token number. This is the minimum value of `max-input-tokens`. You can override the environment variable `BASE_IMAGE_TOKENS` to change this value. The warmup will generate graphs with input length from `BASE_IMAGE_TOKENS` to `max-input-tokens`. For LLava-next 7B, the value of `max-batch-prefill-tokens` is 16384, which is calcualted as follows: `prefill_batch_size` = `max-batch-prefill-tokens` / `max-input-tokens`.
|
||||
|
||||
```bash
|
||||
model=llava-hf/llava-v1.6-mistral-7b-hf
|
||||
hf_token=YOUR_ACCESS_TOKEN # HF access token
|
||||
volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
|
||||
|
||||
docker run -p 8080:80 \
|
||||
--runtime=habana \
|
||||
-v $volume:/data \
|
||||
-e HABANA_VISIBLE_DEVICES=all \
|
||||
-e OMPI_MCA_btl_vader_single_copy_mechanism=none \
|
||||
-e HF_HUB_ENABLE_HF_TRANSFER=1 \
|
||||
-e HUGGING_FACE_HUB_TOKEN=$hf_token \
|
||||
-e PREFILL_BATCH_BUCKET_SIZE=1 \
|
||||
--cap-add=sys_nice \
|
||||
--ipc=host \
|
||||
ghcr.io/huggingface/tgi-gaudi:2.0.1 \
|
||||
--model-id $model \
|
||||
--max-input-tokens 4096 \
|
||||
--max-batch-prefill-tokens 16384 \
|
||||
--max-total-tokens 8192
|
||||
```
|
||||
|
||||
Please note that the model warmup can take several minutes, especially for FP8 configs. To minimize this time in consecutive runs, please refer to [Disk Caching Eviction Policy](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_PyTorch_Models.html#disk-caching-eviction-policy).
|
||||
|
||||
Other sequence lengths can be used with proportionally decreased/increased batch size (the higher sequence length, the lower batch size).
|
||||
|
1776
server/poetry.lock
generated
1776
server/poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@ -14,7 +14,7 @@ grpcio = "^1.51.1"
|
||||
grpcio-status = "*"
|
||||
grpcio-reflection = "*"
|
||||
grpc-interceptor = "^0.15.0"
|
||||
typer = "^0.6.1"
|
||||
typer = "^0.7.0"
|
||||
loguru = "^0.6.0"
|
||||
opentelemetry-api = "^1.15.0"
|
||||
opentelemetry-exporter-otlp = "^1.15.0"
|
||||
@ -22,10 +22,10 @@ opentelemetry-instrumentation-grpc = "^0.36b0"
|
||||
hf-transfer = "^0.1.2"
|
||||
sentencepiece = "^0.1.97"
|
||||
peft = "^0.10"
|
||||
optimum-habana = "1.12.0"
|
||||
transformers = "4.40.2"
|
||||
optimum-habana = "1.13.1"
|
||||
transformers = "4.43.4"
|
||||
numpy = "1.26.4"
|
||||
accelerate = "0.27.2"
|
||||
accelerate = "0.33.0"
|
||||
outlines= { version = "^0.0.36", optional = true }
|
||||
prometheus-client = "^0.20.0"
|
||||
py-cpuinfo = "^9.0.0"
|
||||
|
@ -1,37 +1,37 @@
|
||||
accelerate==0.27.2 ; python_version >= "3.9" and python_version < "3.13"
|
||||
aiohttp==3.9.5 ; python_version >= "3.9" and python_version < "3.13"
|
||||
accelerate==0.33.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
aiohappyeyeballs==2.4.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
aiohttp==3.10.5 ; python_version >= "3.9" and python_version < "3.13"
|
||||
aiosignal==1.3.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||
async-timeout==4.0.3 ; python_version >= "3.9" and python_version < "3.11"
|
||||
attrs==23.2.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
attrs==24.2.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
backoff==2.2.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||
certifi==2024.6.2 ; python_version >= "3.9" and python_version < "3.13"
|
||||
certifi==2024.7.4 ; python_version >= "3.9" and python_version < "3.13"
|
||||
charset-normalizer==3.3.2 ; python_version >= "3.9" and python_version < "3.13"
|
||||
click==8.1.7 ; python_version >= "3.9" and python_version < "3.13"
|
||||
colorama==0.4.6 ; python_version >= "3.9" and python_version < "3.13" and (sys_platform == "win32" or platform_system == "Windows")
|
||||
coloredlogs==15.0.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||
datasets==2.19.2 ; python_version >= "3.9" and python_version < "3.13"
|
||||
datasets==2.21.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
deprecated==1.2.14 ; python_version >= "3.9" and python_version < "3.13"
|
||||
diffusers==0.26.3 ; python_version >= "3.9" and python_version < "3.13"
|
||||
diffusers==0.29.2 ; python_version >= "3.9" and python_version < "3.13"
|
||||
dill==0.3.8 ; python_version >= "3.9" and python_version < "3.13"
|
||||
filelock==3.15.4 ; python_version >= "3.9" and python_version < "3.13"
|
||||
frozenlist==1.4.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||
fsspec==2024.3.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||
fsspec[http]==2024.3.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||
googleapis-common-protos==1.63.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||
fsspec==2024.6.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||
fsspec[http]==2024.6.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||
googleapis-common-protos==1.63.2 ; python_version >= "3.9" and python_version < "3.13"
|
||||
grpc-interceptor==0.15.4 ; python_version >= "3.9" and python_version < "3.13"
|
||||
grpcio-reflection==1.48.2 ; python_version >= "3.9" and python_version < "3.13"
|
||||
grpcio-status==1.48.2 ; python_version >= "3.9" and python_version < "3.13"
|
||||
grpcio==1.64.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||
hf-transfer==0.1.6 ; python_version >= "3.9" and python_version < "3.13"
|
||||
huggingface-hub==0.22.2 ; python_version >= "3.9" and python_version < "3.13"
|
||||
grpcio==1.66.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
hf-transfer==0.1.8 ; python_version >= "3.9" and python_version < "3.13"
|
||||
huggingface-hub==0.24.6 ; python_version >= "3.9" and python_version < "3.13"
|
||||
humanfriendly==10.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
idna==3.7 ; python_version >= "3.9" and python_version < "3.13"
|
||||
importlib-metadata==7.2.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||
intel-openmp==2021.4.0 ; python_version >= "3.9" and python_version < "3.13" and platform_system == "Windows"
|
||||
idna==3.8 ; python_version >= "3.9" and python_version < "3.13"
|
||||
importlib-metadata==8.4.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
jinja2==3.1.4 ; python_version >= "3.9" and python_version < "3.13"
|
||||
joblib==1.4.2 ; python_version >= "3.9" and python_version < "3.13"
|
||||
loguru==0.6.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
markupsafe==2.1.5 ; python_version >= "3.9" and python_version < "3.13"
|
||||
mkl==2021.4.0 ; python_version >= "3.9" and python_version < "3.13" and platform_system == "Windows"
|
||||
mpmath==1.3.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
multidict==6.0.5 ; python_version >= "3.9" and python_version < "3.13"
|
||||
multiprocess==0.70.16 ; python_version >= "3.9" and python_version < "3.13"
|
||||
@ -46,40 +46,43 @@ opentelemetry-instrumentation==0.36b0 ; python_version >= "3.9" and python_versi
|
||||
opentelemetry-proto==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
opentelemetry-sdk==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
opentelemetry-semantic-conventions==0.36b0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
optimum-habana==1.12.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
optimum==1.20.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
optimum-habana==1.13.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||
optimum==1.21.4 ; python_version >= "3.9" and python_version < "3.13"
|
||||
packaging==24.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||
pandas==2.2.2 ; python_version >= "3.9" and python_version < "3.13"
|
||||
peft==0.10.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
pillow==10.3.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
pillow==10.4.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
prometheus-client==0.20.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
protobuf==3.20.3 ; python_version >= "3.9" and python_version < "3.13"
|
||||
psutil==6.0.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
py-cpuinfo==9.0.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
pyarrow-hotfix==0.6 ; python_version >= "3.9" and python_version < "3.13"
|
||||
pyarrow==16.1.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
pyarrow==17.0.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
pyreadline3==3.4.1 ; sys_platform == "win32" and python_version >= "3.9" and python_version < "3.13"
|
||||
python-dateutil==2.9.0.post0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
pytz==2024.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||
pyyaml==6.0.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||
regex==2024.5.15 ; python_version >= "3.9" and python_version < "3.13"
|
||||
pyyaml==6.0.2 ; python_version >= "3.9" and python_version < "3.13"
|
||||
regex==2024.7.24 ; python_version >= "3.9" and python_version < "3.13"
|
||||
requests==2.32.3 ; python_version >= "3.9" and python_version < "3.13"
|
||||
safetensors==0.4.3 ; python_version >= "3.9" and python_version < "3.13"
|
||||
safetensors==0.4.4 ; python_version >= "3.9" and python_version < "3.13"
|
||||
scikit-learn==1.5.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||
scipy==1.13.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||
sentence-transformers[train]==3.0.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||
sentencepiece==0.1.99 ; python_version >= "3.9" and python_version < "3.13"
|
||||
setuptools==70.1.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
setuptools==73.0.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||
six==1.16.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
sympy==1.12.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||
tbb==2021.13.0 ; python_version >= "3.9" and python_version < "3.13" and platform_system == "Windows"
|
||||
sympy==1.13.2 ; python_version >= "3.9" and python_version < "3.13"
|
||||
threadpoolctl==3.5.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
tokenizers==0.19.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||
tqdm==4.66.4 ; python_version >= "3.9" and python_version < "3.13"
|
||||
transformers==4.40.2 ; python_version >= "3.9" and python_version < "3.13"
|
||||
transformers[sentencepiece]==4.40.2 ; python_version >= "3.9" and python_version < "3.13"
|
||||
typer==0.6.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||
tqdm==4.66.5 ; python_version >= "3.9" and python_version < "3.13"
|
||||
transformers==4.43.4 ; python_version >= "3.9" and python_version < "3.13"
|
||||
transformers[sentencepiece]==4.43.4 ; python_version >= "3.9" and python_version < "3.13"
|
||||
triton==3.0.0 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version < "3.13" and python_version >= "3.9"
|
||||
typer==0.7.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
typing-extensions==4.12.2 ; python_version >= "3.9" and python_version < "3.13"
|
||||
tzdata==2024.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||
urllib3==2.2.2 ; python_version >= "3.9" and python_version < "3.13"
|
||||
win32-setctime==1.1.0 ; python_version >= "3.9" and python_version < "3.13" and sys_platform == "win32"
|
||||
wrapt==1.16.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
xxhash==3.4.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||
xxhash==3.5.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
yarl==1.9.4 ; python_version >= "3.9" and python_version < "3.13"
|
||||
zipp==3.19.2 ; python_version >= "3.9" and python_version < "3.13"
|
||||
zipp==3.20.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
|
@ -765,7 +765,7 @@ class VlmCausalLM(Model):
|
||||
"past_key_values": past_key_values,
|
||||
"token_idx": token_idx,
|
||||
"pixel_values": pixel_values,
|
||||
"image_sizes": image_sizes
|
||||
"image_sizes": image_sizes,
|
||||
}
|
||||
|
||||
hpu_kwargs = {}
|
||||
@ -1110,7 +1110,6 @@ class VlmCausalLM(Model):
|
||||
f"You need to decrease `--max-batch-prefill-tokens`"
|
||||
)
|
||||
|
||||
self.model.clear_inputs()
|
||||
global BASE_IMAGE_TOKENS, MAX_TOTAL_TOKENS, MAX_BATCH_TOTAL_TOKENS, PREFILL_WARMUP_BATCH_SIZE_LIST, PREFILL_WARMUP_SEQLEN_LIST, DECODE_WARMUP_BATCH_SIZE_LIST
|
||||
max_input_length = batches[0].input_ids.shape[1]
|
||||
max_prefill_batch_size = batches[0].input_ids.shape[0]
|
||||
@ -1163,7 +1162,6 @@ class VlmCausalLM(Model):
|
||||
f"Memory stats: {mem_stats} "
|
||||
)
|
||||
|
||||
self.model.clear_inputs()
|
||||
max_decode_batch_size = math.floor(MAX_BATCH_TOTAL_TOKENS / MAX_TOTAL_TOKENS)
|
||||
batch_size = max_prefill_batch_size * 2
|
||||
# Decode warmup with bigger batch_size
|
||||
@ -1212,5 +1210,4 @@ class VlmCausalLM(Model):
|
||||
f"Memory stats: {mem_stats}"
|
||||
)
|
||||
|
||||
self.model.clear_inputs()
|
||||
return MAX_BATCH_TOTAL_TOKENS
|
Loading…
Reference in New Issue
Block a user