Merge branch 'main' into flash_decoding

This commit is contained in:
Wang, Yi 2025-01-09 21:27:43 +08:00 committed by GitHub
commit 068520749c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 10 additions and 9 deletions

View File

@ -118,8 +118,8 @@ ENV CCL_ZE_IPC_EXCHANGE=sockets
#ENV TORCH_LLM_ALLREDUCE=1 #ENV TORCH_LLM_ALLREDUCE=1
#ENV CCL_TOPO_FABRIC_VERTEX_CONNECTION_CHECK=0 #ENV CCL_TOPO_FABRIC_VERTEX_CONNECTION_CHECK=0
RUN git clone https://github.com/intel/intel-extension-for-pytorch && cd intel-extension-for-pytorch && git checkout c3e14caf792ad04824dd921e2fc3f16fca0d462e RUN git clone https://github.com/intel/intel-extension-for-pytorch && cd intel-extension-for-pytorch && git checkout 033af6f63745ac748cccdadee5c6140c7971edf6
RUN cd intel-extension-for-pytorch && git submodule update --init --recursive && USE_AOT_DEVLIST='pvc' BUILD_SEPARATE_OPS=OFF BUILD_WITH_CPU=OFF USE_XETLA=ON python setup.py install && rm -rf /usr/src/intel-extension-for-pytorch RUN cd intel-extension-for-pytorch && git submodule update --init --recursive && USE_AOT_DEVLIST='pvc,ats-m150' BUILD_SEPARATE_OPS=OFF BUILD_WITH_CPU=OFF USE_XETLA=ON python setup.py install && rm -rf /usr/src/intel-extension-for-pytorch
# Install benchmarker # Install benchmarker
COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark

View File

@ -1,7 +1,7 @@
<div align="center"> <div align="center">
<a href="https://www.youtube.com/watch?v=jlMAX2Oaht0"> <a href="https://www.youtube.com/watch?v=jlMAX2Oaht0">
<img width=560 width=315 alt="Making TGI deployment optimal" src="https://huggingface.co/datasets/Narsil/tgi_assets/resolve/main/thumbnail.png"> <img width=560 alt="Making TGI deployment optimal" src="https://huggingface.co/datasets/Narsil/tgi_assets/resolve/main/thumbnail.png">
</a> </a>
# Text Generation Inference # Text Generation Inference
@ -141,8 +141,8 @@ You have the option to utilize the `HF_TOKEN` environment variable for configuri
For example, if you want to serve the gated Llama V2 model variants: For example, if you want to serve the gated Llama V2 model variants:
1. Go to https://huggingface.co/settings/tokens 1. Go to https://huggingface.co/settings/tokens
2. Copy your cli READ token 2. Copy your CLI READ token
3. Export `HF_TOKEN=<your cli READ token>` 3. Export `HF_TOKEN=<your CLI READ token>`
or with Docker: or with Docker:
@ -157,7 +157,7 @@ docker run --gpus all --shm-size 1g -e HF_TOKEN=$token -p 8080:80 -v $volume:/da
### A note on Shared Memory (shm) ### A note on Shared Memory (shm)
[`NCCL`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/index.html) is a communication framework used by [`NCCL`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/index.html) is a communication framework used by
`PyTorch` to do distributed training/inference. `text-generation-inference` make `PyTorch` to do distributed training/inference. `text-generation-inference` makes
use of `NCCL` to enable Tensor Parallelism to dramatically speed up inference for large language models. use of `NCCL` to enable Tensor Parallelism to dramatically speed up inference for large language models.
In order to share data between the different devices of a `NCCL` group, `NCCL` might fall back to using the host memory if In order to share data between the different devices of a `NCCL` group, `NCCL` might fall back to using the host memory if
@ -196,7 +196,7 @@ Detailed blogpost by Adyen on TGI inner workings: [LLM inference at scale with T
You can also opt to install `text-generation-inference` locally. You can also opt to install `text-generation-inference` locally.
First clone the repository and change directoy into it: First clone the repository and change directory into it:
```shell ```shell
git clone https://github.com/huggingface/text-generation-inference git clone https://github.com/huggingface/text-generation-inference
@ -213,7 +213,7 @@ curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
conda create -n text-generation-inference python=3.11 conda create -n text-generation-inference python=3.11
conda activate text-generation-inference conda activate text-generation-inference
#using pyton venv #using python venv
python3 -m venv .venv python3 -m venv .venv
source .venv/bin/activate source .venv/bin/activate
``` ```

View File

@ -205,6 +205,7 @@ pub async fn kserve_model_infer(
let generate_request = GenerateRequest { let generate_request = GenerateRequest {
inputs: str_input.to_string(), inputs: str_input.to_string(),
parameters: payload.parameters.clone(), parameters: payload.parameters.clone(),
add_special_tokens: true,
}; };
let infer = infer.clone(); let infer = infer.clone();
let compute_type = compute_type.clone(); let compute_type = compute_type.clone();
@ -212,7 +213,7 @@ pub async fn kserve_model_infer(
async move { async move {
generate_internal(infer, compute_type, Json(generate_request), span) generate_internal(infer, compute_type, Json(generate_request), span)
.await .await
.map(|(_, Json(generation))| { .map(|(_, _, Json(generation))| {
let generation_as_bytes = generation.generated_text.as_bytes().to_vec(); let generation_as_bytes = generation.generated_text.as_bytes().to_vec();
OutputChunk { OutputChunk {
name: output.name.clone(), name: output.name.clone(),