From 23bc38b10d06f8cc271d086c26270976faf67cc2 Mon Sep 17 00:00:00 2001 From: drbh Date: Thu, 19 Dec 2024 16:55:17 -0500 Subject: [PATCH 1/3] fix: include add_special_tokens in kserve request (#2859) merging as this patch is already used, and fully limit to the kserve feature --- router/src/kserve.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/router/src/kserve.rs b/router/src/kserve.rs index c53fa481..ea85eb8c 100644 --- a/router/src/kserve.rs +++ b/router/src/kserve.rs @@ -205,6 +205,7 @@ pub async fn kserve_model_infer( let generate_request = GenerateRequest { inputs: str_input.to_string(), parameters: payload.parameters.clone(), + add_special_tokens: true, }; let infer = infer.clone(); let compute_type = compute_type.clone(); @@ -212,7 +213,7 @@ pub async fn kserve_model_infer( async move { generate_internal(infer, compute_type, Json(generate_request), span) .await - .map(|(_, Json(generation))| { + .map(|(_, _, Json(generation))| { let generation_as_bytes = generation.generated_text.as_bytes().to_vec(); OutputChunk { name: output.name.clone(), From d37a43e58189e5556b9ed73e067db4a8d03191ef Mon Sep 17 00:00:00 2001 From: Ruida Zeng <31152346+ruidazeng@users.noreply.github.com> Date: Thu, 9 Jan 2025 03:09:23 -0600 Subject: [PATCH 2/3] chore: fixed some typos and attribute issues in README (#2891) * chore: fixed html repeated attribute in README * chore: fix minor grammar/capitalization * chore: fixed spelling mistakes in README --- README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 6d3a9b12..31966ddb 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@
- Making TGI deployment optimal + Making TGI deployment optimal # Text Generation Inference @@ -141,8 +141,8 @@ You have the option to utilize the `HF_TOKEN` environment variable for configuri For example, if you want to serve the gated Llama V2 model variants: 1. Go to https://huggingface.co/settings/tokens -2. Copy your cli READ token -3. Export `HF_TOKEN=` +2. Copy your CLI READ token +3. Export `HF_TOKEN=` or with Docker: @@ -157,7 +157,7 @@ docker run --gpus all --shm-size 1g -e HF_TOKEN=$token -p 8080:80 -v $volume:/da ### A note on Shared Memory (shm) [`NCCL`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/index.html) is a communication framework used by -`PyTorch` to do distributed training/inference. `text-generation-inference` make +`PyTorch` to do distributed training/inference. `text-generation-inference` makes use of `NCCL` to enable Tensor Parallelism to dramatically speed up inference for large language models. In order to share data between the different devices of a `NCCL` group, `NCCL` might fall back to using the host memory if @@ -196,7 +196,7 @@ Detailed blogpost by Adyen on TGI inner workings: [LLM inference at scale with T You can also opt to install `text-generation-inference` locally. -First clone the repository and change directoy into it: +First clone the repository and change directory into it: ```shell git clone https://github.com/huggingface/text-generation-inference @@ -213,7 +213,7 @@ curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh conda create -n text-generation-inference python=3.11 conda activate text-generation-inference -#using pyton venv +#using python venv python3 -m venv .venv source .venv/bin/activate ``` From afb6c728d8df41d3552395b64d5743133d56db93 Mon Sep 17 00:00:00 2001 From: "Wang, Yi" Date: Thu, 9 Jan 2025 17:11:03 +0800 Subject: [PATCH 3/3] update ipex xpu to fix issue in ARC770 (#2884) * update ipex xpu to fix issue in ARC770 Signed-off-by: Wang, Yi A * add ats support Signed-off-by: Wang, Yi A --------- Signed-off-by: Wang, Yi A --- Dockerfile_intel | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Dockerfile_intel b/Dockerfile_intel index 720d7bee..3b5e4a13 100644 --- a/Dockerfile_intel +++ b/Dockerfile_intel @@ -100,7 +100,6 @@ WORKDIR /usr/src RUN pip install https://intel-optimized-pytorch.s3.cn-north-1.amazonaws.com.cn/ipex_dev/xpu/torch-2.5.0a0%2Bgite84e33f-cp311-cp311-linux_x86_64.whl --no-cache-dir RUN pip install https://intel-optimized-pytorch.s3.cn-north-1.amazonaws.com.cn/ipex_dev/xpu/torchaudio-2.5.0a0%2B56bc006-cp311-cp311-linux_x86_64.whl --no-cache-dir RUN pip install https://intel-optimized-pytorch.s3.cn-north-1.amazonaws.com.cn/ipex_dev/xpu/torchvision-0.20.0a0%2B8e8a208-cp311-cp311-linux_x86_64.whl --no-cache-dir -RUN pip install https://intel-optimized-pytorch.s3.cn-north-1.amazonaws.com.cn/ipex_dev/xpu/intel_extension_for_pytorch-2.5.10%2Bgit9d489a8-cp311-cp311-linux_x86_64.whl --no-cache-dir RUN pip install https://intel-optimized-pytorch.s3.cn-north-1.amazonaws.com.cn/ipex_dev/xpu/oneccl_bind_pt-2.5.0%2Bxpu-cp311-cp311-linux_x86_64.whl --no-cache-dir RUN pip install triton-xpu==3.0.0b2 --no-cache-dir @@ -119,6 +118,9 @@ ENV CCL_ZE_IPC_EXCHANGE=sockets #ENV TORCH_LLM_ALLREDUCE=1 #ENV CCL_TOPO_FABRIC_VERTEX_CONNECTION_CHECK=0 +RUN git clone https://github.com/intel/intel-extension-for-pytorch && cd intel-extension-for-pytorch && git checkout 033af6f63745ac748cccdadee5c6140c7971edf6 +RUN cd intel-extension-for-pytorch && git submodule update --init --recursive && USE_AOT_DEVLIST='pvc,ats-m150' BUILD_SEPARATE_OPS=OFF BUILD_WITH_CPU=OFF USE_XETLA=ON python setup.py install && rm -rf /usr/src/intel-extension-for-pytorch + # Install benchmarker COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark # Install router