Add some missing modification of 2.3.0 because of conflict

Signed-off-by: yuanwu <yuan.wu@intel.com>
This commit is contained in:
yuanwu 2024-09-25 07:49:49 +00:00
parent 514a5a737d
commit 14fdc4ae5e
26 changed files with 2946 additions and 1094 deletions

View File

View File

45
.github/workflows/autodocs.yaml vendored Normal file
View File

@ -0,0 +1,45 @@
name: Automatic Documentation for Launcher
on:
pull_request:
jobs:
update_docs:
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v2
- name: Set up Rust
uses: actions-rs/toolchain@v1
with:
profile: minimal
toolchain: stable
- name: Install Protocol Buffers compiler
run: |
sudo apt-get update
sudo apt-get install -y protobuf-compiler libprotobuf-dev
- name: Install Launcher
id: install-launcher
run: cargo install --path launcher/
- name: Install router
id: install-router
run: cargo install --path backends/v3/
- uses: actions/setup-node@v4
with:
node-version: 22
- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: '3.x'
- name: Check that documentation is up-to-date
run: |
npm install -g @redocly/cli
python update_doc.py --check

191
.github/workflows/build.yaml vendored Normal file
View File

@ -0,0 +1,191 @@
name: Build and push docker image to internal registry
on:
workflow_call:
inputs:
hardware:
type: string
description: Hardware
# options:
# - cuda
# - rocm
# - intel
required: true
release-tests:
description: "Run release integration tests"
required: true
default: false
type: boolean
jobs:
build-and-push:
outputs:
docker_image: ${{ steps.final.outputs.docker_image }}
docker_devices: ${{ steps.final.outputs.docker_devices }}
runs_on: ${{ steps.final.outputs.runs_on }}
label: ${{ steps.final.outputs.label }}
concurrency:
group: ${{ github.workflow }}-build-and-push-image-${{ inputs.hardware }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
runs-on:
group: aws-highmemory-32-plus-priv
permissions:
contents: write
packages: write
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Inject slug/short variables
uses: rlespinasse/github-slug-action@v4.4.1
- name: Construct harware variables
shell: bash
run: |
case ${{ inputs.hardware }} in
cuda)
export dockerfile="Dockerfile"
export label_extension=""
export docker_devices=""
export runs_on="aws-g6-12xl-plus-priv-cache"
export platform=""
;;
rocm)
export dockerfile="Dockerfile_amd"
export label_extension="-rocm"
export docker_devices="/dev/kfd,/dev/dri"
# TODO Re-enable when they pass.
# export runs_on="amd-gpu-tgi"
export runs_on="ubuntu-latest"
export platform=""
;;
intel-xpu)
export dockerfile="Dockerfile_intel"
export label_extension="-intel-xpu"
export docker_devices=""
export runs_on="ubuntu-latest"
export platform="xpu"
;;
intel-cpu)
export dockerfile="Dockerfile_intel"
export label_extension="-intel-cpu"
export docker_devices=""
export runs_on="ubuntu-latest"
export platform="cpu"
;;
esac
echo $dockerfile
echo "Dockerfile=${dockerfile}"
echo $label_extension
echo $docker_devices
echo $runs_on
echo $platform
echo "DOCKERFILE=${dockerfile}" >> $GITHUB_ENV
echo "LABEL=${label_extension}" >> $GITHUB_ENV
echo "PLATFORM=${platform}" >> $GITHUB_ENV
echo "DOCKER_DEVICES=${docker_devices}" >> $GITHUB_ENV
echo "RUNS_ON=${runs_on}" >> $GITHUB_ENV
echo REGISTRY_MIRROR=$REGISTRY_MIRROR >> $GITHUB_ENV
- name: Initialize Docker Buildx
uses: docker/setup-buildx-action@v3
with:
install: true
buildkitd-config: /tmp/buildkitd.toml
- name: Login to internal Container Registry
uses: docker/login-action@v3
with:
username: ${{ secrets.REGISTRY_USERNAME }}
password: ${{ secrets.REGISTRY_PASSWORD }}
registry: registry.internal.huggingface.tech
- name: Login to GitHub Container Registry
if: github.event_name != 'pull_request'
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Login to Azure Container Registry
if: github.event_name != 'pull_request'
uses: docker/login-action@v3
with:
username: ${{ secrets.AZURE_DOCKER_USERNAME }}
password: ${{ secrets.AZURE_DOCKER_PASSWORD }}
registry: db4c2190dd824d1f950f5d1555fbadf0.azurecr.io
# If pull request
- name: Extract metadata (tags, labels) for Docker
if: ${{ github.event_name == 'pull_request' }}
id: meta-pr
uses: docker/metadata-action@v5
with:
images: |
registry.internal.huggingface.tech/api-inference/community/text-generation-inference
tags: |
type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}${{ env.LABEL }}
# If main, release or tag
- name: Extract metadata (tags, labels) for Docker
if: ${{ github.event_name != 'pull_request' }}
id: meta
uses: docker/metadata-action@v4.3.0
with:
flavor: |
latest=auto
images: |
registry.internal.huggingface.tech/api-inference/community/text-generation-inference
ghcr.io/huggingface/text-generation-inference
db4c2190dd824d1f950f5d1555fbadf0.azurecr.io/text-generation-inference
tags: |
type=semver,pattern={{version}}${{ env.LABEL }}
type=semver,pattern={{major}}.{{minor}}${{ env.LABEL }}
type=raw,value=latest${{ env.LABEL }},enable=${{ github.ref == format('refs/heads/{0}', github.event.repository.default_branch) }}
type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}${{ env.LABEL }}
- name: Build and push Docker image
id: build-and-push
uses: docker/build-push-action@v4
with:
context: .
file: ${{ env.DOCKERFILE }}
push: true
platforms: 'linux/amd64'
build-args: |
GIT_SHA=${{ env.GITHUB_SHA }}
DOCKER_LABEL=sha-${{ env.GITHUB_SHA_SHORT }}${{ env.LABEL }}
PLATFORM=${{ env.PLATFORM }}
tags: ${{ steps.meta.outputs.tags || steps.meta-pr.outputs.tags }}
labels: ${{ steps.meta.outputs.labels || steps.meta-pr.outputs.labels }}
cache-from: type=s3,region=us-east-1,bucket=ci-docker-buildx-cache,name=text-generation-inference-cache${{ env.LABEL }},mode=min,access_key_id=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_ACCESS_KEY_ID }},secret_access_key=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_SECRET_ACCESS_KEY }},mode=min
cache-to: type=s3,region=us-east-1,bucket=ci-docker-buildx-cache,name=text-generation-inference-cache${{ env.LABEL }},mode=min,access_key_id=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_ACCESS_KEY_ID }},secret_access_key=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_SECRET_ACCESS_KEY }},mode=min
- name: Final
id: final
run: |
echo "docker_image=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:sha-${{ env.GITHUB_SHA_SHORT}}${{ env.LABEL }}" >> "$GITHUB_OUTPUT"
echo "docker_devices=${{ env.DOCKER_DEVICES }}" >> "$GITHUB_OUTPUT"
echo "runs_on=${{ env.RUNS_ON }}" >> "$GITHUB_OUTPUT"
echo "label=${{ env.LABEL }}" >> "$GITHUB_OUTPUT"
integration_tests:
concurrency:
group: ${{ github.workflow }}-${{ github.job }}-${{ needs.build-and-push.outputs.label }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
needs: build-and-push
runs-on:
group: ${{ needs.build-and-push.outputs.runs_on }}
if: needs.build-and-push.outputs.runs_on != 'ubuntu-latest'
env:
PYTEST_FLAGS: ${{ (startsWith(github.ref, 'refs/tags/') || github.ref == 'refs/heads/main' || inputs.release-tests == true) && '--release' || '--release' }}
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Inject slug/short variables
uses: rlespinasse/github-slug-action@v4.4.1
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: "3.10"
- name: Install
run: |
make install-integration-tests
- name: Run tests
run: |
export DOCKER_VOLUME=/mnt/cache
export DOCKER_IMAGE=${{ needs.build-and-push.outputs.docker_image }}
export DOCKER_DEVICES=${{ needs.build-and-push.outputs.docker_devices }}
export HF_TOKEN=${{ secrets.HF_TOKEN }}
echo $DOCKER_IMAGE
pytest -s -vv integration-tests ${PYTEST_FLAGS}

View File

@ -0,0 +1,20 @@
name: Build documentation
on:
push:
paths:
- "docs/source/**"
branches:
- main
- doc-builder*
- v*-release
jobs:
build:
uses: huggingface/doc-builder/.github/workflows/build_main_documentation.yml@main
with:
commit_sha: ${{ github.sha }}
package: text-generation-inference
additional_args: --not_python_module
secrets:
hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}

View File

@ -0,0 +1,19 @@
name: Build PR Documentation
on:
pull_request:
paths:
- "docs/source/**"
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
jobs:
build:
uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main
with:
commit_sha: ${{ github.event.pull_request.head.sha }}
pr_number: ${{ github.event.number }}
package: text-generation-inference
additional_args: --not_python_module

26
.github/workflows/client-tests.yaml vendored Normal file
View File

@ -0,0 +1,26 @@
name: Python Client Tests
on:
pull_request:
paths:
- ".github/workflows/client-tests.yaml"
- "clients/python/**"
jobs:
run_tests:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Set up Python
uses: actions/setup-python@v1
with:
python-version: 3.9
- name: Install
run: |
cd clients/python && pip install .
- name: Run tests
run: |
pip install pytest pytest-asyncio
export HF_TOKEN=${{ secrets.HF_TOKEN }}
make python-client-tests

43
.github/workflows/load_test.yaml vendored Normal file
View File

@ -0,0 +1,43 @@
name: Nightly load test
on:
schedule:
- cron: '0 0 * * 1-5'
pull_request:
paths:
- ".github/workflows/load_test.yaml"
branches:
- 'main'
jobs:
load-tests:
concurrency:
group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
runs-on:
group: aws-g5-12xlarge
env:
DOCKER_VOLUME: /cache
steps:
- name: Checkout repository
uses: actions/checkout@v3
- name: Install k6
run: |
curl https://github.com/grafana/k6/releases/download/v0.44.0/k6-v0.44.0-linux-amd64.tar.gz -L | tar xvz --strip-components 1
- name: Start starcoder
run: |
docker run --name tgi-starcoder --rm --gpus all -p 3000:80 -v /mnt/cache:/data -e HF_TOKEN=${{ secrets.HF_TOKEN }} --pull always -d ghcr.io/huggingface/text-generation-inference:latest --model-id bigcode/starcoder --num-shard 2 --max-batch-total-tokens 32768
sleep 10
wget --timeout 10 --retry-on-http-error --waitretry=1 --tries=240 http://localhost:3000/health
- name: Run k6
run: |
./k6 run load_tests/starcoder_load.js
- name: Stop starcoder
if: ${{ always() }}
run: |
docker stop tgi-starcoder || true

View File

@ -1,56 +0,0 @@
name: Build and push docker image to Github registry
on:
workflow_dispatch:
inputs:
tag:
description: 'Tag for the Docker image:'
required: true
jobs:
build-and-push:
concurrency:
group: ${{ github.workflow }}
cancel-in-progress: true
runs-on: ubuntu-latest
permissions:
contents: write
packages: write
# This is used to complete the identity challenge
# with sigstore/fulcio when running outside of PRs.
id-token: write
security-events: write
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Initialize Docker Buildx
uses: docker/setup-buildx-action@v3
with:
install: true
config-inline: |
[registry."docker.io"]
- name: Login to GitHub Container Registry
if: github.event_name != 'pull_request'
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Extract metadata (tags, labels) for Docker
id: meta
uses: docker/metadata-action@v4.3.0
with:
flavor: |
latest=true
images: ghcr.io/huggingface/tgi-gaudi
tags: |
type=raw,value=${{ github.event.inputs.tag }}
- name: Build and push Docker image
id: build-and-push
uses: docker/build-push-action@v4
with:
context: .
file: Dockerfile
push: true
platforms: 'linux/amd64'
tags: ${{ steps.meta.outputs.tags }}

14
.github/workflows/stale.yaml vendored Normal file
View File

@ -0,0 +1,14 @@
name: 'Close stale issues and PRs'
on:
schedule:
- cron: '30 1 * * *'
jobs:
stale:
runs-on: ubuntu-latest
steps:
- uses: actions/stale@v8
with:
stale-issue-message: 'This issue is stale because it has been open 30 days with no activity. Remove stale label or comment or this will be closed in 5 days.'
days-before-stale: 30
days-before-close: 5

60
.github/workflows/tests.yaml vendored Normal file
View File

@ -0,0 +1,60 @@
name: Server Tests
on:
pull_request:
paths:
- ".github/workflows/tests.yaml"
- "server/**"
- "proto/**"
- "router/**"
- "launcher/**"
- "Cargo.lock"
- "rust-toolchain.toml"
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
jobs:
run_tests:
runs-on:
group: aws-highmemory-32-plus-priv
steps:
- uses: actions/checkout@v2
- name: Set up Python
uses: actions/setup-python@v4
id: python
with:
python-version: 3.11
- name: Install Rust
uses: actions-rs/toolchain@v1
with:
# Released on: 02 May, 2024
# https://releases.rs/docs/1.78.0/
toolchain: 1.80.0
override: true
components: rustfmt, clippy
- name: Install Protoc
uses: arduino/setup-protoc@v1
- name: Clean unused files
run: |
sudo rm -rf /usr/local/lib/android # will release about 10 GB if you don't need Android
sudo rm -rf /usr/share/dotnet # will release about 20GB if you don't need .NET
- name: Install
run: |
sudo apt update
sudo apt install python3.11-dev -y
make install-cpu
- name: Run server tests
run: |
pip install pytest
export HF_TOKEN=${{ secrets.HF_TOKEN }}
pytest -s -vv server/tests
- name: Pre-commit checks
run: |
pip install pre-commit
pre-commit install
pre-commit run --all-files
- name: Run Rust tests
run: |
cargo test

View File

@ -0,0 +1,16 @@
name: Upload PR Documentation
on:
workflow_run:
workflows: ["Build PR Documentation"]
types:
- completed
jobs:
build:
uses: huggingface/doc-builder/.github/workflows/upload_pr_documentation.yml@main
with:
package_name: text-generation-inference
secrets:
hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }}

573
Cargo.lock generated
View File

@ -589,14 +589,38 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "df8670b8c7b9dae1793364eafadf7239c40d669904660c5960d74cfd80b46a53" checksum = "df8670b8c7b9dae1793364eafadf7239c40d669904660c5960d74cfd80b46a53"
[[package]] [[package]]
name = "cc" name = "cast"
version = "1.0.99" version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "96c51067fd44124faa7f870b4b1c969379ad32b2ba805aa959430ceaa384f695" checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
[[package]]
name = "castaway"
version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0abae9be0aaf9ea96a3b1b8b1b55c602ca751eba1b1500220cea4ecbafe7c0d5"
dependencies = [
"rustversion",
]
[[package]]
name = "cc"
version = "1.1.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b62ac837cdb5cb22e10a256099b4fc502b1dfe560cb282963a974d7abd80e476"
dependencies = [ dependencies = [
"jobserver", "jobserver",
"libc", "libc",
"once_cell", "shlex",
]
[[package]]
name = "cexpr"
version = "0.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766"
dependencies = [
"nom",
] ]
[[package]] [[package]]
@ -621,6 +645,34 @@ version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fd16c4719339c4530435d38e511904438d07cce7950afa3718a84ac36c10e89e" checksum = "fd16c4719339c4530435d38e511904438d07cce7950afa3718a84ac36c10e89e"
[[package]]
name = "cfg_aliases"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724"
[[package]]
name = "clang-sys"
version = "1.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4"
dependencies = [
"glob",
"libc",
"libloading",
]
[[package]]
name = "clap"
version = "2.34.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a0610544180c38b88101fecf2dd634b174a62eef6946f84dfc6a7127512b381c"
dependencies = [
"bitflags 1.3.2",
"textwrap",
"unicode-width",
]
[[package]] [[package]]
name = "clap" name = "clap"
version = "4.5.17" version = "4.5.17"
@ -645,21 +697,40 @@ dependencies = [
[[package]] [[package]]
name = "clap_derive" name = "clap_derive"
version = "4.5.5" version = "4.5.13"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c780290ccf4fb26629baa7a1081e68ced113f1d3ec302fa5948f1c381ebf06c6" checksum = "501d359d5f3dcaf6ecdeee48833ae73ec6e42723a1e52419c79abf9507eec0a0"
dependencies = [ dependencies = [
"heck 0.5.0", "heck 0.5.0",
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.66", "syn 2.0.77",
] ]
[[package]] [[package]]
name = "clap_lex" name = "clap_lex"
version = "0.7.1" version = "0.7.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4b82cf0babdbd58558212896d1a4272303a57bdb245c2bf1147185fb45640e70" checksum = "1462739cb27611015575c0c11df5df7601141071f07518d56fcc1be504cbec97"
[[package]]
name = "cmake"
version = "0.1.51"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fb1e43aa7fd152b1f968787f7dbcdeb306d1867ff373c69955211876c053f91a"
dependencies = [
"cc",
]
[[package]]
name = "codespan-reporting"
version = "0.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3538270d33cc669650c4b093848450d380def10c331d38c768e34cac80576e6e"
dependencies = [
"termcolor",
"unicode-width",
]
[[package]] [[package]]
name = "color_quant" name = "color_quant"
@ -1027,15 +1098,10 @@ dependencies = [
] ]
[[package]] [[package]]
name = "displaydoc" name = "dunce"
version = "0.2.4" version = "1.0.5"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "487585f4d0c6655fe74905e2504d8ad6908e4db67f744eb140876906c2f3175d" checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.66",
]
[[package]] [[package]]
name = "easy-cast" name = "easy-cast"
@ -1349,18 +1415,24 @@ dependencies = [
[[package]] [[package]]
name = "gimli" name = "gimli"
version = "0.29.0" version = "0.31.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "40ecd4077b5ae9fd2e9e169b102c6c330d0605168eb0e8bf79952b256dbefffd" checksum = "32085ea23f3234fc7846555e85283ba4de91e21016dc0455a16286d87a292d64"
[[package]]
name = "glob"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b"
[[package]] [[package]]
name = "grpc-metadata" name = "grpc-metadata"
version = "0.1.0" version = "0.1.0"
dependencies = [ dependencies = [
"opentelemetry", "opentelemetry 0.20.0",
"tonic 0.10.2", "tonic 0.10.2",
"tracing", "tracing",
"tracing-opentelemetry", "tracing-opentelemetry 0.21.0",
] ]
[[package]] [[package]]
@ -1559,9 +1631,9 @@ dependencies = [
[[package]] [[package]]
name = "httparse" name = "httparse"
version = "1.9.3" version = "1.9.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d0e7a4dd27b9476dc40cb050d3632d3bba3a70ddbff012285f7f8559a1e7e545" checksum = "0fcc0b4a115bf80b728eb8ea024ad5bd707b615bfed49e0665b6e0f86fd082d9"
[[package]] [[package]]
name = "httpdate" name = "httpdate"
@ -1579,7 +1651,7 @@ dependencies = [
"futures-channel", "futures-channel",
"futures-core", "futures-core",
"futures-util", "futures-util",
"h2", "h2 0.3.26",
"http 0.2.12", "http 0.2.12",
"http-body 0.4.6", "http-body 0.4.6",
"httparse", "httparse",
@ -1652,128 +1724,30 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d6183ddfa99b85da61a140bea0efc93fdf56ceaa041b37d553518030827f9905" checksum = "d6183ddfa99b85da61a140bea0efc93fdf56ceaa041b37d553518030827f9905"
dependencies = [ dependencies = [
"bytes", "bytes",
"hyper", "hyper 0.14.30",
"native-tls", "native-tls",
"tokio", "tokio",
"tokio-native-tls", "tokio-native-tls",
] ]
[[package]] [[package]]
name = "icu_collections" name = "hyper-util"
version = "1.5.0" version = "0.1.8"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "db2fa452206ebee18c4b5c2274dbf1de17008e874b4dc4f0aea9d01ca79e4526" checksum = "da62f120a8a37763efb0cf8fdf264b884c7b8b9ac8660b900c8661030c00e6ba"
dependencies = [ dependencies = [
"displaydoc", "bytes",
"yoke", "futures-channel",
"zerofrom", "futures-util",
"zerovec", "http 1.1.0",
] "http-body 1.0.1",
"hyper 1.4.1",
[[package]] "pin-project-lite",
name = "icu_locid" "socket2",
version = "1.5.0" "tokio",
source = "registry+https://github.com/rust-lang/crates.io-index" "tower",
checksum = "13acbb8371917fc971be86fc8057c41a64b521c184808a698c02acc242dbf637" "tower-service",
dependencies = [ "tracing",
"displaydoc",
"litemap",
"tinystr",
"writeable",
"zerovec",
]
[[package]]
name = "icu_locid_transform"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "01d11ac35de8e40fdeda00d9e1e9d92525f3f9d887cdd7aa81d727596788b54e"
dependencies = [
"displaydoc",
"icu_locid",
"icu_locid_transform_data",
"icu_provider",
"tinystr",
"zerovec",
]
[[package]]
name = "icu_locid_transform_data"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fdc8ff3388f852bede6b579ad4e978ab004f139284d7b28715f773507b946f6e"
[[package]]
name = "icu_normalizer"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "19ce3e0da2ec68599d193c93d088142efd7f9c5d6fc9b803774855747dc6a84f"
dependencies = [
"displaydoc",
"icu_collections",
"icu_normalizer_data",
"icu_properties",
"icu_provider",
"smallvec",
"utf16_iter",
"utf8_iter",
"write16",
"zerovec",
]
[[package]]
name = "icu_normalizer_data"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f8cafbf7aa791e9b22bec55a167906f9e1215fd475cd22adfcf660e03e989516"
[[package]]
name = "icu_properties"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1f8ac670d7422d7f76b32e17a5db556510825b29ec9154f235977c9caba61036"
dependencies = [
"displaydoc",
"icu_collections",
"icu_locid_transform",
"icu_properties_data",
"icu_provider",
"tinystr",
"zerovec",
]
[[package]]
name = "icu_properties_data"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "67a8effbc3dd3e4ba1afa8ad918d5684b8868b3b26500753effea8d2eed19569"
[[package]]
name = "icu_provider"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6ed421c8a8ef78d3e2dbc98a973be2f3770cb42b606e3ab18d6237c4dfde68d9"
dependencies = [
"displaydoc",
"icu_locid",
"icu_provider_macros",
"stable_deref_trait",
"tinystr",
"writeable",
"yoke",
"zerofrom",
"zerovec",
]
[[package]]
name = "icu_provider_macros"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.76",
] ]
[[package]] [[package]]
@ -1784,14 +1758,12 @@ checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39"
[[package]] [[package]]
name = "idna" name = "idna"
version = "1.0.0" version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4716a3a0933a1d01c2f72450e89596eb51dd34ef3c211ccd875acdf1f8fe47ed" checksum = "634d9b1461af396cad843f47fdba5597a4f9e6ddd4bfb6ff5d85028c25cb12f6"
dependencies = [ dependencies = [
"icu_normalizer", "unicode-bidi",
"icu_properties", "unicode-normalization",
"smallvec",
"utf8_iter",
] ]
[[package]] [[package]]
@ -1879,11 +1851,21 @@ version = "0.14.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "94bd26b1b737bc11f183620072e188d1c6ede67e0e78682228d66b49ec510e17" checksum = "94bd26b1b737bc11f183620072e188d1c6ede67e0e78682228d66b49ec510e17"
dependencies = [ dependencies = [
"opentelemetry", "opentelemetry 0.20.0",
"opentelemetry-otlp", "opentelemetry-otlp",
"thiserror", "thiserror",
"tracing", "tracing",
"tracing-opentelemetry", "tracing-opentelemetry 0.21.0",
]
[[package]]
name = "instability"
version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b23a0c8dfe501baac4adf6ebbfa6eddf8f0c07f56b058cc1288017e32397846c"
dependencies = [
"quote",
"syn 2.0.77",
] ]
[[package]] [[package]]
@ -1903,14 +1885,14 @@ checksum = "c34819042dc3d3971c46c2190835914dfbe0c3c13f61449b2997f4e9722dfa60"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.66", "syn 2.0.77",
] ]
[[package]] [[package]]
name = "ipnet" name = "ipnet"
version = "2.9.0" version = "2.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8f518f335dce6725a761382244631d86cf0ccb2863413590b31338feb467f9c3" checksum = "187674a687eed5fe42285b40c6291f9a01517d415fad1c3cbc6a9f778af7fcd4"
[[package]] [[package]]
name = "is_terminal_polyfill" name = "is_terminal_polyfill"
@ -2099,12 +2081,6 @@ version = "0.4.14"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89"
[[package]]
name = "litemap"
version = "0.7.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "643cb0b8d4fcc284004d5fd0d67ccf61dfffadb7f75e1e71bc420f4688a3a704"
[[package]] [[package]]
name = "lock_api" name = "lock_api"
version = "0.4.12" version = "0.4.12"
@ -3134,8 +3110,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "22505a5c94da8e3b7c2996394d1c933236c4d743e81a410bcca4e6989fc066a4" checksum = "22505a5c94da8e3b7c2996394d1c933236c4d743e81a410bcca4e6989fc066a4"
dependencies = [ dependencies = [
"bytes", "bytes",
"heck 0.4.1", "heck 0.5.0",
"itertools 0.10.5", "itertools 0.12.1",
"log", "log",
"multimap", "multimap",
"once_cell", "once_cell",
@ -3168,10 +3144,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "81bddcdb20abf9501610992b6759a4c888aef7d1a7247ef75e2404275ac24af1" checksum = "81bddcdb20abf9501610992b6759a4c888aef7d1a7247ef75e2404275ac24af1"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"itertools 0.10.5", "itertools 0.12.1",
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.66", "syn 2.0.77",
] ]
[[package]] [[package]]
@ -3427,11 +3403,11 @@ dependencies = [
[[package]] [[package]]
name = "redox_syscall" name = "redox_syscall"
version = "0.5.2" version = "0.5.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c82cf8cff14456045f55ec4241383baeff27af886adb72ffb2162f99911de0fd" checksum = "0884ad60e090bf1345b93da0a5de8923c93884cd03f40dfcfddd3b4bee661853"
dependencies = [ dependencies = [
"bitflags 2.5.0", "bitflags 2.6.0",
] ]
[[package]] [[package]]
@ -3581,23 +3557,22 @@ dependencies = [
[[package]] [[package]]
name = "rust-embed-impl" name = "rust-embed-impl"
version = "6.8.1" version = "8.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "49b94b81e5b2c284684141a2fb9e2a31be90638caf040bf9afbc5a0416afe1ac" checksum = "6125dbc8867951125eec87294137f4e9c2c96566e61bf72c45095a7c77761478"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"rust-embed-utils", "rust-embed-utils",
"shellexpand", "syn 2.0.77",
"syn 2.0.66",
"walkdir", "walkdir",
] ]
[[package]] [[package]]
name = "rust-embed-utils" name = "rust-embed-utils"
version = "7.8.1" version = "8.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9d38ff6bf570dc3bb7100fce9f7b60c33fa71d80e88da3f2580df4ff2bdded74" checksum = "2e5347777e9aacb56039b0e1f28785929a8a3b709e87482e7442c72e7c12529d"
dependencies = [ dependencies = [
"sha2", "sha2",
"walkdir", "walkdir",
@ -4024,10 +3999,10 @@ dependencies = [
] ]
[[package]] [[package]]
name = "stable_deref_trait" name = "static_assertions"
version = "1.2.0" version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
[[package]] [[package]]
name = "strsim" name = "strsim"
@ -4092,15 +4067,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2047c6ded9c721764247e62cd3b03c09ffc529b2ba5b10ec482ae507a4a70160" checksum = "2047c6ded9c721764247e62cd3b03c09ffc529b2ba5b10ec482ae507a4a70160"
[[package]] [[package]]
name = "synstructure" name = "sync_wrapper"
version = "0.13.1" version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971" checksum = "a7065abeca94b6a8a577f9bd45aa0867a2238b74e8eb67cf10d492bc39351394"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.66",
]
[[package]] [[package]]
name = "sysinfo" name = "sysinfo"
@ -4194,12 +4164,44 @@ dependencies = [
"windows-sys 0.59.0", "windows-sys 0.59.0",
] ]
[[package]]
name = "termcolor"
version = "1.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "06794f8f6c5c898b3275aebefa6b8a1cb24cd2c6c79397ab15774837a0bc5755"
dependencies = [
"winapi-util",
]
[[package]]
name = "text-generation-backends-trtllm"
version = "2.2.1-dev0"
dependencies = [
"async-stream",
"async-trait",
"clap 4.5.17",
"cmake",
"cxx",
"cxx-build",
"log",
"parking_lot",
"pkg-config",
"text-generation-router",
"thiserror",
"tokenizers 0.19.1",
"tokio",
"tokio-stream",
"tracing",
"tracing-opentelemetry 0.24.0",
"tracing-subscriber",
]
[[package]] [[package]]
name = "text-generation-benchmark" name = "text-generation-benchmark"
version = "2.0.4" version = "2.2.1-dev0"
dependencies = [ dependencies = [
"average", "average",
"clap", "clap 4.5.17",
"crossterm", "crossterm",
"float-ord", "float-ord",
"hf-hub", "hf-hub",
@ -4217,13 +4219,14 @@ dependencies = [
[[package]] [[package]]
name = "text-generation-client" name = "text-generation-client"
version = "2.0.4" version = "2.2.1-dev0"
dependencies = [ dependencies = [
"async-trait",
"base64 0.22.1",
"futures", "futures",
"grpc-metadata", "grpc-metadata",
"prost 0.12.6", "prost 0.12.6",
"prost-build", "prost-build",
"rand",
"thiserror", "thiserror",
"tokio", "tokio",
"tonic 0.10.2", "tonic 0.10.2",
@ -4234,13 +4237,13 @@ dependencies = [
[[package]] [[package]]
name = "text-generation-launcher" name = "text-generation-launcher"
version = "2.0.4" version = "2.2.1-dev0"
dependencies = [ dependencies = [
"clap", "clap 4.5.17",
"ctrlc", "ctrlc",
"float_eq", "float_eq",
"hf-hub", "hf-hub",
"nix", "nix 0.28.0",
"once_cell", "once_cell",
"reqwest", "reqwest",
"serde", "serde",
@ -4253,13 +4256,14 @@ dependencies = [
[[package]] [[package]]
name = "text-generation-router" name = "text-generation-router"
version = "2.0.4" version = "2.2.1-dev0"
dependencies = [ dependencies = [
"async-stream", "async-stream",
"axum", "async-trait",
"axum 0.7.5",
"axum-tracing-opentelemetry", "axum-tracing-opentelemetry",
"base64 0.22.1", "base64 0.22.1",
"clap", "clap 4.5.17",
"csv", "csv",
"futures", "futures",
"futures-util", "futures-util",
@ -4434,15 +4438,30 @@ dependencies = [
] ]
[[package]] [[package]]
name = "tinystr" name = "tinytemplate"
version = "0.7.6" version = "1.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9117f5d4db391c1cf6927e7bea3db74b9a1c1add8f7eda9ffd5364f40f57b82f" checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc"
dependencies = [ dependencies = [
"displaydoc", "serde",
"zerovec", "serde_json",
] ]
[[package]]
name = "tinyvec"
version = "1.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "445e881f4f6d382d5f27c034e25eb92edd7c784ceab92a0937db7f2e9471b938"
dependencies = [
"tinyvec_macros",
]
[[package]]
name = "tinyvec_macros"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
[[package]] [[package]]
name = "tokenizers" name = "tokenizers"
version = "0.19.1" version = "0.19.1"
@ -4675,13 +4694,13 @@ checksum = "d560933a0de61cf715926b9cac824d4c883c2c43142f787595e48280c40a1d0e"
dependencies = [ dependencies = [
"async-stream", "async-stream",
"async-trait", "async-trait",
"axum", "axum 0.6.20",
"base64 0.21.7", "base64 0.21.7",
"bytes", "bytes",
"h2", "h2 0.3.26",
"http", "http 0.2.12",
"http-body", "http-body 0.4.6",
"hyper", "hyper 0.14.30",
"hyper-timeout", "hyper-timeout",
"percent-encoding", "percent-encoding",
"pin-project", "pin-project",
@ -4927,10 +4946,25 @@ dependencies = [
] ]
[[package]] [[package]]
name = "unicode-ident" name = "unicode-bidi"
version = "1.0.12" version = "0.3.15"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" checksum = "08f95100a766bf4f8f28f90d77e0a5461bbdb219042e7679bebe79004fed8d75"
[[package]]
name = "unicode-ident"
version = "1.0.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe"
[[package]]
name = "unicode-normalization"
version = "0.1.23"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a56d1686db2308d901306f92a263857ef59ea39678a5458e7cb17f01415101f5"
dependencies = [
"tinyvec",
]
[[package]] [[package]]
name = "unicode-normalization-alignments" name = "unicode-normalization-alignments"
@ -5010,9 +5044,9 @@ dependencies = [
[[package]] [[package]]
name = "url" name = "url"
version = "2.5.1" version = "2.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f7c25da092f0a868cdf09e8674cd3b7ef3a7d92a24253e663a2fb85e2496de56" checksum = "22784dbdf76fdde8af1aeda5622b546b422b6fc585325248a2bf9f5e41e94d6c"
dependencies = [ dependencies = [
"form_urlencoded", "form_urlencoded",
"idna", "idna",
@ -5025,18 +5059,6 @@ version = "2.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da" checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da"
[[package]]
name = "utf16_iter"
version = "1.0.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c8232dd3cdaed5356e0f716d285e4b40b932ac434100fe9b7e0e8e935b9e6246"
[[package]]
name = "utf8_iter"
version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be"
[[package]] [[package]]
name = "utf8parse" name = "utf8parse"
version = "0.2.2" version = "0.2.2"
@ -5057,9 +5079,9 @@ dependencies = [
[[package]] [[package]]
name = "utoipa-gen" name = "utoipa-gen"
version = "3.5.0" version = "4.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "05d96dcd6fc96f3df9b3280ef480770af1b7c5d14bc55192baa9b067976d920c" checksum = "7bf0e16c02bc4bf5322ab65f10ab1149bdbcaa782cba66dc7057370a3f8190be"
dependencies = [ dependencies = [
"proc-macro-error", "proc-macro-error",
"proc-macro2", "proc-macro2",
@ -5070,11 +5092,11 @@ dependencies = [
[[package]] [[package]]
name = "utoipa-swagger-ui" name = "utoipa-swagger-ui"
version = "3.1.5" version = "6.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "84614caa239fb25b2bb373a52859ffd94605ceb256eeb1d63436325cf81e3653" checksum = "0b39868d43c011961e04b41623e050aedf2cc93652562ff7935ce0f819aaf2da"
dependencies = [ dependencies = [
"axum", "axum 0.7.5",
"mime_guess", "mime_guess",
"regex", "regex",
"rust-embed", "rust-embed",
@ -5291,9 +5313,9 @@ dependencies = [
[[package]] [[package]]
name = "webpki-roots" name = "webpki-roots"
version = "0.26.2" version = "0.26.5"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3c452ad30530b54a4d8e71952716a212b08efd0f3562baa66c29a618b07da7c3" checksum = "0bd24728e5af82c6c4ec1b66ac4844bdf8156257fccda846ec58b42cd0cdbe6a"
dependencies = [ dependencies = [
"rustls-pki-types", "rustls-pki-types",
] ]
@ -5599,70 +5621,14 @@ dependencies = [
"windows-sys 0.48.0", "windows-sys 0.48.0",
] ]
[[package]]
name = "write16"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d1890f4022759daae28ed4fe62859b1236caebfc61ede2f63ed4e695f3f6d936"
[[package]]
name = "writeable"
version = "0.5.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e9df38ee2d2c3c5948ea468a8406ff0db0b29ae1ffde1bcf20ef305bcc95c51"
[[package]]
name = "yoke"
version = "0.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6c5b1314b079b0930c31e3af543d8ee1757b1951ae1e1565ec704403a7240ca5"
dependencies = [
"serde",
"stable_deref_trait",
"yoke-derive",
"zerofrom",
]
[[package]]
name = "yoke-derive"
version = "0.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "28cc31741b18cb6f1d5ff12f5b7523e3d6eb0852bbbad19d73905511d9849b95"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.66",
"synstructure",
]
[[package]]
name = "zerocopy"
version = "0.6.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "854e949ac82d619ee9a14c66a1b674ac730422372ccb759ce0c39cabcf2bf8e6"
dependencies = [
"byteorder",
"zerocopy-derive 0.6.6",
]
[[package]] [[package]]
name = "zerocopy" name = "zerocopy"
version = "0.7.35" version = "0.7.35"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0" checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0"
dependencies = [ dependencies = [
"zerocopy-derive 0.7.35", "byteorder",
] "zerocopy-derive",
[[package]]
name = "zerocopy-derive"
version = "0.6.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "125139de3f6b9d625c39e2efdd73d41bdac468ccd556556440e322be0e1bbd91"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.77",
] ]
[[package]] [[package]]
@ -5673,28 +5639,7 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 2.0.76", "syn 2.0.77",
]
[[package]]
name = "zerofrom"
version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "91ec111ce797d0e0784a1116d0ddcdbea84322cd79e5d5ad173daeba4f93ab55"
dependencies = [
"zerofrom-derive",
]
[[package]]
name = "zerofrom-derive"
version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0ea7b4a3637ea8669cedf0f1fd5c286a17f3de97b8dd5a70a6c167a1730e63a5"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.66",
"synstructure",
] ]
[[package]] [[package]]
@ -5703,28 +5648,6 @@ version = "1.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ced3678a2879b30306d323f4542626697a464a97c0a07c9aebf7ebca65cd4dde" checksum = "ced3678a2879b30306d323f4542626697a464a97c0a07c9aebf7ebca65cd4dde"
[[package]]
name = "zerovec"
version = "0.10.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bb2cc8827d6c0994478a15c53f374f46fbd41bea663d809b14744bc42e6b109c"
dependencies = [
"yoke",
"zerofrom",
"zerovec-derive",
]
[[package]]
name = "zerovec-derive"
version = "0.10.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "97cf56601ee5052b4417d90c8755c6683473c926039908196cf35d99f893ebe7"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.66",
]
[[package]] [[package]]
name = "zip" name = "zip"
version = "0.6.6" version = "0.6.6"

View File

@ -20,7 +20,7 @@ default-members = [
resolver = "2" resolver = "2"
[workspace.package] [workspace.package]
version = "2.0.4" version = "2.3.1-dev0"
edition = "2021" edition = "2021"
authors = ["Olivier Dehaene"] authors = ["Olivier Dehaene"]
homepage = "https://github.com/huggingface/text-generation-inference" homepage = "https://github.com/huggingface/text-generation-inference"

View File

@ -1,8 +1,10 @@
# Rust builder # Rust builder
FROM lukemathwalker/cargo-chef:latest-rust-1.78 AS chef FROM lukemathwalker/cargo-chef:latest-rust-1.80 AS chef
WORKDIR /usr/src WORKDIR /usr/src
FROM chef as planner ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
FROM chef AS planner
COPY Cargo.lock Cargo.lock COPY Cargo.lock Cargo.lock
COPY Cargo.toml Cargo.toml COPY Cargo.toml Cargo.toml
COPY rust-toolchain.toml rust-toolchain.toml COPY rust-toolchain.toml rust-toolchain.toml
@ -25,16 +27,19 @@ RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
rm -f $PROTOC_ZIP rm -f $PROTOC_ZIP
COPY --from=planner /usr/src/recipe.json recipe.json COPY --from=planner /usr/src/recipe.json recipe.json
COPY Cargo.lock Cargo.lock RUN cargo chef cook --profile release-opt --recipe-path recipe.json
RUN cargo chef cook --release --recipe-path recipe.json
ARG GIT_SHA
ARG DOCKER_LABEL
COPY Cargo.toml Cargo.toml COPY Cargo.toml Cargo.toml
COPY rust-toolchain.toml rust-toolchain.toml COPY rust-toolchain.toml rust-toolchain.toml
COPY proto proto COPY proto proto
COPY benchmark benchmark COPY benchmark benchmark
COPY router router COPY router router
COPY backends backends
COPY launcher launcher COPY launcher launcher
RUN cargo build --release RUN cargo build --profile release-opt
# Text Generation Inference base image # Text Generation Inference base image
FROM vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest as base FROM vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest as base
@ -70,14 +75,26 @@ RUN cd server && \
pip install . --no-cache-dir pip install . --no-cache-dir
# Install benchmarker # Install benchmarker
COPY --from=builder /usr/src/target/release/text-generation-benchmark /usr/local/bin/text-generation-benchmark COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
# Install router # Install router
COPY --from=builder /usr/src/target/release/text-generation-router /usr/local/bin/text-generation-router COPY --from=builder /usr/src/target/release-opt/text-generation-router /usr/local/bin/text-generation-router
# Install launcher # Install launcher
COPY --from=builder /usr/src/target/release/text-generation-launcher /usr/local/bin/text-generation-launcher COPY --from=builder /usr/src/target/release-opt/text-generation-launcher /usr/local/bin/text-generation-launcher
# AWS Sagemaker compatible image
FROM base AS sagemaker
COPY sagemaker-entrypoint.sh entrypoint.sh
RUN chmod +x entrypoint.sh
ENTRYPOINT ["./entrypoint.sh"]
# Final image # Final image
FROM base FROM base
ENTRYPOINT ["text-generation-launcher"] COPY ./tgi-entrypoint.sh /tgi-entrypoint.sh
CMD ["--json-output"] RUN chmod +x /tgi-entrypoint.sh
ENTRYPOINT ["/tgi-entrypoint.sh"]
# CMD ["--json-output"]

View File

@ -1,9 +1,8 @@
install-server: install-server:
cd server && make install cd server && make install
install-integration-tests: install-server-cpu:
cd integration-tests && pip install -r requirements.txt cd server && make install-server
cd clients/python && pip install .
install-router: install-router:
cargo install --path backends/v3/ cargo install --path backends/v3/
@ -14,7 +13,10 @@ install-launcher:
install-benchmark: install-benchmark:
cargo install --path benchmark/ cargo install --path benchmark/
install: install-server install-router install-launcher install-custom-kernels install: install-server install-router install-launcher
install-cpu: install-server-cpu install-router install-launcher
server-dev: server-dev:
cd server && make run-dev cd server && make run-dev
@ -46,8 +48,8 @@ python-tests: python-server-tests python-client-tests
run-falcon-7b-instruct: run-falcon-7b-instruct:
text-generation-launcher --model-id tiiuae/falcon-7b-instruct --port 8080 text-generation-launcher --model-id tiiuae/falcon-7b-instruct --port 8080
run-falcon-7b-instruct-quantize:
text-generation-launcher --model-id tiiuae/falcon-7b-instruct --quantize bitsandbytes --port 8080
clean: clean:
rm -rf target aml rm -rf target aml
debug_image_build:
docker build --no-cache --progress=plain -t debug_tgi .

2186
docs/openapi.json Normal file

File diff suppressed because it is too large Load Diff

View File

@ -1,10 +0,0 @@
diff a/docs/openapi.json b/docs/openapi.json (rejected hunks)
@@ -10,7 +10,7 @@
"name": "Apache 2.0",
"url": "https://www.apache.org/licenses/LICENSE-2.0"
},
- "version": "2.2.1-dev0"
+ "version": "2.3.1-dev0"
},
"paths": {
"/": {

View File

@ -1,5 +1,3 @@
/// Copyright (C) 2024 Habana Labs, Ltd. an Intel Company.
use clap::{Parser, ValueEnum}; use clap::{Parser, ValueEnum};
use hf_hub::{ use hf_hub::{
api::sync::{Api, ApiBuilder}, api::sync::{Api, ApiBuilder},
@ -726,12 +724,13 @@ fn shard_manager(
if let Some(dtype) = dtype { if let Some(dtype) = dtype {
shard_args.push("--dtype".to_string()); shard_args.push("--dtype".to_string());
shard_args.push(dtype.to_string()); shard_args.push(dtype.to_string())
} }
// Model optional revision // Model optional revision
if let Some(revision) = revision { if let Some(revision) = revision {
shard_args.push("--revision".to_string()); shard_args.push("--revision".to_string());
shard_args.push(revision); shard_args.push(revision)
} }
let rope = match (rope_scaling, rope_factor) { let rope = match (rope_scaling, rope_factor) {
@ -1567,16 +1566,6 @@ fn terminate(process_name: &str, mut process: Child, timeout: Duration) -> io::R
} }
fn main() -> Result<(), LauncherError> { fn main() -> Result<(), LauncherError> {
match Command::new("ldconfig").spawn() {
Ok(_) => {}
Err(err) => {
tracing::warn!(
"Unable to refresh ldconfig cache. Skipping (useless in most cases). Details {:?}",
err
)
}
}
// Pattern match configuration // Pattern match configuration
let args: Args = Args::parse(); let args: Args = Args::parse();

View File

@ -224,7 +224,7 @@ message DecodeResponse {
message WarmupRequest { message WarmupRequest {
/// Batch to warmup on /// Batch to warmup on
repeated Batch batches = 1; Batch batch = 1;
uint32 max_input_length = 2; uint32 max_input_length = 2;
uint32 max_prefill_tokens = 3; uint32 max_prefill_tokens = 3;
uint32 max_total_tokens = 4; uint32 max_total_tokens = 4;

View File

@ -1,647 +0,0 @@
// This file is @generated by prost-build.
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct HealthRequest {}
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct HealthResponse {}
/// / Empty request
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct InfoRequest {}
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct InfoResponse {
#[prost(bool, tag = "1")]
pub requires_padding: bool,
#[prost(string, tag = "2")]
pub dtype: ::prost::alloc::string::String,
#[prost(string, tag = "3")]
pub device_type: ::prost::alloc::string::String,
#[prost(uint32, optional, tag = "4")]
pub window_size: ::core::option::Option<u32>,
#[prost(uint32, tag = "5")]
pub speculate: u32,
}
/// / Empty request
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct ServiceDiscoveryRequest {}
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct ServiceDiscoveryResponse {
/// / Other shards urls
#[prost(string, repeated, tag = "1")]
pub urls: ::prost::alloc::vec::Vec<::prost::alloc::string::String>,
}
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct ClearCacheRequest {
/// / Optional batch id
#[prost(uint64, optional, tag = "1")]
pub id: ::core::option::Option<u64>,
}
/// / Empty response
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct ClearCacheResponse {}
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct NextTokenChooserParameters {
/// / exponential scaling output probability distribution
#[prost(float, tag = "1")]
pub temperature: f32,
/// / restricting to the k highest probability elements
#[prost(uint32, tag = "2")]
pub top_k: u32,
/// / restricting to top tokens summing to prob_cut_off <= prob_cut_off
#[prost(float, tag = "3")]
pub top_p: f32,
/// / restricting to top tokens summing to prob_cut_off <= prob_cut_off
#[prost(float, tag = "4")]
pub typical_p: f32,
/// / apply sampling on the logits
#[prost(bool, tag = "5")]
pub do_sample: bool,
/// / random seed for sampling
#[prost(uint64, tag = "6")]
pub seed: u64,
/// / repetition penalty
#[prost(float, tag = "7")]
pub repetition_penalty: f32,
/// / frequency penalty
#[prost(float, tag = "9")]
pub frequency_penalty: f32,
/// / token watermarking using "A Watermark for Large Language Models"
#[prost(bool, tag = "8")]
pub watermark: bool,
/// / grammar (applied if not empty)
#[prost(string, tag = "10")]
pub grammar: ::prost::alloc::string::String,
/// / grammar type
#[prost(enumeration = "GrammarType", tag = "11")]
pub grammar_type: i32,
}
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct StoppingCriteriaParameters {
/// / Maximum number of generated tokens
#[prost(uint32, tag = "1")]
pub max_new_tokens: u32,
/// / Optional stopping sequences
#[prost(string, repeated, tag = "2")]
pub stop_sequences: ::prost::alloc::vec::Vec<::prost::alloc::string::String>,
/// / Ignore end of sequence token
/// / used for benchmarking
#[prost(bool, tag = "3")]
pub ignore_eos_token: bool,
}
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct Request {
/// / Request ID
#[prost(uint64, tag = "1")]
pub id: u64,
/// / The generation context
#[prost(string, tag = "2")]
pub inputs: ::prost::alloc::string::String,
/// / Context truncation
#[prost(uint32, tag = "3")]
pub truncate: u32,
/// / Next Token Chooser Parameters
#[prost(message, optional, tag = "4")]
pub parameters: ::core::option::Option<NextTokenChooserParameters>,
/// / Stopping Criteria Parameters
#[prost(message, optional, tag = "5")]
pub stopping_parameters: ::core::option::Option<StoppingCriteriaParameters>,
/// / Return prefill logprobs
#[prost(bool, tag = "6")]
pub prefill_logprobs: bool,
/// / Return most likely n tokens
#[prost(uint32, tag = "7")]
pub top_n_tokens: u32,
}
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct Batch {
/// / Batch ID
#[prost(uint64, tag = "1")]
pub id: u64,
/// / Individual requests
#[prost(message, repeated, tag = "2")]
pub requests: ::prost::alloc::vec::Vec<Request>,
/// / Batch size (==len(requests))
#[prost(uint32, tag = "3")]
pub size: u32,
/// / Maximum number of tokens this batch will grow to
#[prost(uint32, tag = "4")]
pub max_tokens: u32,
}
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct CachedBatch {
/// / Batch ID
#[prost(uint64, tag = "1")]
pub id: u64,
/// / Individual requests ids
#[prost(uint64, repeated, tag = "2")]
pub request_ids: ::prost::alloc::vec::Vec<u64>,
/// / Batch size (==len(requests))
#[prost(uint32, tag = "3")]
pub size: u32,
/// / Maximum number of tokens this batch will grow to
#[prost(uint32, tag = "4")]
pub max_tokens: u32,
}
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct GeneratedText {
/// / Output
#[prost(string, tag = "1")]
pub text: ::prost::alloc::string::String,
/// / Number of generated tokens
#[prost(uint32, tag = "2")]
pub generated_tokens: u32,
/// / Finish reason
#[prost(enumeration = "FinishReason", tag = "3")]
pub finish_reason: i32,
/// / Seed
#[prost(uint64, optional, tag = "4")]
pub seed: ::core::option::Option<u64>,
}
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct Tokens {
/// / Token IDs
#[prost(uint32, repeated, tag = "1")]
pub ids: ::prost::alloc::vec::Vec<u32>,
/// / Logprobs
#[prost(float, repeated, tag = "2")]
pub logprobs: ::prost::alloc::vec::Vec<f32>,
/// / tokens
#[prost(string, repeated, tag = "3")]
pub texts: ::prost::alloc::vec::Vec<::prost::alloc::string::String>,
/// / special
#[prost(bool, repeated, tag = "4")]
pub is_special: ::prost::alloc::vec::Vec<bool>,
}
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct Generation {
/// / Request ID
#[prost(uint64, tag = "1")]
pub request_id: u64,
/// / Prefill tokens (optional)
#[prost(message, optional, tag = "2")]
pub prefill_tokens: ::core::option::Option<Tokens>,
#[prost(message, optional, tag = "3")]
pub tokens: ::core::option::Option<Tokens>,
/// / Complete generated text
#[prost(message, optional, tag = "4")]
pub generated_text: ::core::option::Option<GeneratedText>,
/// / Top tokens
#[prost(message, repeated, tag = "5")]
pub top_tokens: ::prost::alloc::vec::Vec<Tokens>,
}
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct FilterBatchRequest {
/// / Batch ID
#[prost(uint64, tag = "1")]
pub batch_id: u64,
/// / Requests to keep
#[prost(uint64, repeated, tag = "2")]
pub request_ids: ::prost::alloc::vec::Vec<u64>,
}
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct FilterBatchResponse {
/// / Filtered Batch (cached)
#[prost(message, optional, tag = "1")]
pub batch: ::core::option::Option<CachedBatch>,
}
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct PrefillRequest {
/// / Batch
#[prost(message, optional, tag = "1")]
pub batch: ::core::option::Option<Batch>,
}
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct PrefillResponse {
/// / Generation
#[prost(message, repeated, tag = "1")]
pub generations: ::prost::alloc::vec::Vec<Generation>,
/// / Next batch (cached)
#[prost(message, optional, tag = "2")]
pub batch: ::core::option::Option<CachedBatch>,
/// / Forward elapsed time in nanoseconds
#[prost(uint64, tag = "3")]
pub forward_ns: u64,
/// / Decode elapsed time in nanoseconds
#[prost(uint64, tag = "4")]
pub decode_ns: u64,
/// / Total elapsed time in nanoseconds
#[prost(uint64, tag = "5")]
pub total_ns: u64,
}
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct DecodeRequest {
/// / Cached batches
#[prost(message, repeated, tag = "1")]
pub batches: ::prost::alloc::vec::Vec<CachedBatch>,
}
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct DecodeResponse {
/// / Decodes
#[prost(message, repeated, tag = "1")]
pub generations: ::prost::alloc::vec::Vec<Generation>,
/// / Next batch (cached)
#[prost(message, optional, tag = "2")]
pub batch: ::core::option::Option<CachedBatch>,
/// / Forward elapsed time in nanoseconds
#[prost(uint64, tag = "3")]
pub forward_ns: u64,
/// / Decode elapsed time in nanoseconds
#[prost(uint64, tag = "4")]
pub decode_ns: u64,
/// / Total elapsed time in nanoseconds
#[prost(uint64, tag = "5")]
pub total_ns: u64,
/// / Concatenate elapsed time in nanoseconds
#[prost(uint64, optional, tag = "6")]
pub concat_ns: ::core::option::Option<u64>,
}
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct WarmupRequest {
/// / Batch to warmup on
#[prost(message, optional, tag = "1")]
pub batch: ::core::option::Option<Batch>,
#[prost(uint32, tag = "2")]
pub max_input_length: u32,
#[prost(uint32, tag = "3")]
pub max_prefill_tokens: u32,
#[prost(uint32, tag = "4")]
pub max_total_tokens: u32,
}
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct WarmupResponse {
/// / Maximum number of tokens supported by the model
#[prost(uint32, optional, tag = "1")]
pub max_supported_total_tokens: ::core::option::Option<u32>,
}
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)]
#[repr(i32)]
pub enum GrammarType {
None = 0,
Json = 1,
Regex = 2,
}
impl GrammarType {
/// String value of the enum field names used in the ProtoBuf definition.
///
/// The values are not transformed in any way and thus are considered stable
/// (if the ProtoBuf definition does not change) and safe for programmatic use.
pub fn as_str_name(&self) -> &'static str {
match self {
GrammarType::None => "GRAMMAR_TYPE_NONE",
GrammarType::Json => "GRAMMAR_TYPE_JSON",
GrammarType::Regex => "GRAMMAR_TYPE_REGEX",
}
}
/// Creates an enum from field names used in the ProtoBuf definition.
pub fn from_str_name(value: &str) -> ::core::option::Option<Self> {
match value {
"GRAMMAR_TYPE_NONE" => Some(Self::None),
"GRAMMAR_TYPE_JSON" => Some(Self::Json),
"GRAMMAR_TYPE_REGEX" => Some(Self::Regex),
_ => None,
}
}
}
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)]
#[repr(i32)]
pub enum FinishReason {
Length = 0,
EosToken = 1,
StopSequence = 2,
}
impl FinishReason {
/// String value of the enum field names used in the ProtoBuf definition.
///
/// The values are not transformed in any way and thus are considered stable
/// (if the ProtoBuf definition does not change) and safe for programmatic use.
pub fn as_str_name(&self) -> &'static str {
match self {
FinishReason::Length => "FINISH_REASON_LENGTH",
FinishReason::EosToken => "FINISH_REASON_EOS_TOKEN",
FinishReason::StopSequence => "FINISH_REASON_STOP_SEQUENCE",
}
}
/// Creates an enum from field names used in the ProtoBuf definition.
pub fn from_str_name(value: &str) -> ::core::option::Option<Self> {
match value {
"FINISH_REASON_LENGTH" => Some(Self::Length),
"FINISH_REASON_EOS_TOKEN" => Some(Self::EosToken),
"FINISH_REASON_STOP_SEQUENCE" => Some(Self::StopSequence),
_ => None,
}
}
}
/// Generated client implementations.
pub mod text_generation_service_client {
#![allow(unused_variables, dead_code, missing_docs, clippy::let_unit_value)]
use tonic::codegen::*;
use tonic::codegen::http::Uri;
#[derive(Debug, Clone)]
pub struct TextGenerationServiceClient<T> {
inner: tonic::client::Grpc<T>,
}
impl TextGenerationServiceClient<tonic::transport::Channel> {
/// Attempt to create a new client by connecting to a given endpoint.
pub async fn connect<D>(dst: D) -> Result<Self, tonic::transport::Error>
where
D: TryInto<tonic::transport::Endpoint>,
D::Error: Into<StdError>,
{
let conn = tonic::transport::Endpoint::new(dst)?.connect().await?;
Ok(Self::new(conn))
}
}
impl<T> TextGenerationServiceClient<T>
where
T: tonic::client::GrpcService<tonic::body::BoxBody>,
T::Error: Into<StdError>,
T::ResponseBody: Body<Data = Bytes> + Send + 'static,
<T::ResponseBody as Body>::Error: Into<StdError> + Send,
{
pub fn new(inner: T) -> Self {
let inner = tonic::client::Grpc::new(inner);
Self { inner }
}
pub fn with_origin(inner: T, origin: Uri) -> Self {
let inner = tonic::client::Grpc::with_origin(inner, origin);
Self { inner }
}
pub fn with_interceptor<F>(
inner: T,
interceptor: F,
) -> TextGenerationServiceClient<InterceptedService<T, F>>
where
F: tonic::service::Interceptor,
T::ResponseBody: Default,
T: tonic::codegen::Service<
http::Request<tonic::body::BoxBody>,
Response = http::Response<
<T as tonic::client::GrpcService<tonic::body::BoxBody>>::ResponseBody,
>,
>,
<T as tonic::codegen::Service<
http::Request<tonic::body::BoxBody>,
>>::Error: Into<StdError> + Send + Sync,
{
TextGenerationServiceClient::new(InterceptedService::new(inner, interceptor))
}
/// Compress requests with the given encoding.
///
/// This requires the server to support it otherwise it might respond with an
/// error.
#[must_use]
pub fn send_compressed(mut self, encoding: CompressionEncoding) -> Self {
self.inner = self.inner.send_compressed(encoding);
self
}
/// Enable decompressing responses.
#[must_use]
pub fn accept_compressed(mut self, encoding: CompressionEncoding) -> Self {
self.inner = self.inner.accept_compressed(encoding);
self
}
/// Limits the maximum size of a decoded message.
///
/// Default: `4MB`
#[must_use]
pub fn max_decoding_message_size(mut self, limit: usize) -> Self {
self.inner = self.inner.max_decoding_message_size(limit);
self
}
/// Limits the maximum size of an encoded message.
///
/// Default: `usize::MAX`
#[must_use]
pub fn max_encoding_message_size(mut self, limit: usize) -> Self {
self.inner = self.inner.max_encoding_message_size(limit);
self
}
/// / Model Info
pub async fn info(
&mut self,
request: impl tonic::IntoRequest<super::InfoRequest>,
) -> std::result::Result<tonic::Response<super::InfoResponse>, tonic::Status> {
self.inner
.ready()
.await
.map_err(|e| {
tonic::Status::new(
tonic::Code::Unknown,
format!("Service was not ready: {}", e.into()),
)
})?;
let codec = tonic::codec::ProstCodec::default();
let path = http::uri::PathAndQuery::from_static(
"/generate.v2.TextGenerationService/Info",
);
let mut req = request.into_request();
req.extensions_mut()
.insert(GrpcMethod::new("generate.v2.TextGenerationService", "Info"));
self.inner.unary(req, path, codec).await
}
/// / Service discovery
pub async fn service_discovery(
&mut self,
request: impl tonic::IntoRequest<super::ServiceDiscoveryRequest>,
) -> std::result::Result<
tonic::Response<super::ServiceDiscoveryResponse>,
tonic::Status,
> {
self.inner
.ready()
.await
.map_err(|e| {
tonic::Status::new(
tonic::Code::Unknown,
format!("Service was not ready: {}", e.into()),
)
})?;
let codec = tonic::codec::ProstCodec::default();
let path = http::uri::PathAndQuery::from_static(
"/generate.v2.TextGenerationService/ServiceDiscovery",
);
let mut req = request.into_request();
req.extensions_mut()
.insert(
GrpcMethod::new(
"generate.v2.TextGenerationService",
"ServiceDiscovery",
),
);
self.inner.unary(req, path, codec).await
}
/// / Empties batch cache
pub async fn clear_cache(
&mut self,
request: impl tonic::IntoRequest<super::ClearCacheRequest>,
) -> std::result::Result<
tonic::Response<super::ClearCacheResponse>,
tonic::Status,
> {
self.inner
.ready()
.await
.map_err(|e| {
tonic::Status::new(
tonic::Code::Unknown,
format!("Service was not ready: {}", e.into()),
)
})?;
let codec = tonic::codec::ProstCodec::default();
let path = http::uri::PathAndQuery::from_static(
"/generate.v2.TextGenerationService/ClearCache",
);
let mut req = request.into_request();
req.extensions_mut()
.insert(
GrpcMethod::new("generate.v2.TextGenerationService", "ClearCache"),
);
self.inner.unary(req, path, codec).await
}
/// / Remove requests from a cached batch
pub async fn filter_batch(
&mut self,
request: impl tonic::IntoRequest<super::FilterBatchRequest>,
) -> std::result::Result<
tonic::Response<super::FilterBatchResponse>,
tonic::Status,
> {
self.inner
.ready()
.await
.map_err(|e| {
tonic::Status::new(
tonic::Code::Unknown,
format!("Service was not ready: {}", e.into()),
)
})?;
let codec = tonic::codec::ProstCodec::default();
let path = http::uri::PathAndQuery::from_static(
"/generate.v2.TextGenerationService/FilterBatch",
);
let mut req = request.into_request();
req.extensions_mut()
.insert(
GrpcMethod::new("generate.v2.TextGenerationService", "FilterBatch"),
);
self.inner.unary(req, path, codec).await
}
/// / Warmup the model and compute max cache size
pub async fn warmup(
&mut self,
request: impl tonic::IntoRequest<super::WarmupRequest>,
) -> std::result::Result<tonic::Response<super::WarmupResponse>, tonic::Status> {
self.inner
.ready()
.await
.map_err(|e| {
tonic::Status::new(
tonic::Code::Unknown,
format!("Service was not ready: {}", e.into()),
)
})?;
let codec = tonic::codec::ProstCodec::default();
let path = http::uri::PathAndQuery::from_static(
"/generate.v2.TextGenerationService/Warmup",
);
let mut req = request.into_request();
req.extensions_mut()
.insert(GrpcMethod::new("generate.v2.TextGenerationService", "Warmup"));
self.inner.unary(req, path, codec).await
}
/// / Prefill batch and decode first token
pub async fn prefill(
&mut self,
request: impl tonic::IntoRequest<super::PrefillRequest>,
) -> std::result::Result<
tonic::Response<super::PrefillResponse>,
tonic::Status,
> {
self.inner
.ready()
.await
.map_err(|e| {
tonic::Status::new(
tonic::Code::Unknown,
format!("Service was not ready: {}", e.into()),
)
})?;
let codec = tonic::codec::ProstCodec::default();
let path = http::uri::PathAndQuery::from_static(
"/generate.v2.TextGenerationService/Prefill",
);
let mut req = request.into_request();
req.extensions_mut()
.insert(GrpcMethod::new("generate.v2.TextGenerationService", "Prefill"));
self.inner.unary(req, path, codec).await
}
/// / Decode token for a list of prefilled batches
pub async fn decode(
&mut self,
request: impl tonic::IntoRequest<super::DecodeRequest>,
) -> std::result::Result<tonic::Response<super::DecodeResponse>, tonic::Status> {
self.inner
.ready()
.await
.map_err(|e| {
tonic::Status::new(
tonic::Code::Unknown,
format!("Service was not ready: {}", e.into()),
)
})?;
let codec = tonic::codec::ProstCodec::default();
let path = http::uri::PathAndQuery::from_static(
"/generate.v2.TextGenerationService/Decode",
);
let mut req = request.into_request();
req.extensions_mut()
.insert(GrpcMethod::new("generate.v2.TextGenerationService", "Decode"));
self.inner.unary(req, path, codec).await
}
/// / Health check
pub async fn health(
&mut self,
request: impl tonic::IntoRequest<super::HealthRequest>,
) -> std::result::Result<tonic::Response<super::HealthResponse>, tonic::Status> {
self.inner
.ready()
.await
.map_err(|e| {
tonic::Status::new(
tonic::Code::Unknown,
format!("Service was not ready: {}", e.into()),
)
})?;
let codec = tonic::codec::ProstCodec::default();
let path = http::uri::PathAndQuery::from_static(
"/generate.v2.TextGenerationService/Health",
);
let mut req = request.into_request();
req.extensions_mut()
.insert(GrpcMethod::new("generate.v2.TextGenerationService", "Health"));
self.inner.unary(req, path, codec).await
}
}
}

View File

@ -1,6 +0,0 @@
// This file is @generated by prost-build.
pub mod generate {
pub mod v2 {
include!("generate.v2.rs");
}
}

View File

@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "text-generation-server" name = "text-generation-server"
version = "2.0.4" version = "2.0.5-dev0"
description = "Text Generation Inference Python gRPC Server" description = "Text Generation Inference Python gRPC Server"
authors = ["Olivier Dehaene <olivier@huggingface.co>"] authors = ["Olivier Dehaene <olivier@huggingface.co>"]

View File

@ -9,11 +9,15 @@ from loguru import logger
from text_generation_server.layers.exl2 import Exl2Weight from text_generation_server.layers.exl2 import Exl2Weight
from text_generation_server.layers.gptq import GPTQWeight from text_generation_server.layers.gptq import GPTQWeight
from text_generation_server.utils.log import log_master
try: try:
from exllamav2_kernels import make_q_matrix, gemm_half_q_half from exllamav2.ext import exllamav2_ext
make_q_matrix = exllamav2_ext.make_q_matrix
gemm_half_q_half = exllamav2_ext.gemm_half_q_half
except ImportError: except ImportError:
logger.error("exllamav2_kernels not installed.") log_master(logger.warning, "exllamav2_kernels not installed.")
raise raise
# Dummy tensor to pass instead of g_idx since there is no way to pass "None" to a C++ extension # Dummy tensor to pass instead of g_idx since there is no way to pass "None" to a C++ extension
@ -69,6 +73,10 @@ def ext_make_q_matrix(
""" """
Create Q matrix Create Q matrix
""" """
# max_dq_size = 512*(1024**2)
# max_dq_rows = max_dq_size // out_features[0]
max_dq_rows = 0
# EXL2 # EXL2
if isinstance(w, Exl2Weight): if isinstance(w, Exl2Weight):
extra.q_group_map = make_group_map(w.q_groups, w.q_weight.shape[0]) extra.q_group_map = make_group_map(w.q_groups, w.q_weight.shape[0])
@ -82,10 +90,12 @@ def ext_make_q_matrix(
w.q_scale_max, w.q_scale_max,
w.q_groups, w.q_groups,
extra.q_group_map, extra.q_group_map,
none_tensor, none_tensor, # zeros
none_tensor, none_tensor, # scales
none_tensor, none_tensor, # g_idx
none_tensor, # bias
temp_dq, temp_dq,
max_dq_rows,
) )
# GPTQ # GPTQ
elif isinstance(w, GPTQWeight): elif isinstance(w, GPTQWeight):
@ -105,29 +115,33 @@ def ext_make_q_matrix(
w.qweight, w.qweight,
extra.q_perm, extra.q_perm,
extra.q_invperm, extra.q_invperm,
none_tensor, none_tensor, # q_scale
none_tensor, none_tensor, # q_scale_max
none_tensor, none_tensor, # q_groups
none_tensor, none_tensor, # q_group_map
w.qzeros, w.qzeros,
w.scales, w.scales,
w.g_idx.cpu(), w.g_idx.cpu(),
none_tensor, # bias
temp_dq, temp_dq,
max_dq_rows,
) )
# GPTQ without g_idx # GPTQ without g_idx
else: else:
return make_q_matrix( return make_q_matrix(
w.qweight, w.qweight,
none_tensor, none_tensor, # q_perm
none_tensor, none_tensor, # q_invperm
none_tensor, none_tensor, # q_scale
none_tensor, none_tensor, # q_scale_max
none_tensor, none_tensor, # q_groups
none_tensor, none_tensor, # q_group_map
w.qzeros, w.qzeros,
w.scales, w.scales,
none_tensor, none_tensor, # g_idx
none_tensor, # bias
temp_dq, temp_dq,
max_dq_rows,
) )
else: else:
RuntimeError("Cannot create handle") RuntimeError("Cannot create handle")

View File

@ -206,6 +206,7 @@ def matmul248(input, qweight, scales, qzeros, g_idx, bits, maxq):
output = torch.empty( output = torch.empty(
(input.shape[0], qweight.shape[1]), device=input.device, dtype=torch.float16 (input.shape[0], qweight.shape[1]), device=input.device, dtype=torch.float16
) )
def grid(META): def grid(META):
return ( return (
triton.cdiv(input.shape[0], META["BLOCK_SIZE_M"]) triton.cdiv(input.shape[0], META["BLOCK_SIZE_M"])

5
tgi-entrypoint.sh Executable file
View File

@ -0,0 +1,5 @@
#!/bin/bash
ldconfig 2>/dev/null || echo 'unable to refresh ld cache, not a big deal in most cases'
text-generation-launcher $@