Add some missing modification of 2.3.0 because of conflict

Signed-off-by: yuanwu <yuan.wu@intel.com>
This commit is contained in:
yuanwu 2024-09-25 07:49:49 +00:00
parent 514a5a737d
commit 14fdc4ae5e
26 changed files with 2946 additions and 1094 deletions

View File

View File

45
.github/workflows/autodocs.yaml vendored Normal file
View File

@ -0,0 +1,45 @@
name: Automatic Documentation for Launcher
on:
pull_request:
jobs:
update_docs:
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v2
- name: Set up Rust
uses: actions-rs/toolchain@v1
with:
profile: minimal
toolchain: stable
- name: Install Protocol Buffers compiler
run: |
sudo apt-get update
sudo apt-get install -y protobuf-compiler libprotobuf-dev
- name: Install Launcher
id: install-launcher
run: cargo install --path launcher/
- name: Install router
id: install-router
run: cargo install --path backends/v3/
- uses: actions/setup-node@v4
with:
node-version: 22
- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: '3.x'
- name: Check that documentation is up-to-date
run: |
npm install -g @redocly/cli
python update_doc.py --check

191
.github/workflows/build.yaml vendored Normal file
View File

@ -0,0 +1,191 @@
name: Build and push docker image to internal registry
on:
workflow_call:
inputs:
hardware:
type: string
description: Hardware
# options:
# - cuda
# - rocm
# - intel
required: true
release-tests:
description: "Run release integration tests"
required: true
default: false
type: boolean
jobs:
build-and-push:
outputs:
docker_image: ${{ steps.final.outputs.docker_image }}
docker_devices: ${{ steps.final.outputs.docker_devices }}
runs_on: ${{ steps.final.outputs.runs_on }}
label: ${{ steps.final.outputs.label }}
concurrency:
group: ${{ github.workflow }}-build-and-push-image-${{ inputs.hardware }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
runs-on:
group: aws-highmemory-32-plus-priv
permissions:
contents: write
packages: write
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Inject slug/short variables
uses: rlespinasse/github-slug-action@v4.4.1
- name: Construct harware variables
shell: bash
run: |
case ${{ inputs.hardware }} in
cuda)
export dockerfile="Dockerfile"
export label_extension=""
export docker_devices=""
export runs_on="aws-g6-12xl-plus-priv-cache"
export platform=""
;;
rocm)
export dockerfile="Dockerfile_amd"
export label_extension="-rocm"
export docker_devices="/dev/kfd,/dev/dri"
# TODO Re-enable when they pass.
# export runs_on="amd-gpu-tgi"
export runs_on="ubuntu-latest"
export platform=""
;;
intel-xpu)
export dockerfile="Dockerfile_intel"
export label_extension="-intel-xpu"
export docker_devices=""
export runs_on="ubuntu-latest"
export platform="xpu"
;;
intel-cpu)
export dockerfile="Dockerfile_intel"
export label_extension="-intel-cpu"
export docker_devices=""
export runs_on="ubuntu-latest"
export platform="cpu"
;;
esac
echo $dockerfile
echo "Dockerfile=${dockerfile}"
echo $label_extension
echo $docker_devices
echo $runs_on
echo $platform
echo "DOCKERFILE=${dockerfile}" >> $GITHUB_ENV
echo "LABEL=${label_extension}" >> $GITHUB_ENV
echo "PLATFORM=${platform}" >> $GITHUB_ENV
echo "DOCKER_DEVICES=${docker_devices}" >> $GITHUB_ENV
echo "RUNS_ON=${runs_on}" >> $GITHUB_ENV
echo REGISTRY_MIRROR=$REGISTRY_MIRROR >> $GITHUB_ENV
- name: Initialize Docker Buildx
uses: docker/setup-buildx-action@v3
with:
install: true
buildkitd-config: /tmp/buildkitd.toml
- name: Login to internal Container Registry
uses: docker/login-action@v3
with:
username: ${{ secrets.REGISTRY_USERNAME }}
password: ${{ secrets.REGISTRY_PASSWORD }}
registry: registry.internal.huggingface.tech
- name: Login to GitHub Container Registry
if: github.event_name != 'pull_request'
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Login to Azure Container Registry
if: github.event_name != 'pull_request'
uses: docker/login-action@v3
with:
username: ${{ secrets.AZURE_DOCKER_USERNAME }}
password: ${{ secrets.AZURE_DOCKER_PASSWORD }}
registry: db4c2190dd824d1f950f5d1555fbadf0.azurecr.io
# If pull request
- name: Extract metadata (tags, labels) for Docker
if: ${{ github.event_name == 'pull_request' }}
id: meta-pr
uses: docker/metadata-action@v5
with:
images: |
registry.internal.huggingface.tech/api-inference/community/text-generation-inference
tags: |
type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}${{ env.LABEL }}
# If main, release or tag
- name: Extract metadata (tags, labels) for Docker
if: ${{ github.event_name != 'pull_request' }}
id: meta
uses: docker/metadata-action@v4.3.0
with:
flavor: |
latest=auto
images: |
registry.internal.huggingface.tech/api-inference/community/text-generation-inference
ghcr.io/huggingface/text-generation-inference
db4c2190dd824d1f950f5d1555fbadf0.azurecr.io/text-generation-inference
tags: |
type=semver,pattern={{version}}${{ env.LABEL }}
type=semver,pattern={{major}}.{{minor}}${{ env.LABEL }}
type=raw,value=latest${{ env.LABEL }},enable=${{ github.ref == format('refs/heads/{0}', github.event.repository.default_branch) }}
type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}${{ env.LABEL }}
- name: Build and push Docker image
id: build-and-push
uses: docker/build-push-action@v4
with:
context: .
file: ${{ env.DOCKERFILE }}
push: true
platforms: 'linux/amd64'
build-args: |
GIT_SHA=${{ env.GITHUB_SHA }}
DOCKER_LABEL=sha-${{ env.GITHUB_SHA_SHORT }}${{ env.LABEL }}
PLATFORM=${{ env.PLATFORM }}
tags: ${{ steps.meta.outputs.tags || steps.meta-pr.outputs.tags }}
labels: ${{ steps.meta.outputs.labels || steps.meta-pr.outputs.labels }}
cache-from: type=s3,region=us-east-1,bucket=ci-docker-buildx-cache,name=text-generation-inference-cache${{ env.LABEL }},mode=min,access_key_id=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_ACCESS_KEY_ID }},secret_access_key=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_SECRET_ACCESS_KEY }},mode=min
cache-to: type=s3,region=us-east-1,bucket=ci-docker-buildx-cache,name=text-generation-inference-cache${{ env.LABEL }},mode=min,access_key_id=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_ACCESS_KEY_ID }},secret_access_key=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_SECRET_ACCESS_KEY }},mode=min
- name: Final
id: final
run: |
echo "docker_image=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:sha-${{ env.GITHUB_SHA_SHORT}}${{ env.LABEL }}" >> "$GITHUB_OUTPUT"
echo "docker_devices=${{ env.DOCKER_DEVICES }}" >> "$GITHUB_OUTPUT"
echo "runs_on=${{ env.RUNS_ON }}" >> "$GITHUB_OUTPUT"
echo "label=${{ env.LABEL }}" >> "$GITHUB_OUTPUT"
integration_tests:
concurrency:
group: ${{ github.workflow }}-${{ github.job }}-${{ needs.build-and-push.outputs.label }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
needs: build-and-push
runs-on:
group: ${{ needs.build-and-push.outputs.runs_on }}
if: needs.build-and-push.outputs.runs_on != 'ubuntu-latest'
env:
PYTEST_FLAGS: ${{ (startsWith(github.ref, 'refs/tags/') || github.ref == 'refs/heads/main' || inputs.release-tests == true) && '--release' || '--release' }}
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Inject slug/short variables
uses: rlespinasse/github-slug-action@v4.4.1
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: "3.10"
- name: Install
run: |
make install-integration-tests
- name: Run tests
run: |
export DOCKER_VOLUME=/mnt/cache
export DOCKER_IMAGE=${{ needs.build-and-push.outputs.docker_image }}
export DOCKER_DEVICES=${{ needs.build-and-push.outputs.docker_devices }}
export HF_TOKEN=${{ secrets.HF_TOKEN }}
echo $DOCKER_IMAGE
pytest -s -vv integration-tests ${PYTEST_FLAGS}

View File

@ -0,0 +1,20 @@
name: Build documentation
on:
push:
paths:
- "docs/source/**"
branches:
- main
- doc-builder*
- v*-release
jobs:
build:
uses: huggingface/doc-builder/.github/workflows/build_main_documentation.yml@main
with:
commit_sha: ${{ github.sha }}
package: text-generation-inference
additional_args: --not_python_module
secrets:
hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}

View File

@ -0,0 +1,19 @@
name: Build PR Documentation
on:
pull_request:
paths:
- "docs/source/**"
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
jobs:
build:
uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main
with:
commit_sha: ${{ github.event.pull_request.head.sha }}
pr_number: ${{ github.event.number }}
package: text-generation-inference
additional_args: --not_python_module

26
.github/workflows/client-tests.yaml vendored Normal file
View File

@ -0,0 +1,26 @@
name: Python Client Tests
on:
pull_request:
paths:
- ".github/workflows/client-tests.yaml"
- "clients/python/**"
jobs:
run_tests:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Set up Python
uses: actions/setup-python@v1
with:
python-version: 3.9
- name: Install
run: |
cd clients/python && pip install .
- name: Run tests
run: |
pip install pytest pytest-asyncio
export HF_TOKEN=${{ secrets.HF_TOKEN }}
make python-client-tests

43
.github/workflows/load_test.yaml vendored Normal file
View File

@ -0,0 +1,43 @@
name: Nightly load test
on:
schedule:
- cron: '0 0 * * 1-5'
pull_request:
paths:
- ".github/workflows/load_test.yaml"
branches:
- 'main'
jobs:
load-tests:
concurrency:
group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
runs-on:
group: aws-g5-12xlarge
env:
DOCKER_VOLUME: /cache
steps:
- name: Checkout repository
uses: actions/checkout@v3
- name: Install k6
run: |
curl https://github.com/grafana/k6/releases/download/v0.44.0/k6-v0.44.0-linux-amd64.tar.gz -L | tar xvz --strip-components 1
- name: Start starcoder
run: |
docker run --name tgi-starcoder --rm --gpus all -p 3000:80 -v /mnt/cache:/data -e HF_TOKEN=${{ secrets.HF_TOKEN }} --pull always -d ghcr.io/huggingface/text-generation-inference:latest --model-id bigcode/starcoder --num-shard 2 --max-batch-total-tokens 32768
sleep 10
wget --timeout 10 --retry-on-http-error --waitretry=1 --tries=240 http://localhost:3000/health
- name: Run k6
run: |
./k6 run load_tests/starcoder_load.js
- name: Stop starcoder
if: ${{ always() }}
run: |
docker stop tgi-starcoder || true

View File

@ -1,56 +0,0 @@
name: Build and push docker image to Github registry
on:
workflow_dispatch:
inputs:
tag:
description: 'Tag for the Docker image:'
required: true
jobs:
build-and-push:
concurrency:
group: ${{ github.workflow }}
cancel-in-progress: true
runs-on: ubuntu-latest
permissions:
contents: write
packages: write
# This is used to complete the identity challenge
# with sigstore/fulcio when running outside of PRs.
id-token: write
security-events: write
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Initialize Docker Buildx
uses: docker/setup-buildx-action@v3
with:
install: true
config-inline: |
[registry."docker.io"]
- name: Login to GitHub Container Registry
if: github.event_name != 'pull_request'
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Extract metadata (tags, labels) for Docker
id: meta
uses: docker/metadata-action@v4.3.0
with:
flavor: |
latest=true
images: ghcr.io/huggingface/tgi-gaudi
tags: |
type=raw,value=${{ github.event.inputs.tag }}
- name: Build and push Docker image
id: build-and-push
uses: docker/build-push-action@v4
with:
context: .
file: Dockerfile
push: true
platforms: 'linux/amd64'
tags: ${{ steps.meta.outputs.tags }}

14
.github/workflows/stale.yaml vendored Normal file
View File

@ -0,0 +1,14 @@
name: 'Close stale issues and PRs'
on:
schedule:
- cron: '30 1 * * *'
jobs:
stale:
runs-on: ubuntu-latest
steps:
- uses: actions/stale@v8
with:
stale-issue-message: 'This issue is stale because it has been open 30 days with no activity. Remove stale label or comment or this will be closed in 5 days.'
days-before-stale: 30
days-before-close: 5

60
.github/workflows/tests.yaml vendored Normal file
View File

@ -0,0 +1,60 @@
name: Server Tests
on:
pull_request:
paths:
- ".github/workflows/tests.yaml"
- "server/**"
- "proto/**"
- "router/**"
- "launcher/**"
- "Cargo.lock"
- "rust-toolchain.toml"
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
jobs:
run_tests:
runs-on:
group: aws-highmemory-32-plus-priv
steps:
- uses: actions/checkout@v2
- name: Set up Python
uses: actions/setup-python@v4
id: python
with:
python-version: 3.11
- name: Install Rust
uses: actions-rs/toolchain@v1
with:
# Released on: 02 May, 2024
# https://releases.rs/docs/1.78.0/
toolchain: 1.80.0
override: true
components: rustfmt, clippy
- name: Install Protoc
uses: arduino/setup-protoc@v1
- name: Clean unused files
run: |
sudo rm -rf /usr/local/lib/android # will release about 10 GB if you don't need Android
sudo rm -rf /usr/share/dotnet # will release about 20GB if you don't need .NET
- name: Install
run: |
sudo apt update
sudo apt install python3.11-dev -y
make install-cpu
- name: Run server tests
run: |
pip install pytest
export HF_TOKEN=${{ secrets.HF_TOKEN }}
pytest -s -vv server/tests
- name: Pre-commit checks
run: |
pip install pre-commit
pre-commit install
pre-commit run --all-files
- name: Run Rust tests
run: |
cargo test

View File

@ -0,0 +1,16 @@
name: Upload PR Documentation
on:
workflow_run:
workflows: ["Build PR Documentation"]
types:
- completed
jobs:
build:
uses: huggingface/doc-builder/.github/workflows/upload_pr_documentation.yml@main
with:
package_name: text-generation-inference
secrets:
hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }}

573
Cargo.lock generated
View File

@ -589,14 +589,38 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "df8670b8c7b9dae1793364eafadf7239c40d669904660c5960d74cfd80b46a53"
[[package]]
name = "cc"
version = "1.0.99"
name = "cast"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "96c51067fd44124faa7f870b4b1c969379ad32b2ba805aa959430ceaa384f695"
checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
[[package]]
name = "castaway"
version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0abae9be0aaf9ea96a3b1b8b1b55c602ca751eba1b1500220cea4ecbafe7c0d5"
dependencies = [
"rustversion",
]
[[package]]
name = "cc"
version = "1.1.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b62ac837cdb5cb22e10a256099b4fc502b1dfe560cb282963a974d7abd80e476"
dependencies = [
"jobserver",
"libc",
"once_cell",
"shlex",
]
[[package]]
name = "cexpr"
version = "0.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766"
dependencies = [
"nom",
]
[[package]]
@ -621,6 +645,34 @@ version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fd16c4719339c4530435d38e511904438d07cce7950afa3718a84ac36c10e89e"
[[package]]
name = "cfg_aliases"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724"
[[package]]
name = "clang-sys"
version = "1.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4"
dependencies = [
"glob",
"libc",
"libloading",
]
[[package]]
name = "clap"
version = "2.34.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a0610544180c38b88101fecf2dd634b174a62eef6946f84dfc6a7127512b381c"
dependencies = [
"bitflags 1.3.2",
"textwrap",
"unicode-width",
]
[[package]]
name = "clap"
version = "4.5.17"
@ -645,21 +697,40 @@ dependencies = [
[[package]]
name = "clap_derive"
version = "4.5.5"
version = "4.5.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c780290ccf4fb26629baa7a1081e68ced113f1d3ec302fa5948f1c381ebf06c6"
checksum = "501d359d5f3dcaf6ecdeee48833ae73ec6e42723a1e52419c79abf9507eec0a0"
dependencies = [
"heck 0.5.0",
"proc-macro2",
"quote",
"syn 2.0.66",
"syn 2.0.77",
]
[[package]]
name = "clap_lex"
version = "0.7.1"
version = "0.7.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4b82cf0babdbd58558212896d1a4272303a57bdb245c2bf1147185fb45640e70"
checksum = "1462739cb27611015575c0c11df5df7601141071f07518d56fcc1be504cbec97"
[[package]]
name = "cmake"
version = "0.1.51"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fb1e43aa7fd152b1f968787f7dbcdeb306d1867ff373c69955211876c053f91a"
dependencies = [
"cc",
]
[[package]]
name = "codespan-reporting"
version = "0.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3538270d33cc669650c4b093848450d380def10c331d38c768e34cac80576e6e"
dependencies = [
"termcolor",
"unicode-width",
]
[[package]]
name = "color_quant"
@ -1027,15 +1098,10 @@ dependencies = [
]
[[package]]
name = "displaydoc"
version = "0.2.4"
name = "dunce"
version = "1.0.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "487585f4d0c6655fe74905e2504d8ad6908e4db67f744eb140876906c2f3175d"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.66",
]
checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813"
[[package]]
name = "easy-cast"
@ -1349,18 +1415,24 @@ dependencies = [
[[package]]
name = "gimli"
version = "0.29.0"
version = "0.31.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "40ecd4077b5ae9fd2e9e169b102c6c330d0605168eb0e8bf79952b256dbefffd"
checksum = "32085ea23f3234fc7846555e85283ba4de91e21016dc0455a16286d87a292d64"
[[package]]
name = "glob"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b"
[[package]]
name = "grpc-metadata"
version = "0.1.0"
dependencies = [
"opentelemetry",
"opentelemetry 0.20.0",
"tonic 0.10.2",
"tracing",
"tracing-opentelemetry",
"tracing-opentelemetry 0.21.0",
]
[[package]]
@ -1559,9 +1631,9 @@ dependencies = [
[[package]]
name = "httparse"
version = "1.9.3"
version = "1.9.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d0e7a4dd27b9476dc40cb050d3632d3bba3a70ddbff012285f7f8559a1e7e545"
checksum = "0fcc0b4a115bf80b728eb8ea024ad5bd707b615bfed49e0665b6e0f86fd082d9"
[[package]]
name = "httpdate"
@ -1579,7 +1651,7 @@ dependencies = [
"futures-channel",
"futures-core",
"futures-util",
"h2",
"h2 0.3.26",
"http 0.2.12",
"http-body 0.4.6",
"httparse",
@ -1652,128 +1724,30 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d6183ddfa99b85da61a140bea0efc93fdf56ceaa041b37d553518030827f9905"
dependencies = [
"bytes",
"hyper",
"hyper 0.14.30",
"native-tls",
"tokio",
"tokio-native-tls",
]
[[package]]
name = "icu_collections"
version = "1.5.0"
name = "hyper-util"
version = "0.1.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "db2fa452206ebee18c4b5c2274dbf1de17008e874b4dc4f0aea9d01ca79e4526"
checksum = "da62f120a8a37763efb0cf8fdf264b884c7b8b9ac8660b900c8661030c00e6ba"
dependencies = [
"displaydoc",
"yoke",
"zerofrom",
"zerovec",
]
[[package]]
name = "icu_locid"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "13acbb8371917fc971be86fc8057c41a64b521c184808a698c02acc242dbf637"
dependencies = [
"displaydoc",
"litemap",
"tinystr",
"writeable",
"zerovec",
]
[[package]]
name = "icu_locid_transform"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "01d11ac35de8e40fdeda00d9e1e9d92525f3f9d887cdd7aa81d727596788b54e"
dependencies = [
"displaydoc",
"icu_locid",
"icu_locid_transform_data",
"icu_provider",
"tinystr",
"zerovec",
]
[[package]]
name = "icu_locid_transform_data"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fdc8ff3388f852bede6b579ad4e978ab004f139284d7b28715f773507b946f6e"
[[package]]
name = "icu_normalizer"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "19ce3e0da2ec68599d193c93d088142efd7f9c5d6fc9b803774855747dc6a84f"
dependencies = [
"displaydoc",
"icu_collections",
"icu_normalizer_data",
"icu_properties",
"icu_provider",
"smallvec",
"utf16_iter",
"utf8_iter",
"write16",
"zerovec",
]
[[package]]
name = "icu_normalizer_data"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f8cafbf7aa791e9b22bec55a167906f9e1215fd475cd22adfcf660e03e989516"
[[package]]
name = "icu_properties"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1f8ac670d7422d7f76b32e17a5db556510825b29ec9154f235977c9caba61036"
dependencies = [
"displaydoc",
"icu_collections",
"icu_locid_transform",
"icu_properties_data",
"icu_provider",
"tinystr",
"zerovec",
]
[[package]]
name = "icu_properties_data"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "67a8effbc3dd3e4ba1afa8ad918d5684b8868b3b26500753effea8d2eed19569"
[[package]]
name = "icu_provider"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6ed421c8a8ef78d3e2dbc98a973be2f3770cb42b606e3ab18d6237c4dfde68d9"
dependencies = [
"displaydoc",
"icu_locid",
"icu_provider_macros",
"stable_deref_trait",
"tinystr",
"writeable",
"yoke",
"zerofrom",
"zerovec",
]
[[package]]
name = "icu_provider_macros"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.76",
"bytes",
"futures-channel",
"futures-util",
"http 1.1.0",
"http-body 1.0.1",
"hyper 1.4.1",
"pin-project-lite",
"socket2",
"tokio",
"tower",
"tower-service",
"tracing",
]
[[package]]
@ -1784,14 +1758,12 @@ checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39"
[[package]]
name = "idna"
version = "1.0.0"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4716a3a0933a1d01c2f72450e89596eb51dd34ef3c211ccd875acdf1f8fe47ed"
checksum = "634d9b1461af396cad843f47fdba5597a4f9e6ddd4bfb6ff5d85028c25cb12f6"
dependencies = [
"icu_normalizer",
"icu_properties",
"smallvec",
"utf8_iter",
"unicode-bidi",
"unicode-normalization",
]
[[package]]
@ -1879,11 +1851,21 @@ version = "0.14.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "94bd26b1b737bc11f183620072e188d1c6ede67e0e78682228d66b49ec510e17"
dependencies = [
"opentelemetry",
"opentelemetry 0.20.0",
"opentelemetry-otlp",
"thiserror",
"tracing",
"tracing-opentelemetry",
"tracing-opentelemetry 0.21.0",
]
[[package]]
name = "instability"
version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b23a0c8dfe501baac4adf6ebbfa6eddf8f0c07f56b058cc1288017e32397846c"
dependencies = [
"quote",
"syn 2.0.77",
]
[[package]]
@ -1903,14 +1885,14 @@ checksum = "c34819042dc3d3971c46c2190835914dfbe0c3c13f61449b2997f4e9722dfa60"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.66",
"syn 2.0.77",
]
[[package]]
name = "ipnet"
version = "2.9.0"
version = "2.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8f518f335dce6725a761382244631d86cf0ccb2863413590b31338feb467f9c3"
checksum = "187674a687eed5fe42285b40c6291f9a01517d415fad1c3cbc6a9f778af7fcd4"
[[package]]
name = "is_terminal_polyfill"
@ -2099,12 +2081,6 @@ version = "0.4.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89"
[[package]]
name = "litemap"
version = "0.7.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "643cb0b8d4fcc284004d5fd0d67ccf61dfffadb7f75e1e71bc420f4688a3a704"
[[package]]
name = "lock_api"
version = "0.4.12"
@ -3134,8 +3110,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "22505a5c94da8e3b7c2996394d1c933236c4d743e81a410bcca4e6989fc066a4"
dependencies = [
"bytes",
"heck 0.4.1",
"itertools 0.10.5",
"heck 0.5.0",
"itertools 0.12.1",
"log",
"multimap",
"once_cell",
@ -3168,10 +3144,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "81bddcdb20abf9501610992b6759a4c888aef7d1a7247ef75e2404275ac24af1"
dependencies = [
"anyhow",
"itertools 0.10.5",
"itertools 0.12.1",
"proc-macro2",
"quote",
"syn 2.0.66",
"syn 2.0.77",
]
[[package]]
@ -3427,11 +3403,11 @@ dependencies = [
[[package]]
name = "redox_syscall"
version = "0.5.2"
version = "0.5.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c82cf8cff14456045f55ec4241383baeff27af886adb72ffb2162f99911de0fd"
checksum = "0884ad60e090bf1345b93da0a5de8923c93884cd03f40dfcfddd3b4bee661853"
dependencies = [
"bitflags 2.5.0",
"bitflags 2.6.0",
]
[[package]]
@ -3581,23 +3557,22 @@ dependencies = [
[[package]]
name = "rust-embed-impl"
version = "6.8.1"
version = "8.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "49b94b81e5b2c284684141a2fb9e2a31be90638caf040bf9afbc5a0416afe1ac"
checksum = "6125dbc8867951125eec87294137f4e9c2c96566e61bf72c45095a7c77761478"
dependencies = [
"proc-macro2",
"quote",
"rust-embed-utils",
"shellexpand",
"syn 2.0.66",
"syn 2.0.77",
"walkdir",
]
[[package]]
name = "rust-embed-utils"
version = "7.8.1"
version = "8.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9d38ff6bf570dc3bb7100fce9f7b60c33fa71d80e88da3f2580df4ff2bdded74"
checksum = "2e5347777e9aacb56039b0e1f28785929a8a3b709e87482e7442c72e7c12529d"
dependencies = [
"sha2",
"walkdir",
@ -4024,10 +3999,10 @@ dependencies = [
]
[[package]]
name = "stable_deref_trait"
version = "1.2.0"
name = "static_assertions"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3"
checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
[[package]]
name = "strsim"
@ -4092,15 +4067,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2047c6ded9c721764247e62cd3b03c09ffc529b2ba5b10ec482ae507a4a70160"
[[package]]
name = "synstructure"
version = "0.13.1"
name = "sync_wrapper"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.66",
]
checksum = "a7065abeca94b6a8a577f9bd45aa0867a2238b74e8eb67cf10d492bc39351394"
[[package]]
name = "sysinfo"
@ -4194,12 +4164,44 @@ dependencies = [
"windows-sys 0.59.0",
]
[[package]]
name = "termcolor"
version = "1.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "06794f8f6c5c898b3275aebefa6b8a1cb24cd2c6c79397ab15774837a0bc5755"
dependencies = [
"winapi-util",
]
[[package]]
name = "text-generation-backends-trtllm"
version = "2.2.1-dev0"
dependencies = [
"async-stream",
"async-trait",
"clap 4.5.17",
"cmake",
"cxx",
"cxx-build",
"log",
"parking_lot",
"pkg-config",
"text-generation-router",
"thiserror",
"tokenizers 0.19.1",
"tokio",
"tokio-stream",
"tracing",
"tracing-opentelemetry 0.24.0",
"tracing-subscriber",
]
[[package]]
name = "text-generation-benchmark"
version = "2.0.4"
version = "2.2.1-dev0"
dependencies = [
"average",
"clap",
"clap 4.5.17",
"crossterm",
"float-ord",
"hf-hub",
@ -4217,13 +4219,14 @@ dependencies = [
[[package]]
name = "text-generation-client"
version = "2.0.4"
version = "2.2.1-dev0"
dependencies = [
"async-trait",
"base64 0.22.1",
"futures",
"grpc-metadata",
"prost 0.12.6",
"prost-build",
"rand",
"thiserror",
"tokio",
"tonic 0.10.2",
@ -4234,13 +4237,13 @@ dependencies = [
[[package]]
name = "text-generation-launcher"
version = "2.0.4"
version = "2.2.1-dev0"
dependencies = [
"clap",
"clap 4.5.17",
"ctrlc",
"float_eq",
"hf-hub",
"nix",
"nix 0.28.0",
"once_cell",
"reqwest",
"serde",
@ -4253,13 +4256,14 @@ dependencies = [
[[package]]
name = "text-generation-router"
version = "2.0.4"
version = "2.2.1-dev0"
dependencies = [
"async-stream",
"axum",
"async-trait",
"axum 0.7.5",
"axum-tracing-opentelemetry",
"base64 0.22.1",
"clap",
"clap 4.5.17",
"csv",
"futures",
"futures-util",
@ -4434,15 +4438,30 @@ dependencies = [
]
[[package]]
name = "tinystr"
version = "0.7.6"
name = "tinytemplate"
version = "1.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9117f5d4db391c1cf6927e7bea3db74b9a1c1add8f7eda9ffd5364f40f57b82f"
checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc"
dependencies = [
"displaydoc",
"zerovec",
"serde",
"serde_json",
]
[[package]]
name = "tinyvec"
version = "1.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "445e881f4f6d382d5f27c034e25eb92edd7c784ceab92a0937db7f2e9471b938"
dependencies = [
"tinyvec_macros",
]
[[package]]
name = "tinyvec_macros"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
[[package]]
name = "tokenizers"
version = "0.19.1"
@ -4675,13 +4694,13 @@ checksum = "d560933a0de61cf715926b9cac824d4c883c2c43142f787595e48280c40a1d0e"
dependencies = [
"async-stream",
"async-trait",
"axum",
"axum 0.6.20",
"base64 0.21.7",
"bytes",
"h2",
"http",
"http-body",
"hyper",
"h2 0.3.26",
"http 0.2.12",
"http-body 0.4.6",
"hyper 0.14.30",
"hyper-timeout",
"percent-encoding",
"pin-project",
@ -4927,10 +4946,25 @@ dependencies = [
]
[[package]]
name = "unicode-ident"
version = "1.0.12"
name = "unicode-bidi"
version = "0.3.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b"
checksum = "08f95100a766bf4f8f28f90d77e0a5461bbdb219042e7679bebe79004fed8d75"
[[package]]
name = "unicode-ident"
version = "1.0.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe"
[[package]]
name = "unicode-normalization"
version = "0.1.23"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a56d1686db2308d901306f92a263857ef59ea39678a5458e7cb17f01415101f5"
dependencies = [
"tinyvec",
]
[[package]]
name = "unicode-normalization-alignments"
@ -5010,9 +5044,9 @@ dependencies = [
[[package]]
name = "url"
version = "2.5.1"
version = "2.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f7c25da092f0a868cdf09e8674cd3b7ef3a7d92a24253e663a2fb85e2496de56"
checksum = "22784dbdf76fdde8af1aeda5622b546b422b6fc585325248a2bf9f5e41e94d6c"
dependencies = [
"form_urlencoded",
"idna",
@ -5025,18 +5059,6 @@ version = "2.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da"
[[package]]
name = "utf16_iter"
version = "1.0.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c8232dd3cdaed5356e0f716d285e4b40b932ac434100fe9b7e0e8e935b9e6246"
[[package]]
name = "utf8_iter"
version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be"
[[package]]
name = "utf8parse"
version = "0.2.2"
@ -5057,9 +5079,9 @@ dependencies = [
[[package]]
name = "utoipa-gen"
version = "3.5.0"
version = "4.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "05d96dcd6fc96f3df9b3280ef480770af1b7c5d14bc55192baa9b067976d920c"
checksum = "7bf0e16c02bc4bf5322ab65f10ab1149bdbcaa782cba66dc7057370a3f8190be"
dependencies = [
"proc-macro-error",
"proc-macro2",
@ -5070,11 +5092,11 @@ dependencies = [
[[package]]
name = "utoipa-swagger-ui"
version = "3.1.5"
version = "6.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "84614caa239fb25b2bb373a52859ffd94605ceb256eeb1d63436325cf81e3653"
checksum = "0b39868d43c011961e04b41623e050aedf2cc93652562ff7935ce0f819aaf2da"
dependencies = [
"axum",
"axum 0.7.5",
"mime_guess",
"regex",
"rust-embed",
@ -5291,9 +5313,9 @@ dependencies = [
[[package]]
name = "webpki-roots"
version = "0.26.2"
version = "0.26.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3c452ad30530b54a4d8e71952716a212b08efd0f3562baa66c29a618b07da7c3"
checksum = "0bd24728e5af82c6c4ec1b66ac4844bdf8156257fccda846ec58b42cd0cdbe6a"
dependencies = [
"rustls-pki-types",
]
@ -5599,70 +5621,14 @@ dependencies = [
"windows-sys 0.48.0",
]
[[package]]
name = "write16"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d1890f4022759daae28ed4fe62859b1236caebfc61ede2f63ed4e695f3f6d936"
[[package]]
name = "writeable"
version = "0.5.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e9df38ee2d2c3c5948ea468a8406ff0db0b29ae1ffde1bcf20ef305bcc95c51"
[[package]]
name = "yoke"
version = "0.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6c5b1314b079b0930c31e3af543d8ee1757b1951ae1e1565ec704403a7240ca5"
dependencies = [
"serde",
"stable_deref_trait",
"yoke-derive",
"zerofrom",
]
[[package]]
name = "yoke-derive"
version = "0.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "28cc31741b18cb6f1d5ff12f5b7523e3d6eb0852bbbad19d73905511d9849b95"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.66",
"synstructure",
]
[[package]]
name = "zerocopy"
version = "0.6.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "854e949ac82d619ee9a14c66a1b674ac730422372ccb759ce0c39cabcf2bf8e6"
dependencies = [
"byteorder",
"zerocopy-derive 0.6.6",
]
[[package]]
name = "zerocopy"
version = "0.7.35"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0"
dependencies = [
"zerocopy-derive 0.7.35",
]
[[package]]
name = "zerocopy-derive"
version = "0.6.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "125139de3f6b9d625c39e2efdd73d41bdac468ccd556556440e322be0e1bbd91"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.77",
"byteorder",
"zerocopy-derive",
]
[[package]]
@ -5673,28 +5639,7 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.76",
]
[[package]]
name = "zerofrom"
version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "91ec111ce797d0e0784a1116d0ddcdbea84322cd79e5d5ad173daeba4f93ab55"
dependencies = [
"zerofrom-derive",
]
[[package]]
name = "zerofrom-derive"
version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0ea7b4a3637ea8669cedf0f1fd5c286a17f3de97b8dd5a70a6c167a1730e63a5"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.66",
"synstructure",
"syn 2.0.77",
]
[[package]]
@ -5703,28 +5648,6 @@ version = "1.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ced3678a2879b30306d323f4542626697a464a97c0a07c9aebf7ebca65cd4dde"
[[package]]
name = "zerovec"
version = "0.10.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bb2cc8827d6c0994478a15c53f374f46fbd41bea663d809b14744bc42e6b109c"
dependencies = [
"yoke",
"zerofrom",
"zerovec-derive",
]
[[package]]
name = "zerovec-derive"
version = "0.10.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "97cf56601ee5052b4417d90c8755c6683473c926039908196cf35d99f893ebe7"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.66",
]
[[package]]
name = "zip"
version = "0.6.6"

View File

@ -20,7 +20,7 @@ default-members = [
resolver = "2"
[workspace.package]
version = "2.0.4"
version = "2.3.1-dev0"
edition = "2021"
authors = ["Olivier Dehaene"]
homepage = "https://github.com/huggingface/text-generation-inference"

View File

@ -1,8 +1,10 @@
# Rust builder
FROM lukemathwalker/cargo-chef:latest-rust-1.78 AS chef
FROM lukemathwalker/cargo-chef:latest-rust-1.80 AS chef
WORKDIR /usr/src
FROM chef as planner
ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
FROM chef AS planner
COPY Cargo.lock Cargo.lock
COPY Cargo.toml Cargo.toml
COPY rust-toolchain.toml rust-toolchain.toml
@ -25,16 +27,19 @@ RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
rm -f $PROTOC_ZIP
COPY --from=planner /usr/src/recipe.json recipe.json
COPY Cargo.lock Cargo.lock
RUN cargo chef cook --release --recipe-path recipe.json
RUN cargo chef cook --profile release-opt --recipe-path recipe.json
ARG GIT_SHA
ARG DOCKER_LABEL
COPY Cargo.toml Cargo.toml
COPY rust-toolchain.toml rust-toolchain.toml
COPY proto proto
COPY benchmark benchmark
COPY router router
COPY backends backends
COPY launcher launcher
RUN cargo build --release
RUN cargo build --profile release-opt
# Text Generation Inference base image
FROM vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest as base
@ -70,14 +75,26 @@ RUN cd server && \
pip install . --no-cache-dir
# Install benchmarker
COPY --from=builder /usr/src/target/release/text-generation-benchmark /usr/local/bin/text-generation-benchmark
COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
# Install router
COPY --from=builder /usr/src/target/release/text-generation-router /usr/local/bin/text-generation-router
COPY --from=builder /usr/src/target/release-opt/text-generation-router /usr/local/bin/text-generation-router
# Install launcher
COPY --from=builder /usr/src/target/release/text-generation-launcher /usr/local/bin/text-generation-launcher
COPY --from=builder /usr/src/target/release-opt/text-generation-launcher /usr/local/bin/text-generation-launcher
# AWS Sagemaker compatible image
FROM base AS sagemaker
COPY sagemaker-entrypoint.sh entrypoint.sh
RUN chmod +x entrypoint.sh
ENTRYPOINT ["./entrypoint.sh"]
# Final image
FROM base
ENTRYPOINT ["text-generation-launcher"]
CMD ["--json-output"]
COPY ./tgi-entrypoint.sh /tgi-entrypoint.sh
RUN chmod +x /tgi-entrypoint.sh
ENTRYPOINT ["/tgi-entrypoint.sh"]
# CMD ["--json-output"]

View File

@ -1,9 +1,8 @@
install-server:
cd server && make install
install-integration-tests:
cd integration-tests && pip install -r requirements.txt
cd clients/python && pip install .
install-server-cpu:
cd server && make install-server
install-router:
cargo install --path backends/v3/
@ -14,7 +13,10 @@ install-launcher:
install-benchmark:
cargo install --path benchmark/
install: install-server install-router install-launcher install-custom-kernels
install: install-server install-router install-launcher
install-cpu: install-server-cpu install-router install-launcher
server-dev:
cd server && make run-dev
@ -46,8 +48,8 @@ python-tests: python-server-tests python-client-tests
run-falcon-7b-instruct:
text-generation-launcher --model-id tiiuae/falcon-7b-instruct --port 8080
run-falcon-7b-instruct-quantize:
text-generation-launcher --model-id tiiuae/falcon-7b-instruct --quantize bitsandbytes --port 8080
clean:
rm -rf target aml
debug_image_build:
docker build --no-cache --progress=plain -t debug_tgi .

2186
docs/openapi.json Normal file

File diff suppressed because it is too large Load Diff

View File

@ -1,10 +0,0 @@
diff a/docs/openapi.json b/docs/openapi.json (rejected hunks)
@@ -10,7 +10,7 @@
"name": "Apache 2.0",
"url": "https://www.apache.org/licenses/LICENSE-2.0"
},
- "version": "2.2.1-dev0"
+ "version": "2.3.1-dev0"
},
"paths": {
"/": {

View File

@ -1,5 +1,3 @@
/// Copyright (C) 2024 Habana Labs, Ltd. an Intel Company.
use clap::{Parser, ValueEnum};
use hf_hub::{
api::sync::{Api, ApiBuilder},
@ -726,12 +724,13 @@ fn shard_manager(
if let Some(dtype) = dtype {
shard_args.push("--dtype".to_string());
shard_args.push(dtype.to_string());
shard_args.push(dtype.to_string())
}
// Model optional revision
if let Some(revision) = revision {
shard_args.push("--revision".to_string());
shard_args.push(revision);
shard_args.push(revision)
}
let rope = match (rope_scaling, rope_factor) {
@ -1567,16 +1566,6 @@ fn terminate(process_name: &str, mut process: Child, timeout: Duration) -> io::R
}
fn main() -> Result<(), LauncherError> {
match Command::new("ldconfig").spawn() {
Ok(_) => {}
Err(err) => {
tracing::warn!(
"Unable to refresh ldconfig cache. Skipping (useless in most cases). Details {:?}",
err
)
}
}
// Pattern match configuration
let args: Args = Args::parse();

View File

@ -224,7 +224,7 @@ message DecodeResponse {
message WarmupRequest {
/// Batch to warmup on
repeated Batch batches = 1;
Batch batch = 1;
uint32 max_input_length = 2;
uint32 max_prefill_tokens = 3;
uint32 max_total_tokens = 4;

View File

@ -1,647 +0,0 @@
// This file is @generated by prost-build.
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct HealthRequest {}
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct HealthResponse {}
/// / Empty request
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct InfoRequest {}
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct InfoResponse {
#[prost(bool, tag = "1")]
pub requires_padding: bool,
#[prost(string, tag = "2")]
pub dtype: ::prost::alloc::string::String,
#[prost(string, tag = "3")]
pub device_type: ::prost::alloc::string::String,
#[prost(uint32, optional, tag = "4")]
pub window_size: ::core::option::Option<u32>,
#[prost(uint32, tag = "5")]
pub speculate: u32,
}
/// / Empty request
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct ServiceDiscoveryRequest {}
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct ServiceDiscoveryResponse {
/// / Other shards urls
#[prost(string, repeated, tag = "1")]
pub urls: ::prost::alloc::vec::Vec<::prost::alloc::string::String>,
}
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct ClearCacheRequest {
/// / Optional batch id
#[prost(uint64, optional, tag = "1")]
pub id: ::core::option::Option<u64>,
}
/// / Empty response
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct ClearCacheResponse {}
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct NextTokenChooserParameters {
/// / exponential scaling output probability distribution
#[prost(float, tag = "1")]
pub temperature: f32,
/// / restricting to the k highest probability elements
#[prost(uint32, tag = "2")]
pub top_k: u32,
/// / restricting to top tokens summing to prob_cut_off <= prob_cut_off
#[prost(float, tag = "3")]
pub top_p: f32,
/// / restricting to top tokens summing to prob_cut_off <= prob_cut_off
#[prost(float, tag = "4")]
pub typical_p: f32,
/// / apply sampling on the logits
#[prost(bool, tag = "5")]
pub do_sample: bool,
/// / random seed for sampling
#[prost(uint64, tag = "6")]
pub seed: u64,
/// / repetition penalty
#[prost(float, tag = "7")]
pub repetition_penalty: f32,
/// / frequency penalty
#[prost(float, tag = "9")]
pub frequency_penalty: f32,
/// / token watermarking using "A Watermark for Large Language Models"
#[prost(bool, tag = "8")]
pub watermark: bool,
/// / grammar (applied if not empty)
#[prost(string, tag = "10")]
pub grammar: ::prost::alloc::string::String,
/// / grammar type
#[prost(enumeration = "GrammarType", tag = "11")]
pub grammar_type: i32,
}
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct StoppingCriteriaParameters {
/// / Maximum number of generated tokens
#[prost(uint32, tag = "1")]
pub max_new_tokens: u32,
/// / Optional stopping sequences
#[prost(string, repeated, tag = "2")]
pub stop_sequences: ::prost::alloc::vec::Vec<::prost::alloc::string::String>,
/// / Ignore end of sequence token
/// / used for benchmarking
#[prost(bool, tag = "3")]
pub ignore_eos_token: bool,
}
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct Request {
/// / Request ID
#[prost(uint64, tag = "1")]
pub id: u64,
/// / The generation context
#[prost(string, tag = "2")]
pub inputs: ::prost::alloc::string::String,
/// / Context truncation
#[prost(uint32, tag = "3")]
pub truncate: u32,
/// / Next Token Chooser Parameters
#[prost(message, optional, tag = "4")]
pub parameters: ::core::option::Option<NextTokenChooserParameters>,
/// / Stopping Criteria Parameters
#[prost(message, optional, tag = "5")]
pub stopping_parameters: ::core::option::Option<StoppingCriteriaParameters>,
/// / Return prefill logprobs
#[prost(bool, tag = "6")]
pub prefill_logprobs: bool,
/// / Return most likely n tokens
#[prost(uint32, tag = "7")]
pub top_n_tokens: u32,
}
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct Batch {
/// / Batch ID
#[prost(uint64, tag = "1")]
pub id: u64,
/// / Individual requests
#[prost(message, repeated, tag = "2")]
pub requests: ::prost::alloc::vec::Vec<Request>,
/// / Batch size (==len(requests))
#[prost(uint32, tag = "3")]
pub size: u32,
/// / Maximum number of tokens this batch will grow to
#[prost(uint32, tag = "4")]
pub max_tokens: u32,
}
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct CachedBatch {
/// / Batch ID
#[prost(uint64, tag = "1")]
pub id: u64,
/// / Individual requests ids
#[prost(uint64, repeated, tag = "2")]
pub request_ids: ::prost::alloc::vec::Vec<u64>,
/// / Batch size (==len(requests))
#[prost(uint32, tag = "3")]
pub size: u32,
/// / Maximum number of tokens this batch will grow to
#[prost(uint32, tag = "4")]
pub max_tokens: u32,
}
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct GeneratedText {
/// / Output
#[prost(string, tag = "1")]
pub text: ::prost::alloc::string::String,
/// / Number of generated tokens
#[prost(uint32, tag = "2")]
pub generated_tokens: u32,
/// / Finish reason
#[prost(enumeration = "FinishReason", tag = "3")]
pub finish_reason: i32,
/// / Seed
#[prost(uint64, optional, tag = "4")]
pub seed: ::core::option::Option<u64>,
}
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct Tokens {
/// / Token IDs
#[prost(uint32, repeated, tag = "1")]
pub ids: ::prost::alloc::vec::Vec<u32>,
/// / Logprobs
#[prost(float, repeated, tag = "2")]
pub logprobs: ::prost::alloc::vec::Vec<f32>,
/// / tokens
#[prost(string, repeated, tag = "3")]
pub texts: ::prost::alloc::vec::Vec<::prost::alloc::string::String>,
/// / special
#[prost(bool, repeated, tag = "4")]
pub is_special: ::prost::alloc::vec::Vec<bool>,
}
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct Generation {
/// / Request ID
#[prost(uint64, tag = "1")]
pub request_id: u64,
/// / Prefill tokens (optional)
#[prost(message, optional, tag = "2")]
pub prefill_tokens: ::core::option::Option<Tokens>,
#[prost(message, optional, tag = "3")]
pub tokens: ::core::option::Option<Tokens>,
/// / Complete generated text
#[prost(message, optional, tag = "4")]
pub generated_text: ::core::option::Option<GeneratedText>,
/// / Top tokens
#[prost(message, repeated, tag = "5")]
pub top_tokens: ::prost::alloc::vec::Vec<Tokens>,
}
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct FilterBatchRequest {
/// / Batch ID
#[prost(uint64, tag = "1")]
pub batch_id: u64,
/// / Requests to keep
#[prost(uint64, repeated, tag = "2")]
pub request_ids: ::prost::alloc::vec::Vec<u64>,
}
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct FilterBatchResponse {
/// / Filtered Batch (cached)
#[prost(message, optional, tag = "1")]
pub batch: ::core::option::Option<CachedBatch>,
}
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct PrefillRequest {
/// / Batch
#[prost(message, optional, tag = "1")]
pub batch: ::core::option::Option<Batch>,
}
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct PrefillResponse {
/// / Generation
#[prost(message, repeated, tag = "1")]
pub generations: ::prost::alloc::vec::Vec<Generation>,
/// / Next batch (cached)
#[prost(message, optional, tag = "2")]
pub batch: ::core::option::Option<CachedBatch>,
/// / Forward elapsed time in nanoseconds
#[prost(uint64, tag = "3")]
pub forward_ns: u64,
/// / Decode elapsed time in nanoseconds
#[prost(uint64, tag = "4")]
pub decode_ns: u64,
/// / Total elapsed time in nanoseconds
#[prost(uint64, tag = "5")]
pub total_ns: u64,
}
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct DecodeRequest {
/// / Cached batches
#[prost(message, repeated, tag = "1")]
pub batches: ::prost::alloc::vec::Vec<CachedBatch>,
}
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct DecodeResponse {
/// / Decodes
#[prost(message, repeated, tag = "1")]
pub generations: ::prost::alloc::vec::Vec<Generation>,
/// / Next batch (cached)
#[prost(message, optional, tag = "2")]
pub batch: ::core::option::Option<CachedBatch>,
/// / Forward elapsed time in nanoseconds
#[prost(uint64, tag = "3")]
pub forward_ns: u64,
/// / Decode elapsed time in nanoseconds
#[prost(uint64, tag = "4")]
pub decode_ns: u64,
/// / Total elapsed time in nanoseconds
#[prost(uint64, tag = "5")]
pub total_ns: u64,
/// / Concatenate elapsed time in nanoseconds
#[prost(uint64, optional, tag = "6")]
pub concat_ns: ::core::option::Option<u64>,
}
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct WarmupRequest {
/// / Batch to warmup on
#[prost(message, optional, tag = "1")]
pub batch: ::core::option::Option<Batch>,
#[prost(uint32, tag = "2")]
pub max_input_length: u32,
#[prost(uint32, tag = "3")]
pub max_prefill_tokens: u32,
#[prost(uint32, tag = "4")]
pub max_total_tokens: u32,
}
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct WarmupResponse {
/// / Maximum number of tokens supported by the model
#[prost(uint32, optional, tag = "1")]
pub max_supported_total_tokens: ::core::option::Option<u32>,
}
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)]
#[repr(i32)]
pub enum GrammarType {
None = 0,
Json = 1,
Regex = 2,
}
impl GrammarType {
/// String value of the enum field names used in the ProtoBuf definition.
///
/// The values are not transformed in any way and thus are considered stable
/// (if the ProtoBuf definition does not change) and safe for programmatic use.
pub fn as_str_name(&self) -> &'static str {
match self {
GrammarType::None => "GRAMMAR_TYPE_NONE",
GrammarType::Json => "GRAMMAR_TYPE_JSON",
GrammarType::Regex => "GRAMMAR_TYPE_REGEX",
}
}
/// Creates an enum from field names used in the ProtoBuf definition.
pub fn from_str_name(value: &str) -> ::core::option::Option<Self> {
match value {
"GRAMMAR_TYPE_NONE" => Some(Self::None),
"GRAMMAR_TYPE_JSON" => Some(Self::Json),
"GRAMMAR_TYPE_REGEX" => Some(Self::Regex),
_ => None,
}
}
}
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)]
#[repr(i32)]
pub enum FinishReason {
Length = 0,
EosToken = 1,
StopSequence = 2,
}
impl FinishReason {
/// String value of the enum field names used in the ProtoBuf definition.
///
/// The values are not transformed in any way and thus are considered stable
/// (if the ProtoBuf definition does not change) and safe for programmatic use.
pub fn as_str_name(&self) -> &'static str {
match self {
FinishReason::Length => "FINISH_REASON_LENGTH",
FinishReason::EosToken => "FINISH_REASON_EOS_TOKEN",
FinishReason::StopSequence => "FINISH_REASON_STOP_SEQUENCE",
}
}
/// Creates an enum from field names used in the ProtoBuf definition.
pub fn from_str_name(value: &str) -> ::core::option::Option<Self> {
match value {
"FINISH_REASON_LENGTH" => Some(Self::Length),
"FINISH_REASON_EOS_TOKEN" => Some(Self::EosToken),
"FINISH_REASON_STOP_SEQUENCE" => Some(Self::StopSequence),
_ => None,
}
}
}
/// Generated client implementations.
pub mod text_generation_service_client {
#![allow(unused_variables, dead_code, missing_docs, clippy::let_unit_value)]
use tonic::codegen::*;
use tonic::codegen::http::Uri;
#[derive(Debug, Clone)]
pub struct TextGenerationServiceClient<T> {
inner: tonic::client::Grpc<T>,
}
impl TextGenerationServiceClient<tonic::transport::Channel> {
/// Attempt to create a new client by connecting to a given endpoint.
pub async fn connect<D>(dst: D) -> Result<Self, tonic::transport::Error>
where
D: TryInto<tonic::transport::Endpoint>,
D::Error: Into<StdError>,
{
let conn = tonic::transport::Endpoint::new(dst)?.connect().await?;
Ok(Self::new(conn))
}
}
impl<T> TextGenerationServiceClient<T>
where
T: tonic::client::GrpcService<tonic::body::BoxBody>,
T::Error: Into<StdError>,
T::ResponseBody: Body<Data = Bytes> + Send + 'static,
<T::ResponseBody as Body>::Error: Into<StdError> + Send,
{
pub fn new(inner: T) -> Self {
let inner = tonic::client::Grpc::new(inner);
Self { inner }
}
pub fn with_origin(inner: T, origin: Uri) -> Self {
let inner = tonic::client::Grpc::with_origin(inner, origin);
Self { inner }
}
pub fn with_interceptor<F>(
inner: T,
interceptor: F,
) -> TextGenerationServiceClient<InterceptedService<T, F>>
where
F: tonic::service::Interceptor,
T::ResponseBody: Default,
T: tonic::codegen::Service<
http::Request<tonic::body::BoxBody>,
Response = http::Response<
<T as tonic::client::GrpcService<tonic::body::BoxBody>>::ResponseBody,
>,
>,
<T as tonic::codegen::Service<
http::Request<tonic::body::BoxBody>,
>>::Error: Into<StdError> + Send + Sync,
{
TextGenerationServiceClient::new(InterceptedService::new(inner, interceptor))
}
/// Compress requests with the given encoding.
///
/// This requires the server to support it otherwise it might respond with an
/// error.
#[must_use]
pub fn send_compressed(mut self, encoding: CompressionEncoding) -> Self {
self.inner = self.inner.send_compressed(encoding);
self
}
/// Enable decompressing responses.
#[must_use]
pub fn accept_compressed(mut self, encoding: CompressionEncoding) -> Self {
self.inner = self.inner.accept_compressed(encoding);
self
}
/// Limits the maximum size of a decoded message.
///
/// Default: `4MB`
#[must_use]
pub fn max_decoding_message_size(mut self, limit: usize) -> Self {
self.inner = self.inner.max_decoding_message_size(limit);
self
}
/// Limits the maximum size of an encoded message.
///
/// Default: `usize::MAX`
#[must_use]
pub fn max_encoding_message_size(mut self, limit: usize) -> Self {
self.inner = self.inner.max_encoding_message_size(limit);
self
}
/// / Model Info
pub async fn info(
&mut self,
request: impl tonic::IntoRequest<super::InfoRequest>,
) -> std::result::Result<tonic::Response<super::InfoResponse>, tonic::Status> {
self.inner
.ready()
.await
.map_err(|e| {
tonic::Status::new(
tonic::Code::Unknown,
format!("Service was not ready: {}", e.into()),
)
})?;
let codec = tonic::codec::ProstCodec::default();
let path = http::uri::PathAndQuery::from_static(
"/generate.v2.TextGenerationService/Info",
);
let mut req = request.into_request();
req.extensions_mut()
.insert(GrpcMethod::new("generate.v2.TextGenerationService", "Info"));
self.inner.unary(req, path, codec).await
}
/// / Service discovery
pub async fn service_discovery(
&mut self,
request: impl tonic::IntoRequest<super::ServiceDiscoveryRequest>,
) -> std::result::Result<
tonic::Response<super::ServiceDiscoveryResponse>,
tonic::Status,
> {
self.inner
.ready()
.await
.map_err(|e| {
tonic::Status::new(
tonic::Code::Unknown,
format!("Service was not ready: {}", e.into()),
)
})?;
let codec = tonic::codec::ProstCodec::default();
let path = http::uri::PathAndQuery::from_static(
"/generate.v2.TextGenerationService/ServiceDiscovery",
);
let mut req = request.into_request();
req.extensions_mut()
.insert(
GrpcMethod::new(
"generate.v2.TextGenerationService",
"ServiceDiscovery",
),
);
self.inner.unary(req, path, codec).await
}
/// / Empties batch cache
pub async fn clear_cache(
&mut self,
request: impl tonic::IntoRequest<super::ClearCacheRequest>,
) -> std::result::Result<
tonic::Response<super::ClearCacheResponse>,
tonic::Status,
> {
self.inner
.ready()
.await
.map_err(|e| {
tonic::Status::new(
tonic::Code::Unknown,
format!("Service was not ready: {}", e.into()),
)
})?;
let codec = tonic::codec::ProstCodec::default();
let path = http::uri::PathAndQuery::from_static(
"/generate.v2.TextGenerationService/ClearCache",
);
let mut req = request.into_request();
req.extensions_mut()
.insert(
GrpcMethod::new("generate.v2.TextGenerationService", "ClearCache"),
);
self.inner.unary(req, path, codec).await
}
/// / Remove requests from a cached batch
pub async fn filter_batch(
&mut self,
request: impl tonic::IntoRequest<super::FilterBatchRequest>,
) -> std::result::Result<
tonic::Response<super::FilterBatchResponse>,
tonic::Status,
> {
self.inner
.ready()
.await
.map_err(|e| {
tonic::Status::new(
tonic::Code::Unknown,
format!("Service was not ready: {}", e.into()),
)
})?;
let codec = tonic::codec::ProstCodec::default();
let path = http::uri::PathAndQuery::from_static(
"/generate.v2.TextGenerationService/FilterBatch",
);
let mut req = request.into_request();
req.extensions_mut()
.insert(
GrpcMethod::new("generate.v2.TextGenerationService", "FilterBatch"),
);
self.inner.unary(req, path, codec).await
}
/// / Warmup the model and compute max cache size
pub async fn warmup(
&mut self,
request: impl tonic::IntoRequest<super::WarmupRequest>,
) -> std::result::Result<tonic::Response<super::WarmupResponse>, tonic::Status> {
self.inner
.ready()
.await
.map_err(|e| {
tonic::Status::new(
tonic::Code::Unknown,
format!("Service was not ready: {}", e.into()),
)
})?;
let codec = tonic::codec::ProstCodec::default();
let path = http::uri::PathAndQuery::from_static(
"/generate.v2.TextGenerationService/Warmup",
);
let mut req = request.into_request();
req.extensions_mut()
.insert(GrpcMethod::new("generate.v2.TextGenerationService", "Warmup"));
self.inner.unary(req, path, codec).await
}
/// / Prefill batch and decode first token
pub async fn prefill(
&mut self,
request: impl tonic::IntoRequest<super::PrefillRequest>,
) -> std::result::Result<
tonic::Response<super::PrefillResponse>,
tonic::Status,
> {
self.inner
.ready()
.await
.map_err(|e| {
tonic::Status::new(
tonic::Code::Unknown,
format!("Service was not ready: {}", e.into()),
)
})?;
let codec = tonic::codec::ProstCodec::default();
let path = http::uri::PathAndQuery::from_static(
"/generate.v2.TextGenerationService/Prefill",
);
let mut req = request.into_request();
req.extensions_mut()
.insert(GrpcMethod::new("generate.v2.TextGenerationService", "Prefill"));
self.inner.unary(req, path, codec).await
}
/// / Decode token for a list of prefilled batches
pub async fn decode(
&mut self,
request: impl tonic::IntoRequest<super::DecodeRequest>,
) -> std::result::Result<tonic::Response<super::DecodeResponse>, tonic::Status> {
self.inner
.ready()
.await
.map_err(|e| {
tonic::Status::new(
tonic::Code::Unknown,
format!("Service was not ready: {}", e.into()),
)
})?;
let codec = tonic::codec::ProstCodec::default();
let path = http::uri::PathAndQuery::from_static(
"/generate.v2.TextGenerationService/Decode",
);
let mut req = request.into_request();
req.extensions_mut()
.insert(GrpcMethod::new("generate.v2.TextGenerationService", "Decode"));
self.inner.unary(req, path, codec).await
}
/// / Health check
pub async fn health(
&mut self,
request: impl tonic::IntoRequest<super::HealthRequest>,
) -> std::result::Result<tonic::Response<super::HealthResponse>, tonic::Status> {
self.inner
.ready()
.await
.map_err(|e| {
tonic::Status::new(
tonic::Code::Unknown,
format!("Service was not ready: {}", e.into()),
)
})?;
let codec = tonic::codec::ProstCodec::default();
let path = http::uri::PathAndQuery::from_static(
"/generate.v2.TextGenerationService/Health",
);
let mut req = request.into_request();
req.extensions_mut()
.insert(GrpcMethod::new("generate.v2.TextGenerationService", "Health"));
self.inner.unary(req, path, codec).await
}
}
}

View File

@ -1,6 +0,0 @@
// This file is @generated by prost-build.
pub mod generate {
pub mod v2 {
include!("generate.v2.rs");
}
}

View File

@ -1,6 +1,6 @@
[tool.poetry]
name = "text-generation-server"
version = "2.0.4"
version = "2.0.5-dev0"
description = "Text Generation Inference Python gRPC Server"
authors = ["Olivier Dehaene <olivier@huggingface.co>"]

View File

@ -9,11 +9,15 @@ from loguru import logger
from text_generation_server.layers.exl2 import Exl2Weight
from text_generation_server.layers.gptq import GPTQWeight
from text_generation_server.utils.log import log_master
try:
from exllamav2_kernels import make_q_matrix, gemm_half_q_half
from exllamav2.ext import exllamav2_ext
make_q_matrix = exllamav2_ext.make_q_matrix
gemm_half_q_half = exllamav2_ext.gemm_half_q_half
except ImportError:
logger.error("exllamav2_kernels not installed.")
log_master(logger.warning, "exllamav2_kernels not installed.")
raise
# Dummy tensor to pass instead of g_idx since there is no way to pass "None" to a C++ extension
@ -69,6 +73,10 @@ def ext_make_q_matrix(
"""
Create Q matrix
"""
# max_dq_size = 512*(1024**2)
# max_dq_rows = max_dq_size // out_features[0]
max_dq_rows = 0
# EXL2
if isinstance(w, Exl2Weight):
extra.q_group_map = make_group_map(w.q_groups, w.q_weight.shape[0])
@ -82,10 +90,12 @@ def ext_make_q_matrix(
w.q_scale_max,
w.q_groups,
extra.q_group_map,
none_tensor,
none_tensor,
none_tensor,
none_tensor, # zeros
none_tensor, # scales
none_tensor, # g_idx
none_tensor, # bias
temp_dq,
max_dq_rows,
)
# GPTQ
elif isinstance(w, GPTQWeight):
@ -105,29 +115,33 @@ def ext_make_q_matrix(
w.qweight,
extra.q_perm,
extra.q_invperm,
none_tensor,
none_tensor,
none_tensor,
none_tensor,
none_tensor, # q_scale
none_tensor, # q_scale_max
none_tensor, # q_groups
none_tensor, # q_group_map
w.qzeros,
w.scales,
w.g_idx.cpu(),
none_tensor, # bias
temp_dq,
max_dq_rows,
)
# GPTQ without g_idx
else:
return make_q_matrix(
w.qweight,
none_tensor,
none_tensor,
none_tensor,
none_tensor,
none_tensor,
none_tensor,
none_tensor, # q_perm
none_tensor, # q_invperm
none_tensor, # q_scale
none_tensor, # q_scale_max
none_tensor, # q_groups
none_tensor, # q_group_map
w.qzeros,
w.scales,
none_tensor,
none_tensor, # g_idx
none_tensor, # bias
temp_dq,
max_dq_rows,
)
else:
RuntimeError("Cannot create handle")

View File

@ -206,6 +206,7 @@ def matmul248(input, qweight, scales, qzeros, g_idx, bits, maxq):
output = torch.empty(
(input.shape[0], qweight.shape[1]), device=input.device, dtype=torch.float16
)
def grid(META):
return (
triton.cdiv(input.shape[0], META["BLOCK_SIZE_M"])

5
tgi-entrypoint.sh Executable file
View File

@ -0,0 +1,5 @@
#!/bin/bash
ldconfig 2>/dev/null || echo 'unable to refresh ld cache, not a big deal in most cases'
text-generation-launcher $@