mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-22 15:32:08 +00:00
Merge pull request #187 from yuanwu2017/v2.0.4
This commit is contained in:
commit
c09f5bc930
1
.gitignore
vendored
1
.gitignore
vendored
@ -13,3 +13,4 @@ server/exllama_kernels/exllama_kernels/hip_buffers.cuh
|
|||||||
server/exllama_kernels/exllama_kernels/exllama_ext_hip.cpp
|
server/exllama_kernels/exllama_kernels/exllama_ext_hip.cpp
|
||||||
|
|
||||||
data/
|
data/
|
||||||
|
load_tests/*.json
|
||||||
|
13
Cargo.lock
generated
13
Cargo.lock
generated
@ -78,9 +78,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "anstyle-query"
|
name = "anstyle-query"
|
||||||
version = "1.1.0"
|
version = "1.0.3"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "ad186efb764318d35165f1758e7dcef3b10628e26d41a44bc5550652e6804391"
|
checksum = "a64c907d4e79225ac72e2a354c9ce84d50ebb4586dee56c82b3ee73004f537f5"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"windows-sys 0.52.0",
|
"windows-sys 0.52.0",
|
||||||
]
|
]
|
||||||
@ -3552,7 +3552,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "text-generation-benchmark"
|
name = "text-generation-benchmark"
|
||||||
version = "2.0.1"
|
version = "2.0.4"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"average",
|
"average",
|
||||||
"clap",
|
"clap",
|
||||||
@ -3573,7 +3573,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "text-generation-client"
|
name = "text-generation-client"
|
||||||
version = "2.0.1"
|
version = "2.0.4"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"futures",
|
"futures",
|
||||||
"grpc-metadata",
|
"grpc-metadata",
|
||||||
@ -3590,7 +3590,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "text-generation-launcher"
|
name = "text-generation-launcher"
|
||||||
version = "2.0.1"
|
version = "2.0.4"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"clap",
|
"clap",
|
||||||
"ctrlc",
|
"ctrlc",
|
||||||
@ -3601,6 +3601,7 @@ dependencies = [
|
|||||||
"reqwest",
|
"reqwest",
|
||||||
"serde",
|
"serde",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
|
"thiserror",
|
||||||
"tracing",
|
"tracing",
|
||||||
"tracing-subscriber",
|
"tracing-subscriber",
|
||||||
"vergen",
|
"vergen",
|
||||||
@ -3608,7 +3609,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "text-generation-router"
|
name = "text-generation-router"
|
||||||
version = "2.0.1"
|
version = "2.0.4"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"async-stream",
|
"async-stream",
|
||||||
"axum",
|
"axum",
|
||||||
|
@ -9,7 +9,7 @@ members = [
|
|||||||
resolver = "2"
|
resolver = "2"
|
||||||
|
|
||||||
[workspace.package]
|
[workspace.package]
|
||||||
version = "2.0.2"
|
version = "2.0.4"
|
||||||
edition = "2021"
|
edition = "2021"
|
||||||
authors = ["Olivier Dehaene"]
|
authors = ["Olivier Dehaene"]
|
||||||
homepage = "https://github.com/huggingface/text-generation-inference"
|
homepage = "https://github.com/huggingface/text-generation-inference"
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
# Rust builder
|
# Rust builder
|
||||||
FROM lukemathwalker/cargo-chef:latest-rust-1.75 AS chef
|
FROM lukemathwalker/cargo-chef:latest-rust-1.78 AS chef
|
||||||
WORKDIR /usr/src
|
WORKDIR /usr/src
|
||||||
|
|
||||||
FROM chef as planner
|
FROM chef as planner
|
||||||
@ -50,6 +50,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins
|
|||||||
ca-certificates \
|
ca-certificates \
|
||||||
make \
|
make \
|
||||||
curl \
|
curl \
|
||||||
|
git \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
# Install server
|
# Install server
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
# Rust builder
|
# Rust builder
|
||||||
FROM lukemathwalker/cargo-chef:latest-rust-1.75 AS chef
|
FROM lukemathwalker/cargo-chef:latest-rust-1.78 AS chef
|
||||||
WORKDIR /usr/src
|
WORKDIR /usr/src
|
||||||
|
|
||||||
ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
|
ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
|
||||||
@ -36,7 +36,7 @@ COPY launcher launcher
|
|||||||
RUN cargo build --release
|
RUN cargo build --release
|
||||||
|
|
||||||
# Text Generation Inference base image for RoCm
|
# Text Generation Inference base image for RoCm
|
||||||
FROM rocm/dev-ubuntu-22.04:5.7 as base
|
FROM rocm/dev-ubuntu-22.04:6.1.1_hip_update as base
|
||||||
|
|
||||||
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
|
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
|
||||||
build-essential \
|
build-essential \
|
||||||
@ -50,13 +50,24 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins
|
|||||||
# Needed to build VLLM & flash.
|
# Needed to build VLLM & flash.
|
||||||
rocthrust-dev \
|
rocthrust-dev \
|
||||||
hipsparse-dev \
|
hipsparse-dev \
|
||||||
hipblas-dev && \
|
hipblas-dev \
|
||||||
|
hipblaslt-dev \
|
||||||
|
rocblas-dev \
|
||||||
|
hiprand-dev \
|
||||||
|
rocrand-dev \
|
||||||
|
miopen-hip-dev \
|
||||||
|
hipfft-dev \
|
||||||
|
hipcub-dev \
|
||||||
|
hipsolver-dev \
|
||||||
|
rccl-dev \
|
||||||
|
cmake \
|
||||||
|
python3-dev && \
|
||||||
rm -rf /var/lib/apt/lists/*
|
rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
# Keep in sync with `server/pyproject.toml
|
# Keep in sync with `server/pyproject.toml
|
||||||
ARG MAMBA_VERSION=23.1.0-1
|
ARG MAMBA_VERSION=23.1.0-1
|
||||||
ARG PYTORCH_VERSION='2.2.0.dev0'
|
ARG PYTORCH_VERSION='2.3.0'
|
||||||
ARG ROCM_VERSION='5.7'
|
ARG ROCM_VERSION='6.0.2'
|
||||||
ARG PYTHON_VERSION='3.10.10'
|
ARG PYTHON_VERSION='3.10.10'
|
||||||
# Automatically set by buildx
|
# Automatically set by buildx
|
||||||
ARG TARGETPLATFORM
|
ARG TARGETPLATFORM
|
||||||
@ -75,12 +86,44 @@ RUN chmod +x ~/mambaforge.sh && \
|
|||||||
mamba init && \
|
mamba init && \
|
||||||
rm ~/mambaforge.sh
|
rm ~/mambaforge.sh
|
||||||
|
|
||||||
# Install PyTorch 2.2 RC compiled against RoCm 5.7, as VLLM can not be compiled with RoCm 5.6.
|
# Install flash-attention, torch dependencies
|
||||||
RUN pip install torch --index-url https://download.pytorch.org/whl/test/rocm5.7/
|
RUN pip install numpy einops ninja --no-cache-dir
|
||||||
|
|
||||||
|
RUN conda install intel::mkl-static intel::mkl-include
|
||||||
|
RUN pip uninstall -y triton && \
|
||||||
|
git clone --depth 1 --single-branch https://github.com/ROCm/triton.git && \
|
||||||
|
cd triton/python && \
|
||||||
|
pip install .
|
||||||
|
|
||||||
|
RUN git clone --depth 1 --recursive --single-branch --branch 2.3-patched https://github.com/fxmarty/pytorch.git pytorch && cd pytorch && pip install -r requirements.txt --no-cache-dir
|
||||||
|
|
||||||
|
ARG _GLIBCXX_USE_CXX11_ABI="1"
|
||||||
|
ARG CMAKE_PREFIX_PATH="/opt/conda"
|
||||||
|
ARG PYTORCH_ROCM_ARCH="gfx90a;gfx942"
|
||||||
|
ARG BUILD_CAFFE2="0" \
|
||||||
|
BUILD_CAFFE2_OPS="0" \
|
||||||
|
USE_CUDA="0" \
|
||||||
|
USE_ROCM="1" \
|
||||||
|
BUILD_TEST="0" \
|
||||||
|
USE_FBGEMM="0" \
|
||||||
|
USE_NNPACK="0" \
|
||||||
|
USE_QNNPACK="0" \
|
||||||
|
USE_XNNPACK="0" \
|
||||||
|
USE_FLASH_ATTENTION="1" \
|
||||||
|
USE_MEM_EFF_ATTENTION="0"
|
||||||
|
|
||||||
|
RUN cd pytorch && python tools/amd_build/build_amd.py && python setup.py install
|
||||||
|
|
||||||
|
# Set as recommended: https://github.com/ROCm/triton/wiki/A-script-to-set-program-execution-environment-in-ROCm
|
||||||
|
ENV HIP_FORCE_DEV_KERNARG=1
|
||||||
|
|
||||||
|
# On MI250 and MI300, performances for flash with Triton FA are slightly better than CK.
|
||||||
|
# However, Triton requires a tunning for each prompt length, which is prohibitive.
|
||||||
|
ENV ROCM_USE_FLASH_ATTN_V2_TRITON=0
|
||||||
|
|
||||||
FROM base AS kernel-builder
|
FROM base AS kernel-builder
|
||||||
|
|
||||||
# Build vllm kernels
|
# # Build vllm kernels
|
||||||
FROM kernel-builder AS vllm-builder
|
FROM kernel-builder AS vllm-builder
|
||||||
WORKDIR /usr/src
|
WORKDIR /usr/src
|
||||||
|
|
||||||
@ -102,21 +145,21 @@ RUN make build-flash-attention-v2-rocm
|
|||||||
FROM kernel-builder as custom-kernels-builder
|
FROM kernel-builder as custom-kernels-builder
|
||||||
WORKDIR /usr/src
|
WORKDIR /usr/src
|
||||||
COPY server/custom_kernels/ .
|
COPY server/custom_kernels/ .
|
||||||
RUN PYTORCH_ROCM_ARCH=gfx90a python setup.py build
|
RUN python setup.py build
|
||||||
|
|
||||||
# Build exllama kernels
|
# Build exllama kernels
|
||||||
FROM kernel-builder as exllama-kernels-builder
|
FROM kernel-builder as exllama-kernels-builder
|
||||||
WORKDIR /usr/src
|
WORKDIR /usr/src
|
||||||
COPY server/exllama_kernels/ .
|
COPY server/exllama_kernels/ .
|
||||||
|
|
||||||
RUN PYTORCH_ROCM_ARCH="gfx90a" python setup.py build
|
RUN python setup.py build
|
||||||
|
|
||||||
# Build exllama v2 kernels
|
# Build exllama v2 kernels
|
||||||
FROM kernel-builder as exllamav2-kernels-builder
|
FROM kernel-builder as exllamav2-kernels-builder
|
||||||
WORKDIR /usr/src
|
WORKDIR /usr/src
|
||||||
COPY server/exllamav2_kernels/ .
|
COPY server/exllamav2_kernels/ .
|
||||||
|
|
||||||
RUN PYTORCH_ROCM_ARCH="gfx90a" python setup.py build
|
RUN python setup.py build
|
||||||
|
|
||||||
FROM base as base-copy
|
FROM base as base-copy
|
||||||
|
|
||||||
@ -140,9 +183,6 @@ COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310
|
|||||||
# Copy build artifacts from exllamav2 kernels builder
|
# Copy build artifacts from exllamav2 kernels builder
|
||||||
COPY --from=exllamav2-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
|
COPY --from=exllamav2-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
|
||||||
|
|
||||||
# Install flash-attention dependencies
|
|
||||||
RUN pip install einops --no-cache-dir
|
|
||||||
|
|
||||||
# Install server
|
# Install server
|
||||||
COPY proto proto
|
COPY proto proto
|
||||||
COPY server server
|
COPY server server
|
||||||
@ -160,7 +200,8 @@ COPY --from=builder /usr/src/target/release/text-generation-router /usr/local/bi
|
|||||||
COPY --from=builder /usr/src/target/release/text-generation-launcher /usr/local/bin/text-generation-launcher
|
COPY --from=builder /usr/src/target/release/text-generation-launcher /usr/local/bin/text-generation-launcher
|
||||||
|
|
||||||
# AWS Sagemaker compatible image
|
# AWS Sagemaker compatible image
|
||||||
FROM base-copy as sagemaker
|
FROM base as sagemaker
|
||||||
|
|
||||||
COPY sagemaker-entrypoint.sh entrypoint.sh
|
COPY sagemaker-entrypoint.sh entrypoint.sh
|
||||||
RUN chmod +x entrypoint.sh
|
RUN chmod +x entrypoint.sh
|
||||||
|
|
||||||
@ -169,5 +210,8 @@ ENTRYPOINT ["./entrypoint.sh"]
|
|||||||
# Final image
|
# Final image
|
||||||
FROM base-copy
|
FROM base-copy
|
||||||
|
|
||||||
ENTRYPOINT ["text-generation-launcher"]
|
COPY ./tgi-entrypoint.sh /tgi-entrypoint.sh
|
||||||
|
RUN chmod +x /tgi-entrypoint.sh
|
||||||
|
|
||||||
|
ENTRYPOINT ["/tgi-entrypoint.sh"]
|
||||||
CMD ["--json-output"]
|
CMD ["--json-output"]
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
FROM lukemathwalker/cargo-chef:latest-rust-1.75 AS chef
|
FROM lukemathwalker/cargo-chef:latest-rust-1.78 AS chef
|
||||||
WORKDIR /usr/src
|
WORKDIR /usr/src
|
||||||
|
|
||||||
ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
|
ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
|
||||||
@ -36,18 +36,19 @@ RUN cargo build --release
|
|||||||
|
|
||||||
|
|
||||||
# Text Generation Inference base image for Intel
|
# Text Generation Inference base image for Intel
|
||||||
FROM intel/intel-extension-for-pytorch:2.1.10-xpu as base
|
FROM intel/intel-extension-for-pytorch:2.1.30-xpu as base
|
||||||
|
|
||||||
USER root
|
USER root
|
||||||
# libssl.so.1.1 is not installed on Ubuntu 22.04 by default, install it
|
# libssl.so.1.1 is not installed on Ubuntu 22.04 by default, install it
|
||||||
RUN wget http://nz2.archive.ubuntu.com/ubuntu/pool/main/o/openssl/libssl1.1_1.1.1f-1ubuntu2_amd64.deb && \
|
RUN wget http://nz2.archive.ubuntu.com/ubuntu/pool/main/o/openssl/libssl1.1_1.1.1f-1ubuntu2_amd64.deb && \
|
||||||
dpkg -i ./libssl1.1_1.1.1f-1ubuntu2_amd64.deb
|
dpkg -i ./libssl1.1_1.1.1f-1ubuntu2_amd64.deb
|
||||||
|
|
||||||
|
RUN wget -qO - https://repositories.intel.com/gpu/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null
|
||||||
|
|
||||||
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
|
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
|
||||||
| gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list
|
| gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list
|
||||||
|
|
||||||
RUN apt-get update && apt install -y intel-basekit xpu-smi cmake python3-dev ninja-build
|
RUN apt-get update && apt install -y intel-basekit xpu-smi
|
||||||
|
|
||||||
# Text Generation Inference base env
|
# Text Generation Inference base env
|
||||||
ENV HUGGINGFACE_HUB_CACHE=/data \
|
ENV HUGGINGFACE_HUB_CACHE=/data \
|
||||||
@ -56,9 +57,8 @@ ENV HUGGINGFACE_HUB_CACHE=/data \
|
|||||||
|
|
||||||
|
|
||||||
WORKDIR /usr/src
|
WORKDIR /usr/src
|
||||||
# Build pytorch and ipex
|
RUN wget https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.1.30a0-cp310-cp310-linux_x86_64.whl
|
||||||
RUN git clone https://github.com/intel/intel-extension-for-pytorch && cd intel-extension-for-pytorch && git checkout -b xpu_main origin/xpu-main
|
RUN pip install intel_extension_for_pytorch-2.1.30a0-cp310-cp310-linux_x86_64.whl
|
||||||
RUN git clone https://github.com/pytorch/pytorch.git && cd pytorch && git checkout 209f2fa8ff86652f67d75c2f19bf9cb9942fd018 && git apply /usr/src/intel-extension-for-pytorch/torch_patches/00*.patch
|
|
||||||
|
|
||||||
# Install server
|
# Install server
|
||||||
COPY proto proto
|
COPY proto proto
|
||||||
@ -72,25 +72,11 @@ RUN cd server && \
|
|||||||
ENV CCL_ROOT=/opt/intel/oneapi/ccl/latest
|
ENV CCL_ROOT=/opt/intel/oneapi/ccl/latest
|
||||||
ENV I_MPI_ROOT=/opt/intel/oneapi/mpi/latest
|
ENV I_MPI_ROOT=/opt/intel/oneapi/mpi/latest
|
||||||
ENV FI_PROVIDER_PATH=/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/lib/prov:/usr/lib/x86_64-linux-gnu/libfabric
|
ENV FI_PROVIDER_PATH=/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/lib/prov:/usr/lib/x86_64-linux-gnu/libfabric
|
||||||
ENV DIAGUTIL_PATH=/opt/intel/oneapi/compiler/latest/etc/compiler/sys_check/sys_check.sh
|
|
||||||
ENV CCL_CONFIGURATION=cpu_gpu_dpcpp
|
|
||||||
ENV MANPATH=/opt/intel/oneapi/mpi/latest/share/man:/opt/intel/oneapi/mpi/latest/share/man:/opt/intel/oneapi/compiler/latest/share/man
|
|
||||||
ENV CMAKE_PREFIX_PATH=/opt/intel/oneapi/mkl/latest/lib/cmake:/opt/intel/oneapi/compiler/latest
|
|
||||||
ENV CMPLR_ROOT=/opt/intel/oneapi/compiler/latest
|
|
||||||
ENV LIBRARY_PATH=/opt/intel/oneapi/mpi/latest/lib:/opt/intel/oneapi/ccl/latest/lib/:/opt/intel/oneapi/mkl/latest/lib/:/opt/intel/oneapi/compiler/latest/lib
|
ENV LIBRARY_PATH=/opt/intel/oneapi/mpi/latest/lib:/opt/intel/oneapi/ccl/latest/lib/:/opt/intel/oneapi/mkl/latest/lib/:/opt/intel/oneapi/compiler/latest/lib
|
||||||
ENV OCL_ICD_FILENAMES=libintelocl_emu.so:libalteracl.so:/opt/intel/oneapi/compiler/latest/lib/libintelocl.so
|
|
||||||
ENV CLASSPATH=/opt/intel/oneapi/mpi/latest/share/java/mpi.jar:/opt/intel/oneapi/mpi/latest/share/java/mpi.jar
|
|
||||||
ENV LD_LIBRARY_PATH=/opt/intel/oneapi/ccl/latest/lib/:/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/lib:/opt/intel/oneapi/mpi/latest/lib:/opt/intel/oneapi/mkl/latest/lib:/opt/intel/oneapi/compiler/latest/opt/compiler/lib:/opt/intel/oneapi/compiler/latest/lib:/opt/intel/oneapi/lib:/opt/intel/oneapi/lib/intel64:
|
ENV LD_LIBRARY_PATH=/opt/intel/oneapi/ccl/latest/lib/:/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/lib:/opt/intel/oneapi/mpi/latest/lib:/opt/intel/oneapi/mkl/latest/lib:/opt/intel/oneapi/compiler/latest/opt/compiler/lib:/opt/intel/oneapi/compiler/latest/lib:/opt/intel/oneapi/lib:/opt/intel/oneapi/lib/intel64:
|
||||||
ENV MKLROOT=/opt/intel/oneapi/mkl/latest
|
|
||||||
ENV NLSPATH=/opt/intel/oneapi/mkl/latest/share/locale/%l_%t/%N:/opt/intel/oneapi/compiler/latest/lib/locale/%l_%t/%N
|
|
||||||
ENV PATH=/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/bin:/opt/intel/oneapi/mpi/latest/bin:/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/bin:/opt/intel/oneapi/mkl/latest/bin/:/opt/intel/oneapi/compiler/latest/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
|
ENV PATH=/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/bin:/opt/intel/oneapi/mpi/latest/bin:/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/bin:/opt/intel/oneapi/mkl/latest/bin/:/opt/intel/oneapi/compiler/latest/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
|
||||||
ENV CPATH=/opt/intel/oneapi/mpi/latest/include:/opt/intel/oneapi/ccl/latest/include:/opt/intel/oneapi/mkl/latest/include
|
|
||||||
ENV CCL_ZE_IPC_EXCHANGE=sockets
|
ENV CCL_ZE_IPC_EXCHANGE=sockets
|
||||||
|
|
||||||
|
|
||||||
RUN pip uninstall -y torch && cd pytorch && git submodule update --init --recursive && python setup.py install
|
|
||||||
RUN pip uninstall -y intel-extension-for-pytorch && cd intel-extension-for-pytorch && git submodule update --init --recursive && USE_AOT_DEVLIST='pvc' BUILD_SEPARATE_OPS=ON BUILD_WITH_CPU=ON USE_XETLA=ON python setup.py install
|
|
||||||
|
|
||||||
# Install benchmarker
|
# Install benchmarker
|
||||||
COPY --from=builder /usr/src/target/release/text-generation-benchmark /usr/local/bin/text-generation-benchmark
|
COPY --from=builder /usr/src/target/release/text-generation-benchmark /usr/local/bin/text-generation-benchmark
|
||||||
# Install router
|
# Install router
|
||||||
|
3999
assets/tgi_grafana.json
Normal file
3999
assets/tgi_grafana.json
Normal file
File diff suppressed because it is too large
Load Diff
@ -11,7 +11,7 @@ pub(crate) enum Event {
|
|||||||
/// Key press.
|
/// Key press.
|
||||||
Key(event::KeyEvent),
|
Key(event::KeyEvent),
|
||||||
/// Terminal resize.
|
/// Terminal resize.
|
||||||
Resize(u16, u16),
|
Resize,
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) async fn terminal_event_task(
|
pub(crate) async fn terminal_event_task(
|
||||||
@ -47,8 +47,8 @@ async fn event_loop(fps: u32, event_sender: mpsc::Sender<Event>) {
|
|||||||
if event::poll(Duration::from_secs(0)).expect("no events available") {
|
if event::poll(Duration::from_secs(0)).expect("no events available") {
|
||||||
match event::read().expect("unable to read event") {
|
match event::read().expect("unable to read event") {
|
||||||
event::Event::Key(e) => event_sender.send(Event::Key(e)).await.unwrap_or(()),
|
event::Event::Key(e) => event_sender.send(Event::Key(e)).await.unwrap_or(()),
|
||||||
event::Event::Resize(w, h) => {
|
event::Event::Resize(_w, _h) => {
|
||||||
event_sender.send(Event::Resize(w, h)).await.unwrap_or(())
|
event_sender.send(Event::Resize).await.unwrap_or(())
|
||||||
}
|
}
|
||||||
_ => (),
|
_ => (),
|
||||||
}
|
}
|
||||||
|
@ -14,5 +14,10 @@
|
|||||||
|
|
||||||
__version__ = "0.6.0"
|
__version__ = "0.6.0"
|
||||||
|
|
||||||
|
DEPRECATION_WARNING = (
|
||||||
|
"`text_generation` clients are deprecated and will be removed in the near future. "
|
||||||
|
"Please use the `InferenceClient` from the `huggingface_hub` package instead."
|
||||||
|
)
|
||||||
|
|
||||||
from text_generation.client import Client, AsyncClient
|
from text_generation.client import Client, AsyncClient
|
||||||
from text_generation.inference_api import InferenceAPIClient, InferenceAPIAsyncClient
|
from text_generation.inference_api import InferenceAPIClient, InferenceAPIAsyncClient
|
||||||
|
@ -1,16 +1,21 @@
|
|||||||
import json
|
import json
|
||||||
import requests
|
import requests
|
||||||
|
import warnings
|
||||||
|
|
||||||
from aiohttp import ClientSession, ClientTimeout
|
from aiohttp import ClientSession, ClientTimeout
|
||||||
from pydantic import ValidationError
|
from pydantic import ValidationError
|
||||||
from typing import Dict, Optional, List, AsyncIterator, Iterator, Union
|
from typing import Dict, Optional, List, AsyncIterator, Iterator, Union
|
||||||
|
|
||||||
|
from text_generation import DEPRECATION_WARNING
|
||||||
from text_generation.types import (
|
from text_generation.types import (
|
||||||
StreamResponse,
|
StreamResponse,
|
||||||
Response,
|
Response,
|
||||||
Request,
|
Request,
|
||||||
Parameters,
|
Parameters,
|
||||||
Grammar,
|
Grammar,
|
||||||
|
CompletionRequest,
|
||||||
|
Completion,
|
||||||
|
CompletionComplete,
|
||||||
ChatRequest,
|
ChatRequest,
|
||||||
ChatCompletionChunk,
|
ChatCompletionChunk,
|
||||||
ChatComplete,
|
ChatComplete,
|
||||||
@ -19,6 +24,9 @@ from text_generation.types import (
|
|||||||
)
|
)
|
||||||
from text_generation.errors import parse_error
|
from text_generation.errors import parse_error
|
||||||
|
|
||||||
|
# emit deprecation warnings
|
||||||
|
warnings.simplefilter("always", DeprecationWarning)
|
||||||
|
|
||||||
|
|
||||||
class Client:
|
class Client:
|
||||||
"""Client to make calls to a text-generation-inference instance
|
"""Client to make calls to a text-generation-inference instance
|
||||||
@ -59,11 +67,100 @@ class Client:
|
|||||||
timeout (`int`):
|
timeout (`int`):
|
||||||
Timeout in seconds
|
Timeout in seconds
|
||||||
"""
|
"""
|
||||||
|
warnings.warn(DEPRECATION_WARNING, DeprecationWarning)
|
||||||
self.base_url = base_url
|
self.base_url = base_url
|
||||||
self.headers = headers
|
self.headers = headers
|
||||||
self.cookies = cookies
|
self.cookies = cookies
|
||||||
self.timeout = timeout
|
self.timeout = timeout
|
||||||
|
|
||||||
|
def completion(
|
||||||
|
self,
|
||||||
|
prompt: str,
|
||||||
|
frequency_penalty: Optional[float] = None,
|
||||||
|
max_tokens: Optional[int] = None,
|
||||||
|
repetition_penalty: Optional[float] = None,
|
||||||
|
seed: Optional[int] = None,
|
||||||
|
stream: bool = False,
|
||||||
|
temperature: Optional[float] = None,
|
||||||
|
top_p: Optional[float] = None,
|
||||||
|
stop: Optional[List[str]] = None,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Given a prompt, generate a response synchronously
|
||||||
|
|
||||||
|
Args:
|
||||||
|
prompt (`str`):
|
||||||
|
Prompt
|
||||||
|
frequency_penalty (`float`):
|
||||||
|
The parameter for frequency penalty. 0.0 means no penalty
|
||||||
|
Penalize new tokens based on their existing frequency in the text so far,
|
||||||
|
decreasing the model's likelihood to repeat the same line verbatim.
|
||||||
|
max_tokens (`int`):
|
||||||
|
Maximum number of generated tokens
|
||||||
|
repetition_penalty (`float`):
|
||||||
|
The parameter for frequency penalty. 0.0 means no penalty. See [this
|
||||||
|
paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
|
||||||
|
seed (`int`):
|
||||||
|
Random sampling seed
|
||||||
|
stream (`bool`):
|
||||||
|
Stream the response
|
||||||
|
temperature (`float`):
|
||||||
|
The value used to module the logits distribution.
|
||||||
|
top_p (`float`):
|
||||||
|
If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
|
||||||
|
higher are kept for generation
|
||||||
|
stop (`List[str]`):
|
||||||
|
Stop generating tokens if a member of `stop` is generated
|
||||||
|
"""
|
||||||
|
request = CompletionRequest(
|
||||||
|
model="tgi",
|
||||||
|
prompt=prompt,
|
||||||
|
frequency_penalty=frequency_penalty,
|
||||||
|
max_tokens=max_tokens,
|
||||||
|
repetition_penalty=repetition_penalty,
|
||||||
|
seed=seed,
|
||||||
|
stream=stream,
|
||||||
|
temperature=temperature,
|
||||||
|
top_p=top_p,
|
||||||
|
stop=stop,
|
||||||
|
)
|
||||||
|
if not stream:
|
||||||
|
resp = requests.post(
|
||||||
|
f"{self.base_url}/v1/completions",
|
||||||
|
json=request.dict(),
|
||||||
|
headers=self.headers,
|
||||||
|
cookies=self.cookies,
|
||||||
|
timeout=self.timeout,
|
||||||
|
)
|
||||||
|
payload = resp.json()
|
||||||
|
if resp.status_code != 200:
|
||||||
|
raise parse_error(resp.status_code, payload)
|
||||||
|
return Completion(**payload)
|
||||||
|
else:
|
||||||
|
return self._completion_stream_response(request)
|
||||||
|
|
||||||
|
def _completion_stream_response(self, request):
|
||||||
|
resp = requests.post(
|
||||||
|
f"{self.base_url}/v1/completions",
|
||||||
|
json=request.dict(),
|
||||||
|
headers=self.headers,
|
||||||
|
cookies=self.cookies,
|
||||||
|
timeout=self.timeout,
|
||||||
|
stream=True,
|
||||||
|
)
|
||||||
|
# iterate and print stream
|
||||||
|
for byte_payload in resp.iter_lines():
|
||||||
|
if byte_payload == b"\n":
|
||||||
|
continue
|
||||||
|
payload = byte_payload.decode("utf-8")
|
||||||
|
if payload.startswith("data:"):
|
||||||
|
json_payload = json.loads(payload.lstrip("data:").rstrip("\n"))
|
||||||
|
try:
|
||||||
|
response = CompletionComplete(**json_payload)
|
||||||
|
yield response
|
||||||
|
except ValidationError:
|
||||||
|
raise parse_error(resp.status, json_payload)
|
||||||
|
|
||||||
def chat(
|
def chat(
|
||||||
self,
|
self,
|
||||||
messages: List[Message],
|
messages: List[Message],
|
||||||
@ -82,6 +179,7 @@ class Client:
|
|||||||
tools: Optional[List[Tool]] = None,
|
tools: Optional[List[Tool]] = None,
|
||||||
tool_prompt: Optional[str] = None,
|
tool_prompt: Optional[str] = None,
|
||||||
tool_choice: Optional[str] = None,
|
tool_choice: Optional[str] = None,
|
||||||
|
stop: Optional[List[str]] = None,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Given a list of messages, generate a response asynchronously
|
Given a list of messages, generate a response asynchronously
|
||||||
@ -124,6 +222,8 @@ class Client:
|
|||||||
A prompt to be appended before the tools
|
A prompt to be appended before the tools
|
||||||
tool_choice (`str`):
|
tool_choice (`str`):
|
||||||
The tool to use
|
The tool to use
|
||||||
|
stop (`List[str]`):
|
||||||
|
Stop generating tokens if a member of `stop` is generated
|
||||||
|
|
||||||
"""
|
"""
|
||||||
request = ChatRequest(
|
request = ChatRequest(
|
||||||
@ -144,6 +244,7 @@ class Client:
|
|||||||
tools=tools,
|
tools=tools,
|
||||||
tool_prompt=tool_prompt,
|
tool_prompt=tool_prompt,
|
||||||
tool_choice=tool_choice,
|
tool_choice=tool_choice,
|
||||||
|
stop=stop,
|
||||||
)
|
)
|
||||||
if not stream:
|
if not stream:
|
||||||
resp = requests.post(
|
resp = requests.post(
|
||||||
@ -449,11 +550,99 @@ class AsyncClient:
|
|||||||
timeout (`int`):
|
timeout (`int`):
|
||||||
Timeout in seconds
|
Timeout in seconds
|
||||||
"""
|
"""
|
||||||
|
warnings.warn(DEPRECATION_WARNING, DeprecationWarning)
|
||||||
self.base_url = base_url
|
self.base_url = base_url
|
||||||
self.headers = headers
|
self.headers = headers
|
||||||
self.cookies = cookies
|
self.cookies = cookies
|
||||||
self.timeout = ClientTimeout(timeout)
|
self.timeout = ClientTimeout(timeout)
|
||||||
|
|
||||||
|
async def completion(
|
||||||
|
self,
|
||||||
|
prompt: str,
|
||||||
|
frequency_penalty: Optional[float] = None,
|
||||||
|
max_tokens: Optional[int] = None,
|
||||||
|
repetition_penalty: Optional[float] = None,
|
||||||
|
seed: Optional[int] = None,
|
||||||
|
stream: bool = False,
|
||||||
|
temperature: Optional[float] = None,
|
||||||
|
top_p: Optional[float] = None,
|
||||||
|
stop: Optional[List[str]] = None,
|
||||||
|
) -> Union[Completion, AsyncIterator[CompletionComplete]]:
|
||||||
|
"""
|
||||||
|
Given a prompt, generate a response asynchronously
|
||||||
|
|
||||||
|
Args:
|
||||||
|
prompt (`str`):
|
||||||
|
Prompt
|
||||||
|
frequency_penalty (`float`):
|
||||||
|
The parameter for frequency penalty. 0.0 means no penalty
|
||||||
|
Penalize new tokens based on their existing frequency in the text so far,
|
||||||
|
decreasing the model's likelihood to repeat the same line verbatim.
|
||||||
|
max_tokens (`int`):
|
||||||
|
Maximum number of generated tokens
|
||||||
|
repetition_penalty (`float`):
|
||||||
|
The parameter for frequency penalty. 0.0 means no penalty. See [this
|
||||||
|
paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
|
||||||
|
seed (`int`):
|
||||||
|
Random sampling seed
|
||||||
|
stream (`bool`):
|
||||||
|
Stream the response
|
||||||
|
temperature (`float`):
|
||||||
|
The value used to module the logits distribution.
|
||||||
|
top_p (`float`):
|
||||||
|
If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
|
||||||
|
higher are kept for generation
|
||||||
|
stop (`List[str]`):
|
||||||
|
Stop generating tokens if a member of `stop` is generated
|
||||||
|
"""
|
||||||
|
request = CompletionRequest(
|
||||||
|
model="tgi",
|
||||||
|
prompt=prompt,
|
||||||
|
frequency_penalty=frequency_penalty,
|
||||||
|
max_tokens=max_tokens,
|
||||||
|
repetition_penalty=repetition_penalty,
|
||||||
|
seed=seed,
|
||||||
|
stream=stream,
|
||||||
|
temperature=temperature,
|
||||||
|
top_p=top_p,
|
||||||
|
stop=stop,
|
||||||
|
)
|
||||||
|
if not stream:
|
||||||
|
return await self._completion_single_response(request)
|
||||||
|
else:
|
||||||
|
return self._completion_stream_response(request)
|
||||||
|
|
||||||
|
async def _completion_single_response(self, request):
|
||||||
|
async with ClientSession(
|
||||||
|
headers=self.headers, cookies=self.cookies, timeout=self.timeout
|
||||||
|
) as session:
|
||||||
|
async with session.post(
|
||||||
|
f"{self.base_url}/v1/completions", json=request.dict()
|
||||||
|
) as resp:
|
||||||
|
payload = await resp.json()
|
||||||
|
if resp.status != 200:
|
||||||
|
raise parse_error(resp.status, payload)
|
||||||
|
return Completion(**payload)
|
||||||
|
|
||||||
|
async def _completion_stream_response(self, request):
|
||||||
|
async with ClientSession(
|
||||||
|
headers=self.headers, cookies=self.cookies, timeout=self.timeout
|
||||||
|
) as session:
|
||||||
|
async with session.post(
|
||||||
|
f"{self.base_url}/v1/completions", json=request.dict()
|
||||||
|
) as resp:
|
||||||
|
async for byte_payload in resp.content:
|
||||||
|
if byte_payload == b"\n":
|
||||||
|
continue
|
||||||
|
payload = byte_payload.decode("utf-8")
|
||||||
|
if payload.startswith("data:"):
|
||||||
|
json_payload = json.loads(payload.lstrip("data:").rstrip("\n"))
|
||||||
|
try:
|
||||||
|
response = CompletionComplete(**json_payload)
|
||||||
|
yield response
|
||||||
|
except ValidationError:
|
||||||
|
raise parse_error(resp.status, json_payload)
|
||||||
|
|
||||||
async def chat(
|
async def chat(
|
||||||
self,
|
self,
|
||||||
messages: List[Message],
|
messages: List[Message],
|
||||||
@ -472,6 +661,7 @@ class AsyncClient:
|
|||||||
tools: Optional[List[Tool]] = None,
|
tools: Optional[List[Tool]] = None,
|
||||||
tool_prompt: Optional[str] = None,
|
tool_prompt: Optional[str] = None,
|
||||||
tool_choice: Optional[str] = None,
|
tool_choice: Optional[str] = None,
|
||||||
|
stop: Optional[List[str]] = None,
|
||||||
) -> Union[ChatComplete, AsyncIterator[ChatCompletionChunk]]:
|
) -> Union[ChatComplete, AsyncIterator[ChatCompletionChunk]]:
|
||||||
"""
|
"""
|
||||||
Given a list of messages, generate a response asynchronously
|
Given a list of messages, generate a response asynchronously
|
||||||
@ -514,6 +704,8 @@ class AsyncClient:
|
|||||||
A prompt to be appended before the tools
|
A prompt to be appended before the tools
|
||||||
tool_choice (`str`):
|
tool_choice (`str`):
|
||||||
The tool to use
|
The tool to use
|
||||||
|
stop (`List[str]`):
|
||||||
|
Stop generating tokens if a member of `stop` is generated
|
||||||
|
|
||||||
"""
|
"""
|
||||||
request = ChatRequest(
|
request = ChatRequest(
|
||||||
@ -534,6 +726,7 @@ class AsyncClient:
|
|||||||
tools=tools,
|
tools=tools,
|
||||||
tool_prompt=tool_prompt,
|
tool_prompt=tool_prompt,
|
||||||
tool_choice=tool_choice,
|
tool_choice=tool_choice,
|
||||||
|
stop=stop,
|
||||||
)
|
)
|
||||||
if not stream:
|
if not stream:
|
||||||
return await self._chat_single_response(request)
|
return await self._chat_single_response(request)
|
||||||
|
@ -46,30 +46,6 @@ class Tool(BaseModel):
|
|||||||
function: dict
|
function: dict
|
||||||
|
|
||||||
|
|
||||||
class ChatCompletionComplete(BaseModel):
|
|
||||||
# Index of the chat completion
|
|
||||||
index: int
|
|
||||||
# Message associated with the chat completion
|
|
||||||
message: Message
|
|
||||||
# Log probabilities for the chat completion
|
|
||||||
logprobs: Optional[Any]
|
|
||||||
# Reason for completion
|
|
||||||
finish_reason: str
|
|
||||||
# Usage details of the chat completion
|
|
||||||
usage: Optional[Any] = None
|
|
||||||
|
|
||||||
|
|
||||||
class CompletionComplete(BaseModel):
|
|
||||||
# Index of the chat completion
|
|
||||||
index: int
|
|
||||||
# Message associated with the chat completion
|
|
||||||
text: str
|
|
||||||
# Log probabilities for the chat completion
|
|
||||||
logprobs: Optional[Any]
|
|
||||||
# Reason for completion
|
|
||||||
finish_reason: str
|
|
||||||
|
|
||||||
|
|
||||||
class Function(BaseModel):
|
class Function(BaseModel):
|
||||||
name: Optional[str]
|
name: Optional[str]
|
||||||
arguments: str
|
arguments: str
|
||||||
@ -95,24 +71,41 @@ class Choice(BaseModel):
|
|||||||
finish_reason: Optional[str] = None
|
finish_reason: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
class ChatCompletionChunk(BaseModel):
|
class CompletionRequest(BaseModel):
|
||||||
id: str
|
# Model identifier
|
||||||
object: str
|
|
||||||
created: int
|
|
||||||
model: str
|
model: str
|
||||||
system_fingerprint: str
|
# Prompt
|
||||||
choices: List[Choice]
|
prompt: str
|
||||||
|
# The parameter for repetition penalty. 1.0 means no penalty.
|
||||||
|
# See [this paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
|
||||||
|
repetition_penalty: Optional[float] = None
|
||||||
|
# The parameter for frequency penalty. 1.0 means no penalty
|
||||||
|
# Penalize new tokens based on their existing frequency in the text so far,
|
||||||
|
# decreasing the model's likelihood to repeat the same line verbatim.
|
||||||
|
frequency_penalty: Optional[float] = None
|
||||||
|
# Maximum number of tokens to generate
|
||||||
|
max_tokens: Optional[int] = None
|
||||||
|
# Flag to indicate streaming response
|
||||||
|
stream: bool = False
|
||||||
|
# Random sampling seed
|
||||||
|
seed: Optional[int] = None
|
||||||
|
# Sampling temperature
|
||||||
|
temperature: Optional[float] = None
|
||||||
|
# Top-p value for nucleus sampling
|
||||||
|
top_p: Optional[float] = None
|
||||||
|
# Stop generating tokens if a member of `stop` is generated
|
||||||
|
stop: Optional[List[str]] = None
|
||||||
|
|
||||||
|
|
||||||
class ChatComplete(BaseModel):
|
class CompletionComplete(BaseModel):
|
||||||
# Chat completion details
|
# Index of the chat completion
|
||||||
id: str
|
index: int
|
||||||
object: str
|
# Message associated with the chat completion
|
||||||
created: int
|
text: str
|
||||||
model: str
|
# Log probabilities for the chat completion
|
||||||
system_fingerprint: str
|
logprobs: Optional[Any]
|
||||||
choices: List[ChatCompletionComplete]
|
# Reason for completion
|
||||||
usage: Any
|
finish_reason: str
|
||||||
|
|
||||||
|
|
||||||
class Completion(BaseModel):
|
class Completion(BaseModel):
|
||||||
@ -163,6 +156,41 @@ class ChatRequest(BaseModel):
|
|||||||
tool_prompt: Optional[str] = None
|
tool_prompt: Optional[str] = None
|
||||||
# Choice of tool to be used
|
# Choice of tool to be used
|
||||||
tool_choice: Optional[str] = None
|
tool_choice: Optional[str] = None
|
||||||
|
# Stop generating tokens if a member of `stop` is generated
|
||||||
|
stop: Optional[List[str]] = None
|
||||||
|
|
||||||
|
|
||||||
|
class ChatCompletionComplete(BaseModel):
|
||||||
|
# Index of the chat completion
|
||||||
|
index: int
|
||||||
|
# Message associated with the chat completion
|
||||||
|
message: Message
|
||||||
|
# Log probabilities for the chat completion
|
||||||
|
logprobs: Optional[Any]
|
||||||
|
# Reason for completion
|
||||||
|
finish_reason: str
|
||||||
|
# Usage details of the chat completion
|
||||||
|
usage: Optional[Any] = None
|
||||||
|
|
||||||
|
|
||||||
|
class ChatComplete(BaseModel):
|
||||||
|
# Chat completion details
|
||||||
|
id: str
|
||||||
|
object: str
|
||||||
|
created: int
|
||||||
|
model: str
|
||||||
|
system_fingerprint: str
|
||||||
|
choices: List[ChatCompletionComplete]
|
||||||
|
usage: Any
|
||||||
|
|
||||||
|
|
||||||
|
class ChatCompletionChunk(BaseModel):
|
||||||
|
id: str
|
||||||
|
object: str
|
||||||
|
created: int
|
||||||
|
model: str
|
||||||
|
system_fingerprint: str
|
||||||
|
choices: List[Choice]
|
||||||
|
|
||||||
|
|
||||||
class Parameters(BaseModel):
|
class Parameters(BaseModel):
|
||||||
|
@ -1121,6 +1121,15 @@
|
|||||||
"description": "An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the\ntokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered.",
|
"description": "An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the\ntokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered.",
|
||||||
"example": 0.95,
|
"example": 0.95,
|
||||||
"nullable": true
|
"nullable": true
|
||||||
|
},
|
||||||
|
"stop": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"description": "Up to 4 sequences where the API will stop generating further tokens.",
|
||||||
|
"example": "null",
|
||||||
|
"nullable": true
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
@ -3,8 +3,16 @@
|
|||||||
title: Text Generation Inference
|
title: Text Generation Inference
|
||||||
- local: quicktour
|
- local: quicktour
|
||||||
title: Quick Tour
|
title: Quick Tour
|
||||||
|
- local: installation_nvidia
|
||||||
|
title: Using TGI with Nvidia GPUs
|
||||||
|
- local: installation_amd
|
||||||
|
title: Using TGI with AMD GPUs
|
||||||
|
- local: installation_gaudi
|
||||||
|
title: Using TGI with Intel Gaudi
|
||||||
|
- local: installation_inferentia
|
||||||
|
title: Using TGI with AWS Inferentia
|
||||||
- local: installation
|
- local: installation
|
||||||
title: Installation
|
title: Installation from source
|
||||||
- local: supported_models
|
- local: supported_models
|
||||||
title: Supported Models and Hardware
|
title: Supported Models and Hardware
|
||||||
- local: messages_api
|
- local: messages_api
|
||||||
@ -20,7 +28,7 @@
|
|||||||
- local: basic_tutorials/using_cli
|
- local: basic_tutorials/using_cli
|
||||||
title: Using TGI CLI
|
title: Using TGI CLI
|
||||||
- local: basic_tutorials/launcher
|
- local: basic_tutorials/launcher
|
||||||
title: All TGI CLI options
|
title: All TGI CLI options
|
||||||
- local: basic_tutorials/non_core_models
|
- local: basic_tutorials/non_core_models
|
||||||
title: Non-core Model Serving
|
title: Non-core Model Serving
|
||||||
- local: basic_tutorials/safety
|
- local: basic_tutorials/safety
|
||||||
@ -29,6 +37,10 @@
|
|||||||
title: Using Guidance, JSON, tools
|
title: Using Guidance, JSON, tools
|
||||||
- local: basic_tutorials/visual_language_models
|
- local: basic_tutorials/visual_language_models
|
||||||
title: Visual Language Models
|
title: Visual Language Models
|
||||||
|
- local: basic_tutorials/monitoring
|
||||||
|
title: Monitoring TGI with Prometheus and Grafana
|
||||||
|
- local: basic_tutorials/train_medusa
|
||||||
|
title: Train Medusa
|
||||||
title: Tutorials
|
title: Tutorials
|
||||||
- sections:
|
- sections:
|
||||||
- local: conceptual/streaming
|
- local: conceptual/streaming
|
||||||
|
@ -19,6 +19,6 @@ docker run --gpus all \
|
|||||||
--shm-size 1g \
|
--shm-size 1g \
|
||||||
-e HUGGING_FACE_HUB_TOKEN=$token \
|
-e HUGGING_FACE_HUB_TOKEN=$token \
|
||||||
-p 8080:80 \
|
-p 8080:80 \
|
||||||
-v $volume:/data ghcr.io/huggingface/text-generation-inference:1.4 \
|
-v $volume:/data ghcr.io/huggingface/text-generation-inference:2.0.3 \
|
||||||
--model-id $model
|
--model-id $model
|
||||||
```
|
```
|
||||||
|
75
docs/source/basic_tutorials/monitoring.md
Normal file
75
docs/source/basic_tutorials/monitoring.md
Normal file
@ -0,0 +1,75 @@
|
|||||||
|
# Monitoring TGI server with Prometheus and Grafana dashboard
|
||||||
|
|
||||||
|
TGI server deployment can easily be monitored through a Grafana dashboard, consuming a Prometheus data collection. Example of inspectable metrics are statistics on the effective batch sizes used by TGI, prefill/decode latencies, number of generated tokens, etc.
|
||||||
|
|
||||||
|
In this tutorial, we look at how to set up a local Grafana dashboard to monitor TGI usage.
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
## Setup on the server machine
|
||||||
|
|
||||||
|
First, on your server machine, TGI needs to be launched as usual. TGI exposes [multiple](https://github.com/huggingface/text-generation-inference/discussions/1127#discussioncomment-7240527) metrics that can be collected by Prometheus monitoring server.
|
||||||
|
|
||||||
|
In the rest of this tutorial, we assume that TGI was launched through Docker with `--network host`.
|
||||||
|
|
||||||
|
On the server where TGI is hosted, a Prometheus server needs to be installed and launched. To do so, please follow [Prometheus installation instructions](https://prometheus.io/download/#prometheus). For example, at the time of writing on a Linux machine:
|
||||||
|
|
||||||
|
```
|
||||||
|
wget https://github.com/prometheus/prometheus/releases/download/v2.52.0/prometheus-2.52.0.linux-amd64.tar.gz
|
||||||
|
tar -xvzf prometheus-2.52.0.linux-amd64.tar.gz
|
||||||
|
cd prometheus
|
||||||
|
```
|
||||||
|
|
||||||
|
Prometheus needs to be configured to listen on TGI's port. To do so, in Prometheus configuration file `prometheus.yml`, one needs to edit the lines:
|
||||||
|
```
|
||||||
|
static_configs:
|
||||||
|
- targets: ["0.0.0.0:80"]
|
||||||
|
```
|
||||||
|
to use the correct IP address and port.
|
||||||
|
|
||||||
|
We suggest to try `curl 0.0.0.0:80/generate -X POST -d '{"inputs":"hey chatbot, how are","parameters":{"max_new_tokens":15}}' -H 'Content-Type: application/json'` on the server side to make sure to configure the correct IP and port.
|
||||||
|
|
||||||
|
Once Prometheus is configured, Prometheus server can be launched on the same machine where TGI is launched:
|
||||||
|
```
|
||||||
|
./prometheus --config.file="prometheus.yml"
|
||||||
|
```
|
||||||
|
|
||||||
|
In this guide, Prometheus monitoring data will be consumed on a local computer. Hence, we need to forward Prometheus port (by default 9090) to the local computer. To do so, we can for example:
|
||||||
|
* Use ssh [local port forwarding](https://www.ssh.com/academy/ssh/tunneling-example)
|
||||||
|
* Use ngrok port tunneling
|
||||||
|
|
||||||
|
For simplicity, we will use [Ngrok](https://ngrok.com/docs/) in this guide to tunnel Prometheus port from the TGI server to the outside word.
|
||||||
|
|
||||||
|
For that, you should follow the steps at https://dashboard.ngrok.com/get-started/setup/linux, and once Ngrok is installed, use:
|
||||||
|
```bash
|
||||||
|
ngrok http http://0.0.0.0:9090
|
||||||
|
```
|
||||||
|
|
||||||
|
As a sanity check, one can make sure that Prometheus server can be accessed at the URL given by Ngrok (in the style of https://d661-4-223-164-145.ngrok-free.app) from a local machine.
|
||||||
|
|
||||||
|
## Setup on the monitoring machine
|
||||||
|
|
||||||
|
Monitoring is typically done on an other machine than the server one. We use a Grafana dashboard to monitor TGI's server usage.
|
||||||
|
|
||||||
|
Two options are available:
|
||||||
|
* Use Grafana Cloud for an hosted dashboard solution (https://grafana.com/products/cloud/).
|
||||||
|
* Self-host a grafana dashboard.
|
||||||
|
|
||||||
|
In this tutorial, for simplicity, we will self host the dashbard. We recommend installing Grafana Open-source edition following [the official install instructions](https://grafana.com/grafana/download?platform=linux&edition=oss), using the available Linux binaries. For example:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
wget https://dl.grafana.com/oss/release/grafana-11.0.0.linux-amd64.tar.gz
|
||||||
|
tar -zxvf grafana-11.0.0.linux-amd64.tar.gz
|
||||||
|
cd grafana-11.0.0
|
||||||
|
./bin/grafana-server
|
||||||
|
```
|
||||||
|
|
||||||
|
Once the Grafana server is launched, the Grafana interface is available at http://localhost:3000. One needs to log in with the `admin` username and `admin` password.
|
||||||
|
|
||||||
|
Once logged in, the Prometheus data source for Grafana needs to be configured, in the option `Add your first data source`. There, a Prometheus data source needs to be added with the Ngrok address we got earlier, that exposes Prometheus port (example: https://d661-4-223-164-145.ngrok-free.app).
|
||||||
|
|
||||||
|
Once Prometheus data source is configured, we can finally create our dashboard! From home, go to `Create your first dashboard` and then `Import dashboard`. There, we will use the recommended dashboard template [tgi_grafana.json](https://github.com/huggingface/text-generation-inference/blob/main/assets/tgi_grafana.json) for a dashboard ready to be used, but you may configure your own dashboard as you like.
|
||||||
|
|
||||||
|
Community contributed dashboard templates are also available, for example [here](https://grafana.com/grafana/dashboards/19831-text-generation-inference-dashboard/) or [here](https://grafana.com/grafana/dashboards/20246-text-generation-inference/).
|
||||||
|
|
||||||
|
Load your dashboard configuration, and your TGI dashboard should be ready to go!
|
208
docs/source/basic_tutorials/train_medusa.md
Normal file
208
docs/source/basic_tutorials/train_medusa.md
Normal file
@ -0,0 +1,208 @@
|
|||||||
|
# Train Medusa
|
||||||
|
|
||||||
|
This tutorial will show you how to train a Medusa model on a dataset of your choice. Please check out the [speculation documentation](../conceptual/speculation.md) for more information on how Medusa works and speculation in general.
|
||||||
|
|
||||||
|
## What are the benefits of training a Medusa model?
|
||||||
|
|
||||||
|
Training Medusa heads can greatly improve the speed of generation. Medusa adds extra "heads" to LLMs to predict multiple future tokens simultaneously. When augmenting a model with Medusa, the original model stays untouched, and only the new heads are fine-tuned during training.
|
||||||
|
|
||||||
|
One of the most important things is to have a good dataset (with similar data to what will be used in production) because Medusa has a much higher hit-rate when the generation is in-domain.
|
||||||
|
|
||||||
|
If you train Medusa on a dataset that is very different from the one you will use in production then the model will not be able to predict the future tokens accurately and consequently the speedup will be minimal or non-existent.
|
||||||
|
|
||||||
|
## Self-distillation (Generating data for training)
|
||||||
|
|
||||||
|
There are many methods for preparing data for training, but one of the easiest and most effective ways is to "self-distill" the data. This means that you can use the same model to generate the data that you will use to train the model.
|
||||||
|
|
||||||
|
Essentially, you prompt the model with a similar input to what you will use in production and the model will generate the output.
|
||||||
|
|
||||||
|
We'll use this output to help train the medusa heads to predict the `n+1`, `n+2`, `n+3`, etc tokens in the sequence.
|
||||||
|
|
||||||
|
## Training
|
||||||
|
|
||||||
|
The original implementation of Medusa is available at [https://github.com/FasterDecoding/Medusa](https://github.com/FasterDecoding/Medusa) and we'll follow a very similar process to train the model as described on the original repository.
|
||||||
|
|
||||||
|
### Getting Started
|
||||||
|
|
||||||
|
There are two methods for training the model:
|
||||||
|
|
||||||
|
- `torchrun` that is a wrapper around `torch.distributed.launch`
|
||||||
|
- a forked version of `axlotl` that supports Medusa
|
||||||
|
|
||||||
|
In this tutorial we'll use `torchrun` to train the model as it is the most straightforward way to train the model but similar steps can be followed to train the model using `axlotl` if you prefer.
|
||||||
|
|
||||||
|
### Training with `torchrun`
|
||||||
|
|
||||||
|
```bash
|
||||||
|
mkdir medusa-training
|
||||||
|
cd medusa-training
|
||||||
|
|
||||||
|
pyenv install 3.10
|
||||||
|
pyenv local 3.10
|
||||||
|
|
||||||
|
uv venv -p 3.10
|
||||||
|
source .venv/bin/activate
|
||||||
|
```
|
||||||
|
|
||||||
|
Now lets clone the original `Medusa` repository and install the library.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git clone https://github.com/FasterDecoding/Medusa.git
|
||||||
|
cd Medusa
|
||||||
|
pip install -e .
|
||||||
|
```
|
||||||
|
|
||||||
|
Next we'll need some data to train on, we can use the `ShareGPT_Vicuna_unfiltered` dataset that is available on the Hugging Face Hub.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
apt install git-lfs
|
||||||
|
git lfs install
|
||||||
|
git clone https://huggingface.co/datasets/Aeala/ShareGPT_Vicuna_unfiltered
|
||||||
|
```
|
||||||
|
|
||||||
|
Currently our directory structure looks like this:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
.
|
||||||
|
├── assets
|
||||||
|
├── CITATION.cff
|
||||||
|
├── create_data.py
|
||||||
|
├── data_generation
|
||||||
|
├── deepspeed.json
|
||||||
|
├── last_run_prepared
|
||||||
|
├── LICENSE
|
||||||
|
├── llm_judge
|
||||||
|
├── medusa
|
||||||
|
├── medusa_llm.egg-info
|
||||||
|
├── mistral.json
|
||||||
|
├── notebooks
|
||||||
|
├── pyproject.toml
|
||||||
|
├── README.md
|
||||||
|
├── ROADMAP.md
|
||||||
|
├── scripts
|
||||||
|
├── ShareGPT_Vicuna_unfiltered
|
||||||
|
│ ├── README.md
|
||||||
|
│ ├── ShareGPT_2023.05.04v0_Wasteland_Edition.json
|
||||||
|
│ └── ShareGPT_V4.3_unfiltered_cleaned_split.json
|
||||||
|
├── simple_gradio_interface.py
|
||||||
|
├── tiny-llama.json
|
||||||
|
└── vicuna_7b_qlora_stage1
|
||||||
|
```
|
||||||
|
|
||||||
|
## Start Training
|
||||||
|
|
||||||
|
Now the lets generate the data and start training the model. This process will take a while since we are generating data from the model.
|
||||||
|
|
||||||
|
First make sure you have an instance of TGI running with the model you want to use for self-distillation.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
model=HuggingFaceH4/zephyr-7b-beta
|
||||||
|
volume=/home/ubuntu/.cache/huggingface/hub/
|
||||||
|
|
||||||
|
docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id $model
|
||||||
|
```
|
||||||
|
|
||||||
|
Now we can generate the data using the `create_data.py` script.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python create_data.py \
|
||||||
|
--input-filename ShareGPT_Vicuna_unfiltered/ShareGPT_V4.3_unfiltered_cleaned_split.json \
|
||||||
|
--output-filename zephyr_self_distill.json
|
||||||
|
```
|
||||||
|
|
||||||
|
At this point our terminal should look like this:
|
||||||
|
|
||||||
|
<div class="flex justify-center">
|
||||||
|
<img
|
||||||
|
src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/medusa-train-large.gif"
|
||||||
|
width="550"
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
> Note: In the screen shot above we are only using a the first 500 examples from the dataset to speed up the process, you should have a much larger dataset for training.
|
||||||
|
|
||||||
|
Now we can finally get to the fun part and start training the model!
|
||||||
|
|
||||||
|
Using `torchrun` we can easily launch the `medusa` training script with the `zephyr_self_distill.json` configuration file.
|
||||||
|
|
||||||
|
> NOTE: If you just self-distilled you may still have the model running, make sure to stop it before starting the training in order to allow all of the resources to be used for training.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
WANDB_MODE=offline torchrun --nproc_per_node=4 medusa/train/train_legacy.py \
|
||||||
|
--model_name_or_path HuggingFaceH4/zephyr-7b-beta \
|
||||||
|
--data_path zephyr_self_distill.json \
|
||||||
|
--bf16 True \
|
||||||
|
--output_dir zephyr_out \
|
||||||
|
--num_train_epochs 5 \
|
||||||
|
--per_device_train_batch_size 4 \
|
||||||
|
--per_device_eval_batch_size 4 \
|
||||||
|
--gradient_accumulation_steps 4 \
|
||||||
|
--evaluation_strategy "no" \
|
||||||
|
--save_strategy "no" \
|
||||||
|
--learning_rate 1e-3 \
|
||||||
|
--weight_decay 0.0 \
|
||||||
|
--warmup_ratio 0.1 \
|
||||||
|
--lr_scheduler_type "cosine" \
|
||||||
|
--logging_steps 1 \
|
||||||
|
--tf32 True \
|
||||||
|
--model_max_length 2048 \
|
||||||
|
--lazy_preprocess True \
|
||||||
|
--medusa_num_heads 3 \
|
||||||
|
--medusa_num_layers 1 \
|
||||||
|
--deepspeed deepspeed.json
|
||||||
|
```
|
||||||
|
|
||||||
|
<div class="flex justify-center">
|
||||||
|
<img
|
||||||
|
src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/medusa-train-heads-large.gif"
|
||||||
|
width="550"
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
If successful, you should see the similar output to the one below:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
wandb: Run history:
|
||||||
|
wandb: train/epoch ▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇███
|
||||||
|
wandb: train/global_step ▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇███
|
||||||
|
wandb: train/learning_rate ▅███▇▇▆▅▅▄▃▂▂▁▁▁
|
||||||
|
wandb: train/loss ██▆▄▄▃▃▂▂▃▁▁▂▁▁▁
|
||||||
|
wandb: train/medusa0_loss ▆▆▇▆▆▅▄▅▃▃▃▃▂▂▂▂▂▃▂▂▂▁▁▁▂▁▁▁▁▁█▁▁▁▂▁▁▁▁▁
|
||||||
|
wandb: train/medusa0_top1 ▁▁▁▁▁▁▁▁▃▂▃▃▄▄▄▃▄▃▄▄▅▅▆▅▆▆▇▅▇▇▄▇█▇▅▇█▆▇▇
|
||||||
|
wandb: train/medusa1_loss ▇▇█▇▇▆▅▅▃▄▃▃▃▃▃▃▃▃▃▃▂▁▂▂▂▁▁▂▁▁▇▁▁▁▂▁▁▁▁▁
|
||||||
|
wandb: train/medusa1_top1 ▁▁▁▁▁▁▁▁▃▂▃▃▃▄▄▃▃▂▃▃▅▅▆▄█▆▇▅▇▇▅█▇▇▅▇█▆▆▇
|
||||||
|
wandb: train/medusa2_loss ▃▃▄▄▄▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁█▁▁▁▂▁▁▁▁▁
|
||||||
|
wandb: train/medusa2_top1 ▁▁▁▂▁▁▁▁▂▂▃▃▃▄▄▃▃▂▃▃▅▆▅▄█▆▆▅▆▆▄█▇▇▄▇█▆▆▇
|
||||||
|
wandb: train/total_flos ▁
|
||||||
|
wandb: train/train_loss ▁
|
||||||
|
wandb: train/train_runtime ▁
|
||||||
|
wandb: train/train_samples_per_second ▁
|
||||||
|
wandb: train/train_steps_per_second ▁
|
||||||
|
wandb:
|
||||||
|
wandb: Run summary:
|
||||||
|
wandb: train/epoch 2.0
|
||||||
|
wandb: train/global_step 16
|
||||||
|
wandb: train/learning_rate 0.0
|
||||||
|
wandb: train/loss 14.8906
|
||||||
|
wandb: train/medusa0_loss 4.25
|
||||||
|
wandb: train/medusa0_top1 0.28809
|
||||||
|
wandb: train/medusa1_loss 4.8125
|
||||||
|
wandb: train/medusa1_top1 0.22727
|
||||||
|
wandb: train/medusa2_loss 5.5
|
||||||
|
wandb: train/medusa2_top1 0.17293
|
||||||
|
wandb: train/total_flos 0.0
|
||||||
|
wandb: train/train_loss 23.98242
|
||||||
|
wandb: train/train_runtime 396.9266
|
||||||
|
wandb: train/train_samples_per_second 2.519
|
||||||
|
wandb: train/train_steps_per_second 0.04
|
||||||
|
```
|
||||||
|
|
||||||
|
Last but most importantly, don't forget to push this model to the Hugging Face Hub so you can use it in your projects.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python -m medusa.hf_utils \
|
||||||
|
--folder zephyr_out_medusa_mlp_zephyr-7b-beta_medusa_3_lr_0.001_layers_1 \
|
||||||
|
--repo drbh/zephyr_medusa_demo
|
||||||
|
```
|
||||||
|
|
||||||
|
Woo, we've successfully trained a Medusa model and pushed it to the Hugging Face Hub! 🎉
|
@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
Text Generation Inference (TGI) now supports [JSON and regex grammars](#grammar-and-constraints) and [tools and functions](#tools-and-functions) to help developers guide LLM responses to fit their needs.
|
Text Generation Inference (TGI) now supports [JSON and regex grammars](#grammar-and-constraints) and [tools and functions](#tools-and-functions) to help developers guide LLM responses to fit their needs.
|
||||||
|
|
||||||
These feature are available starting from version `1.4.3`. They are accessible via the [text_generation](https://pypi.org/project/text-generation/) library. The tool support is compatible with OpenAI's client libraries. The following guide will walk you through the new features and how to use them!
|
These feature are available starting from version `1.4.3`. They are accessible via the [`huggingface_hub`](https://pypi.org/project/huggingface-hub/) library. The tool support is compatible with OpenAI's client libraries. The following guide will walk you through the new features and how to use them!
|
||||||
|
|
||||||
_note: guidance is supported as grammar in the `/generate` endpoint and as tools in the `/chat/completions` endpoint._
|
_note: guidance is supported as grammar in the `/generate` endpoint and as tools in the `/chat/completions` endpoint._
|
||||||
|
|
||||||
@ -74,6 +74,45 @@ curl localhost:3000/generate \
|
|||||||
|
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Hugging Face Hub Python Library
|
||||||
|
|
||||||
|
The Hugging Face Hub Python library provides a client that makes it easy to interact with the Messages API. Here's an example of how to use the client to send a request with a grammar parameter.
|
||||||
|
|
||||||
|
```python
|
||||||
|
from huggingface_hub import InferenceClient
|
||||||
|
|
||||||
|
client = InferenceClient("http://localhost:3000")
|
||||||
|
|
||||||
|
schema = {
|
||||||
|
"properties": {
|
||||||
|
"location": {"title": "Location", "type": "string"},
|
||||||
|
"activity": {"title": "Activity", "type": "string"},
|
||||||
|
"animals_seen": {
|
||||||
|
"maximum": 5,
|
||||||
|
"minimum": 1,
|
||||||
|
"title": "Animals Seen",
|
||||||
|
"type": "integer",
|
||||||
|
},
|
||||||
|
"animals": {"items": {"type": "string"}, "title": "Animals", "type": "array"},
|
||||||
|
},
|
||||||
|
"required": ["location", "activity", "animals_seen", "animals"],
|
||||||
|
"title": "Animals",
|
||||||
|
"type": "object",
|
||||||
|
}
|
||||||
|
|
||||||
|
user_input = "I saw a puppy a cat and a raccoon during my bike ride in the park"
|
||||||
|
resp = client.text_generation(
|
||||||
|
f"convert to JSON: 'f{user_input}'. please use the following schema: {schema}",
|
||||||
|
max_new_tokens=100,
|
||||||
|
seed=42,
|
||||||
|
grammar={"type": "json", "value": schema},
|
||||||
|
)
|
||||||
|
|
||||||
|
print(resp)
|
||||||
|
# { "activity": "bike ride", "animals": ["puppy", "cat", "raccoon"], "animals_seen": 3, "location": "park" }
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
A grammar can be defined using Pydantic models, JSON schemas, or regular expressions. The LLM will then generate a response that conforms to the specified grammar.
|
A grammar can be defined using Pydantic models, JSON schemas, or regular expressions. The LLM will then generate a response that conforms to the specified grammar.
|
||||||
|
|
||||||
> Note: A grammar must compile to an intermediate representation to constrain the output. Grammar compilation is a computationally expensive and may take a few seconds to complete on the first request. Subsequent requests will use the cached grammar and will be much faster.
|
> Note: A grammar must compile to an intermediate representation to constrain the output. Grammar compilation is a computationally expensive and may take a few seconds to complete on the first request. Subsequent requests will use the cached grammar and will be much faster.
|
||||||
@ -83,134 +122,55 @@ A grammar can be defined using Pydantic models, JSON schemas, or regular express
|
|||||||
Using Pydantic models we can define a similar grammar as the previous example in a shorter and more readable way.
|
Using Pydantic models we can define a similar grammar as the previous example in a shorter and more readable way.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
import requests
|
from huggingface_hub import InferenceClient
|
||||||
from pydantic import BaseModel, conint
|
from pydantic import BaseModel, conint
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
|
|
||||||
class Animals(BaseModel):
|
class Animals(BaseModel):
|
||||||
location: str
|
location: str
|
||||||
activity: str
|
activity: str
|
||||||
animals_seen: conint(ge=1, le=5) # Constrained integer type
|
animals_seen: conint(ge=1, le=5) # Constrained integer type
|
||||||
animals: List[str]
|
animals: List[str]
|
||||||
|
|
||||||
prompt = "convert to JSON: I saw a puppy a cat and a raccoon during my bike ride in the park"
|
|
||||||
|
|
||||||
data = {
|
client = InferenceClient("http://localhost:3000")
|
||||||
"inputs": prompt,
|
|
||||||
"parameters": {
|
|
||||||
"repetition_penalty": 1.3,
|
|
||||||
"grammar": {
|
|
||||||
"type": "json",
|
|
||||||
"value": Animals.schema()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
headers = {
|
user_input = "I saw a puppy a cat and a raccoon during my bike ride in the park"
|
||||||
"Content-Type": "application/json",
|
resp = client.text_generation(
|
||||||
}
|
f"convert to JSON: 'f{user_input}'. please use the following schema: {Animals.schema()}",
|
||||||
|
max_new_tokens=100,
|
||||||
response = requests.post(
|
seed=42,
|
||||||
'http://127.0.0.1:3000/generate',
|
grammar={"type": "json", "value": Animals.schema()},
|
||||||
headers=headers,
|
|
||||||
json=data
|
|
||||||
)
|
)
|
||||||
print(response.json())
|
|
||||||
# {'generated_text': '{ "activity": "bike riding", "animals": ["puppy","cat","raccoon"],"animals_seen": 3, "location":"park" }'}
|
print(resp)
|
||||||
|
# { "activity": "bike ride", "animals": ["puppy", "cat", "raccoon"], "animals_seen": 3, "location": "park" }
|
||||||
|
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|
||||||
### JSON Schema Integration
|
defining a grammar as regular expressions
|
||||||
|
|
||||||
If Pydantic's not your style, go raw with direct JSON Schema integration. This is similar to the first example but with programmatic control.
|
|
||||||
|
|
||||||
```python
|
```python
|
||||||
import requests
|
from huggingface_hub import InferenceClient
|
||||||
|
|
||||||
json_schema = {
|
client = InferenceClient("http://localhost:3000")
|
||||||
"properties": {
|
|
||||||
"location": {
|
regexp = "((25[0-5]|2[0-4]\\d|[01]?\\d\\d?)\\.){3}(25[0-5]|2[0-4]\\d|[01]?\\d\\d?)"
|
||||||
"type": "string"
|
|
||||||
},
|
resp = client.text_generation(
|
||||||
"activity": {
|
f"Whats Googles DNS? Please use the following regex: {regexp}",
|
||||||
"type": "string"
|
seed=42,
|
||||||
},
|
grammar={
|
||||||
"animals_seen": {
|
"type": "regex",
|
||||||
"type": "integer",
|
"value": regexp,
|
||||||
"minimum": 1,
|
|
||||||
"maximum": 5
|
|
||||||
},
|
|
||||||
"animals": {
|
|
||||||
"type": "array",
|
|
||||||
"items": {
|
|
||||||
"type": "string"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
"required": ["location", "activity", "animals_seen", "animals"]
|
|
||||||
}
|
|
||||||
|
|
||||||
data = {
|
|
||||||
"inputs": "convert to JSON: I saw a puppy a cat and a raccoon during my bike ride in the park",
|
|
||||||
"parameters": {
|
|
||||||
"max_new_tokens": 200,
|
|
||||||
"repetition_penalty": 1.3,
|
|
||||||
"grammar": {
|
|
||||||
"type": "json",
|
|
||||||
"value": json_schema
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
headers = {
|
|
||||||
"Content-Type": "application/json",
|
|
||||||
}
|
|
||||||
|
|
||||||
response = requests.post(
|
|
||||||
'http://127.0.0.1:3000/generate',
|
|
||||||
headers=headers,
|
|
||||||
json=data
|
|
||||||
)
|
)
|
||||||
print(response.json())
|
|
||||||
# {'generated_text': '{\n"activity": "biking",\n"animals": ["puppy","cat","raccoon"]\n , "animals_seen": 3,\n "location":"park"}'}
|
|
||||||
|
|
||||||
```
|
|
||||||
|
|
||||||
### Using the client
|
print(resp)
|
||||||
|
# 7.1.1.1
|
||||||
TGI provides a client library to that make it easy to send requests with all of the parameters we've discussed above. Here's an example of how to use the client to send a request with a grammar parameter.
|
|
||||||
|
|
||||||
```python
|
|
||||||
from text_generation import AsyncClient
|
|
||||||
from text_generation.types import GrammarType
|
|
||||||
|
|
||||||
# NOTE: tools defined above and removed for brevity
|
|
||||||
|
|
||||||
# Define an async function to encapsulate the async operation
|
|
||||||
async def main():
|
|
||||||
client = AsyncClient(base_url="http://localhost:3000")
|
|
||||||
|
|
||||||
# Use 'await' to wait for the async method 'chat' to complete
|
|
||||||
response = await client.generate(
|
|
||||||
"Whats Googles DNS",
|
|
||||||
max_new_tokens=10,
|
|
||||||
decoder_input_details=True,
|
|
||||||
seed=1,
|
|
||||||
grammar={
|
|
||||||
"type": GrammarType.Regex,
|
|
||||||
"value": "((25[0-5]|2[0-4]\\d|[01]?\\d\\d?)\\.){3}(25[0-5]|2[0-4]\\d|[01]?\\d\\d?)",
|
|
||||||
},
|
|
||||||
)
|
|
||||||
|
|
||||||
# Once the response is received, you can process it
|
|
||||||
print(response.generated_text)
|
|
||||||
|
|
||||||
# Ensure the main async function is run in the event loop
|
|
||||||
if __name__ == "__main__":
|
|
||||||
import asyncio
|
|
||||||
asyncio.run(main())
|
|
||||||
|
|
||||||
# 118.8.0.84
|
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -265,107 +225,87 @@ curl localhost:3000/v1/chat/completions \
|
|||||||
// {"id":"","object":"text_completion","created":1709051640,"model":"HuggingFaceH4/zephyr-7b-beta","system_fingerprint":"1.4.3-native","choices":[{"index":0,"message":{"role":"assistant","tool_calls":{"id":0,"type":"function","function":{"description":null,"name":"tools","parameters":{"format":"celsius","location":"New York"}}}},"logprobs":null,"finish_reason":"eos_token"}],"usage":{"prompt_tokens":157,"completion_tokens":19,"total_tokens":176}}
|
// {"id":"","object":"text_completion","created":1709051640,"model":"HuggingFaceH4/zephyr-7b-beta","system_fingerprint":"1.4.3-native","choices":[{"index":0,"message":{"role":"assistant","tool_calls":{"id":0,"type":"function","function":{"description":null,"name":"tools","parameters":{"format":"celsius","location":"New York"}}}},"logprobs":null,"finish_reason":"eos_token"}],"usage":{"prompt_tokens":157,"completion_tokens":19,"total_tokens":176}}
|
||||||
```
|
```
|
||||||
|
|
||||||
### Text Generation Inference Client
|
### Chat Completion with Tools
|
||||||
|
|
||||||
TGI provides a client library to interact with the Messages API and Tool functions. The client library is available in both synchronous and asynchronous versions.
|
Grammars are supported in the `/generate` endpoint, while tools are supported in the `/chat/completions` endpoint. Here's an example of how to use the client to send a request with a tool parameter.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from text_generation import AsyncClient
|
from huggingface_hub import InferenceClient
|
||||||
|
|
||||||
# NOTE: tools defined above and removed for brevity
|
client = InferenceClient("http://localhost:3000")
|
||||||
|
|
||||||
# Define an async function to encapsulate the async operation
|
tools = [
|
||||||
async def main():
|
{
|
||||||
client = AsyncClient(base_url="http://localhost:3000")
|
"type": "function",
|
||||||
|
"function": {
|
||||||
# Use 'await' to wait for the async method 'chat' to complete
|
"name": "get_current_weather",
|
||||||
response = await client.chat(
|
"description": "Get the current weather",
|
||||||
max_tokens=100,
|
"parameters": {
|
||||||
seed=1,
|
"type": "object",
|
||||||
tools=tools,
|
"properties": {
|
||||||
presence_penalty=-1.1,
|
"location": {
|
||||||
messages=[
|
"type": "string",
|
||||||
{
|
"description": "The city and state, e.g. San Francisco, CA",
|
||||||
"role": "system",
|
},
|
||||||
"content": "You're a helpful assistant! Answer the users question best you can.",
|
"format": {
|
||||||
|
"type": "string",
|
||||||
|
"enum": ["celsius", "fahrenheit"],
|
||||||
|
"description": "The temperature unit to use. Infer this from the users location.",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"required": ["location", "format"],
|
||||||
},
|
},
|
||||||
{
|
},
|
||||||
"role": "user",
|
},
|
||||||
"content": "What is the weather like in Brooklyn, New York?",
|
{
|
||||||
|
"type": "function",
|
||||||
|
"function": {
|
||||||
|
"name": "get_n_day_weather_forecast",
|
||||||
|
"description": "Get an N-day weather forecast",
|
||||||
|
"parameters": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"location": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "The city and state, e.g. San Francisco, CA",
|
||||||
|
},
|
||||||
|
"format": {
|
||||||
|
"type": "string",
|
||||||
|
"enum": ["celsius", "fahrenheit"],
|
||||||
|
"description": "The temperature unit to use. Infer this from the users location.",
|
||||||
|
},
|
||||||
|
"num_days": {
|
||||||
|
"type": "integer",
|
||||||
|
"description": "The number of days to forecast",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"required": ["location", "format", "num_days"],
|
||||||
},
|
},
|
||||||
],
|
},
|
||||||
)
|
},
|
||||||
|
]
|
||||||
|
|
||||||
# Once the response is received, you can process it
|
chat = client.chat_completion(
|
||||||
print(response.choices[0].message.tool_calls)
|
messages=[
|
||||||
|
{
|
||||||
|
"role": "system",
|
||||||
|
"content": "You're a helpful assistant! Answer the users question best you can.",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "What is the weather like in Brooklyn, New York?",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
tools=tools,
|
||||||
|
seed=42,
|
||||||
|
max_tokens=100,
|
||||||
|
)
|
||||||
|
|
||||||
# Ensure the main async function is run in the event loop
|
print(chat.choices[0].message.tool_calls)
|
||||||
if __name__ == "__main__":
|
# [ChatCompletionOutputToolCall(function=ChatCompletionOutputFunctionDefinition(arguments={'format': 'fahrenheit', 'location': 'Brooklyn, New York', 'num_days': 7}, name='get_n_day_weather_forecast', description=None), id=0, type='function')]
|
||||||
import asyncio
|
|
||||||
asyncio.run(main())
|
|
||||||
|
|
||||||
# {"id":"","object":"text_completion","created":1709051942,"model":"HuggingFaceH4/zephyr-7b-beta","system_fingerprint":"1.4.3-native","choices":[{"index":0,"message":{"role":"assistant","tool_calls":{"id":0,"type":"function","function":{"description":null,"name":"tools","parameters":{"format":"celsius","location":"New York"}}}},"logprobs":null,"finish_reason":"eos_token"}],"usage":{"prompt_tokens":157,"completion_tokens":20,"total_tokens":177}}
|
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|
||||||
<details>
|
|
||||||
<summary>Tools used in example above</summary>
|
|
||||||
|
|
||||||
```python
|
|
||||||
tools = [
|
|
||||||
{
|
|
||||||
"type": "function",
|
|
||||||
"function": {
|
|
||||||
"name": "get_current_weather",
|
|
||||||
"description": "Get the current weather",
|
|
||||||
"parameters": {
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"location": {
|
|
||||||
"type": "string",
|
|
||||||
"description": "The city and state, e.g. San Francisco, CA",
|
|
||||||
},
|
|
||||||
"format": {
|
|
||||||
"type": "string",
|
|
||||||
"enum": ["celsius", "fahrenheit"],
|
|
||||||
"description": "The temperature unit to use. Infer this from the users location.",
|
|
||||||
},
|
|
||||||
},
|
|
||||||
"required": ["location", "format"],
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "function",
|
|
||||||
"function": {
|
|
||||||
"name": "get_n_day_weather_forecast",
|
|
||||||
"description": "Get an N-day weather forecast",
|
|
||||||
"parameters": {
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"location": {
|
|
||||||
"type": "string",
|
|
||||||
"description": "The city and state, e.g. San Francisco, CA",
|
|
||||||
},
|
|
||||||
"format": {
|
|
||||||
"type": "string",
|
|
||||||
"enum": ["celsius", "fahrenheit"],
|
|
||||||
"description": "The temperature unit to use. Infer this from the users location.",
|
|
||||||
},
|
|
||||||
"num_days": {
|
|
||||||
"type": "integer",
|
|
||||||
"description": "The number of days to forecast",
|
|
||||||
},
|
|
||||||
},
|
|
||||||
"required": ["location", "format", "num_days"],
|
|
||||||
},
|
|
||||||
},
|
|
||||||
}
|
|
||||||
]
|
|
||||||
```
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
### OpenAI integration
|
### OpenAI integration
|
||||||
|
|
||||||
TGI exposes an OpenAI-compatible API, which means you can use OpenAI's client libraries to interact with TGI's Messages API and Tool functions.
|
TGI exposes an OpenAI-compatible API, which means you can use OpenAI's client libraries to interact with TGI's Messages API and Tool functions.
|
||||||
|
@ -53,7 +53,67 @@ for token in client.text_generation(prompt, max_new_tokens=10, stream=True):
|
|||||||
# This is a picture of an anthropomorphic rabbit in a space suit.
|
# This is a picture of an anthropomorphic rabbit in a space suit.
|
||||||
```
|
```
|
||||||
|
|
||||||
If you want additional details, you can add `details=True`. In this case, you get a `TextGenerationStreamResponse` which contains additional information such as the probabilities and the tokens. For the final response in the stream, it also returns the full generated text.
|
or via the `chat_completion` endpoint:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from huggingface_hub import InferenceClient
|
||||||
|
|
||||||
|
client = InferenceClient("http://127.0.0.1:3000")
|
||||||
|
|
||||||
|
chat = client.chat_completion(
|
||||||
|
messages=[
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{"type": "text", "text": "Whats in this image?"},
|
||||||
|
{
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": {
|
||||||
|
"url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png"
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
seed=42,
|
||||||
|
max_tokens=100,
|
||||||
|
)
|
||||||
|
|
||||||
|
print(chat)
|
||||||
|
# ChatCompletionOutput(choices=[ChatCompletionOutputComplete(finish_reason='length', index=0, message=ChatCompletionOutputMessage(role='assistant', content=" The image you've provided features an anthropomorphic rabbit in spacesuit attire. This rabbit is depicted with human-like posture and movement, standing on a rocky terrain with a vast, reddish-brown landscape in the background. The spacesuit is detailed with mission patches, circuitry, and a helmet that covers the rabbit's face and ear, with an illuminated red light on the chest area.\n\nThe artwork style is that of a", name=None, tool_calls=None), logprobs=None)], created=1714589614, id='', model='llava-hf/llava-v1.6-mistral-7b-hf', object='text_completion', system_fingerprint='2.0.2-native', usage=ChatCompletionOutputUsage(completion_tokens=100, prompt_tokens=2943, total_tokens=3043))
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
or with OpenAi's library:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from openai import OpenAI
|
||||||
|
|
||||||
|
# init the client but point it to TGI
|
||||||
|
client = OpenAI(base_url="http://localhost:3000/v1", api_key="-")
|
||||||
|
|
||||||
|
chat_completion = client.chat.completions.create(
|
||||||
|
model="tgi",
|
||||||
|
messages=[
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{"type": "text", "text": "Whats in this image?"},
|
||||||
|
{
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": {
|
||||||
|
"url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png"
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
stream=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
print(chat_completion)
|
||||||
|
# ChatCompletion(id='', choices=[Choice(finish_reason='eos_token', index=0, logprobs=None, message=ChatCompletionMessage(content=' The image depicts an anthropomorphic rabbit dressed in a space suit with gear that resembles NASA attire. The setting appears to be a solar eclipse with dramatic mountain peaks and a partial celestial body in the sky. The artwork is detailed and vivid, with a warm color palette and a sense of an adventurous bunny exploring or preparing for a journey beyond Earth. ', role='assistant', function_call=None, tool_calls=None))], created=1714589732, model='llava-hf/llava-v1.6-mistral-7b-hf', object='text_completion', system_fingerprint='2.0.2-native', usage=CompletionUsage(completion_tokens=84, prompt_tokens=2943, total_tokens=3027))
|
||||||
|
```
|
||||||
|
|
||||||
### Inference Through Sending `cURL` Requests
|
### Inference Through Sending `cURL` Requests
|
||||||
|
|
||||||
|
@ -76,7 +76,7 @@ There are two main ways to use guidance; you can either use the `/generate` endp
|
|||||||
|
|
||||||
Under the hood tools are a special case of grammars that allows the model to choose one or none of the provided tools.
|
Under the hood tools are a special case of grammars that allows the model to choose one or none of the provided tools.
|
||||||
|
|
||||||
Please refer to [using guidance](../basic_tutorial/using_guidance) for more examples and details on how to use guidance in Python, JavaScript, and cURL.
|
Please refer to [using guidance](../basic_tutorials/using_guidance) for more examples and details on how to use guidance in Python, JavaScript, and cURL.
|
||||||
|
|
||||||
### Getting the most out of guidance
|
### Getting the most out of guidance
|
||||||
|
|
||||||
|
@ -27,7 +27,7 @@ You can check a few existing fine-tunes for popular models:
|
|||||||
- [text-generation-inference/Mistral-7B-Instruct-v0.2-medusa](https://huggingface.co/text-generation-inference/Mistral-7B-Instruct-v0.2-medusa)
|
- [text-generation-inference/Mistral-7B-Instruct-v0.2-medusa](https://huggingface.co/text-generation-inference/Mistral-7B-Instruct-v0.2-medusa)
|
||||||
|
|
||||||
|
|
||||||
In order to create your own medusa heads for your own finetune, you should check own the original medusa repo. [https://github.com/FasterDecoding/Medusa](https://github.com/FasterDecoding/Medusa)
|
In order to create your own medusa heads for your own finetune, you should check own the original medusa repo. [../basic_tutorials/train_medusa.md](../basic_tutorials/train_medusa.md)
|
||||||
|
|
||||||
|
|
||||||
In order to use medusa models in TGI, simply point to a medusa enabled model, and everything will load automatically.
|
In order to use medusa models in TGI, simply point to a medusa enabled model, and everything will load automatically.
|
||||||
|
@ -1,6 +1,10 @@
|
|||||||
# Installation
|
# Installation from source
|
||||||
|
|
||||||
This section explains how to install the CLI tool as well as installing TGI from source. **The strongly recommended approach is to use Docker, as it does not require much setup. Check [the Quick Tour](./quicktour) to learn how to run TGI with Docker.**
|
<Tip warning={true}>
|
||||||
|
|
||||||
|
Installing TGI from source is not the recommended usage. We strongly recommend to use TGI through Docker, check the [Quick Tour](./quicktour), [Installation for Nvidia GPUs](./installation_nvidia) and [Installation for AMD GPUs](./installation_amd) to learn how to use TGI with Docker.
|
||||||
|
|
||||||
|
</Tip>
|
||||||
|
|
||||||
## Install CLI
|
## Install CLI
|
||||||
|
|
||||||
|
38
docs/source/installation_amd.md
Normal file
38
docs/source/installation_amd.md
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
# Using TGI with AMD GPUs
|
||||||
|
|
||||||
|
TGI is supported and tested on [AMD Instinct MI210](https://www.amd.com/en/products/accelerators/instinct/mi200/mi210.html), [MI250](https://www.amd.com/en/products/accelerators/instinct/mi200/mi250.html) and [MI300](https://www.amd.com/en/products/accelerators/instinct/mi300.html) GPUs. The support may be extended in the future. The recommended usage is through Docker. Make sure to check the [AMD documentation](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/docker.html) on how to use Docker with AMD GPUs.
|
||||||
|
|
||||||
|
On a server powered by AMD GPUs, TGI can be launched with the following command:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
model=teknium/OpenHermes-2.5-Mistral-7B
|
||||||
|
volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
|
||||||
|
|
||||||
|
docker run --rm -it --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \
|
||||||
|
--device=/dev/kfd --device=/dev/dri --group-add video \
|
||||||
|
--ipc=host --shm-size 256g --net host -v $volume:/data \
|
||||||
|
ghcr.io/huggingface/text-generation-inference:2.0.3-rocm \
|
||||||
|
--model-id $model
|
||||||
|
```
|
||||||
|
|
||||||
|
The launched TGI server can then be queried from clients, make sure to check out the [Consuming TGI](./basic_tutorials/consuming_tgi) guide.
|
||||||
|
|
||||||
|
## TunableOp
|
||||||
|
|
||||||
|
TGI's docker image for AMD GPUs integrates [PyTorch's TunableOp](https://github.com/pytorch/pytorch/tree/main/aten/src/ATen/cuda/tunable), which allows to do an additional warmup to select the best performing matrix multiplication (GEMM) kernel from rocBLAS or hipBLASLt.
|
||||||
|
|
||||||
|
Experimentally, on MI300X, we noticed a 6-8% latency improvement when using TunableOp on top of ROCm 6.1 and PyTorch 2.3.
|
||||||
|
|
||||||
|
TunableOp is enabled by default, the warmup may take 1-2 minutes. In case you would like to disable TunableOp, please pass `--env PYTORCH_TUNABLEOP_ENABLED="0"` when launcher TGI's docker container.
|
||||||
|
|
||||||
|
## Flash attention implementation
|
||||||
|
|
||||||
|
Two implementations of Flash Attention are available for ROCm, the first is [ROCm/flash-attention](https://github.com/ROCm/flash-attention) based on a [Composable Kernel](https://github.com/ROCm/composable_kernel) (CK) implementation, and the second is a [Triton implementation](https://github.com/huggingface/text-generation-inference/blob/main/server/text_generation_server/utils/flash_attn_triton.py).
|
||||||
|
|
||||||
|
By default, the Composable Kernel implementation is used. However, the Triton implementation has slightly lower latency on MI250 and MI300, but requires a warmup which can be prohibitive as it needs to be done again for each new prompt length. If needed, FA Triton impelmentation can be enabled with `--env ROCM_USE_FLASH_ATTN_V2_TRITON="0"` when launching TGI's docker container.
|
||||||
|
|
||||||
|
## Unsupported features
|
||||||
|
|
||||||
|
The following features are currently not supported in the ROCm version of TGI, and the supported may be extended in the future:
|
||||||
|
* Loading [AWQ](https://huggingface.co/docs/transformers/quantization#awq) checkpoints.
|
||||||
|
* Kernel for sliding window attention (Mistral)
|
3
docs/source/installation_gaudi.md
Normal file
3
docs/source/installation_gaudi.md
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
# Using TGI with Intel Gaudi
|
||||||
|
|
||||||
|
Check out this [repository](https://github.com/huggingface/tgi-gaudi) to serve models with TGI on Gaudi and Gaudi2 with [Optimum Habana](https://huggingface.co/docs/optimum/habana/index).
|
3
docs/source/installation_inferentia.md
Normal file
3
docs/source/installation_inferentia.md
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
# Using TGI with Inferentia
|
||||||
|
|
||||||
|
Check out this [guide](https://github.com/huggingface/optimum-neuron/tree/main/text-generation-inference) on how to serve models with TGI on Inferentia2.
|
18
docs/source/installation_nvidia.md
Normal file
18
docs/source/installation_nvidia.md
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
# Using TGI with Nvidia GPUs
|
||||||
|
|
||||||
|
TGI optimized models are supported on NVIDIA [H100](https://www.nvidia.com/en-us/data-center/h100/), [A100](https://www.nvidia.com/en-us/data-center/a100/), [A10G](https://www.nvidia.com/en-us/data-center/products/a10-gpu/) and [T4](https://www.nvidia.com/en-us/data-center/tesla-t4/) GPUs with CUDA 12.2+. Note that you have to install [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html) to use it.
|
||||||
|
|
||||||
|
For other NVIDIA GPUs, continuous batching will still apply, but some operations like flash attention and paged attention will not be executed.
|
||||||
|
|
||||||
|
TGI can be used on NVIDIA GPUs through its official docker image:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
model=teknium/OpenHermes-2.5-Mistral-7B
|
||||||
|
volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
|
||||||
|
|
||||||
|
docker run --gpus all --shm-size 64g -p 8080:80 -v $volume:/data \
|
||||||
|
ghcr.io/huggingface/text-generation-inference:2.0.3 \
|
||||||
|
--model-id $model
|
||||||
|
```
|
||||||
|
|
||||||
|
The launched TGI server can then be queried from clients, make sure to check out the [Consuming TGI](./basic_tutorials/consuming_tgi) guide.
|
@ -2,30 +2,27 @@
|
|||||||
|
|
||||||
The easiest way of getting started is using the official Docker container. Install Docker following [their installation instructions](https://docs.docker.com/get-docker/).
|
The easiest way of getting started is using the official Docker container. Install Docker following [their installation instructions](https://docs.docker.com/get-docker/).
|
||||||
|
|
||||||
Let's say you want to deploy [teknium/OpenHermes-2.5-Mistral-7B](https://huggingface.co/teknium/OpenHermes-2.5-Mistral-7B) model with TGI. Here is an example on how to do that:
|
## Launching TGI
|
||||||
|
|
||||||
|
Let's say you want to deploy [teknium/OpenHermes-2.5-Mistral-7B](https://huggingface.co/teknium/OpenHermes-2.5-Mistral-7B) model with TGI on an Nvidia GPU. Here is an example on how to do that:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
model=teknium/OpenHermes-2.5-Mistral-7B
|
model=teknium/OpenHermes-2.5-Mistral-7B
|
||||||
volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
|
volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
|
||||||
|
|
||||||
docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:1.4 --model-id $model
|
docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data \
|
||||||
|
ghcr.io/huggingface/text-generation-inference:2.0.3 \
|
||||||
|
--model-id $model
|
||||||
```
|
```
|
||||||
|
|
||||||
<Tip warning={true}>
|
### Supported hardware
|
||||||
|
|
||||||
To use NVIDIA GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). We also recommend using NVIDIA drivers with CUDA version 12.2 or higher.
|
TGI supports various hardware. Make sure to check the [Using TGI with Nvidia GPUs](./installation_nvidia), [Using TGI with AMD GPUs](./installation_amd), [Using TGI with Gaudi](./installation_gaudi), [Using TGI with Inferentia](./installation_inferentia) guides depending on which hardware you would like to deploy TGI on.
|
||||||
|
|
||||||
</Tip>
|
## Consuming TGI
|
||||||
|
|
||||||
TGI also supports ROCm-enabled AMD GPUs (only MI210 and MI250 are tested), details are available in the [Supported Hardware section](./supported_models#supported-hardware) and [AMD documentation](https://rocm.docs.amd.com/en/latest/deploy/docker.html). To launch TGI on ROCm GPUs, please use instead:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
docker run --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --device=/dev/kfd --device=/dev/dri --group-add video --ipc=host --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:1.4-rocm --model-id $model
|
|
||||||
```
|
|
||||||
|
|
||||||
Once TGI is running, you can use the `generate` endpoint by doing requests. To learn more about how to query the endpoints, check the [Consuming TGI](./basic_tutorials/consuming_tgi) section, where we show examples with utility libraries and UIs. Below you can see a simple snippet to query the endpoint.
|
Once TGI is running, you can use the `generate` endpoint by doing requests. To learn more about how to query the endpoints, check the [Consuming TGI](./basic_tutorials/consuming_tgi) section, where we show examples with utility libraries and UIs. Below you can see a simple snippet to query the endpoint.
|
||||||
|
|
||||||
|
|
||||||
<inferencesnippet>
|
<inferencesnippet>
|
||||||
<python>
|
<python>
|
||||||
|
|
||||||
@ -91,7 +88,7 @@ curl 127.0.0.1:8080/generate \
|
|||||||
To see all possible deploy flags and options, you can use the `--help` flag. It's possible to configure the number of shards, quantization, generation parameters, and more.
|
To see all possible deploy flags and options, you can use the `--help` flag. It's possible to configure the number of shards, quantization, generation parameters, and more.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
docker run ghcr.io/huggingface/text-generation-inference:1.4 --help
|
docker run ghcr.io/huggingface/text-generation-inference:2.0.3 --help
|
||||||
```
|
```
|
||||||
|
|
||||||
</Tip>
|
</Tip>
|
||||||
|
@ -1,29 +1,36 @@
|
|||||||
|
|
||||||
# Supported Models and Hardware
|
# Supported Models and Hardware
|
||||||
|
|
||||||
Text Generation Inference enables serving optimized models on specific hardware for the highest performance. The following sections list which models are hardware are supported.
|
Text Generation Inference enables serving optimized models on specific hardware for the highest performance. The following sections list which models are hardware are supported.
|
||||||
|
|
||||||
## Supported Models
|
## Supported Models
|
||||||
|
|
||||||
The following models are optimized and can be served with TGI, which uses custom CUDA kernels for better inference. You can add the flag `--disable-custom-kernels` at the end of the `docker run` command if you wish to disable them.
|
- [Idefics 2](https://huggingface.co/HuggingFaceM4/idefics2-8b) (Multimodal)
|
||||||
|
- [Llava Next (1.6)](https://huggingface.co/llava-hf/llava-v1.6-vicuna-13b-hf) (Multimodal)
|
||||||
- [BLOOM](https://huggingface.co/bigscience/bloom)
|
- [Llama](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct)
|
||||||
- [FLAN-T5](https://huggingface.co/google/flan-t5-xxl)
|
- [Phi 3](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct)
|
||||||
- [Galactica](https://huggingface.co/facebook/galactica-120b)
|
- [Gemma](https://huggingface.co/google/gemma-7b)
|
||||||
- [GPT-Neox](https://huggingface.co/EleutherAI/gpt-neox-20b)
|
- [Cohere](https://huggingface.co/CohereForAI/c4ai-command-r-plus)
|
||||||
- [Llama](https://github.com/facebookresearch/llama)
|
- [Dbrx](https://huggingface.co/databricks/dbrx-instruct)
|
||||||
- [OPT](https://huggingface.co/facebook/opt-66b)
|
- [Mamba](https://huggingface.co/state-spaces/mamba-2.8b-slimpj)
|
||||||
- [SantaCoder](https://huggingface.co/bigcode/santacoder)
|
|
||||||
- [Starcoder](https://huggingface.co/bigcode/starcoder)
|
|
||||||
- [Falcon 7B](https://huggingface.co/tiiuae/falcon-7b)
|
|
||||||
- [Falcon 40B](https://huggingface.co/tiiuae/falcon-40b)
|
|
||||||
- [MPT](https://huggingface.co/mosaicml/mpt-30b)
|
|
||||||
- [Llama V2](https://huggingface.co/meta-llama)
|
|
||||||
- [Code Llama](https://huggingface.co/codellama)
|
|
||||||
- [Mistral](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2)
|
- [Mistral](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2)
|
||||||
- [Mixtral](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1)
|
- [Mixtral](https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1)
|
||||||
- [Phi](https://huggingface.co/microsoft/phi-2)
|
- [Gpt Bigcode](https://huggingface.co/bigcode/gpt_bigcode-santacoder)
|
||||||
- [Idefics](HuggingFaceM4/idefics-9b-instruct) (Multimodal)
|
- [Phi](https://huggingface.co/microsoft/phi-1_5)
|
||||||
- [Llava-next](llava-hf/llava-v1.6-mistral-7b-hf) (Multimodal)
|
- [Baichuan](https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat)
|
||||||
|
- [Falcon](https://huggingface.co/tiiuae/falcon-7b-instruct)
|
||||||
|
- [StarCoder 2](https://huggingface.co/bigcode/starcoder2-15b-instruct-v0.1)
|
||||||
|
- [Qwen 2](https://huggingface.co/bigcode/starcoder2-15b-instruct-v0.1)
|
||||||
|
- [Opt](https://huggingface.co/facebook/opt-6.7b)
|
||||||
|
- [T5](https://huggingface.co/google/flan-t5-xxl)
|
||||||
|
- [Galactica](https://huggingface.co/facebook/galactica-120b)
|
||||||
|
- [SantaCoder](https://huggingface.co/bigcode/santacoder)
|
||||||
|
- [Bloom](https://huggingface.co/bigscience/bloom-560m)
|
||||||
|
- [Mpt](https://huggingface.co/mosaicml/mpt-7b-instruct)
|
||||||
|
- [Gpt2](https://huggingface.co/openai-community/gpt2)
|
||||||
|
- [Gpt Neox](https://huggingface.co/EleutherAI/gpt-neox-20b)
|
||||||
|
- [Idefics](https://huggingface.co/HuggingFaceM4/idefics-9b) (Multimodal)
|
||||||
|
|
||||||
|
|
||||||
If the above list lacks the model you would like to serve, depending on the model's pipeline type, you can try to initialize and serve the model anyways to see how well it performs, but performance isn't guaranteed for non-optimized models:
|
If the above list lacks the model you would like to serve, depending on the model's pipeline type, you can try to initialize and serve the model anyways to see how well it performs, but performance isn't guaranteed for non-optimized models:
|
||||||
|
|
||||||
@ -38,18 +45,4 @@ If you wish to serve a supported model that already exists on a local folder, ju
|
|||||||
|
|
||||||
```bash
|
```bash
|
||||||
text-generation-launcher --model-id <PATH-TO-LOCAL-BLOOM>
|
text-generation-launcher --model-id <PATH-TO-LOCAL-BLOOM>
|
||||||
``````
|
```
|
||||||
|
|
||||||
|
|
||||||
## Supported Hardware
|
|
||||||
|
|
||||||
TGI optimized models are supported on NVIDIA [A100](https://www.nvidia.com/en-us/data-center/a100/), [A10G](https://www.nvidia.com/en-us/data-center/products/a10-gpu/) and [T4](https://www.nvidia.com/en-us/data-center/tesla-t4/) GPUs with CUDA 12.2+. Note that you have to install [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html) to use it. For other NVIDIA GPUs, continuous batching will still apply, but some operations like flash attention and paged attention will not be executed.
|
|
||||||
|
|
||||||
TGI also has support of ROCm-enabled AMD Instinct MI210 and MI250 GPUs, with paged attention, GPTQ quantization, flash attention v2 support. The following features are currently not supported in the ROCm version of TGI, and the supported may be extended in the future:
|
|
||||||
* Loading [AWQ](https://huggingface.co/docs/transformers/quantization#awq) checkpoints.
|
|
||||||
* Flash [layer norm kernel](https://github.com/Dao-AILab/flash-attention/tree/main/csrc/layer_norm)
|
|
||||||
* Kernel for sliding window attention (Mistral)
|
|
||||||
|
|
||||||
TGI is also supported on the following AI hardware accelerators:
|
|
||||||
- *Habana first-gen Gaudi and Gaudi2:* check out this [repository](https://github.com/huggingface/tgi-gaudi) to serve models with TGI on Gaudi and Gaudi2 with [Optimum Habana](https://huggingface.co/docs/optimum/habana/index)
|
|
||||||
* *AWS Inferentia2:* check out this [guide](https://github.com/huggingface/optimum-neuron/tree/main/text-generation-inference) on how to serve models with TGI on Inferentia2.
|
|
||||||
|
BIN
integration-tests/images/cow_beach.png
Normal file
BIN
integration-tests/images/cow_beach.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 66 KiB |
@ -0,0 +1,99 @@
|
|||||||
|
{
|
||||||
|
"details": {
|
||||||
|
"best_of_sequences": null,
|
||||||
|
"finish_reason": "length",
|
||||||
|
"generated_tokens": 10,
|
||||||
|
"prefill": [
|
||||||
|
{
|
||||||
|
"id": 2061,
|
||||||
|
"logprob": null,
|
||||||
|
"text": "What"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 318,
|
||||||
|
"logprob": -3.1835938,
|
||||||
|
"text": " is"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 2769,
|
||||||
|
"logprob": -9.171875,
|
||||||
|
"text": " deep"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 4673,
|
||||||
|
"logprob": -1.6425781,
|
||||||
|
"text": " learning"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 30,
|
||||||
|
"logprob": -0.7314453,
|
||||||
|
"text": "?"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"seed": null,
|
||||||
|
"tokens": [
|
||||||
|
{
|
||||||
|
"id": 198,
|
||||||
|
"logprob": -0.68603516,
|
||||||
|
"special": false,
|
||||||
|
"text": "\n"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 198,
|
||||||
|
"logprob": -0.005393982,
|
||||||
|
"special": false,
|
||||||
|
"text": "\n"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 29744,
|
||||||
|
"logprob": -0.31079102,
|
||||||
|
"special": false,
|
||||||
|
"text": "Deep"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 4673,
|
||||||
|
"logprob": -0.08300781,
|
||||||
|
"special": false,
|
||||||
|
"text": " learning"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 318,
|
||||||
|
"logprob": -0.58984375,
|
||||||
|
"special": false,
|
||||||
|
"text": " is"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 257,
|
||||||
|
"logprob": -0.953125,
|
||||||
|
"special": false,
|
||||||
|
"text": " a"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 649,
|
||||||
|
"logprob": -2.0957031,
|
||||||
|
"special": false,
|
||||||
|
"text": " new"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 2214,
|
||||||
|
"logprob": -1.8095703,
|
||||||
|
"special": false,
|
||||||
|
"text": " field"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 286,
|
||||||
|
"logprob": -1.0673828,
|
||||||
|
"special": false,
|
||||||
|
"text": " of"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 2267,
|
||||||
|
"logprob": -0.9375,
|
||||||
|
"special": false,
|
||||||
|
"text": " research"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"top_tokens": null
|
||||||
|
},
|
||||||
|
"generated_text": "\n\nDeep learning is a new field of research"
|
||||||
|
}
|
@ -0,0 +1,398 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"details": {
|
||||||
|
"best_of_sequences": null,
|
||||||
|
"finish_reason": "length",
|
||||||
|
"generated_tokens": 10,
|
||||||
|
"prefill": [
|
||||||
|
{
|
||||||
|
"id": 2061,
|
||||||
|
"logprob": null,
|
||||||
|
"text": "What"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 318,
|
||||||
|
"logprob": -3.1835938,
|
||||||
|
"text": " is"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 2769,
|
||||||
|
"logprob": -9.171875,
|
||||||
|
"text": " deep"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 4673,
|
||||||
|
"logprob": -1.6425781,
|
||||||
|
"text": " learning"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 30,
|
||||||
|
"logprob": -0.7314453,
|
||||||
|
"text": "?"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"seed": null,
|
||||||
|
"tokens": [
|
||||||
|
{
|
||||||
|
"id": 198,
|
||||||
|
"logprob": -0.68603516,
|
||||||
|
"special": false,
|
||||||
|
"text": "\n"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 198,
|
||||||
|
"logprob": -0.005672455,
|
||||||
|
"special": false,
|
||||||
|
"text": "\n"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 29744,
|
||||||
|
"logprob": -0.3251953,
|
||||||
|
"special": false,
|
||||||
|
"text": "Deep"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 4673,
|
||||||
|
"logprob": -0.08294678,
|
||||||
|
"special": false,
|
||||||
|
"text": " learning"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 318,
|
||||||
|
"logprob": -0.5854492,
|
||||||
|
"special": false,
|
||||||
|
"text": " is"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 257,
|
||||||
|
"logprob": -0.9423828,
|
||||||
|
"special": false,
|
||||||
|
"text": " a"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 649,
|
||||||
|
"logprob": -2.0800781,
|
||||||
|
"special": false,
|
||||||
|
"text": " new"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 2214,
|
||||||
|
"logprob": -1.8369141,
|
||||||
|
"special": false,
|
||||||
|
"text": " field"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 286,
|
||||||
|
"logprob": -1.0683594,
|
||||||
|
"special": false,
|
||||||
|
"text": " of"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 2267,
|
||||||
|
"logprob": -0.9711914,
|
||||||
|
"special": false,
|
||||||
|
"text": " research"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"top_tokens": null
|
||||||
|
},
|
||||||
|
"generated_text": "\n\nDeep learning is a new field of research"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"details": {
|
||||||
|
"best_of_sequences": null,
|
||||||
|
"finish_reason": "length",
|
||||||
|
"generated_tokens": 10,
|
||||||
|
"prefill": [
|
||||||
|
{
|
||||||
|
"id": 2061,
|
||||||
|
"logprob": null,
|
||||||
|
"text": "What"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 318,
|
||||||
|
"logprob": -3.1660156,
|
||||||
|
"text": " is"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 2769,
|
||||||
|
"logprob": -9.1796875,
|
||||||
|
"text": " deep"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 4673,
|
||||||
|
"logprob": -1.6376953,
|
||||||
|
"text": " learning"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 30,
|
||||||
|
"logprob": -0.72216797,
|
||||||
|
"text": "?"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"seed": null,
|
||||||
|
"tokens": [
|
||||||
|
{
|
||||||
|
"id": 198,
|
||||||
|
"logprob": -0.7089844,
|
||||||
|
"special": false,
|
||||||
|
"text": "\n"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 198,
|
||||||
|
"logprob": -0.0054779053,
|
||||||
|
"special": false,
|
||||||
|
"text": "\n"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 29744,
|
||||||
|
"logprob": -0.3190918,
|
||||||
|
"special": false,
|
||||||
|
"text": "Deep"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 4673,
|
||||||
|
"logprob": -0.08319092,
|
||||||
|
"special": false,
|
||||||
|
"text": " learning"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 318,
|
||||||
|
"logprob": -0.5839844,
|
||||||
|
"special": false,
|
||||||
|
"text": " is"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 257,
|
||||||
|
"logprob": -0.9506836,
|
||||||
|
"special": false,
|
||||||
|
"text": " a"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 649,
|
||||||
|
"logprob": -2.0878906,
|
||||||
|
"special": false,
|
||||||
|
"text": " new"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 2214,
|
||||||
|
"logprob": -1.8496094,
|
||||||
|
"special": false,
|
||||||
|
"text": " field"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 286,
|
||||||
|
"logprob": -1.0673828,
|
||||||
|
"special": false,
|
||||||
|
"text": " of"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 2267,
|
||||||
|
"logprob": -0.9370117,
|
||||||
|
"special": false,
|
||||||
|
"text": " research"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"top_tokens": null
|
||||||
|
},
|
||||||
|
"generated_text": "\n\nDeep learning is a new field of research"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"details": {
|
||||||
|
"best_of_sequences": null,
|
||||||
|
"finish_reason": "length",
|
||||||
|
"generated_tokens": 10,
|
||||||
|
"prefill": [
|
||||||
|
{
|
||||||
|
"id": 2061,
|
||||||
|
"logprob": null,
|
||||||
|
"text": "What"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 318,
|
||||||
|
"logprob": -3.1660156,
|
||||||
|
"text": " is"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 2769,
|
||||||
|
"logprob": -9.1796875,
|
||||||
|
"text": " deep"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 4673,
|
||||||
|
"logprob": -1.6376953,
|
||||||
|
"text": " learning"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 30,
|
||||||
|
"logprob": -0.72216797,
|
||||||
|
"text": "?"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"seed": null,
|
||||||
|
"tokens": [
|
||||||
|
{
|
||||||
|
"id": 198,
|
||||||
|
"logprob": -0.7089844,
|
||||||
|
"special": false,
|
||||||
|
"text": "\n"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 198,
|
||||||
|
"logprob": -0.0054779053,
|
||||||
|
"special": false,
|
||||||
|
"text": "\n"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 29744,
|
||||||
|
"logprob": -0.3190918,
|
||||||
|
"special": false,
|
||||||
|
"text": "Deep"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 4673,
|
||||||
|
"logprob": -0.08319092,
|
||||||
|
"special": false,
|
||||||
|
"text": " learning"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 318,
|
||||||
|
"logprob": -0.5839844,
|
||||||
|
"special": false,
|
||||||
|
"text": " is"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 257,
|
||||||
|
"logprob": -0.9506836,
|
||||||
|
"special": false,
|
||||||
|
"text": " a"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 649,
|
||||||
|
"logprob": -2.0878906,
|
||||||
|
"special": false,
|
||||||
|
"text": " new"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 2214,
|
||||||
|
"logprob": -1.8496094,
|
||||||
|
"special": false,
|
||||||
|
"text": " field"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 286,
|
||||||
|
"logprob": -1.0673828,
|
||||||
|
"special": false,
|
||||||
|
"text": " of"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 2267,
|
||||||
|
"logprob": -0.9370117,
|
||||||
|
"special": false,
|
||||||
|
"text": " research"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"top_tokens": null
|
||||||
|
},
|
||||||
|
"generated_text": "\n\nDeep learning is a new field of research"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"details": {
|
||||||
|
"best_of_sequences": null,
|
||||||
|
"finish_reason": "length",
|
||||||
|
"generated_tokens": 10,
|
||||||
|
"prefill": [
|
||||||
|
{
|
||||||
|
"id": 2061,
|
||||||
|
"logprob": null,
|
||||||
|
"text": "What"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 318,
|
||||||
|
"logprob": -3.1660156,
|
||||||
|
"text": " is"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 2769,
|
||||||
|
"logprob": -9.1796875,
|
||||||
|
"text": " deep"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 4673,
|
||||||
|
"logprob": -1.6376953,
|
||||||
|
"text": " learning"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 30,
|
||||||
|
"logprob": -0.72216797,
|
||||||
|
"text": "?"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"seed": null,
|
||||||
|
"tokens": [
|
||||||
|
{
|
||||||
|
"id": 198,
|
||||||
|
"logprob": -0.7089844,
|
||||||
|
"special": false,
|
||||||
|
"text": "\n"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 198,
|
||||||
|
"logprob": -0.0054779053,
|
||||||
|
"special": false,
|
||||||
|
"text": "\n"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 29744,
|
||||||
|
"logprob": -0.3190918,
|
||||||
|
"special": false,
|
||||||
|
"text": "Deep"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 4673,
|
||||||
|
"logprob": -0.08319092,
|
||||||
|
"special": false,
|
||||||
|
"text": " learning"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 318,
|
||||||
|
"logprob": -0.5839844,
|
||||||
|
"special": false,
|
||||||
|
"text": " is"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 257,
|
||||||
|
"logprob": -0.9506836,
|
||||||
|
"special": false,
|
||||||
|
"text": " a"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 649,
|
||||||
|
"logprob": -2.0878906,
|
||||||
|
"special": false,
|
||||||
|
"text": " new"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 2214,
|
||||||
|
"logprob": -1.8496094,
|
||||||
|
"special": false,
|
||||||
|
"text": " field"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 286,
|
||||||
|
"logprob": -1.0673828,
|
||||||
|
"special": false,
|
||||||
|
"text": " of"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 2267,
|
||||||
|
"logprob": -0.9370117,
|
||||||
|
"special": false,
|
||||||
|
"text": " research"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"top_tokens": null
|
||||||
|
},
|
||||||
|
"generated_text": "\n\nDeep learning is a new field of research"
|
||||||
|
}
|
||||||
|
]
|
@ -0,0 +1,25 @@
|
|||||||
|
{
|
||||||
|
"details": {
|
||||||
|
"best_of_sequences": null,
|
||||||
|
"finish_reason": "eos_token",
|
||||||
|
"generated_tokens": 2,
|
||||||
|
"prefill": [],
|
||||||
|
"seed": null,
|
||||||
|
"tokens": [
|
||||||
|
{
|
||||||
|
"id": 54901,
|
||||||
|
"logprob": -0.72753906,
|
||||||
|
"special": false,
|
||||||
|
"text": "beach"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 1,
|
||||||
|
"logprob": -0.011009216,
|
||||||
|
"special": true,
|
||||||
|
"text": "<eos>"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"top_tokens": null
|
||||||
|
},
|
||||||
|
"generated_text": "beach"
|
||||||
|
}
|
44
integration-tests/models/test_flash_gpt2.py
Normal file
44
integration-tests/models/test_flash_gpt2.py
Normal file
@ -0,0 +1,44 @@
|
|||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="module")
|
||||||
|
def flash_gpt2_handle(launcher):
|
||||||
|
with launcher("openai-community/gpt2", num_shard=2) as handle:
|
||||||
|
yield handle
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="module")
|
||||||
|
async def flash_gpt2(flash_gpt2_handle):
|
||||||
|
await flash_gpt2_handle.health(300)
|
||||||
|
return flash_gpt2_handle.client
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_flash_gpt2(flash_gpt2, response_snapshot):
|
||||||
|
response = await flash_gpt2.generate(
|
||||||
|
"What is deep learning?",
|
||||||
|
max_new_tokens=10,
|
||||||
|
decoder_input_details=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert response.details.generated_tokens == 10
|
||||||
|
assert response == response_snapshot
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_flash_gpt2_load(flash_gpt2, generate_load, response_snapshot):
|
||||||
|
responses = await generate_load(
|
||||||
|
flash_gpt2,
|
||||||
|
"What is deep learning?",
|
||||||
|
max_new_tokens=10,
|
||||||
|
n=4,
|
||||||
|
)
|
||||||
|
|
||||||
|
generated_texts = [r.generated_text for r in responses]
|
||||||
|
|
||||||
|
assert len(generated_texts) == 4
|
||||||
|
assert all(
|
||||||
|
[text == generated_texts[0] for text in generated_texts]
|
||||||
|
), generated_texts
|
||||||
|
|
||||||
|
assert responses == response_snapshot
|
39
integration-tests/models/test_flash_pali_gemma.py
Normal file
39
integration-tests/models/test_flash_pali_gemma.py
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
import pytest
|
||||||
|
import requests
|
||||||
|
import io
|
||||||
|
import base64
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="module")
|
||||||
|
def flash_pali_gemma_handle(launcher):
|
||||||
|
with launcher(
|
||||||
|
"google/paligemma-3b-pt-224",
|
||||||
|
num_shard=1,
|
||||||
|
revision="float16",
|
||||||
|
max_input_length=4000,
|
||||||
|
max_total_tokens=4096,
|
||||||
|
) as handle:
|
||||||
|
yield handle
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="module")
|
||||||
|
async def flash_pali_gemma(flash_pali_gemma_handle):
|
||||||
|
await flash_pali_gemma_handle.health(300)
|
||||||
|
return flash_pali_gemma_handle.client
|
||||||
|
|
||||||
|
|
||||||
|
def get_cow_beach():
|
||||||
|
with open("integration-tests/images/cow_beach.png", "rb") as image_file:
|
||||||
|
encoded_string = base64.b64encode(image_file.read())
|
||||||
|
return f"data:image/png;base64,{encoded_string.decode('utf-8')}"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
@pytest.mark.private
|
||||||
|
async def test_flash_pali_gemma(flash_pali_gemma, response_snapshot):
|
||||||
|
cow = get_cow_beach()
|
||||||
|
inputs = f"Where is the cow standing?\n"
|
||||||
|
response = await flash_pali_gemma.generate(inputs, max_new_tokens=20)
|
||||||
|
|
||||||
|
assert response.generated_text == "beach"
|
||||||
|
assert response == response_snapshot
|
@ -14,6 +14,7 @@ nix = { version = "0.28.0", features = ["signal"] }
|
|||||||
once_cell = "1.19.0"
|
once_cell = "1.19.0"
|
||||||
serde = { version = "1.0.188", features = ["derive"] }
|
serde = { version = "1.0.188", features = ["derive"] }
|
||||||
serde_json = "1.0.107"
|
serde_json = "1.0.107"
|
||||||
|
thiserror = "1.0.59"
|
||||||
tracing = "0.1.37"
|
tracing = "0.1.37"
|
||||||
tracing-subscriber = { version = "0.3.17", features = ["json", "env-filter"] }
|
tracing-subscriber = { version = "0.3.17", features = ["json", "env-filter"] }
|
||||||
|
|
||||||
|
@ -18,14 +18,33 @@ use std::thread;
|
|||||||
use std::thread::sleep;
|
use std::thread::sleep;
|
||||||
use std::time::{Duration, Instant};
|
use std::time::{Duration, Instant};
|
||||||
use std::{fs, io};
|
use std::{fs, io};
|
||||||
use tracing_subscriber::EnvFilter;
|
use thiserror::Error;
|
||||||
|
use tracing_subscriber::{filter::LevelFilter, EnvFilter};
|
||||||
|
|
||||||
mod env_runtime;
|
mod env_runtime;
|
||||||
|
|
||||||
|
#[derive(Deserialize)]
|
||||||
|
struct RawConfig {
|
||||||
|
max_position_embeddings: Option<usize>,
|
||||||
|
n_positions: Option<usize>,
|
||||||
|
max_seq_len: Option<usize>,
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Deserialize)]
|
#[derive(Deserialize)]
|
||||||
struct Config {
|
struct Config {
|
||||||
max_position_embeddings: Option<usize>,
|
max_position_embeddings: Option<usize>,
|
||||||
max_seq_len: Option<usize>,
|
}
|
||||||
|
|
||||||
|
impl From<RawConfig> for Config {
|
||||||
|
fn from(other: RawConfig) -> Self {
|
||||||
|
let max_position_embeddings = other
|
||||||
|
.max_position_embeddings
|
||||||
|
.or(other.max_seq_len)
|
||||||
|
.or(other.n_positions);
|
||||||
|
Config {
|
||||||
|
max_position_embeddings,
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, Copy, Debug, ValueEnum)]
|
#[derive(Clone, Copy, Debug, ValueEnum)]
|
||||||
@ -453,6 +472,7 @@ fn shard_manager(
|
|||||||
max_total_tokens: usize,
|
max_total_tokens: usize,
|
||||||
max_batch_size: Option<usize>,
|
max_batch_size: Option<usize>,
|
||||||
otlp_endpoint: Option<String>,
|
otlp_endpoint: Option<String>,
|
||||||
|
log_level: LevelFilter,
|
||||||
status_sender: mpsc::Sender<ShardStatus>,
|
status_sender: mpsc::Sender<ShardStatus>,
|
||||||
shutdown: Arc<AtomicBool>,
|
shutdown: Arc<AtomicBool>,
|
||||||
_shutdown_sender: mpsc::Sender<()>,
|
_shutdown_sender: mpsc::Sender<()>,
|
||||||
@ -475,7 +495,7 @@ fn shard_manager(
|
|||||||
"--uds-path".to_string(),
|
"--uds-path".to_string(),
|
||||||
uds_path,
|
uds_path,
|
||||||
"--logger-level".to_string(),
|
"--logger-level".to_string(),
|
||||||
"INFO".to_string(),
|
log_level.to_string().to_uppercase(),
|
||||||
"--json-output".to_string(),
|
"--json-output".to_string(),
|
||||||
];
|
];
|
||||||
|
|
||||||
@ -755,13 +775,13 @@ struct PythonLogMessage {
|
|||||||
impl PythonLogMessage {
|
impl PythonLogMessage {
|
||||||
fn trace(&self) {
|
fn trace(&self) {
|
||||||
match self.record.level.name {
|
match self.record.level.name {
|
||||||
PythonLogLevelEnum::Trace => tracing::trace!("{}", self.text),
|
PythonLogLevelEnum::Trace => tracing::trace!("{}", self.text.trim_end()),
|
||||||
PythonLogLevelEnum::Debug => tracing::debug!("{}", self.text),
|
PythonLogLevelEnum::Debug => tracing::debug!("{}", self.text.trim_end()),
|
||||||
PythonLogLevelEnum::Info => tracing::info!("{}", self.text),
|
PythonLogLevelEnum::Info => tracing::info!("{}", self.text.trim_end()),
|
||||||
PythonLogLevelEnum::Success => tracing::info!("{}", self.text),
|
PythonLogLevelEnum::Success => tracing::info!("{}", self.text.trim_end()),
|
||||||
PythonLogLevelEnum::Warning => tracing::warn!("{}", self.text),
|
PythonLogLevelEnum::Warning => tracing::warn!("{}", self.text.trim_end()),
|
||||||
PythonLogLevelEnum::Error => tracing::error!("{}", self.text),
|
PythonLogLevelEnum::Error => tracing::error!("{}", self.text.trim_end()),
|
||||||
PythonLogLevelEnum::Critical => tracing::error!("{}", self.text),
|
PythonLogLevelEnum::Critical => tracing::error!("{}", self.text.trim_end()),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -823,26 +843,26 @@ fn find_num_shards(
|
|||||||
Ok(num_shard)
|
Ok(num_shard)
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug, Error)]
|
||||||
enum LauncherError {
|
enum LauncherError {
|
||||||
|
#[error("Invalid argument: {0}")]
|
||||||
ArgumentValidation(String),
|
ArgumentValidation(String),
|
||||||
|
#[error("not enough cuda devices: {0}")]
|
||||||
NotEnoughCUDADevices(String),
|
NotEnoughCUDADevices(String),
|
||||||
|
#[error("Download error")]
|
||||||
DownloadError,
|
DownloadError,
|
||||||
|
#[error("Shard cannot start")]
|
||||||
ShardCannotStart,
|
ShardCannotStart,
|
||||||
|
#[error("Shard disconnected")]
|
||||||
ShardDisconnected,
|
ShardDisconnected,
|
||||||
|
#[error("Shard failed")]
|
||||||
ShardFailed,
|
ShardFailed,
|
||||||
|
#[error("Webserver failed")]
|
||||||
WebserverFailed,
|
WebserverFailed,
|
||||||
|
#[error("Webserver cannot start")]
|
||||||
WebserverCannotStart,
|
WebserverCannotStart,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl core::fmt::Display for LauncherError {
|
|
||||||
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
|
|
||||||
write!(f, "{self:?}")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl std::error::Error for LauncherError {}
|
|
||||||
|
|
||||||
fn download_convert_model(args: &Args, running: Arc<AtomicBool>) -> Result<(), LauncherError> {
|
fn download_convert_model(args: &Args, running: Arc<AtomicBool>) -> Result<(), LauncherError> {
|
||||||
// Enter download tracing span
|
// Enter download tracing span
|
||||||
let _span = tracing::span!(tracing::Level::INFO, "download").entered();
|
let _span = tracing::span!(tracing::Level::INFO, "download").entered();
|
||||||
@ -981,6 +1001,7 @@ fn spawn_shards(
|
|||||||
args: &Args,
|
args: &Args,
|
||||||
cuda_graphs: Vec<usize>,
|
cuda_graphs: Vec<usize>,
|
||||||
max_total_tokens: usize,
|
max_total_tokens: usize,
|
||||||
|
max_log_level: LevelFilter,
|
||||||
shutdown: Arc<AtomicBool>,
|
shutdown: Arc<AtomicBool>,
|
||||||
shutdown_receiver: &mpsc::Receiver<()>,
|
shutdown_receiver: &mpsc::Receiver<()>,
|
||||||
shutdown_sender: mpsc::Sender<()>,
|
shutdown_sender: mpsc::Sender<()>,
|
||||||
@ -1038,6 +1059,7 @@ fn spawn_shards(
|
|||||||
max_total_tokens,
|
max_total_tokens,
|
||||||
max_batch_size,
|
max_batch_size,
|
||||||
otlp_endpoint,
|
otlp_endpoint,
|
||||||
|
max_log_level,
|
||||||
status_sender,
|
status_sender,
|
||||||
shutdown,
|
shutdown,
|
||||||
shutdown_sender,
|
shutdown_sender,
|
||||||
@ -1278,8 +1300,22 @@ fn main() -> Result<(), LauncherError> {
|
|||||||
let args: Args = Args::parse();
|
let args: Args = Args::parse();
|
||||||
|
|
||||||
// Filter events with LOG_LEVEL
|
// Filter events with LOG_LEVEL
|
||||||
let env_filter =
|
let varname = "LOG_LEVEL";
|
||||||
EnvFilter::try_from_env("LOG_LEVEL").unwrap_or_else(|_| EnvFilter::new("info"));
|
let env_filter = if let Ok(log_level) = std::env::var(varname) {
|
||||||
|
// Override to avoid simple logs to be spammed with tokio level informations
|
||||||
|
let log_level = match &log_level[..] {
|
||||||
|
"warn" => "text_generation_launcher=warn,text_generation_router=warn",
|
||||||
|
"info" => "text_generation_launcher=info,text_generation_router=info",
|
||||||
|
"debug" => "text_generation_launcher=debug,text_generation_router=debug",
|
||||||
|
log_level => log_level,
|
||||||
|
};
|
||||||
|
EnvFilter::builder()
|
||||||
|
.with_default_directive(LevelFilter::INFO.into())
|
||||||
|
.parse_lossy(log_level)
|
||||||
|
} else {
|
||||||
|
EnvFilter::new("info")
|
||||||
|
};
|
||||||
|
let max_log_level = env_filter.max_level_hint().unwrap_or(LevelFilter::INFO);
|
||||||
|
|
||||||
if args.json_output {
|
if args.json_output {
|
||||||
tracing_subscriber::fmt()
|
tracing_subscriber::fmt()
|
||||||
@ -1322,33 +1358,30 @@ fn main() -> Result<(), LauncherError> {
|
|||||||
};
|
};
|
||||||
|
|
||||||
let content = std::fs::read_to_string(filename)?;
|
let content = std::fs::read_to_string(filename)?;
|
||||||
let config: Config = serde_json::from_str(&content)?;
|
let config: RawConfig = serde_json::from_str(&content)?;
|
||||||
|
let config: Config = config.into();
|
||||||
|
|
||||||
// Quantization usually means you're even more RAM constrained.
|
// Quantization usually means you're even more RAM constrained.
|
||||||
let max_default = 4096;
|
let max_default = 4096;
|
||||||
|
|
||||||
let max_position_embeddings = match (config.max_position_embeddings, config.max_seq_len) {
|
if let Some(max_position_embeddings) = config.max_position_embeddings {
|
||||||
(Some(max_position_embeddings), _) | (None, Some(max_position_embeddings)) => {
|
if max_position_embeddings > max_default {
|
||||||
if max_position_embeddings > max_default {
|
let max = max_position_embeddings;
|
||||||
let max = max_position_embeddings;
|
if args.max_input_tokens.is_none()
|
||||||
if args.max_input_tokens.is_none()
|
&& args.max_total_tokens.is_none()
|
||||||
&& args.max_total_tokens.is_none()
|
&& args.max_batch_prefill_tokens.is_none()
|
||||||
&& args.max_batch_prefill_tokens.is_none()
|
{
|
||||||
{
|
tracing::info!("Model supports up to {max} but tgi will now set its default to {max_default} instead. This is to save VRAM by refusing large prompts in order to allow more users on the same hardware. You can increase that size using `--max-batch-prefill-tokens={} --max-total-tokens={max} --max-input-tokens={}`.", max + 50, max - 1);
|
||||||
tracing::info!("Model supports up to {max} but tgi will now set its default to {max_default} instead. This is to save VRAM by refusing large prompts in order to allow more users on the same hardware. You can increase that size using `--max-batch-prefill-tokens={} --max-total-tokens={max} --max-input-tokens={}`.", max + 50, max - 1);
|
|
||||||
}
|
|
||||||
max_default
|
|
||||||
} else {
|
|
||||||
max_position_embeddings
|
|
||||||
}
|
}
|
||||||
|
Ok(max_default)
|
||||||
|
} else {
|
||||||
|
Ok(max_position_embeddings)
|
||||||
}
|
}
|
||||||
_ => {
|
} else {
|
||||||
return Err(Box::new(LauncherError::ArgumentValidation(
|
Err(Box::new(LauncherError::ArgumentValidation(
|
||||||
"no max defined".to_string(),
|
"no max defined".to_string(),
|
||||||
)));
|
)))
|
||||||
}
|
}
|
||||||
};
|
|
||||||
Ok(max_position_embeddings)
|
|
||||||
};
|
};
|
||||||
let max_position_embeddings: usize = get_max_position_embeddings().unwrap_or(4096);
|
let max_position_embeddings: usize = get_max_position_embeddings().unwrap_or(4096);
|
||||||
|
|
||||||
@ -1504,6 +1537,7 @@ fn main() -> Result<(), LauncherError> {
|
|||||||
&args,
|
&args,
|
||||||
cuda_graphs,
|
cuda_graphs,
|
||||||
max_total_tokens,
|
max_total_tokens,
|
||||||
|
max_log_level,
|
||||||
shutdown.clone(),
|
shutdown.clone(),
|
||||||
&shutdown_receiver,
|
&shutdown_receiver,
|
||||||
shutdown_sender,
|
shutdown_sender,
|
||||||
|
9
load_tests/Makefile
Normal file
9
load_tests/Makefile
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
|
||||||
|
ShareGPT_V3_unfiltered_cleaned_split.json:
|
||||||
|
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||||
|
|
||||||
|
prepare_share: ShareGPT_V3_unfiltered_cleaned_split.json
|
||||||
|
python filter.py
|
||||||
|
|
||||||
|
prepare_orca:
|
||||||
|
python orca.py
|
26
load_tests/filter.py
Normal file
26
load_tests/filter.py
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
import json
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
with open("./ShareGPT_V3_unfiltered_cleaned_split.json", "r") as f:
|
||||||
|
data = json.load(f)
|
||||||
|
|
||||||
|
# Select only the first 2k conversations that start with a human.
|
||||||
|
max = 2000
|
||||||
|
conversations = []
|
||||||
|
for conversation in data:
|
||||||
|
conv = conversation.get("conversations")
|
||||||
|
if conv and conv[0]["from"] == "human":
|
||||||
|
# Trim the rest of the output
|
||||||
|
conversation["conversations"] = conversation["conversations"][:1]
|
||||||
|
conversations.append(conversation)
|
||||||
|
|
||||||
|
if len(conversation) >= max:
|
||||||
|
break
|
||||||
|
|
||||||
|
with open("./small.json", "w") as f:
|
||||||
|
data = json.dump(conversations, f, indent=4)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
27
load_tests/orca.py
Normal file
27
load_tests/orca.py
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
import json
|
||||||
|
import datasets
|
||||||
|
import tqdm
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
dataset = datasets.load_dataset("Open-Orca/OpenOrca", split="train")
|
||||||
|
# Select only the first 2k conversations that start with a human.
|
||||||
|
max = min(2000, len(dataset))
|
||||||
|
conversations = []
|
||||||
|
for item in tqdm.tqdm(dataset, total=max):
|
||||||
|
conversation = {
|
||||||
|
"conversations": [
|
||||||
|
{"from": "human", "value": item["question"]},
|
||||||
|
],
|
||||||
|
"id": item["id"],
|
||||||
|
}
|
||||||
|
conversations.append(conversation)
|
||||||
|
if len(conversations) >= max:
|
||||||
|
break
|
||||||
|
|
||||||
|
with open("./small.json", "w") as f:
|
||||||
|
data = json.dump(conversations, f, indent=4)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
@ -1,63 +0,0 @@
|
|||||||
import {check} from 'k6';
|
|
||||||
import http from 'k6/http';
|
|
||||||
import {Trend} from 'k6/metrics';
|
|
||||||
|
|
||||||
const host = __ENV.HOST || '127.0.0.1:3000';
|
|
||||||
|
|
||||||
const totalTime = new Trend('total_time', true);
|
|
||||||
const validationTime = new Trend('validation_time', true);
|
|
||||||
const queueTime = new Trend('queue_time', true);
|
|
||||||
const inferenceTime = new Trend('inference_time', true);
|
|
||||||
const timePerToken = new Trend('time_per_token', true);
|
|
||||||
|
|
||||||
const example = {
|
|
||||||
payload: JSON.stringify({
|
|
||||||
inputs: '# This is a fibonacci function written in the Python programming language.' +
|
|
||||||
'def fibonacci',
|
|
||||||
parameters: {
|
|
||||||
details: true,
|
|
||||||
max_new_tokens: 60,
|
|
||||||
temperature: 0.2,
|
|
||||||
top_p: 0.95,
|
|
||||||
seed: 0,
|
|
||||||
},
|
|
||||||
}),
|
|
||||||
generated_tokens: 60
|
|
||||||
};
|
|
||||||
|
|
||||||
export const options = {
|
|
||||||
thresholds: {
|
|
||||||
http_req_failed: ['rate==0'],
|
|
||||||
time_per_token: ['p(95)<90'],
|
|
||||||
queue_time: ['p(95)<1500'],
|
|
||||||
},
|
|
||||||
scenarios: {
|
|
||||||
load_test: {
|
|
||||||
executor: 'constant-arrival-rate',
|
|
||||||
duration: '60s',
|
|
||||||
preAllocatedVUs: 100,
|
|
||||||
rate: 10,
|
|
||||||
timeUnit: '1s',
|
|
||||||
},
|
|
||||||
},
|
|
||||||
};
|
|
||||||
|
|
||||||
export default function () {
|
|
||||||
const headers = {'Content-Type': 'application/json'};
|
|
||||||
const res = http.post(`http://${host}/generate`, example.payload, {
|
|
||||||
headers,
|
|
||||||
});
|
|
||||||
|
|
||||||
check(res, {
|
|
||||||
'Post status is 200': (r) => res.status === 200,
|
|
||||||
'Post response generated tokens': (r) => res.status === 200 && res.json().details.generated_tokens === example.generated_tokens,
|
|
||||||
});
|
|
||||||
|
|
||||||
if (res.status === 200) {
|
|
||||||
totalTime.add(res.headers["X-Total-Time"]);
|
|
||||||
validationTime.add(res.headers["X-Validation-Time"]);
|
|
||||||
queueTime.add(res.headers["X-Queue-Time"]);
|
|
||||||
inferenceTime.add(res.headers["X-Inference-Time"]);
|
|
||||||
timePerToken.add(res.headers["X-Time-Per-Token"]);
|
|
||||||
}
|
|
||||||
}
|
|
@ -110,6 +110,7 @@ impl Client {
|
|||||||
max_prefill_tokens: u32,
|
max_prefill_tokens: u32,
|
||||||
max_total_tokens: u32,
|
max_total_tokens: u32,
|
||||||
max_batch_size: Option<usize>,
|
max_batch_size: Option<usize>,
|
||||||
|
model_id: &str
|
||||||
) -> Result<Option<u32>> {
|
) -> Result<Option<u32>> {
|
||||||
let warmup_enabled: bool = env::var("WARMUP_ENABLED").ok().map_or(true, |value| value.to_lowercase() == "true");
|
let warmup_enabled: bool = env::var("WARMUP_ENABLED").ok().map_or(true, |value| value.to_lowercase() == "true");
|
||||||
if !warmup_enabled {
|
if !warmup_enabled {
|
||||||
@ -152,25 +153,76 @@ impl Client {
|
|||||||
|
|
||||||
let mut batch_counter: u64 = 0;
|
let mut batch_counter: u64 = 0;
|
||||||
let mut request_counter: u64 = 0;
|
let mut request_counter: u64 = 0;
|
||||||
for shape in shapes.iter() {
|
if model_id.contains("llava") {
|
||||||
let (batch_size, seq_length) = shape;
|
let mut n_tokens = 0;
|
||||||
let mut batches: Vec<Batch> = vec");
|
||||||
seq_bucket_size,
|
inputs.push_str(&"_test ".to_string().repeat(max_input_length as usize));
|
||||||
false,
|
|
||||||
None,
|
requests.push(Request {
|
||||||
)
|
id: 0,
|
||||||
];
|
// We truncate the input on the server side to be sure that it has the correct size
|
||||||
// if possible, create second batch in order to trigger concatenate operation
|
inputs,
|
||||||
if *batch_size < max_decode_batch_size {
|
truncate,
|
||||||
batches.push(
|
// Set sampling parameters to also take these ops into account in the max memory
|
||||||
|
parameters: Some(NextTokenChooserParameters {
|
||||||
|
temperature: 0.9,
|
||||||
|
top_k: 10,
|
||||||
|
top_p: 0.9,
|
||||||
|
typical_p: 0.9,
|
||||||
|
do_sample: false,
|
||||||
|
seed: 0,
|
||||||
|
repetition_penalty: 1.2,
|
||||||
|
frequency_penalty: 0.1,
|
||||||
|
watermark: true,
|
||||||
|
grammar: String::new(),
|
||||||
|
grammar_type: GrammarType::None as i32,
|
||||||
|
}),
|
||||||
|
stopping_parameters: Some(StoppingCriteriaParameters {
|
||||||
|
max_new_tokens: max_total_tokens - truncate,
|
||||||
|
stop_sequences: vec![],
|
||||||
|
ignore_eos_token: true,
|
||||||
|
}),
|
||||||
|
prefill_logprobs: true,
|
||||||
|
top_n_tokens: 20,
|
||||||
|
});
|
||||||
|
n_tokens += max_input_length;
|
||||||
|
|
||||||
|
// Check max_batch_size
|
||||||
|
if Some(requests.len()) == max_batch_size {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut batches = Vec::new();
|
||||||
|
batches.push(Batch {
|
||||||
|
id: 0,
|
||||||
|
size: requests.len() as u32,
|
||||||
|
requests,
|
||||||
|
max_tokens: 0,
|
||||||
|
});
|
||||||
|
|
||||||
|
let request = tonic::Request::new(WarmupRequest {
|
||||||
|
batches,
|
||||||
|
max_input_length,
|
||||||
|
max_prefill_tokens,
|
||||||
|
max_total_tokens,
|
||||||
|
})
|
||||||
|
.inject_context();
|
||||||
|
let response = self.stub.warmup(request).await?.into_inner();
|
||||||
|
Ok(response.max_supported_total_tokens)
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
for shape in shapes.iter() {
|
||||||
|
let (batch_size, seq_length) = shape;
|
||||||
|
let mut batches: Vec<Batch> = vec![
|
||||||
self.create_warmup_batch(
|
self.create_warmup_batch(
|
||||||
(1, *seq_length),
|
*shape,
|
||||||
&mut batch_counter,
|
&mut batch_counter,
|
||||||
&mut request_counter,
|
&mut request_counter,
|
||||||
max_input_length,
|
max_input_length,
|
||||||
@ -179,56 +231,45 @@ impl Client {
|
|||||||
false,
|
false,
|
||||||
None,
|
None,
|
||||||
)
|
)
|
||||||
);
|
];
|
||||||
|
// if possible, create second batch in order to trigger concatenate operation
|
||||||
|
if *batch_size < max_decode_batch_size {
|
||||||
|
batches.push(
|
||||||
|
self.create_warmup_batch(
|
||||||
|
(1, *seq_length),
|
||||||
|
&mut batch_counter,
|
||||||
|
&mut request_counter,
|
||||||
|
max_input_length,
|
||||||
|
max_total_tokens,
|
||||||
|
seq_bucket_size,
|
||||||
|
false,
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
let request = tonic::Request::new(WarmupRequest {
|
||||||
|
batches,
|
||||||
|
max_input_length,
|
||||||
|
max_prefill_tokens,
|
||||||
|
max_total_tokens,
|
||||||
|
}).inject_context();
|
||||||
|
let _response = self.stub.warmup(request).await?.into_inner();
|
||||||
}
|
}
|
||||||
|
|
||||||
let request = tonic::Request::new(WarmupRequest {
|
// send batches to warmup all possible decode shapes
|
||||||
batches,
|
if decode_batch_sizes.len() > 1 {
|
||||||
max_input_length,
|
let steps_per_bucket: u32 = if decode_bucket_size <= max_prefill_batch_size {
|
||||||
max_prefill_tokens,
|
decode_bucket_size
|
||||||
max_total_tokens,
|
} else {
|
||||||
}).inject_context();
|
decode_bucket_size.div_ceil(max_prefill_batch_size)
|
||||||
let _response = self.stub.warmup(request).await?.into_inner();
|
};
|
||||||
}
|
let max_new_tokens: u32 = 2 * decode_batch_sizes.len() as u32 * steps_per_bucket;
|
||||||
|
|
||||||
// send batches to warmup all possible decode shapes
|
let mut requests_send: u32 = cmp::min(max_prefill_batch_size, decode_bucket_size);
|
||||||
if decode_batch_sizes.len() > 1 {
|
let mut batches: Vec<Batch> = vec![
|
||||||
let steps_per_bucket: u32 = if decode_bucket_size <= max_prefill_batch_size {
|
|
||||||
decode_bucket_size
|
|
||||||
} else {
|
|
||||||
decode_bucket_size.div_ceil(max_prefill_batch_size)
|
|
||||||
};
|
|
||||||
let max_new_tokens: u32 = 2 * decode_batch_sizes.len() as u32 * steps_per_bucket;
|
|
||||||
|
|
||||||
let mut requests_send: u32 = cmp::min(max_prefill_batch_size, decode_bucket_size);
|
|
||||||
let mut batches: Vec<Batch> = vec![
|
|
||||||
self.create_warmup_batch(
|
|
||||||
(requests_send, seq_bucket_size),
|
|
||||||
&mut batch_counter,
|
|
||||||
&mut request_counter,
|
|
||||||
max_input_length,
|
|
||||||
max_total_tokens,
|
|
||||||
seq_bucket_size,
|
|
||||||
false,
|
|
||||||
Some(max_new_tokens),
|
|
||||||
)
|
|
||||||
];
|
|
||||||
|
|
||||||
let get_current_decode_batch_size = |num: u32| -> u32 {
|
|
||||||
decode_batch_sizes.iter()
|
|
||||||
.filter(|&&x| x >= num)
|
|
||||||
.min()
|
|
||||||
.copied()
|
|
||||||
.unwrap()
|
|
||||||
};
|
|
||||||
|
|
||||||
let mut current_decode_batch_size: u32 = get_current_decode_batch_size(requests_send);
|
|
||||||
while current_decode_batch_size < max_decode_batch_size {
|
|
||||||
let distance_to_next_bucket = current_decode_batch_size + decode_bucket_size - requests_send;
|
|
||||||
let num_requests: u32 = cmp::min(distance_to_next_bucket, max_prefill_batch_size);
|
|
||||||
batches.push(
|
|
||||||
self.create_warmup_batch(
|
self.create_warmup_batch(
|
||||||
(num_requests, seq_bucket_size),
|
(requests_send, seq_bucket_size),
|
||||||
&mut batch_counter,
|
&mut batch_counter,
|
||||||
&mut request_counter,
|
&mut request_counter,
|
||||||
max_input_length,
|
max_input_length,
|
||||||
@ -237,48 +278,74 @@ impl Client {
|
|||||||
false,
|
false,
|
||||||
Some(max_new_tokens),
|
Some(max_new_tokens),
|
||||||
)
|
)
|
||||||
);
|
];
|
||||||
|
|
||||||
requests_send += num_requests;
|
let get_current_decode_batch_size = |num: u32| -> u32 {
|
||||||
current_decode_batch_size = get_current_decode_batch_size(requests_send);
|
decode_batch_sizes.iter()
|
||||||
|
.filter(|&&x| x >= num)
|
||||||
|
.min()
|
||||||
|
.copied()
|
||||||
|
.unwrap()
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut current_decode_batch_size: u32 = get_current_decode_batch_size(requests_send);
|
||||||
|
while current_decode_batch_size < max_decode_batch_size {
|
||||||
|
let distance_to_next_bucket = current_decode_batch_size + decode_bucket_size - requests_send;
|
||||||
|
let num_requests: u32 = cmp::min(distance_to_next_bucket, max_prefill_batch_size);
|
||||||
|
batches.push(
|
||||||
|
self.create_warmup_batch(
|
||||||
|
(num_requests, seq_bucket_size),
|
||||||
|
&mut batch_counter,
|
||||||
|
&mut request_counter,
|
||||||
|
max_input_length,
|
||||||
|
max_total_tokens,
|
||||||
|
seq_bucket_size,
|
||||||
|
false,
|
||||||
|
Some(max_new_tokens),
|
||||||
|
)
|
||||||
|
);
|
||||||
|
|
||||||
|
requests_send += num_requests;
|
||||||
|
current_decode_batch_size = get_current_decode_batch_size(requests_send);
|
||||||
|
}
|
||||||
|
|
||||||
|
let request = tonic::Request::new(WarmupRequest {
|
||||||
|
batches,
|
||||||
|
max_input_length,
|
||||||
|
max_prefill_tokens,
|
||||||
|
max_total_tokens,
|
||||||
|
}).inject_context();
|
||||||
|
let _response = self.stub.warmup(request).await?.into_inner();
|
||||||
}
|
}
|
||||||
|
|
||||||
let request = tonic::Request::new(WarmupRequest {
|
// send batches with default params to warm up Greedy search
|
||||||
batches,
|
let mut greedy_shapes: Vec<(u32, u32)> = Vec::with_capacity(prefill_batch_sizes.len());
|
||||||
max_input_length,
|
for batch_size in &prefill_batch_sizes {
|
||||||
max_prefill_tokens,
|
greedy_shapes.push((*batch_size, seq_bucket_size.clone()));
|
||||||
max_total_tokens,
|
}
|
||||||
}).inject_context();
|
for greedy_shape in greedy_shapes.iter() {
|
||||||
let _response = self.stub.warmup(request).await?.into_inner();
|
let batches: Vec<Batch> = vec![
|
||||||
}
|
self.create_warmup_batch(
|
||||||
|
*greedy_shape,
|
||||||
// send batches with default params to warm up Greedy search
|
&mut batch_counter,
|
||||||
let mut greedy_shapes: Vec<(u32, u32)> = Vec::with_capacity(prefill_batch_sizes.len());
|
&mut request_counter,
|
||||||
for batch_size in &prefill_batch_sizes {
|
max_input_length,
|
||||||
greedy_shapes.push((*batch_size, seq_bucket_size.clone()));
|
max_total_tokens,
|
||||||
}
|
seq_bucket_size,
|
||||||
for greedy_shape in greedy_shapes.iter() {
|
true,
|
||||||
let batches: Vec<Batch> = vec![
|
None,
|
||||||
self.create_warmup_batch(
|
)
|
||||||
*greedy_shape,
|
];
|
||||||
&mut batch_counter,
|
let request = tonic::Request::new(WarmupRequest {
|
||||||
&mut request_counter,
|
batches,
|
||||||
max_input_length,
|
max_input_length,
|
||||||
|
max_prefill_tokens,
|
||||||
max_total_tokens,
|
max_total_tokens,
|
||||||
seq_bucket_size,
|
}).inject_context();
|
||||||
true,
|
let _response = self.stub.warmup(request).await?.into_inner();
|
||||||
None,
|
}
|
||||||
)
|
Ok(None) // No support for maximum total tokens
|
||||||
];
|
|
||||||
let request = tonic::Request::new(WarmupRequest {
|
|
||||||
batches,
|
|
||||||
max_input_length,
|
|
||||||
max_prefill_tokens,
|
|
||||||
max_total_tokens,
|
|
||||||
}).inject_context();
|
|
||||||
let _response = self.stub.warmup(request).await?.into_inner();
|
|
||||||
}
|
}
|
||||||
Ok(None) // No support for maximum total tokens
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[instrument(skip_all)]
|
#[instrument(skip_all)]
|
||||||
|
@ -100,6 +100,7 @@ impl ShardedClient {
|
|||||||
max_prefill_tokens: u32,
|
max_prefill_tokens: u32,
|
||||||
max_total_tokens: u32,
|
max_total_tokens: u32,
|
||||||
max_batch_size: Option<usize>,
|
max_batch_size: Option<usize>,
|
||||||
|
model_id: &str,
|
||||||
) -> Result<Option<u32>> {
|
) -> Result<Option<u32>> {
|
||||||
let futures: Vec<_> = self
|
let futures: Vec<_> = self
|
||||||
.clients
|
.clients
|
||||||
@ -110,6 +111,7 @@ impl ShardedClient {
|
|||||||
max_prefill_tokens,
|
max_prefill_tokens,
|
||||||
max_total_tokens,
|
max_total_tokens,
|
||||||
max_batch_size,
|
max_batch_size,
|
||||||
|
model_id
|
||||||
))
|
))
|
||||||
})
|
})
|
||||||
.collect();
|
.collect();
|
||||||
|
@ -2,30 +2,9 @@
|
|||||||
//! Inspired by: https://github.com/open-telemetry/opentelemetry-rust gRPC examples
|
//! Inspired by: https://github.com/open-telemetry/opentelemetry-rust gRPC examples
|
||||||
|
|
||||||
use opentelemetry::global;
|
use opentelemetry::global;
|
||||||
use opentelemetry::propagation::{Extractor, Injector};
|
use opentelemetry::propagation::Injector;
|
||||||
use tracing_opentelemetry::OpenTelemetrySpanExt;
|
use tracing_opentelemetry::OpenTelemetrySpanExt;
|
||||||
|
|
||||||
/// Extract context metadata from a gRPC request's metadata
|
|
||||||
struct MetadataExtractor<'a>(pub &'a tonic::metadata::MetadataMap);
|
|
||||||
|
|
||||||
impl<'a> Extractor for MetadataExtractor<'a> {
|
|
||||||
/// Get a value for a key from the MetadataMap. If the value can't be converted to &str, returns None
|
|
||||||
fn get(&self, key: &str) -> Option<&str> {
|
|
||||||
self.0.get(key).and_then(|metadata| metadata.to_str().ok())
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Collect all the keys from the MetadataMap.
|
|
||||||
fn keys(&self) -> Vec<&str> {
|
|
||||||
self.0
|
|
||||||
.keys()
|
|
||||||
.map(|key| match key {
|
|
||||||
tonic::metadata::KeyRef::Ascii(v) => v.as_str(),
|
|
||||||
tonic::metadata::KeyRef::Binary(v) => v.as_str(),
|
|
||||||
})
|
|
||||||
.collect::<Vec<_>>()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Inject context in the metadata of a gRPC request.
|
/// Inject context in the metadata of a gRPC request.
|
||||||
struct MetadataInjector<'a>(pub &'a mut tonic::metadata::MetadataMap);
|
struct MetadataInjector<'a>(pub &'a mut tonic::metadata::MetadataMap);
|
||||||
|
|
||||||
|
@ -100,7 +100,6 @@ impl LlavaNext {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||||
#[serde(tag = "model_type")]
|
|
||||||
#[serde(rename_all = "snake_case")]
|
#[serde(rename_all = "snake_case")]
|
||||||
pub struct ClipVisionModel {
|
pub struct ClipVisionModel {
|
||||||
image_size: usize,
|
image_size: usize,
|
||||||
@ -108,7 +107,6 @@ pub struct ClipVisionModel {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||||
#[serde(tag = "model_type")]
|
|
||||||
#[serde(rename_all = "snake_case")]
|
#[serde(rename_all = "snake_case")]
|
||||||
pub struct Idefics2 {}
|
pub struct Idefics2 {}
|
||||||
|
|
||||||
@ -118,6 +116,24 @@ impl Idefics2 {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||||
|
#[serde(rename_all = "snake_case")]
|
||||||
|
pub struct PaliTextConfig {
|
||||||
|
num_image_tokens: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||||
|
#[serde(rename_all = "snake_case")]
|
||||||
|
pub struct Paligemma {
|
||||||
|
text_config: PaliTextConfig,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Paligemma {
|
||||||
|
pub fn get_number_of_features(&self, _height: usize, _width: usize) -> usize {
|
||||||
|
self.text_config.num_image_tokens
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||||
#[serde(tag = "model_type")]
|
#[serde(tag = "model_type")]
|
||||||
#[serde(rename_all = "snake_case")]
|
#[serde(rename_all = "snake_case")]
|
||||||
@ -132,12 +148,15 @@ pub enum Config {
|
|||||||
Santacoder,
|
Santacoder,
|
||||||
Bloom,
|
Bloom,
|
||||||
Mpt,
|
Mpt,
|
||||||
|
Gpt2,
|
||||||
GptNeox,
|
GptNeox,
|
||||||
Phi,
|
Phi,
|
||||||
#[serde(rename = "phi-msft")]
|
#[serde(rename = "phi-msft")]
|
||||||
PhiMsft,
|
PhiMsft,
|
||||||
|
Phi3,
|
||||||
Llama,
|
Llama,
|
||||||
Baichuan,
|
Baichuan,
|
||||||
|
Paligemma(Paligemma),
|
||||||
Gemma,
|
Gemma,
|
||||||
Cohere,
|
Cohere,
|
||||||
Drbx,
|
Drbx,
|
||||||
|
@ -4,7 +4,7 @@
|
|||||||
use crate::validation::{Validation, ValidationError};
|
use crate::validation::{Validation, ValidationError};
|
||||||
use crate::{
|
use crate::{
|
||||||
ChatTemplateInputs, ChatTemplateVersions, Entry, GenerateRequest, GenerateStreamResponse,
|
ChatTemplateInputs, ChatTemplateVersions, Entry, GenerateRequest, GenerateStreamResponse,
|
||||||
HubTokenizerConfig, Message, PrefillToken, Queue, Token,
|
HubTokenizerConfig, Message, MessageChunk, PrefillToken, Queue, Text, TextMessage, Token,
|
||||||
};
|
};
|
||||||
use crate::{FunctionRef, FunctionsMap, GrammarType, Properties, Tool, ToolType, Tools};
|
use crate::{FunctionRef, FunctionsMap, GrammarType, Properties, Tool, ToolType, Tools};
|
||||||
use futures::future::try_join_all;
|
use futures::future::try_join_all;
|
||||||
@ -373,16 +373,15 @@ impl ChatTemplate {
|
|||||||
if self.use_default_tool_template {
|
if self.use_default_tool_template {
|
||||||
if let Some(last_message) = messages.last_mut() {
|
if let Some(last_message) = messages.last_mut() {
|
||||||
if let Some((GrammarType::Json(tools), tool_prompt)) = grammar_with_prompt {
|
if let Some((GrammarType::Json(tools), tool_prompt)) = grammar_with_prompt {
|
||||||
last_message.content = Some(format!(
|
last_message.content.push(MessageChunk::Text(Text {
|
||||||
"{}\n---\n{}\n{}",
|
text: format!("\n---\n{}\n{}", tool_prompt, tools),
|
||||||
last_message.content.as_deref().unwrap_or_default(),
|
}));
|
||||||
tool_prompt,
|
|
||||||
tools
|
|
||||||
));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let messages: Vec<TextMessage> = messages.into_iter().map(|c| c.into()).collect();
|
||||||
|
|
||||||
self.template
|
self.template
|
||||||
.render(ChatTemplateInputs {
|
.render(ChatTemplateInputs {
|
||||||
messages,
|
messages,
|
||||||
@ -950,8 +949,7 @@ impl InferError {
|
|||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use crate::infer::raise_exception;
|
use crate::infer::raise_exception;
|
||||||
use crate::ChatTemplateInputs;
|
use crate::{ChatTemplateInputs, TextMessage};
|
||||||
use crate::Message;
|
|
||||||
use minijinja::Environment;
|
use minijinja::Environment;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@ -985,29 +983,21 @@ mod tests {
|
|||||||
|
|
||||||
let chat_template_inputs = ChatTemplateInputs {
|
let chat_template_inputs = ChatTemplateInputs {
|
||||||
messages: vec![
|
messages: vec![
|
||||||
Message {
|
TextMessage {
|
||||||
role: "user".to_string(),
|
role: "user".to_string(),
|
||||||
content: Some("Hi!".to_string()),
|
content: "Hi!".to_string(),
|
||||||
name: None,
|
|
||||||
tool_calls: None,
|
|
||||||
},
|
},
|
||||||
Message {
|
TextMessage {
|
||||||
role: "assistant".to_string(),
|
role: "assistant".to_string(),
|
||||||
content: Some("Hello how can I help?".to_string()),
|
content: "Hello how can I help?".to_string(),
|
||||||
name: None,
|
|
||||||
tool_calls: None,
|
|
||||||
},
|
},
|
||||||
Message {
|
TextMessage {
|
||||||
role: "user".to_string(),
|
role: "user".to_string(),
|
||||||
content: Some("What is Deep Learning?".to_string()),
|
content: "What is Deep Learning?".to_string(),
|
||||||
name: None,
|
|
||||||
tool_calls: None,
|
|
||||||
},
|
},
|
||||||
Message {
|
TextMessage {
|
||||||
role: "assistant".to_string(),
|
role: "assistant".to_string(),
|
||||||
content: Some("magic!".to_string()),
|
content: "magic!".to_string(),
|
||||||
name: None,
|
|
||||||
tool_calls: None,
|
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
bos_token: Some("[BOS]"),
|
bos_token: Some("[BOS]"),
|
||||||
@ -1055,35 +1045,25 @@ mod tests {
|
|||||||
|
|
||||||
let chat_template_inputs = ChatTemplateInputs {
|
let chat_template_inputs = ChatTemplateInputs {
|
||||||
messages: vec![
|
messages: vec![
|
||||||
Message {
|
TextMessage {
|
||||||
role: "user".to_string(),
|
role: "user".to_string(),
|
||||||
content: Some("Hi!".to_string()),
|
content: "Hi!".to_string(),
|
||||||
name: None,
|
|
||||||
tool_calls: None,
|
|
||||||
},
|
},
|
||||||
Message {
|
TextMessage {
|
||||||
role: "user".to_string(),
|
role: "user".to_string(),
|
||||||
content: Some("Hi again!".to_string()),
|
content: "Hi again!".to_string(),
|
||||||
name: None,
|
|
||||||
tool_calls: None,
|
|
||||||
},
|
},
|
||||||
Message {
|
TextMessage {
|
||||||
role: "assistant".to_string(),
|
role: "assistant".to_string(),
|
||||||
content: Some("Hello how can I help?".to_string()),
|
content: "Hello how can I help?".to_string(),
|
||||||
name: None,
|
|
||||||
tool_calls: None,
|
|
||||||
},
|
},
|
||||||
Message {
|
TextMessage {
|
||||||
role: "user".to_string(),
|
role: "user".to_string(),
|
||||||
content: Some("What is Deep Learning?".to_string()),
|
content: "What is Deep Learning?".to_string(),
|
||||||
name: None,
|
|
||||||
tool_calls: None,
|
|
||||||
},
|
},
|
||||||
Message {
|
TextMessage {
|
||||||
role: "assistant".to_string(),
|
role: "assistant".to_string(),
|
||||||
content: Some("magic!".to_string()),
|
content: "magic!".to_string(),
|
||||||
name: None,
|
|
||||||
tool_calls: None,
|
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
bos_token: Some("[BOS]"),
|
bos_token: Some("[BOS]"),
|
||||||
@ -1136,29 +1116,21 @@ mod tests {
|
|||||||
|
|
||||||
let chat_template_inputs = ChatTemplateInputs {
|
let chat_template_inputs = ChatTemplateInputs {
|
||||||
messages: vec![
|
messages: vec![
|
||||||
Message {
|
TextMessage {
|
||||||
role: "user".to_string(),
|
role: "user".to_string(),
|
||||||
content: Some("Hi!".to_string()),
|
content: "Hi!".to_string(),
|
||||||
name: None,
|
|
||||||
tool_calls: None,
|
|
||||||
},
|
},
|
||||||
Message {
|
TextMessage {
|
||||||
role: "assistant".to_string(),
|
role: "assistant".to_string(),
|
||||||
content: Some("Hello how can I help?".to_string()),
|
content: "Hello how can I help?".to_string(),
|
||||||
name: None,
|
|
||||||
tool_calls: None,
|
|
||||||
},
|
},
|
||||||
Message {
|
TextMessage {
|
||||||
role: "user".to_string(),
|
role: "user".to_string(),
|
||||||
content: Some("What is Deep Learning?".to_string()),
|
content: "What is Deep Learning?".to_string(),
|
||||||
name: None,
|
|
||||||
tool_calls: None,
|
|
||||||
},
|
},
|
||||||
Message {
|
TextMessage {
|
||||||
role: "assistant".to_string(),
|
role: "assistant".to_string(),
|
||||||
content: Some("magic!".to_string()),
|
content: "magic!".to_string(),
|
||||||
name: None,
|
|
||||||
tool_calls: None,
|
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
bos_token: Some("[BOS]"),
|
bos_token: Some("[BOS]"),
|
||||||
@ -1195,29 +1167,21 @@ mod tests {
|
|||||||
|
|
||||||
let chat_template_inputs = ChatTemplateInputs {
|
let chat_template_inputs = ChatTemplateInputs {
|
||||||
messages: vec![
|
messages: vec![
|
||||||
Message {
|
TextMessage {
|
||||||
role: "user".to_string(),
|
role: "user".to_string(),
|
||||||
content: Some("Hi!".to_string()),
|
content: "Hi!".to_string(),
|
||||||
name: None,
|
|
||||||
tool_calls: None,
|
|
||||||
},
|
},
|
||||||
Message {
|
TextMessage {
|
||||||
role: "assistant".to_string(),
|
role: "assistant".to_string(),
|
||||||
content: Some("Hello how can I help?".to_string()),
|
content: "Hello how can I help?".to_string(),
|
||||||
name: None,
|
|
||||||
tool_calls: None,
|
|
||||||
},
|
},
|
||||||
Message {
|
TextMessage {
|
||||||
role: "user".to_string(),
|
role: "user".to_string(),
|
||||||
content: Some("What is Deep Learning?".to_string()),
|
content: "What is Deep Learning?".to_string(),
|
||||||
name: None,
|
|
||||||
tool_calls: None,
|
|
||||||
},
|
},
|
||||||
Message {
|
TextMessage {
|
||||||
role: "assistant".to_string(),
|
role: "assistant".to_string(),
|
||||||
content: Some("magic!".to_string()),
|
content: "magic!".to_string(),
|
||||||
name: None,
|
|
||||||
tool_calls: None,
|
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
bos_token: Some("[BOS]"),
|
bos_token: Some("[BOS]"),
|
||||||
@ -1240,34 +1204,24 @@ mod tests {
|
|||||||
#[test]
|
#[test]
|
||||||
fn test_many_chat_templates() {
|
fn test_many_chat_templates() {
|
||||||
let example_chat = vec![
|
let example_chat = vec![
|
||||||
Message {
|
TextMessage {
|
||||||
role: "user".to_string(),
|
role: "user".to_string(),
|
||||||
content: Some("Hello, how are you?".to_string()),
|
content: "Hello, how are you?".to_string(),
|
||||||
name: None,
|
|
||||||
tool_calls: None,
|
|
||||||
},
|
},
|
||||||
Message {
|
TextMessage {
|
||||||
role: "assistant".to_string(),
|
role: "assistant".to_string(),
|
||||||
content: Some("I'm doing great. How can I help you today?".to_string()),
|
content: "I'm doing great. How can I help you today?".to_string(),
|
||||||
name: None,
|
|
||||||
tool_calls: None,
|
|
||||||
},
|
},
|
||||||
Message {
|
TextMessage {
|
||||||
role: "user".to_string(),
|
role: "user".to_string(),
|
||||||
content: Some("I'd like to show off how chat templating works!".to_string()),
|
content: "I'd like to show off how chat templating works!".to_string(),
|
||||||
name: None,
|
|
||||||
tool_calls: None,
|
|
||||||
},
|
},
|
||||||
];
|
];
|
||||||
|
|
||||||
let example_chat_with_system = vec![Message {
|
let example_chat_with_system = [TextMessage {
|
||||||
role: "system".to_string(),
|
role: "system".to_string(),
|
||||||
content: Some(
|
content: "You are a friendly chatbot who always responds in the style of a pirate"
|
||||||
"You are a friendly chatbot who always responds in the style of a pirate"
|
.to_string(),
|
||||||
.to_string(),
|
|
||||||
),
|
|
||||||
name: None,
|
|
||||||
tool_calls: None,
|
|
||||||
}]
|
}]
|
||||||
.iter()
|
.iter()
|
||||||
.chain(&example_chat)
|
.chain(&example_chat)
|
||||||
@ -1384,7 +1338,7 @@ mod tests {
|
|||||||
{
|
{
|
||||||
let mut env = Environment::new();
|
let mut env = Environment::new();
|
||||||
env.add_function("raise_exception", raise_exception);
|
env.add_function("raise_exception", raise_exception);
|
||||||
let tmpl = env.template_from_str(&chat_template);
|
let tmpl = env.template_from_str(chat_template);
|
||||||
let result = tmpl.unwrap().render(input).unwrap();
|
let result = tmpl.unwrap().render(input).unwrap();
|
||||||
assert_eq!(result, target);
|
assert_eq!(result, target);
|
||||||
}
|
}
|
||||||
@ -1407,17 +1361,13 @@ mod tests {
|
|||||||
chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}",
|
chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}",
|
||||||
input: ChatTemplateInputs {
|
input: ChatTemplateInputs {
|
||||||
messages: vec![
|
messages: vec![
|
||||||
Message {
|
TextMessage{
|
||||||
role: "system".to_string(),
|
role: "system".to_string(),
|
||||||
content: Some("You are a friendly chatbot who always responds in the style of a pirate".to_string()),
|
content: "You are a friendly chatbot who always responds in the style of a pirate".to_string(),
|
||||||
name: None,
|
|
||||||
tool_calls: None,
|
|
||||||
},
|
},
|
||||||
Message {
|
TextMessage{
|
||||||
role: "user".to_string(),
|
role: "user".to_string(),
|
||||||
content: Some("How many helicopters can a human eat in one sitting?".to_string()),
|
content: "How many helicopters can a human eat in one sitting?".to_string(),
|
||||||
name: None,
|
|
||||||
tool_calls: None,
|
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
add_generation_prompt: true,
|
add_generation_prompt: true,
|
||||||
|
@ -11,6 +11,7 @@ use queue::{Entry, Queue};
|
|||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use tokio::sync::OwnedSemaphorePermit;
|
use tokio::sync::OwnedSemaphorePermit;
|
||||||
use tokio_stream::wrappers::UnboundedReceiverStream;
|
use tokio_stream::wrappers::UnboundedReceiverStream;
|
||||||
|
use tracing::warn;
|
||||||
use utoipa::ToSchema;
|
use utoipa::ToSchema;
|
||||||
use validation::Validation;
|
use validation::Validation;
|
||||||
|
|
||||||
@ -159,6 +160,8 @@ pub struct Info {
|
|||||||
#[schema(example = "32")]
|
#[schema(example = "32")]
|
||||||
pub max_client_batch_size: usize,
|
pub max_client_batch_size: usize,
|
||||||
/// Router Info
|
/// Router Info
|
||||||
|
#[schema(example = "text-generation-router")]
|
||||||
|
pub router: &'static str,
|
||||||
#[schema(example = "0.5.0")]
|
#[schema(example = "0.5.0")]
|
||||||
pub version: &'static str,
|
pub version: &'static str,
|
||||||
#[schema(nullable = true, example = "null")]
|
#[schema(nullable = true, example = "null")]
|
||||||
@ -399,6 +402,11 @@ pub struct CompletionRequest {
|
|||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
#[schema(example = "1.0")]
|
#[schema(example = "1.0")]
|
||||||
pub frequency_penalty: Option<f32>,
|
pub frequency_penalty: Option<f32>,
|
||||||
|
|
||||||
|
/// Up to 4 sequences where the API will stop generating further tokens.
|
||||||
|
#[serde(default)]
|
||||||
|
#[schema(nullable = true, example = "null")]
|
||||||
|
pub stop: Option<Vec<String>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, Deserialize, Serialize, ToSchema, Default)]
|
#[derive(Clone, Deserialize, Serialize, ToSchema, Default)]
|
||||||
@ -438,7 +446,7 @@ pub(crate) struct ChatCompletion {
|
|||||||
#[derive(Clone, Deserialize, Serialize, ToSchema)]
|
#[derive(Clone, Deserialize, Serialize, ToSchema)]
|
||||||
pub(crate) struct ChatCompletionComplete {
|
pub(crate) struct ChatCompletionComplete {
|
||||||
pub index: u32,
|
pub index: u32,
|
||||||
pub message: Message,
|
pub message: OutputMessage,
|
||||||
pub logprobs: Option<ChatCompletionLogprobs>,
|
pub logprobs: Option<ChatCompletionLogprobs>,
|
||||||
pub finish_reason: String,
|
pub finish_reason: String,
|
||||||
}
|
}
|
||||||
@ -531,6 +539,30 @@ impl ChatCompletion {
|
|||||||
return_logprobs: bool,
|
return_logprobs: bool,
|
||||||
tool_calls: Option<Vec<ToolCall>>,
|
tool_calls: Option<Vec<ToolCall>>,
|
||||||
) -> Self {
|
) -> Self {
|
||||||
|
let message = match (output, tool_calls) {
|
||||||
|
(Some(content), None) => OutputMessage::ChatMessage(TextMessage {
|
||||||
|
role: "assistant".into(),
|
||||||
|
content,
|
||||||
|
}),
|
||||||
|
(None, Some(tool_calls)) => OutputMessage::ToolCall(ToolCallMessage {
|
||||||
|
role: "assistant".to_string(),
|
||||||
|
tool_calls,
|
||||||
|
}),
|
||||||
|
(Some(output), Some(_)) => {
|
||||||
|
warn!("Received both chat and tool call");
|
||||||
|
OutputMessage::ChatMessage(TextMessage {
|
||||||
|
role: "assistant".into(),
|
||||||
|
content: output,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
(None, None) => {
|
||||||
|
warn!("Didn't receive an answer");
|
||||||
|
OutputMessage::ChatMessage(TextMessage {
|
||||||
|
role: "assistant".into(),
|
||||||
|
content: "".to_string(),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
};
|
||||||
Self {
|
Self {
|
||||||
id: String::new(),
|
id: String::new(),
|
||||||
object: "text_completion".into(),
|
object: "text_completion".into(),
|
||||||
@ -539,12 +571,7 @@ impl ChatCompletion {
|
|||||||
system_fingerprint,
|
system_fingerprint,
|
||||||
choices: vec![ChatCompletionComplete {
|
choices: vec![ChatCompletionComplete {
|
||||||
index: 0,
|
index: 0,
|
||||||
message: Message {
|
message,
|
||||||
role: "assistant".into(),
|
|
||||||
content: output,
|
|
||||||
name: None,
|
|
||||||
tool_calls,
|
|
||||||
},
|
|
||||||
logprobs: return_logprobs
|
logprobs: return_logprobs
|
||||||
.then(|| ChatCompletionLogprobs::from((details.tokens, details.top_tokens))),
|
.then(|| ChatCompletionLogprobs::from((details.tokens, details.top_tokens))),
|
||||||
finish_reason: details.finish_reason.to_string(),
|
finish_reason: details.finish_reason.to_string(),
|
||||||
@ -566,7 +593,8 @@ pub(crate) struct CompletionCompleteChunk {
|
|||||||
pub model: String,
|
pub model: String,
|
||||||
pub system_fingerprint: String,
|
pub system_fingerprint: String,
|
||||||
}
|
}
|
||||||
#[derive(Clone, Deserialize, Serialize, ToSchema)]
|
|
||||||
|
#[derive(Clone, Serialize, ToSchema)]
|
||||||
pub(crate) struct ChatCompletionChunk {
|
pub(crate) struct ChatCompletionChunk {
|
||||||
pub id: String,
|
pub id: String,
|
||||||
pub object: String,
|
pub object: String,
|
||||||
@ -578,7 +606,7 @@ pub(crate) struct ChatCompletionChunk {
|
|||||||
pub choices: Vec<ChatCompletionChoice>,
|
pub choices: Vec<ChatCompletionChoice>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, Deserialize, Serialize, ToSchema)]
|
#[derive(Clone, Serialize, ToSchema)]
|
||||||
pub(crate) struct ChatCompletionChoice {
|
pub(crate) struct ChatCompletionChoice {
|
||||||
pub index: u32,
|
pub index: u32,
|
||||||
pub delta: ChatCompletionDelta,
|
pub delta: ChatCompletionDelta,
|
||||||
@ -586,21 +614,21 @@ pub(crate) struct ChatCompletionChoice {
|
|||||||
pub finish_reason: Option<String>,
|
pub finish_reason: Option<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, Debug, Deserialize, Serialize, ToSchema)]
|
#[derive(Clone, Deserialize, ToSchema, Serialize, Debug, PartialEq)]
|
||||||
pub(crate) struct ChatCompletionDelta {
|
pub struct ToolCallDelta {
|
||||||
#[schema(example = "user")]
|
#[schema(example = "assistant")]
|
||||||
// TODO Modify this to a true enum.
|
role: String,
|
||||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
tool_calls: DeltaToolCall,
|
||||||
pub role: Option<String>,
|
|
||||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
|
||||||
#[schema(example = "What is Deep Learning?")]
|
|
||||||
pub content: Option<String>,
|
|
||||||
// default to None
|
|
||||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
|
||||||
pub tool_calls: Option<DeltaToolCall>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, Deserialize, Serialize, ToSchema, Debug)]
|
#[derive(Clone, Debug, Serialize, ToSchema)]
|
||||||
|
#[serde(untagged)]
|
||||||
|
enum ChatCompletionDelta {
|
||||||
|
Chat(TextMessage),
|
||||||
|
Tool(ToolCallDelta),
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone, Deserialize, Serialize, ToSchema, Debug, PartialEq)]
|
||||||
pub(crate) struct DeltaToolCall {
|
pub(crate) struct DeltaToolCall {
|
||||||
pub index: u32,
|
pub index: u32,
|
||||||
pub id: String,
|
pub id: String,
|
||||||
@ -608,7 +636,7 @@ pub(crate) struct DeltaToolCall {
|
|||||||
pub function: Function,
|
pub function: Function,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, Deserialize, Serialize, ToSchema, Debug)]
|
#[derive(Clone, Deserialize, Serialize, ToSchema, Debug, PartialEq)]
|
||||||
pub(crate) struct Function {
|
pub(crate) struct Function {
|
||||||
pub name: Option<String>,
|
pub name: Option<String>,
|
||||||
pub arguments: String,
|
pub arguments: String,
|
||||||
@ -626,15 +654,13 @@ impl ChatCompletionChunk {
|
|||||||
finish_reason: Option<String>,
|
finish_reason: Option<String>,
|
||||||
) -> Self {
|
) -> Self {
|
||||||
let delta = match (delta, tool_calls) {
|
let delta = match (delta, tool_calls) {
|
||||||
(Some(delta), _) => ChatCompletionDelta {
|
(Some(delta), _) => ChatCompletionDelta::Chat(TextMessage {
|
||||||
role: Some("assistant".to_string()),
|
role: "assistant".to_string(),
|
||||||
content: Some(delta),
|
content: delta,
|
||||||
tool_calls: None,
|
}),
|
||||||
},
|
(None, Some(tool_calls)) => ChatCompletionDelta::Tool(ToolCallDelta {
|
||||||
(None, Some(tool_calls)) => ChatCompletionDelta {
|
role: "assistant".to_string(),
|
||||||
role: Some("assistant".to_string()),
|
tool_calls: DeltaToolCall {
|
||||||
content: None,
|
|
||||||
tool_calls: Some(DeltaToolCall {
|
|
||||||
index: 0,
|
index: 0,
|
||||||
id: String::new(),
|
id: String::new(),
|
||||||
r#type: "function".to_string(),
|
r#type: "function".to_string(),
|
||||||
@ -642,13 +668,12 @@ impl ChatCompletionChunk {
|
|||||||
name: None,
|
name: None,
|
||||||
arguments: tool_calls[0].to_string(),
|
arguments: tool_calls[0].to_string(),
|
||||||
},
|
},
|
||||||
}),
|
},
|
||||||
},
|
}),
|
||||||
(None, None) => ChatCompletionDelta {
|
(None, None) => ChatCompletionDelta::Chat(TextMessage {
|
||||||
role: None,
|
role: "assistant".to_string(),
|
||||||
content: None,
|
content: "".to_string(),
|
||||||
tool_calls: None,
|
}),
|
||||||
},
|
|
||||||
};
|
};
|
||||||
Self {
|
Self {
|
||||||
id: String::new(),
|
id: String::new(),
|
||||||
@ -849,7 +874,7 @@ where
|
|||||||
state.end()
|
state.end()
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, Debug, Deserialize, Serialize, ToSchema, Default)]
|
#[derive(Clone, Debug, Deserialize, Serialize, ToSchema, Default, PartialEq)]
|
||||||
pub(crate) struct FunctionDefinition {
|
pub(crate) struct FunctionDefinition {
|
||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
pub description: Option<String>,
|
pub description: Option<String>,
|
||||||
@ -869,7 +894,7 @@ pub(crate) struct Tool {
|
|||||||
|
|
||||||
#[derive(Clone, Serialize, Deserialize, Default)]
|
#[derive(Clone, Serialize, Deserialize, Default)]
|
||||||
pub(crate) struct ChatTemplateInputs<'a> {
|
pub(crate) struct ChatTemplateInputs<'a> {
|
||||||
messages: Vec<Message>,
|
messages: Vec<TextMessage>,
|
||||||
bos_token: Option<&'a str>,
|
bos_token: Option<&'a str>,
|
||||||
eos_token: Option<&'a str>,
|
eos_token: Option<&'a str>,
|
||||||
add_generation_prompt: bool,
|
add_generation_prompt: bool,
|
||||||
@ -877,88 +902,113 @@ pub(crate) struct ChatTemplateInputs<'a> {
|
|||||||
tools_prompt: Option<&'a str>,
|
tools_prompt: Option<&'a str>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, Deserialize, Serialize, ToSchema, Default, Debug)]
|
#[derive(Clone, Deserialize, Serialize, ToSchema, Default, Debug, PartialEq)]
|
||||||
pub(crate) struct ToolCall {
|
pub(crate) struct ToolCall {
|
||||||
pub id: u32,
|
pub id: String,
|
||||||
pub r#type: String,
|
pub r#type: String,
|
||||||
pub function: FunctionDefinition,
|
pub function: FunctionDefinition,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, Deserialize, Serialize, ToSchema, Default, Debug)]
|
#[derive(Clone, Deserialize, ToSchema, Serialize, Debug, PartialEq)]
|
||||||
pub(crate) struct Text {
|
struct Url {
|
||||||
#[serde(default)]
|
url: String,
|
||||||
pub text: String,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, Deserialize, Serialize, ToSchema, Default, Debug)]
|
#[derive(Clone, Deserialize, ToSchema, Serialize, Debug, PartialEq)]
|
||||||
pub(crate) struct ImageUrl {
|
struct ImageUrl {
|
||||||
#[serde(default)]
|
image_url: Url,
|
||||||
pub url: String,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, Deserialize, Serialize, ToSchema, Default, Debug)]
|
#[derive(Clone, Deserialize, ToSchema, Serialize, Debug, PartialEq)]
|
||||||
pub(crate) struct Content {
|
struct Text {
|
||||||
pub r#type: String,
|
text: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone, Deserialize, ToSchema, Serialize, Debug, PartialEq)]
|
||||||
|
#[serde(tag = "type")]
|
||||||
|
#[serde(rename_all = "snake_case")]
|
||||||
|
enum MessageChunk {
|
||||||
|
Text(Text),
|
||||||
|
ImageUrl(ImageUrl),
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone, Deserialize, ToSchema, Serialize, Debug, PartialEq)]
|
||||||
|
pub struct Message {
|
||||||
|
#[schema(example = "user")]
|
||||||
|
role: String,
|
||||||
|
#[schema(example = "My name is David and I")]
|
||||||
|
#[serde(deserialize_with = "message_content_serde::deserialize")]
|
||||||
|
content: Vec<MessageChunk>,
|
||||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||||
pub text: Option<String>,
|
#[schema(example = "\"David\"")]
|
||||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
name: Option<String>,
|
||||||
pub image_url: Option<ImageUrl>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
mod message_content_serde {
|
mod message_content_serde {
|
||||||
use super::*;
|
use super::*;
|
||||||
use serde::de;
|
use serde::{Deserialize, Deserializer};
|
||||||
use serde::Deserializer;
|
|
||||||
use serde_json::Value;
|
|
||||||
|
|
||||||
pub fn deserialize<'de, D>(deserializer: D) -> Result<Option<String>, D::Error>
|
pub fn deserialize<'de, D>(deserializer: D) -> Result<Vec<MessageChunk>, D::Error>
|
||||||
where
|
where
|
||||||
D: Deserializer<'de>,
|
D: Deserializer<'de>,
|
||||||
{
|
{
|
||||||
let value = Value::deserialize(deserializer)?;
|
#[derive(Deserialize)]
|
||||||
match value {
|
#[serde(untagged)]
|
||||||
Value::String(s) => Ok(Some(s)),
|
enum Message {
|
||||||
Value::Array(arr) => {
|
Text(String),
|
||||||
let results: Result<Vec<String>, _> = arr
|
Chunks(Vec<MessageChunk>),
|
||||||
.into_iter()
|
}
|
||||||
.map(|v| {
|
let message: Message = Deserialize::deserialize(deserializer)?;
|
||||||
let content: Content =
|
let chunks = match message {
|
||||||
serde_json::from_value(v).map_err(de::Error::custom)?;
|
Message::Text(text) => {
|
||||||
match content.r#type.as_str() {
|
vec![MessageChunk::Text(Text { text })]
|
||||||
"text" => Ok(content.text.unwrap_or_default()),
|
|
||||||
"image_url" => {
|
|
||||||
if let Some(url) = content.image_url {
|
|
||||||
Ok(format!("", url.url))
|
|
||||||
} else {
|
|
||||||
Ok(String::new())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
_ => Err(de::Error::custom("invalid content type")),
|
|
||||||
}
|
|
||||||
})
|
|
||||||
.collect();
|
|
||||||
|
|
||||||
results.map(|strings| Some(strings.join("")))
|
|
||||||
}
|
}
|
||||||
Value::Null => Ok(None),
|
Message::Chunks(s) => s,
|
||||||
_ => Err(de::Error::custom("invalid token format")),
|
};
|
||||||
|
Ok(chunks)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone, Deserialize, ToSchema, Serialize, Debug, PartialEq)]
|
||||||
|
pub struct TextMessage {
|
||||||
|
#[schema(example = "user")]
|
||||||
|
pub role: String,
|
||||||
|
#[schema(example = "My name is David and I")]
|
||||||
|
pub content: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<Message> for TextMessage {
|
||||||
|
fn from(value: Message) -> Self {
|
||||||
|
TextMessage {
|
||||||
|
role: value.role,
|
||||||
|
content: value
|
||||||
|
.content
|
||||||
|
.into_iter()
|
||||||
|
.map(|c| match c {
|
||||||
|
MessageChunk::Text(Text { text }) => text,
|
||||||
|
MessageChunk::ImageUrl(image) => {
|
||||||
|
let url = image.image_url.url;
|
||||||
|
format!("")
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.collect::<Vec<_>>()
|
||||||
|
.join(""),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, Deserialize, ToSchema, Serialize, Debug)]
|
#[derive(Clone, Deserialize, ToSchema, Serialize, Debug, PartialEq)]
|
||||||
pub(crate) struct Message {
|
pub struct ToolCallMessage {
|
||||||
#[schema(example = "user")]
|
#[schema(example = "assistant")]
|
||||||
pub role: String,
|
role: String,
|
||||||
#[serde(skip_serializing_if = "Option::is_none")]
|
tool_calls: Vec<ToolCall>,
|
||||||
#[schema(example = "My name is David and I")]
|
}
|
||||||
#[serde(deserialize_with = "message_content_serde::deserialize")]
|
|
||||||
pub content: Option<String>,
|
#[derive(Clone, Deserialize, ToSchema, Serialize, Debug, PartialEq)]
|
||||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
#[serde(untagged)]
|
||||||
#[schema(example = "\"David\"")]
|
pub(crate) enum OutputMessage {
|
||||||
pub name: Option<String>,
|
ChatMessage(TextMessage),
|
||||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
ToolCall(ToolCallMessage),
|
||||||
pub tool_calls: Option<Vec<ToolCall>>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, Debug, Deserialize, ToSchema)]
|
#[derive(Clone, Debug, Deserialize, ToSchema)]
|
||||||
@ -1121,7 +1171,7 @@ pub(crate) struct ErrorResponse {
|
|||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
|
use serde_json::json;
|
||||||
use tokenizers::Tokenizer;
|
use tokenizers::Tokenizer;
|
||||||
|
|
||||||
pub(crate) async fn get_tokenizer() -> Tokenizer {
|
pub(crate) async fn get_tokenizer() -> Tokenizer {
|
||||||
@ -1189,4 +1239,100 @@ mod tests {
|
|||||||
);
|
);
|
||||||
assert_eq!(config.eos_token, Some("<|end▁of▁sentence|>".to_string()));
|
assert_eq!(config.eos_token, Some("<|end▁of▁sentence|>".to_string()));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_chat_simple_string() {
|
||||||
|
let json = json!({
|
||||||
|
"model": "",
|
||||||
|
"messages": [{
|
||||||
|
"role": "user",
|
||||||
|
"content": "What is Deep Learning?"
|
||||||
|
}]
|
||||||
|
});
|
||||||
|
let request: ChatRequest = serde_json::from_str(json.to_string().as_str()).unwrap();
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
request.messages[0],
|
||||||
|
Message {
|
||||||
|
role: "user".to_string(),
|
||||||
|
content: vec![MessageChunk::Text(Text {
|
||||||
|
text: "What is Deep Learning?".to_string()
|
||||||
|
}),],
|
||||||
|
name: None
|
||||||
|
}
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_chat_request() {
|
||||||
|
let json = json!({
|
||||||
|
"model": "",
|
||||||
|
"messages": [{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{"type": "text", "text": "Whats in this image?"},
|
||||||
|
{"type": "image_url", "image_url": {"url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png"}},
|
||||||
|
]
|
||||||
|
}]
|
||||||
|
});
|
||||||
|
let request: ChatRequest = serde_json::from_str(json.to_string().as_str()).unwrap();
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
request.messages[0],
|
||||||
|
Message{
|
||||||
|
role: "user".to_string(),
|
||||||
|
content: vec![
|
||||||
|
MessageChunk::Text(Text { text: "Whats in this image?".to_string() }),
|
||||||
|
MessageChunk::ImageUrl(ImageUrl { image_url: Url { url: "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png".to_string() } })
|
||||||
|
],
|
||||||
|
name: None
|
||||||
|
}
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn text_message_convert() {
|
||||||
|
let message = Message{
|
||||||
|
role: "user".to_string(),
|
||||||
|
content: vec![
|
||||||
|
MessageChunk::Text(Text { text: "Whats in this image?".to_string() }),
|
||||||
|
MessageChunk::ImageUrl(ImageUrl { image_url: Url { url: "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png".to_string() } })
|
||||||
|
],
|
||||||
|
name: None
|
||||||
|
};
|
||||||
|
let textmsg: TextMessage = message.into();
|
||||||
|
assert_eq!(textmsg.content, "Whats in this image?");
|
||||||
|
}
|
||||||
|
#[test]
|
||||||
|
fn openai_output() {
|
||||||
|
let message = OutputMessage::ChatMessage(TextMessage {
|
||||||
|
role: "assistant".to_string(),
|
||||||
|
content: "This is the answer".to_string(),
|
||||||
|
});
|
||||||
|
let serialized = serde_json::to_string(&message).unwrap();
|
||||||
|
assert_eq!(
|
||||||
|
serialized,
|
||||||
|
r#"{"role":"assistant","content":"This is the answer"}"#
|
||||||
|
);
|
||||||
|
|
||||||
|
let message = OutputMessage::ToolCall(ToolCallMessage {
|
||||||
|
role: "assistant".to_string(),
|
||||||
|
tool_calls: vec![ToolCall {
|
||||||
|
id: "0".to_string(),
|
||||||
|
r#type: "function".to_string(),
|
||||||
|
function: FunctionDefinition {
|
||||||
|
description: None,
|
||||||
|
name: "myfn".to_string(),
|
||||||
|
arguments: json!({
|
||||||
|
"format": "csv"
|
||||||
|
}),
|
||||||
|
},
|
||||||
|
}],
|
||||||
|
});
|
||||||
|
let serialized = serde_json::to_string(&message).unwrap();
|
||||||
|
assert_eq!(
|
||||||
|
serialized,
|
||||||
|
r#"{"role":"assistant","tool_calls":[{"id":"0","type":"function","function":{"description":null,"name":"myfn","arguments":{"format":"csv"}}}]}"#
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -23,7 +23,7 @@ use tokenizers::Tokenizer;
|
|||||||
use tower_http::cors::AllowOrigin;
|
use tower_http::cors::AllowOrigin;
|
||||||
use tracing_subscriber::layer::SubscriberExt;
|
use tracing_subscriber::layer::SubscriberExt;
|
||||||
use tracing_subscriber::util::SubscriberInitExt;
|
use tracing_subscriber::util::SubscriberInitExt;
|
||||||
use tracing_subscriber::{EnvFilter, Layer};
|
use tracing_subscriber::{filter::LevelFilter, EnvFilter, Layer};
|
||||||
|
|
||||||
/// App Configuration
|
/// App Configuration
|
||||||
#[derive(Parser, Debug)]
|
#[derive(Parser, Debug)]
|
||||||
@ -349,6 +349,7 @@ async fn main() -> Result<(), RouterError> {
|
|||||||
max_batch_prefill_tokens,
|
max_batch_prefill_tokens,
|
||||||
max_total_tokens as u32,
|
max_total_tokens as u32,
|
||||||
max_batch_size,
|
max_batch_size,
|
||||||
|
&model_info.model_id
|
||||||
)
|
)
|
||||||
.await
|
.await
|
||||||
.map_err(RouterError::Warmup)?
|
.map_err(RouterError::Warmup)?
|
||||||
@ -482,8 +483,21 @@ fn init_logging(otlp_endpoint: Option<String>, json_output: bool) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Filter events with LOG_LEVEL
|
// Filter events with LOG_LEVEL
|
||||||
let env_filter =
|
let varname = "LOG_LEVEL";
|
||||||
EnvFilter::try_from_env("LOG_LEVEL").unwrap_or_else(|_| EnvFilter::new("info"));
|
let env_filter = if let Ok(log_level) = std::env::var(varname) {
|
||||||
|
// Override to avoid simple logs to be spammed with tokio level informations
|
||||||
|
let log_level = match &log_level[..] {
|
||||||
|
"warn" => "text_generation_launcher=warn,text_generation_router=warn",
|
||||||
|
"info" => "text_generation_launcher=info,text_generation_router=info",
|
||||||
|
"debug" => "text_generation_launcher=debug,text_generation_router=debug",
|
||||||
|
log_level => log_level,
|
||||||
|
};
|
||||||
|
EnvFilter::builder()
|
||||||
|
.with_default_directive(LevelFilter::INFO.into())
|
||||||
|
.parse_lossy(log_level)
|
||||||
|
} else {
|
||||||
|
EnvFilter::new("info")
|
||||||
|
};
|
||||||
|
|
||||||
tracing_subscriber::registry()
|
tracing_subscriber::registry()
|
||||||
.with(env_filter)
|
.with(env_filter)
|
||||||
|
@ -599,9 +599,22 @@ async fn completions(
|
|||||||
let span = tracing::Span::current();
|
let span = tracing::Span::current();
|
||||||
metrics::increment_counter!("tgi_request_count");
|
metrics::increment_counter!("tgi_request_count");
|
||||||
|
|
||||||
let stream = req.stream;
|
let CompletionRequest {
|
||||||
let max_new_tokens = req.max_tokens.or(Some(100));
|
max_tokens,
|
||||||
let seed = req.seed;
|
seed,
|
||||||
|
stop,
|
||||||
|
stream,
|
||||||
|
temperature,
|
||||||
|
..
|
||||||
|
} = req;
|
||||||
|
|
||||||
|
let max_new_tokens = max_tokens.or(Some(100));
|
||||||
|
let stop = stop.unwrap_or_default();
|
||||||
|
// enable greedy only when temperature is 0
|
||||||
|
let (do_sample, temperature) = match temperature {
|
||||||
|
Some(temperature) if temperature == 0.0 => (false, None),
|
||||||
|
other => (true, other),
|
||||||
|
};
|
||||||
|
|
||||||
// if suffix is present throw an error
|
// if suffix is present throw an error
|
||||||
if req.suffix.is_some() {
|
if req.suffix.is_some() {
|
||||||
@ -637,16 +650,16 @@ async fn completions(
|
|||||||
inputs: prompt.to_string(),
|
inputs: prompt.to_string(),
|
||||||
parameters: GenerateParameters {
|
parameters: GenerateParameters {
|
||||||
best_of: None,
|
best_of: None,
|
||||||
temperature: req.temperature,
|
temperature,
|
||||||
repetition_penalty: req.repetition_penalty,
|
repetition_penalty: req.repetition_penalty,
|
||||||
frequency_penalty: req.frequency_penalty,
|
frequency_penalty: req.frequency_penalty,
|
||||||
top_k: None,
|
top_k: None,
|
||||||
top_p: req.top_p,
|
top_p: req.top_p,
|
||||||
typical_p: None,
|
typical_p: None,
|
||||||
do_sample: true,
|
do_sample,
|
||||||
max_new_tokens,
|
max_new_tokens,
|
||||||
return_full_text: None,
|
return_full_text: None,
|
||||||
stop: Vec::new(),
|
stop: stop.clone(),
|
||||||
truncate: None,
|
truncate: None,
|
||||||
watermark: false,
|
watermark: false,
|
||||||
details: true,
|
details: true,
|
||||||
@ -698,7 +711,7 @@ async fn completions(
|
|||||||
model: model_id.clone(),
|
model: model_id.clone(),
|
||||||
system_fingerprint: system_fingerprint.clone(),
|
system_fingerprint: system_fingerprint.clone(),
|
||||||
})
|
})
|
||||||
.map_or_else(|_e| Event::default(), |data| data)
|
.unwrap_or_else(|_e| Event::default())
|
||||||
};
|
};
|
||||||
|
|
||||||
let (header_tx, header_rx) = oneshot::channel();
|
let (header_tx, header_rx) = oneshot::channel();
|
||||||
@ -990,7 +1003,6 @@ async fn chat_completions(
|
|||||||
) -> Result<Response, (StatusCode, Json<ErrorResponse>)> {
|
) -> Result<Response, (StatusCode, Json<ErrorResponse>)> {
|
||||||
let span = tracing::Span::current();
|
let span = tracing::Span::current();
|
||||||
metrics::increment_counter!("tgi_request_count");
|
metrics::increment_counter!("tgi_request_count");
|
||||||
|
|
||||||
let ChatRequest {
|
let ChatRequest {
|
||||||
logprobs,
|
logprobs,
|
||||||
max_tokens,
|
max_tokens,
|
||||||
@ -1124,13 +1136,10 @@ async fn chat_completions(
|
|||||||
logprobs,
|
logprobs,
|
||||||
stream_token.details.map(|d| d.finish_reason.to_string()),
|
stream_token.details.map(|d| d.finish_reason.to_string()),
|
||||||
))
|
))
|
||||||
.map_or_else(
|
.unwrap_or_else(|e| {
|
||||||
|e| {
|
println!("Failed to serialize ChatCompletionChunk: {:?}", e);
|
||||||
println!("Failed to serialize ChatCompletionChunk: {:?}", e);
|
Event::default()
|
||||||
Event::default()
|
})
|
||||||
},
|
|
||||||
|data| data,
|
|
||||||
)
|
|
||||||
};
|
};
|
||||||
|
|
||||||
let (headers, response_stream) = generate_stream_internal(
|
let (headers, response_stream) = generate_stream_internal(
|
||||||
@ -1165,7 +1174,7 @@ async fn chat_completions(
|
|||||||
)
|
)
|
||||||
})?;
|
})?;
|
||||||
let tool_calls = vec![ToolCall {
|
let tool_calls = vec![ToolCall {
|
||||||
id: 0,
|
id: "0".to_string(),
|
||||||
r#type: "function".to_string(),
|
r#type: "function".to_string(),
|
||||||
function: FunctionDefinition {
|
function: FunctionDefinition {
|
||||||
description: None,
|
description: None,
|
||||||
@ -1568,6 +1577,7 @@ pub async fn run(
|
|||||||
max_batch_size,
|
max_batch_size,
|
||||||
validation_workers,
|
validation_workers,
|
||||||
max_client_batch_size,
|
max_client_batch_size,
|
||||||
|
router: env!("CARGO_PKG_NAME"),
|
||||||
version: env!("CARGO_PKG_VERSION"),
|
version: env!("CARGO_PKG_VERSION"),
|
||||||
sha: option_env!("VERGEN_GIT_SHA"),
|
sha: option_env!("VERGEN_GIT_SHA"),
|
||||||
docker_label: option_env!("DOCKER_LABEL"),
|
docker_label: option_env!("DOCKER_LABEL"),
|
||||||
|
@ -565,6 +565,30 @@ fn prepare_input(
|
|||||||
inputs = modified_inputs;
|
inputs = modified_inputs;
|
||||||
tokenizer_query
|
tokenizer_query
|
||||||
}
|
}
|
||||||
|
Some(Config::Paligemma(config)) => {
|
||||||
|
let mut modified_inputs = String::with_capacity(inputs.len());
|
||||||
|
let mut tokenizer_query = String::with_capacity(inputs.len());
|
||||||
|
let mut start = 0;
|
||||||
|
for chunk in RE.find_iter(&inputs) {
|
||||||
|
let chunk_start = chunk.start();
|
||||||
|
let chunk_end = chunk.end();
|
||||||
|
if chunk_start != start {
|
||||||
|
modified_inputs.push_str(&inputs[start..chunk_start]);
|
||||||
|
tokenizer_query.push_str(&inputs[start..chunk_start]);
|
||||||
|
}
|
||||||
|
let (image_uri, height, width) = fetch_image(&inputs[chunk_start..chunk_end])?;
|
||||||
|
let slots = config.get_number_of_features(height, width);
|
||||||
|
tokenizer_query.push_str(&"<image>".repeat(slots));
|
||||||
|
modified_inputs.push_str(&image_uri);
|
||||||
|
start = chunk_end;
|
||||||
|
}
|
||||||
|
if start != inputs.len() - 1 {
|
||||||
|
modified_inputs.push_str(&inputs[start..]);
|
||||||
|
tokenizer_query.push_str(&inputs[start..]);
|
||||||
|
}
|
||||||
|
inputs = modified_inputs;
|
||||||
|
tokenizer_query
|
||||||
|
}
|
||||||
Some(Config::Idefics2(config)) => {
|
Some(Config::Idefics2(config)) => {
|
||||||
let mut modified_inputs = String::with_capacity(inputs.len());
|
let mut modified_inputs = String::with_capacity(inputs.len());
|
||||||
let mut tokenizer_query = String::with_capacity(inputs.len());
|
let mut tokenizer_query = String::with_capacity(inputs.len());
|
||||||
|
@ -1,6 +1,5 @@
|
|||||||
[toolchain]
|
[toolchain]
|
||||||
# Released on: 28 December, 2023
|
# Released on: 02 May, 2024
|
||||||
# Branched from master on: 10 November, 2023
|
# https://releases.rs/docs/1.78.0/
|
||||||
# https://releases.rs/docs/1.75.0/
|
channel = "1.78.0"
|
||||||
channel = "1.75.0"
|
|
||||||
components = ["rustfmt", "clippy"]
|
components = ["rustfmt", "clippy"]
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
flash_att_v2_commit_cuda := 23e8fa5a263d1c7122bc46a86ef32030ee7130f9
|
flash_att_v2_commit_cuda := 23e8fa5a263d1c7122bc46a86ef32030ee7130f9
|
||||||
flash_att_v2_commit_rocm := 8736558c287ff2ef28b24878e42828c595ac3e69
|
flash_att_v2_commit_rocm := 2554f490101742ccdc56620a938f847f61754be6
|
||||||
|
|
||||||
|
|
||||||
flash-attention-v2-cuda:
|
flash-attention-v2-cuda:
|
||||||
@ -18,12 +18,12 @@ install-flash-attention-v2-cuda: build-flash-attention-v2-cuda
|
|||||||
flash-attention-v2-rocm:
|
flash-attention-v2-rocm:
|
||||||
# Clone flash attention
|
# Clone flash attention
|
||||||
pip install -U packaging ninja --no-cache-dir
|
pip install -U packaging ninja --no-cache-dir
|
||||||
git clone https://github.com/fxmarty/flash-attention-rocm flash-attention-v2
|
git clone https://github.com/ROCm/flash-attention.git flash-attention-v2
|
||||||
|
|
||||||
build-flash-attention-v2-rocm: flash-attention-v2-rocm
|
build-flash-attention-v2-rocm: flash-attention-v2-rocm
|
||||||
cd flash-attention-v2 && git fetch && git checkout $(flash_att_v2_commit_rocm)
|
cd flash-attention-v2 && git fetch && git checkout $(flash_att_v2_commit_rocm)
|
||||||
cd flash-attention-v2 && git submodule update --init --recursive
|
cd flash-attention-v2 && git submodule update --init --recursive
|
||||||
cd flash-attention-v2 && PYTORCH_ROCM_ARCH=gfx90a python setup.py build
|
cd flash-attention-v2 && GPU_ARCHS="gfx90a;gfx942" PYTORCH_ROCM_ARCH="gfx90a;gfx942" python setup.py build
|
||||||
|
|
||||||
install-flash-attention-v2-rocm: build-flash-attention-v2-rocm
|
install-flash-attention-v2-rocm: build-flash-attention-v2-rocm
|
||||||
cd flash-attention-v2 && git submodule update --init --recursive && python setup.py install
|
cd flash-attention-v2 && git submodule update --init --recursive && python setup.py install
|
||||||
|
@ -14,11 +14,11 @@ install-vllm-cuda: build-vllm-cuda
|
|||||||
vllm-rocm:
|
vllm-rocm:
|
||||||
# Clone vllm
|
# Clone vllm
|
||||||
pip install -U ninja packaging --no-cache-dir
|
pip install -U ninja packaging --no-cache-dir
|
||||||
git clone https://github.com/fxmarty/vllm-public.git vllm
|
git clone https://github.com/fxmarty/rocm-vllm.git vllm
|
||||||
|
|
||||||
build-vllm-rocm: vllm-rocm
|
build-vllm-rocm: vllm-rocm
|
||||||
cd vllm && git fetch && git checkout ad9b7c4095ef54419a0533d254f2ad84bd2dfcae
|
cd vllm && git fetch && git checkout ca6913b3c2ffacdcb7d15e914dc34adbc6c89479
|
||||||
cd vllm && python setup.py build
|
cd vllm && PYTORCH_ROCM_ARCH="gfx90a;gfx942" python setup.py install
|
||||||
|
|
||||||
install-vllm-rocm: build-vllm-rocm
|
install-vllm-rocm: build-vllm-rocm
|
||||||
pip uninstall vllm -y || true
|
pip uninstall vllm -y || true
|
||||||
|
@ -10,8 +10,9 @@ __device__ __forceinline__ __half __compat_hrcp(__half x) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
__device__ __forceinline__ __half2 __compat_h2rcp(__half2 x) {
|
__device__ __forceinline__ __half2 __compat_h2rcp(__half2 x) {
|
||||||
return _Float16_2{static_cast<_Float16>(__builtin_amdgcn_rcph(x.x)),
|
return _Float16_2{
|
||||||
static_cast<_Float16>(__builtin_amdgcn_rcph(x.y))};
|
_Float16_2{static_cast<_Float16>(1.0f),
|
||||||
|
static_cast<_Float16>(1.0f)} / x.data};
|
||||||
}
|
}
|
||||||
|
|
||||||
#define hrcp __compat_hrcp
|
#define hrcp __compat_hrcp
|
||||||
|
681
server/poetry.lock
generated
681
server/poetry.lock
generated
@ -1,4 +1,4 @@
|
|||||||
# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand.
|
# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "accelerate"
|
name = "accelerate"
|
||||||
@ -194,13 +194,13 @@ files = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "certifi"
|
name = "certifi"
|
||||||
version = "2024.6.2"
|
version = "2024.7.4"
|
||||||
description = "Python package for providing Mozilla's CA Bundle."
|
description = "Python package for providing Mozilla's CA Bundle."
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.6"
|
python-versions = ">=3.6"
|
||||||
files = [
|
files = [
|
||||||
{file = "certifi-2024.6.2-py3-none-any.whl", hash = "sha256:ddc6c8ce995e6987e7faf5e3f1b02b302836a0e5d98ece18392cb1a36c72ad56"},
|
{file = "certifi-2024.7.4-py3-none-any.whl", hash = "sha256:c198e21b1289c2ab85ee4e67bb4b4ef3ead0892059901a8d5b622f24a1101e90"},
|
||||||
{file = "certifi-2024.6.2.tar.gz", hash = "sha256:3cd43f1c6fa7dedc5899d69d3ad0398fd018ad1a17fba83ddaf78aa46c747516"},
|
{file = "certifi-2024.7.4.tar.gz", hash = "sha256:5a1e7645bc0ec61a09e26c36f6106dd4cf40c6db3a1fb6352b0244e7fb057c7b"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@ -474,13 +474,13 @@ files = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "exceptiongroup"
|
name = "exceptiongroup"
|
||||||
version = "1.2.1"
|
version = "1.2.2"
|
||||||
description = "Backport of PEP 654 (exception groups)"
|
description = "Backport of PEP 654 (exception groups)"
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.7"
|
python-versions = ">=3.7"
|
||||||
files = [
|
files = [
|
||||||
{file = "exceptiongroup-1.2.1-py3-none-any.whl", hash = "sha256:5258b9ed329c5bbdd31a309f53cbfb0b155341807f6ff7606a1e801a891b29ad"},
|
{file = "exceptiongroup-1.2.2-py3-none-any.whl", hash = "sha256:3111b9d131c238bec2f8f516e123e14ba243563fb135d3fe885990585aa7795b"},
|
||||||
{file = "exceptiongroup-1.2.1.tar.gz", hash = "sha256:a4785e48b045528f5bfe627b6ad554ff32def154f42372786903b7abcfe1aa16"},
|
{file = "exceptiongroup-1.2.2.tar.gz", hash = "sha256:47c2edf7c6738fafb49fd34290706d1a1a2f4d1c6df275526b62cbb4aa5393cc"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.extras]
|
[package.extras]
|
||||||
@ -628,17 +628,17 @@ tqdm = ["tqdm"]
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "googleapis-common-protos"
|
name = "googleapis-common-protos"
|
||||||
version = "1.63.1"
|
version = "1.63.2"
|
||||||
description = "Common protobufs used in Google APIs"
|
description = "Common protobufs used in Google APIs"
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.7"
|
python-versions = ">=3.7"
|
||||||
files = [
|
files = [
|
||||||
{file = "googleapis-common-protos-1.63.1.tar.gz", hash = "sha256:c6442f7a0a6b2a80369457d79e6672bb7dcbaab88e0848302497e3ec80780a6a"},
|
{file = "googleapis-common-protos-1.63.2.tar.gz", hash = "sha256:27c5abdffc4911f28101e635de1533fb4cfd2c37fbaa9174587c799fac90aa87"},
|
||||||
{file = "googleapis_common_protos-1.63.1-py2.py3-none-any.whl", hash = "sha256:0e1c2cdfcbc354b76e4a211a35ea35d6926a835cba1377073c4861db904a1877"},
|
{file = "googleapis_common_protos-1.63.2-py2.py3-none-any.whl", hash = "sha256:27a2499c7e8aff199665b22741997e485eccc8645aa9176c7c988e6fae507945"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
protobuf = ">=3.19.5,<3.20.0 || >3.20.0,<3.20.1 || >3.20.1,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<6.0.0.dev0"
|
protobuf = ">=3.20.2,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<6.0.0.dev0"
|
||||||
|
|
||||||
[package.extras]
|
[package.extras]
|
||||||
grpc = ["grpcio (>=1.44.0,<2.0.0.dev0)"]
|
grpc = ["grpcio (>=1.44.0,<2.0.0.dev0)"]
|
||||||
@ -942,13 +942,13 @@ files = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "importlib-metadata"
|
name = "importlib-metadata"
|
||||||
version = "7.2.1"
|
version = "8.0.0"
|
||||||
description = "Read metadata from Python packages"
|
description = "Read metadata from Python packages"
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.8"
|
python-versions = ">=3.8"
|
||||||
files = [
|
files = [
|
||||||
{file = "importlib_metadata-7.2.1-py3-none-any.whl", hash = "sha256:ffef94b0b66046dd8ea2d619b701fe978d9264d38f3998bc4c27ec3b146a87c8"},
|
{file = "importlib_metadata-8.0.0-py3-none-any.whl", hash = "sha256:15584cf2b1bf449d98ff8a6ff1abef57bf20f3ac6454f431736cd3e660921b2f"},
|
||||||
{file = "importlib_metadata-7.2.1.tar.gz", hash = "sha256:509ecb2ab77071db5137c655e24ceb3eee66e7bbc6574165d0d114d9fc4bbe68"},
|
{file = "importlib_metadata-8.0.0.tar.gz", hash = "sha256:188bd24e4c346d3f0a933f275c2fec67050326a856b9a359881d7c2a697e8812"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
@ -1025,13 +1025,13 @@ files = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "jsonschema"
|
name = "jsonschema"
|
||||||
version = "4.22.0"
|
version = "4.23.0"
|
||||||
description = "An implementation of JSON Schema validation for Python"
|
description = "An implementation of JSON Schema validation for Python"
|
||||||
optional = true
|
optional = true
|
||||||
python-versions = ">=3.8"
|
python-versions = ">=3.8"
|
||||||
files = [
|
files = [
|
||||||
{file = "jsonschema-4.22.0-py3-none-any.whl", hash = "sha256:ff4cfd6b1367a40e7bc6411caec72effadd3db0bbe5017de188f2d6108335802"},
|
{file = "jsonschema-4.23.0-py3-none-any.whl", hash = "sha256:fbadb6f8b144a8f8cf9f0b89ba94501d143e50411a1278633f56a7acf7fd5566"},
|
||||||
{file = "jsonschema-4.22.0.tar.gz", hash = "sha256:5b22d434a45935119af990552c862e5d6d564e8f6601206b305a61fdf661a2b7"},
|
{file = "jsonschema-4.23.0.tar.gz", hash = "sha256:d71497fef26351a33265337fa77ffeb82423f3ea21283cd9467bb03999266bc4"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
@ -1042,7 +1042,7 @@ rpds-py = ">=0.7.1"
|
|||||||
|
|
||||||
[package.extras]
|
[package.extras]
|
||||||
format = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3987", "uri-template", "webcolors (>=1.11)"]
|
format = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3987", "uri-template", "webcolors (>=1.11)"]
|
||||||
format-nongpl = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3986-validator (>0.1.0)", "uri-template", "webcolors (>=1.11)"]
|
format-nongpl = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3986-validator (>0.1.0)", "uri-template", "webcolors (>=24.6.0)"]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "jsonschema-specifications"
|
name = "jsonschema-specifications"
|
||||||
@ -1580,13 +1580,13 @@ files = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "nvidia-nvjitlink-cu12"
|
name = "nvidia-nvjitlink-cu12"
|
||||||
version = "12.5.40"
|
version = "12.5.82"
|
||||||
description = "Nvidia JIT LTO Library"
|
description = "Nvidia JIT LTO Library"
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3"
|
python-versions = ">=3"
|
||||||
files = [
|
files = [
|
||||||
{file = "nvidia_nvjitlink_cu12-12.5.40-py3-none-manylinux2014_x86_64.whl", hash = "sha256:d9714f27c1d0f0895cd8915c07a87a1d0029a0aa36acaf9156952ec2a8a12189"},
|
{file = "nvidia_nvjitlink_cu12-12.5.82-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f9b37bc5c8cf7509665cb6ada5aaa0ce65618f2332b7d3e78e9790511f111212"},
|
||||||
{file = "nvidia_nvjitlink_cu12-12.5.40-py3-none-win_amd64.whl", hash = "sha256:c3401dc8543b52d3a8158007a0c1ab4e9c768fcbd24153a48c86972102197ddd"},
|
{file = "nvidia_nvjitlink_cu12-12.5.82-py3-none-win_amd64.whl", hash = "sha256:e782564d705ff0bf61ac3e1bf730166da66dd2fe9012f111ede5fc49b64ae697"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@ -1756,24 +1756,24 @@ files = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "optimum"
|
name = "optimum"
|
||||||
version = "1.20.0"
|
version = "1.21.2"
|
||||||
description = "Optimum Library is an extension of the Hugging Face Transformers library, providing a framework to integrate third-party libraries from Hardware Partners and interface with their specific functionality."
|
description = "Optimum Library is an extension of the Hugging Face Transformers library, providing a framework to integrate third-party libraries from Hardware Partners and interface with their specific functionality."
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.7.0"
|
python-versions = ">=3.7.0"
|
||||||
files = [
|
files = [
|
||||||
{file = "optimum-1.20.0-py3-none-any.whl", hash = "sha256:0c0d0746043c95e22cf3586946d7408d353f10c0486f1c7d2d11084a5cfc0ede"},
|
{file = "optimum-1.21.2-py3-none-any.whl", hash = "sha256:8b3633b9312413ceac5156294a2a0cd221268baf5a2c593f4d54ec20bff296d8"},
|
||||||
{file = "optimum-1.20.0.tar.gz", hash = "sha256:b64c7536fe738db9b56605105efe72006401ad2aa00cb499ae407f2e06f3043b"},
|
{file = "optimum-1.21.2.tar.gz", hash = "sha256:037e65d265237809fac69e9003215c60cf6de56e97c62ff7565abab4a94a64ce"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
coloredlogs = "*"
|
coloredlogs = "*"
|
||||||
datasets = "*"
|
datasets = "*"
|
||||||
huggingface-hub = ">=0.8.0"
|
huggingface-hub = ">=0.8.0"
|
||||||
numpy = "*"
|
numpy = "<2.0"
|
||||||
packaging = "*"
|
packaging = "*"
|
||||||
sympy = "*"
|
sympy = "*"
|
||||||
torch = ">=1.11"
|
torch = ">=1.11"
|
||||||
transformers = {version = ">=4.26.0,<4.42.0", extras = ["sentencepiece"]}
|
transformers = {version = ">=4.26.0,<4.43.0", extras = ["sentencepiece"]}
|
||||||
|
|
||||||
[package.extras]
|
[package.extras]
|
||||||
amd = ["optimum-amd"]
|
amd = ["optimum-amd"]
|
||||||
@ -1786,15 +1786,16 @@ exporters-gpu = ["onnx", "onnxruntime-gpu", "timm"]
|
|||||||
exporters-tf = ["h5py", "numpy (<1.24.0)", "onnx", "onnxruntime", "tensorflow (>=2.4,<=2.12.1)", "tf2onnx", "timm", "transformers[sentencepiece] (>=4.26.0,<4.38.0)"]
|
exporters-tf = ["h5py", "numpy (<1.24.0)", "onnx", "onnxruntime", "tensorflow (>=2.4,<=2.12.1)", "tf2onnx", "timm", "transformers[sentencepiece] (>=4.26.0,<4.38.0)"]
|
||||||
furiosa = ["optimum-furiosa"]
|
furiosa = ["optimum-furiosa"]
|
||||||
graphcore = ["optimum-graphcore"]
|
graphcore = ["optimum-graphcore"]
|
||||||
habana = ["optimum-habana", "transformers (>=4.38.0,<4.39.0)"]
|
habana = ["optimum-habana", "transformers (>=4.40.0,<4.41.0)"]
|
||||||
intel = ["optimum-intel (>=1.16.0)"]
|
intel = ["optimum-intel (>=1.18.0)"]
|
||||||
neural-compressor = ["optimum-intel[neural-compressor] (>=1.16.0)"]
|
ipex = ["optimum-intel[ipex] (>=1.18.0)"]
|
||||||
|
neural-compressor = ["optimum-intel[neural-compressor] (>=1.18.0)"]
|
||||||
neuron = ["optimum-neuron[neuron] (>=0.0.20)", "transformers (>=4.36.2,<4.42.0)"]
|
neuron = ["optimum-neuron[neuron] (>=0.0.20)", "transformers (>=4.36.2,<4.42.0)"]
|
||||||
neuronx = ["optimum-neuron[neuronx] (>=0.0.20)", "transformers (>=4.36.2,<4.42.0)"]
|
neuronx = ["optimum-neuron[neuronx] (>=0.0.20)", "transformers (>=4.36.2,<4.42.0)"]
|
||||||
nncf = ["optimum-intel[nncf] (>=1.16.0)"]
|
nncf = ["optimum-intel[nncf] (>=1.18.0)"]
|
||||||
onnxruntime = ["datasets (>=1.2.1)", "evaluate", "onnx", "onnxruntime (>=1.11.0)", "protobuf (>=3.20.1)"]
|
onnxruntime = ["datasets (>=1.2.1)", "evaluate", "onnx", "onnxruntime (>=1.11.0)", "protobuf (>=3.20.1)"]
|
||||||
onnxruntime-gpu = ["accelerate", "datasets (>=1.2.1)", "evaluate", "onnx", "onnxruntime-gpu (>=1.11.0)", "protobuf (>=3.20.1)"]
|
onnxruntime-gpu = ["accelerate", "datasets (>=1.2.1)", "evaluate", "onnx", "onnxruntime-gpu (>=1.11.0)", "protobuf (>=3.20.1)"]
|
||||||
openvino = ["optimum-intel[openvino] (>=1.16.0)"]
|
openvino = ["optimum-intel[openvino] (>=1.18.0)"]
|
||||||
quality = ["black (>=23.1,<24.0)", "ruff (==0.1.5)"]
|
quality = ["black (>=23.1,<24.0)", "ruff (==0.1.5)"]
|
||||||
tests = ["Pillow", "accelerate", "diffusers (>=0.17.0)", "einops", "invisible-watermark", "parameterized", "pytest (<=8.0.0)", "pytest-xdist", "requests", "rjieba", "sacremoses", "scikit-learn", "timm", "torchaudio", "torchvision"]
|
tests = ["Pillow", "accelerate", "diffusers (>=0.17.0)", "einops", "invisible-watermark", "parameterized", "pytest (<=8.0.0)", "pytest-xdist", "requests", "rjieba", "sacremoses", "scikit-learn", "timm", "torchaudio", "torchvision"]
|
||||||
|
|
||||||
@ -1970,84 +1971,95 @@ test = ["black", "datasets", "diffusers (<0.21.0)", "hf-doc-builder", "parameter
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "pillow"
|
name = "pillow"
|
||||||
version = "10.3.0"
|
version = "10.4.0"
|
||||||
description = "Python Imaging Library (Fork)"
|
description = "Python Imaging Library (Fork)"
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.8"
|
python-versions = ">=3.8"
|
||||||
files = [
|
files = [
|
||||||
{file = "pillow-10.3.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:90b9e29824800e90c84e4022dd5cc16eb2d9605ee13f05d47641eb183cd73d45"},
|
{file = "pillow-10.4.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:4d9667937cfa347525b319ae34375c37b9ee6b525440f3ef48542fcf66f2731e"},
|
||||||
{file = "pillow-10.3.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a2c405445c79c3f5a124573a051062300936b0281fee57637e706453e452746c"},
|
{file = "pillow-10.4.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:543f3dc61c18dafb755773efc89aae60d06b6596a63914107f75459cf984164d"},
|
||||||
{file = "pillow-10.3.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:78618cdbccaa74d3f88d0ad6cb8ac3007f1a6fa5c6f19af64b55ca170bfa1edf"},
|
{file = "pillow-10.4.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7928ecbf1ece13956b95d9cbcfc77137652b02763ba384d9ab508099a2eca856"},
|
||||||
{file = "pillow-10.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:261ddb7ca91fcf71757979534fb4c128448b5b4c55cb6152d280312062f69599"},
|
{file = "pillow-10.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e4d49b85c4348ea0b31ea63bc75a9f3857869174e2bf17e7aba02945cd218e6f"},
|
||||||
{file = "pillow-10.3.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:ce49c67f4ea0609933d01c0731b34b8695a7a748d6c8d186f95e7d085d2fe475"},
|
{file = "pillow-10.4.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:6c762a5b0997f5659a5ef2266abc1d8851ad7749ad9a6a5506eb23d314e4f46b"},
|
||||||
{file = "pillow-10.3.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:b14f16f94cbc61215115b9b1236f9c18403c15dd3c52cf629072afa9d54c1cbf"},
|
{file = "pillow-10.4.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:a985e028fc183bf12a77a8bbf36318db4238a3ded7fa9df1b9a133f1cb79f8fc"},
|
||||||
{file = "pillow-10.3.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:d33891be6df59d93df4d846640f0e46f1a807339f09e79a8040bc887bdcd7ed3"},
|
{file = "pillow-10.4.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:812f7342b0eee081eaec84d91423d1b4650bb9828eb53d8511bcef8ce5aecf1e"},
|
||||||
{file = "pillow-10.3.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:b50811d664d392f02f7761621303eba9d1b056fb1868c8cdf4231279645c25f5"},
|
{file = "pillow-10.4.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:ac1452d2fbe4978c2eec89fb5a23b8387aba707ac72810d9490118817d9c0b46"},
|
||||||
{file = "pillow-10.3.0-cp310-cp310-win32.whl", hash = "sha256:ca2870d5d10d8726a27396d3ca4cf7976cec0f3cb706debe88e3a5bd4610f7d2"},
|
{file = "pillow-10.4.0-cp310-cp310-win32.whl", hash = "sha256:bcd5e41a859bf2e84fdc42f4edb7d9aba0a13d29a2abadccafad99de3feff984"},
|
||||||
{file = "pillow-10.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:f0d0591a0aeaefdaf9a5e545e7485f89910c977087e7de2b6c388aec32011e9f"},
|
{file = "pillow-10.4.0-cp310-cp310-win_amd64.whl", hash = "sha256:ecd85a8d3e79cd7158dec1c9e5808e821feea088e2f69a974db5edf84dc53141"},
|
||||||
{file = "pillow-10.3.0-cp310-cp310-win_arm64.whl", hash = "sha256:ccce24b7ad89adb5a1e34a6ba96ac2530046763912806ad4c247356a8f33a67b"},
|
{file = "pillow-10.4.0-cp310-cp310-win_arm64.whl", hash = "sha256:ff337c552345e95702c5fde3158acb0625111017d0e5f24bf3acdb9cc16b90d1"},
|
||||||
{file = "pillow-10.3.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:5f77cf66e96ae734717d341c145c5949c63180842a545c47a0ce7ae52ca83795"},
|
{file = "pillow-10.4.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:0a9ec697746f268507404647e531e92889890a087e03681a3606d9b920fbee3c"},
|
||||||
{file = "pillow-10.3.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e4b878386c4bf293578b48fc570b84ecfe477d3b77ba39a6e87150af77f40c57"},
|
{file = "pillow-10.4.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:dfe91cb65544a1321e631e696759491ae04a2ea11d36715eca01ce07284738be"},
|
||||||
{file = "pillow-10.3.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fdcbb4068117dfd9ce0138d068ac512843c52295ed996ae6dd1faf537b6dbc27"},
|
{file = "pillow-10.4.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5dc6761a6efc781e6a1544206f22c80c3af4c8cf461206d46a1e6006e4429ff3"},
|
||||||
{file = "pillow-10.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9797a6c8fe16f25749b371c02e2ade0efb51155e767a971c61734b1bf6293994"},
|
{file = "pillow-10.4.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5e84b6cc6a4a3d76c153a6b19270b3526a5a8ed6b09501d3af891daa2a9de7d6"},
|
||||||
{file = "pillow-10.3.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:9e91179a242bbc99be65e139e30690e081fe6cb91a8e77faf4c409653de39451"},
|
{file = "pillow-10.4.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:bbc527b519bd3aa9d7f429d152fea69f9ad37c95f0b02aebddff592688998abe"},
|
||||||
{file = "pillow-10.3.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:1b87bd9d81d179bd8ab871603bd80d8645729939f90b71e62914e816a76fc6bd"},
|
{file = "pillow-10.4.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:76a911dfe51a36041f2e756b00f96ed84677cdeb75d25c767f296c1c1eda1319"},
|
||||||
{file = "pillow-10.3.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:81d09caa7b27ef4e61cb7d8fbf1714f5aec1c6b6c5270ee53504981e6e9121ad"},
|
{file = "pillow-10.4.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:59291fb29317122398786c2d44427bbd1a6d7ff54017075b22be9d21aa59bd8d"},
|
||||||
{file = "pillow-10.3.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:048ad577748b9fa4a99a0548c64f2cb8d672d5bf2e643a739ac8faff1164238c"},
|
{file = "pillow-10.4.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:416d3a5d0e8cfe4f27f574362435bc9bae57f679a7158e0096ad2beb427b8696"},
|
||||||
{file = "pillow-10.3.0-cp311-cp311-win32.whl", hash = "sha256:7161ec49ef0800947dc5570f86568a7bb36fa97dd09e9827dc02b718c5643f09"},
|
{file = "pillow-10.4.0-cp311-cp311-win32.whl", hash = "sha256:7086cc1d5eebb91ad24ded9f58bec6c688e9f0ed7eb3dbbf1e4800280a896496"},
|
||||||
{file = "pillow-10.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:8eb0908e954d093b02a543dc963984d6e99ad2b5e36503d8a0aaf040505f747d"},
|
{file = "pillow-10.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:cbed61494057c0f83b83eb3a310f0bf774b09513307c434d4366ed64f4128a91"},
|
||||||
{file = "pillow-10.3.0-cp311-cp311-win_arm64.whl", hash = "sha256:4e6f7d1c414191c1199f8996d3f2282b9ebea0945693fb67392c75a3a320941f"},
|
{file = "pillow-10.4.0-cp311-cp311-win_arm64.whl", hash = "sha256:f5f0c3e969c8f12dd2bb7e0b15d5c468b51e5017e01e2e867335c81903046a22"},
|
||||||
{file = "pillow-10.3.0-cp312-cp312-macosx_10_10_x86_64.whl", hash = "sha256:e46f38133e5a060d46bd630faa4d9fa0202377495df1f068a8299fd78c84de84"},
|
{file = "pillow-10.4.0-cp312-cp312-macosx_10_10_x86_64.whl", hash = "sha256:673655af3eadf4df6b5457033f086e90299fdd7a47983a13827acf7459c15d94"},
|
||||||
{file = "pillow-10.3.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:50b8eae8f7334ec826d6eeffaeeb00e36b5e24aa0b9df322c247539714c6df19"},
|
{file = "pillow-10.4.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:866b6942a92f56300012f5fbac71f2d610312ee65e22f1aa2609e491284e5597"},
|
||||||
{file = "pillow-10.3.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9d3bea1c75f8c53ee4d505c3e67d8c158ad4df0d83170605b50b64025917f338"},
|
{file = "pillow-10.4.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:29dbdc4207642ea6aad70fbde1a9338753d33fb23ed6956e706936706f52dd80"},
|
||||||
{file = "pillow-10.3.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:19aeb96d43902f0a783946a0a87dbdad5c84c936025b8419da0a0cd7724356b1"},
|
{file = "pillow-10.4.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bf2342ac639c4cf38799a44950bbc2dfcb685f052b9e262f446482afaf4bffca"},
|
||||||
{file = "pillow-10.3.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:74d28c17412d9caa1066f7a31df8403ec23d5268ba46cd0ad2c50fb82ae40462"},
|
{file = "pillow-10.4.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:f5b92f4d70791b4a67157321c4e8225d60b119c5cc9aee8ecf153aace4aad4ef"},
|
||||||
{file = "pillow-10.3.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:ff61bfd9253c3915e6d41c651d5f962da23eda633cf02262990094a18a55371a"},
|
{file = "pillow-10.4.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:86dcb5a1eb778d8b25659d5e4341269e8590ad6b4e8b44d9f4b07f8d136c414a"},
|
||||||
{file = "pillow-10.3.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:d886f5d353333b4771d21267c7ecc75b710f1a73d72d03ca06df49b09015a9ef"},
|
{file = "pillow-10.4.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:780c072c2e11c9b2c7ca37f9a2ee8ba66f44367ac3e5c7832afcfe5104fd6d1b"},
|
||||||
{file = "pillow-10.3.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4b5ec25d8b17217d635f8935dbc1b9aa5907962fae29dff220f2659487891cd3"},
|
{file = "pillow-10.4.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:37fb69d905be665f68f28a8bba3c6d3223c8efe1edf14cc4cfa06c241f8c81d9"},
|
||||||
{file = "pillow-10.3.0-cp312-cp312-win32.whl", hash = "sha256:51243f1ed5161b9945011a7360e997729776f6e5d7005ba0c6879267d4c5139d"},
|
{file = "pillow-10.4.0-cp312-cp312-win32.whl", hash = "sha256:7dfecdbad5c301d7b5bde160150b4db4c659cee2b69589705b6f8a0c509d9f42"},
|
||||||
{file = "pillow-10.3.0-cp312-cp312-win_amd64.whl", hash = "sha256:412444afb8c4c7a6cc11a47dade32982439925537e483be7c0ae0cf96c4f6a0b"},
|
{file = "pillow-10.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:1d846aea995ad352d4bdcc847535bd56e0fd88d36829d2c90be880ef1ee4668a"},
|
||||||
{file = "pillow-10.3.0-cp312-cp312-win_arm64.whl", hash = "sha256:798232c92e7665fe82ac085f9d8e8ca98826f8e27859d9a96b41d519ecd2e49a"},
|
{file = "pillow-10.4.0-cp312-cp312-win_arm64.whl", hash = "sha256:e553cad5179a66ba15bb18b353a19020e73a7921296a7979c4a2b7f6a5cd57f9"},
|
||||||
{file = "pillow-10.3.0-cp38-cp38-macosx_10_10_x86_64.whl", hash = "sha256:4eaa22f0d22b1a7e93ff0a596d57fdede2e550aecffb5a1ef1106aaece48e96b"},
|
{file = "pillow-10.4.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:8bc1a764ed8c957a2e9cacf97c8b2b053b70307cf2996aafd70e91a082e70df3"},
|
||||||
{file = "pillow-10.3.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:cd5e14fbf22a87321b24c88669aad3a51ec052eb145315b3da3b7e3cc105b9a2"},
|
{file = "pillow-10.4.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:6209bb41dc692ddfee4942517c19ee81b86c864b626dbfca272ec0f7cff5d9fb"},
|
||||||
{file = "pillow-10.3.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1530e8f3a4b965eb6a7785cf17a426c779333eb62c9a7d1bbcf3ffd5bf77a4aa"},
|
{file = "pillow-10.4.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bee197b30783295d2eb680b311af15a20a8b24024a19c3a26431ff83eb8d1f70"},
|
||||||
{file = "pillow-10.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5d512aafa1d32efa014fa041d38868fda85028e3f930a96f85d49c7d8ddc0383"},
|
{file = "pillow-10.4.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1ef61f5dd14c300786318482456481463b9d6b91ebe5ef12f405afbba77ed0be"},
|
||||||
{file = "pillow-10.3.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:339894035d0ede518b16073bdc2feef4c991ee991a29774b33e515f1d308e08d"},
|
{file = "pillow-10.4.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:297e388da6e248c98bc4a02e018966af0c5f92dfacf5a5ca22fa01cb3179bca0"},
|
||||||
{file = "pillow-10.3.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:aa7e402ce11f0885305bfb6afb3434b3cd8f53b563ac065452d9d5654c7b86fd"},
|
{file = "pillow-10.4.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:e4db64794ccdf6cb83a59d73405f63adbe2a1887012e308828596100a0b2f6cc"},
|
||||||
{file = "pillow-10.3.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:0ea2a783a2bdf2a561808fe4a7a12e9aa3799b701ba305de596bc48b8bdfce9d"},
|
{file = "pillow-10.4.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:bd2880a07482090a3bcb01f4265f1936a903d70bc740bfcb1fd4e8a2ffe5cf5a"},
|
||||||
{file = "pillow-10.3.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:c78e1b00a87ce43bb37642c0812315b411e856a905d58d597750eb79802aaaa3"},
|
{file = "pillow-10.4.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4b35b21b819ac1dbd1233317adeecd63495f6babf21b7b2512d244ff6c6ce309"},
|
||||||
{file = "pillow-10.3.0-cp38-cp38-win32.whl", hash = "sha256:72d622d262e463dfb7595202d229f5f3ab4b852289a1cd09650362db23b9eb0b"},
|
{file = "pillow-10.4.0-cp313-cp313-win32.whl", hash = "sha256:551d3fd6e9dc15e4c1eb6fc4ba2b39c0c7933fa113b220057a34f4bb3268a060"},
|
||||||
{file = "pillow-10.3.0-cp38-cp38-win_amd64.whl", hash = "sha256:2034f6759a722da3a3dbd91a81148cf884e91d1b747992ca288ab88c1de15999"},
|
{file = "pillow-10.4.0-cp313-cp313-win_amd64.whl", hash = "sha256:030abdbe43ee02e0de642aee345efa443740aa4d828bfe8e2eb11922ea6a21ea"},
|
||||||
{file = "pillow-10.3.0-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:2ed854e716a89b1afcedea551cd85f2eb2a807613752ab997b9974aaa0d56936"},
|
{file = "pillow-10.4.0-cp313-cp313-win_arm64.whl", hash = "sha256:5b001114dd152cfd6b23befeb28d7aee43553e2402c9f159807bf55f33af8a8d"},
|
||||||
{file = "pillow-10.3.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:dc1a390a82755a8c26c9964d457d4c9cbec5405896cba94cf51f36ea0d855002"},
|
{file = "pillow-10.4.0-cp38-cp38-macosx_10_10_x86_64.whl", hash = "sha256:8d4d5063501b6dd4024b8ac2f04962d661222d120381272deea52e3fc52d3736"},
|
||||||
{file = "pillow-10.3.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4203efca580f0dd6f882ca211f923168548f7ba334c189e9eab1178ab840bf60"},
|
{file = "pillow-10.4.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:7c1ee6f42250df403c5f103cbd2768a28fe1a0ea1f0f03fe151c8741e1469c8b"},
|
||||||
{file = "pillow-10.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3102045a10945173d38336f6e71a8dc71bcaeed55c3123ad4af82c52807b9375"},
|
{file = "pillow-10.4.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b15e02e9bb4c21e39876698abf233c8c579127986f8207200bc8a8f6bb27acf2"},
|
||||||
{file = "pillow-10.3.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:6fb1b30043271ec92dc65f6d9f0b7a830c210b8a96423074b15c7bc999975f57"},
|
{file = "pillow-10.4.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7a8d4bade9952ea9a77d0c3e49cbd8b2890a399422258a77f357b9cc9be8d680"},
|
||||||
{file = "pillow-10.3.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:1dfc94946bc60ea375cc39cff0b8da6c7e5f8fcdc1d946beb8da5c216156ddd8"},
|
{file = "pillow-10.4.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:43efea75eb06b95d1631cb784aa40156177bf9dd5b4b03ff38979e048258bc6b"},
|
||||||
{file = "pillow-10.3.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:b09b86b27a064c9624d0a6c54da01c1beaf5b6cadfa609cf63789b1d08a797b9"},
|
{file = "pillow-10.4.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:950be4d8ba92aca4b2bb0741285a46bfae3ca699ef913ec8416c1b78eadd64cd"},
|
||||||
{file = "pillow-10.3.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:d3b2348a78bc939b4fed6552abfd2e7988e0f81443ef3911a4b8498ca084f6eb"},
|
{file = "pillow-10.4.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:d7480af14364494365e89d6fddc510a13e5a2c3584cb19ef65415ca57252fb84"},
|
||||||
{file = "pillow-10.3.0-cp39-cp39-win32.whl", hash = "sha256:45ebc7b45406febf07fef35d856f0293a92e7417ae7933207e90bf9090b70572"},
|
{file = "pillow-10.4.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:73664fe514b34c8f02452ffb73b7a92c6774e39a647087f83d67f010eb9a0cf0"},
|
||||||
{file = "pillow-10.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:0ba26351b137ca4e0db0342d5d00d2e355eb29372c05afd544ebf47c0956ffeb"},
|
{file = "pillow-10.4.0-cp38-cp38-win32.whl", hash = "sha256:e88d5e6ad0d026fba7bdab8c3f225a69f063f116462c49892b0149e21b6c0a0e"},
|
||||||
{file = "pillow-10.3.0-cp39-cp39-win_arm64.whl", hash = "sha256:50fd3f6b26e3441ae07b7c979309638b72abc1a25da31a81a7fbd9495713ef4f"},
|
{file = "pillow-10.4.0-cp38-cp38-win_amd64.whl", hash = "sha256:5161eef006d335e46895297f642341111945e2c1c899eb406882a6c61a4357ab"},
|
||||||
{file = "pillow-10.3.0-pp310-pypy310_pp73-macosx_10_10_x86_64.whl", hash = "sha256:6b02471b72526ab8a18c39cb7967b72d194ec53c1fd0a70b050565a0f366d355"},
|
{file = "pillow-10.4.0-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:0ae24a547e8b711ccaaf99c9ae3cd975470e1a30caa80a6aaee9a2f19c05701d"},
|
||||||
{file = "pillow-10.3.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:8ab74c06ffdab957d7670c2a5a6e1a70181cd10b727cd788c4dd9005b6a8acd9"},
|
{file = "pillow-10.4.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:298478fe4f77a4408895605f3482b6cc6222c018b2ce565c2b6b9c354ac3229b"},
|
||||||
{file = "pillow-10.3.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:048eeade4c33fdf7e08da40ef402e748df113fd0b4584e32c4af74fe78baaeb2"},
|
{file = "pillow-10.4.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:134ace6dc392116566980ee7436477d844520a26a4b1bd4053f6f47d096997fd"},
|
||||||
{file = "pillow-10.3.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9e2ec1e921fd07c7cda7962bad283acc2f2a9ccc1b971ee4b216b75fad6f0463"},
|
{file = "pillow-10.4.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:930044bb7679ab003b14023138b50181899da3f25de50e9dbee23b61b4de2126"},
|
||||||
{file = "pillow-10.3.0-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:4c8e73e99da7db1b4cad7f8d682cf6abad7844da39834c288fbfa394a47bbced"},
|
{file = "pillow-10.4.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:c76e5786951e72ed3686e122d14c5d7012f16c8303a674d18cdcd6d89557fc5b"},
|
||||||
{file = "pillow-10.3.0-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:16563993329b79513f59142a6b02055e10514c1a8e86dca8b48a893e33cf91e3"},
|
{file = "pillow-10.4.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:b2724fdb354a868ddf9a880cb84d102da914e99119211ef7ecbdc613b8c96b3c"},
|
||||||
{file = "pillow-10.3.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:dd78700f5788ae180b5ee8902c6aea5a5726bac7c364b202b4b3e3ba2d293170"},
|
{file = "pillow-10.4.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:dbc6ae66518ab3c5847659e9988c3b60dc94ffb48ef9168656e0019a93dbf8a1"},
|
||||||
{file = "pillow-10.3.0-pp39-pypy39_pp73-macosx_10_10_x86_64.whl", hash = "sha256:aff76a55a8aa8364d25400a210a65ff59d0168e0b4285ba6bf2bd83cf675ba32"},
|
{file = "pillow-10.4.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:06b2f7898047ae93fad74467ec3d28fe84f7831370e3c258afa533f81ef7f3df"},
|
||||||
{file = "pillow-10.3.0-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:b7bc2176354defba3edc2b9a777744462da2f8e921fbaf61e52acb95bafa9828"},
|
{file = "pillow-10.4.0-cp39-cp39-win32.whl", hash = "sha256:7970285ab628a3779aecc35823296a7869f889b8329c16ad5a71e4901a3dc4ef"},
|
||||||
{file = "pillow-10.3.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:793b4e24db2e8742ca6423d3fde8396db336698c55cd34b660663ee9e45ed37f"},
|
{file = "pillow-10.4.0-cp39-cp39-win_amd64.whl", hash = "sha256:961a7293b2457b405967af9c77dcaa43cc1a8cd50d23c532e62d48ab6cdd56f5"},
|
||||||
{file = "pillow-10.3.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d93480005693d247f8346bc8ee28c72a2191bdf1f6b5db469c096c0c867ac015"},
|
{file = "pillow-10.4.0-cp39-cp39-win_arm64.whl", hash = "sha256:32cda9e3d601a52baccb2856b8ea1fc213c90b340c542dcef77140dfa3278a9e"},
|
||||||
{file = "pillow-10.3.0-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:c83341b89884e2b2e55886e8fbbf37c3fa5efd6c8907124aeb72f285ae5696e5"},
|
{file = "pillow-10.4.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:5b4815f2e65b30f5fbae9dfffa8636d992d49705723fe86a3661806e069352d4"},
|
||||||
{file = "pillow-10.3.0-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:1a1d1915db1a4fdb2754b9de292642a39a7fb28f1736699527bb649484fb966a"},
|
{file = "pillow-10.4.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:8f0aef4ef59694b12cadee839e2ba6afeab89c0f39a3adc02ed51d109117b8da"},
|
||||||
{file = "pillow-10.3.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:a0eaa93d054751ee9964afa21c06247779b90440ca41d184aeb5d410f20ff591"},
|
{file = "pillow-10.4.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9f4727572e2918acaa9077c919cbbeb73bd2b3ebcfe033b72f858fc9fbef0026"},
|
||||||
{file = "pillow-10.3.0.tar.gz", hash = "sha256:9d2455fbf44c914840c793e89aa82d0e1763a14253a000743719ae5946814b2d"},
|
{file = "pillow-10.4.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ff25afb18123cea58a591ea0244b92eb1e61a1fd497bf6d6384f09bc3262ec3e"},
|
||||||
|
{file = "pillow-10.4.0-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:dc3e2db6ba09ffd7d02ae9141cfa0ae23393ee7687248d46a7507b75d610f4f5"},
|
||||||
|
{file = "pillow-10.4.0-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:02a2be69f9c9b8c1e97cf2713e789d4e398c751ecfd9967c18d0ce304efbf885"},
|
||||||
|
{file = "pillow-10.4.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:0755ffd4a0c6f267cccbae2e9903d95477ca2f77c4fcf3a3a09570001856c8a5"},
|
||||||
|
{file = "pillow-10.4.0-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:a02364621fe369e06200d4a16558e056fe2805d3468350df3aef21e00d26214b"},
|
||||||
|
{file = "pillow-10.4.0-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:1b5dea9831a90e9d0721ec417a80d4cbd7022093ac38a568db2dd78363b00908"},
|
||||||
|
{file = "pillow-10.4.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9b885f89040bb8c4a1573566bbb2f44f5c505ef6e74cec7ab9068c900047f04b"},
|
||||||
|
{file = "pillow-10.4.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:87dd88ded2e6d74d31e1e0a99a726a6765cda32d00ba72dc37f0651f306daaa8"},
|
||||||
|
{file = "pillow-10.4.0-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:2db98790afc70118bd0255c2eeb465e9767ecf1f3c25f9a1abb8ffc8cfd1fe0a"},
|
||||||
|
{file = "pillow-10.4.0-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:f7baece4ce06bade126fb84b8af1c33439a76d8a6fd818970215e0560ca28c27"},
|
||||||
|
{file = "pillow-10.4.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:cfdd747216947628af7b259d274771d84db2268ca062dd5faf373639d00113a3"},
|
||||||
|
{file = "pillow-10.4.0.tar.gz", hash = "sha256:166c1cd4d24309b30d61f79f4a9114b7b2313d7450912277855ff5dfd7cd4a06"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.extras]
|
[package.extras]
|
||||||
docs = ["furo", "olefile", "sphinx (>=2.4)", "sphinx-copybutton", "sphinx-inline-tabs", "sphinx-removed-in", "sphinxext-opengraph"]
|
docs = ["furo", "olefile", "sphinx (>=7.3)", "sphinx-copybutton", "sphinx-inline-tabs", "sphinxext-opengraph"]
|
||||||
fpx = ["olefile"]
|
fpx = ["olefile"]
|
||||||
mic = ["olefile"]
|
mic = ["olefile"]
|
||||||
tests = ["check-manifest", "coverage", "defusedxml", "markdown2", "olefile", "packaging", "pyroma", "pytest", "pytest-cov", "pytest-timeout"]
|
tests = ["check-manifest", "coverage", "defusedxml", "markdown2", "olefile", "packaging", "pyroma", "pytest", "pytest-cov", "pytest-timeout"]
|
||||||
@ -2156,52 +2168,42 @@ files = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "pyarrow"
|
name = "pyarrow"
|
||||||
version = "16.1.0"
|
version = "17.0.0"
|
||||||
description = "Python library for Apache Arrow"
|
description = "Python library for Apache Arrow"
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.8"
|
python-versions = ">=3.8"
|
||||||
files = [
|
files = [
|
||||||
{file = "pyarrow-16.1.0-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:17e23b9a65a70cc733d8b738baa6ad3722298fa0c81d88f63ff94bf25eaa77b9"},
|
{file = "pyarrow-17.0.0-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:a5c8b238d47e48812ee577ee20c9a2779e6a5904f1708ae240f53ecbee7c9f07"},
|
||||||
{file = "pyarrow-16.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:4740cc41e2ba5d641071d0ab5e9ef9b5e6e8c7611351a5cb7c1d175eaf43674a"},
|
{file = "pyarrow-17.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:db023dc4c6cae1015de9e198d41250688383c3f9af8f565370ab2b4cb5f62655"},
|
||||||
{file = "pyarrow-16.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:98100e0268d04e0eec47b73f20b39c45b4006f3c4233719c3848aa27a03c1aef"},
|
{file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:da1e060b3876faa11cee287839f9cc7cdc00649f475714b8680a05fd9071d545"},
|
||||||
{file = "pyarrow-16.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f68f409e7b283c085f2da014f9ef81e885d90dcd733bd648cfba3ef265961848"},
|
{file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:75c06d4624c0ad6674364bb46ef38c3132768139ddec1c56582dbac54f2663e2"},
|
||||||
{file = "pyarrow-16.1.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:a8914cd176f448e09746037b0c6b3a9d7688cef451ec5735094055116857580c"},
|
{file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:fa3c246cc58cb5a4a5cb407a18f193354ea47dd0648194e6265bd24177982fe8"},
|
||||||
{file = "pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:48be160782c0556156d91adbdd5a4a7e719f8d407cb46ae3bb4eaee09b3111bd"},
|
{file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:f7ae2de664e0b158d1607699a16a488de3d008ba99b3a7aa5de1cbc13574d047"},
|
||||||
{file = "pyarrow-16.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:9cf389d444b0f41d9fe1444b70650fea31e9d52cfcb5f818b7888b91b586efff"},
|
{file = "pyarrow-17.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:5984f416552eea15fd9cee03da53542bf4cddaef5afecefb9aa8d1010c335087"},
|
||||||
{file = "pyarrow-16.1.0-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:d0ebea336b535b37eee9eee31761813086d33ed06de9ab6fc6aaa0bace7b250c"},
|
{file = "pyarrow-17.0.0-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:1c8856e2ef09eb87ecf937104aacfa0708f22dfeb039c363ec99735190ffb977"},
|
||||||
{file = "pyarrow-16.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2e73cfc4a99e796727919c5541c65bb88b973377501e39b9842ea71401ca6c1c"},
|
{file = "pyarrow-17.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2e19f569567efcbbd42084e87f948778eb371d308e137a0f97afe19bb860ccb3"},
|
||||||
{file = "pyarrow-16.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bf9251264247ecfe93e5f5a0cd43b8ae834f1e61d1abca22da55b20c788417f6"},
|
{file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6b244dc8e08a23b3e352899a006a26ae7b4d0da7bb636872fa8f5884e70acf15"},
|
||||||
{file = "pyarrow-16.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ddf5aace92d520d3d2a20031d8b0ec27b4395cab9f74e07cc95edf42a5cc0147"},
|
{file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b72e87fe3e1db343995562f7fff8aee354b55ee83d13afba65400c178ab2597"},
|
||||||
{file = "pyarrow-16.1.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:25233642583bf658f629eb230b9bb79d9af4d9f9229890b3c878699c82f7d11e"},
|
{file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:dc5c31c37409dfbc5d014047817cb4ccd8c1ea25d19576acf1a001fe07f5b420"},
|
||||||
{file = "pyarrow-16.1.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:a33a64576fddfbec0a44112eaf844c20853647ca833e9a647bfae0582b2ff94b"},
|
{file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:e3343cb1e88bc2ea605986d4b94948716edc7a8d14afd4e2c097232f729758b4"},
|
||||||
{file = "pyarrow-16.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:185d121b50836379fe012753cf15c4ba9638bda9645183ab36246923875f8d1b"},
|
{file = "pyarrow-17.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:a27532c38f3de9eb3e90ecab63dfda948a8ca859a66e3a47f5f42d1e403c4d03"},
|
||||||
{file = "pyarrow-16.1.0-cp312-cp312-macosx_10_15_x86_64.whl", hash = "sha256:2e51ca1d6ed7f2e9d5c3c83decf27b0d17bb207a7dea986e8dc3e24f80ff7d6f"},
|
{file = "pyarrow-17.0.0-cp312-cp312-macosx_10_15_x86_64.whl", hash = "sha256:9b8a823cea605221e61f34859dcc03207e52e409ccf6354634143e23af7c8d22"},
|
||||||
{file = "pyarrow-16.1.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:06ebccb6f8cb7357de85f60d5da50e83507954af617d7b05f48af1621d331c9a"},
|
{file = "pyarrow-17.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f1e70de6cb5790a50b01d2b686d54aaf73da01266850b05e3af2a1bc89e16053"},
|
||||||
{file = "pyarrow-16.1.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b04707f1979815f5e49824ce52d1dceb46e2f12909a48a6a753fe7cafbc44a0c"},
|
{file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0071ce35788c6f9077ff9ecba4858108eebe2ea5a3f7cf2cf55ebc1dbc6ee24a"},
|
||||||
{file = "pyarrow-16.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0d32000693deff8dc5df444b032b5985a48592c0697cb6e3071a5d59888714e2"},
|
{file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:757074882f844411fcca735e39aae74248a1531367a7c80799b4266390ae51cc"},
|
||||||
{file = "pyarrow-16.1.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:8785bb10d5d6fd5e15d718ee1d1f914fe768bf8b4d1e5e9bf253de8a26cb1628"},
|
{file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:9ba11c4f16976e89146781a83833df7f82077cdab7dc6232c897789343f7891a"},
|
||||||
{file = "pyarrow-16.1.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:e1369af39587b794873b8a307cc6623a3b1194e69399af0efd05bb202195a5a7"},
|
{file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:b0c6ac301093b42d34410b187bba560b17c0330f64907bfa4f7f7f2444b0cf9b"},
|
||||||
{file = "pyarrow-16.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:febde33305f1498f6df85e8020bca496d0e9ebf2093bab9e0f65e2b4ae2b3444"},
|
{file = "pyarrow-17.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:392bc9feabc647338e6c89267635e111d71edad5fcffba204425a7c8d13610d7"},
|
||||||
{file = "pyarrow-16.1.0-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:b5f5705ab977947a43ac83b52ade3b881eb6e95fcc02d76f501d549a210ba77f"},
|
{file = "pyarrow-17.0.0-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:af5ff82a04b2171415f1410cff7ebb79861afc5dae50be73ce06d6e870615204"},
|
||||||
{file = "pyarrow-16.1.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:0d27bf89dfc2576f6206e9cd6cf7a107c9c06dc13d53bbc25b0bd4556f19cf5f"},
|
{file = "pyarrow-17.0.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:edca18eaca89cd6382dfbcff3dd2d87633433043650c07375d095cd3517561d8"},
|
||||||
{file = "pyarrow-16.1.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0d07de3ee730647a600037bc1d7b7994067ed64d0eba797ac74b2bc77384f4c2"},
|
|
||||||
{file = "pyarrow-16.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fbef391b63f708e103df99fbaa3acf9f671d77a183a07546ba2f2c297b361e83"},
|
|
||||||
{file = "pyarrow-16.1.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:19741c4dbbbc986d38856ee7ddfdd6a00fc3b0fc2d928795b95410d38bb97d15"},
|
|
||||||
{file = "pyarrow-16.1.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:f2c5fb249caa17b94e2b9278b36a05ce03d3180e6da0c4c3b3ce5b2788f30eed"},
|
|
||||||
{file = "pyarrow-16.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:e6b6d3cd35fbb93b70ade1336022cc1147b95ec6af7d36906ca7fe432eb09710"},
|
|
||||||
{file = "pyarrow-16.1.0-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:18da9b76a36a954665ccca8aa6bd9f46c1145f79c0bb8f4f244f5f8e799bca55"},
|
|
||||||
{file = "pyarrow-16.1.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:99f7549779b6e434467d2aa43ab2b7224dd9e41bdde486020bae198978c9e05e"},
|
|
||||||
{file = "pyarrow-16.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f07fdffe4fd5b15f5ec15c8b64584868d063bc22b86b46c9695624ca3505b7b4"},
|
|
||||||
{file = "pyarrow-16.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ddfe389a08ea374972bd4065d5f25d14e36b43ebc22fc75f7b951f24378bf0b5"},
|
|
||||||
{file = "pyarrow-16.1.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:3b20bd67c94b3a2ea0a749d2a5712fc845a69cb5d52e78e6449bbd295611f3aa"},
|
|
||||||
{file = "pyarrow-16.1.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:ba8ac20693c0bb0bf4b238751d4409e62852004a8cf031c73b0e0962b03e45e3"},
|
|
||||||
{file = "pyarrow-16.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:31a1851751433d89a986616015841977e0a188662fcffd1a5677453f1df2de0a"},
|
|
||||||
{file = "pyarrow-16.1.0.tar.gz", hash = "sha256:15fbb22ea96d11f0b5768504a3f961edab25eaf4197c341720c4a387f6c60315"},
|
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
numpy = ">=1.16.6"
|
numpy = ">=1.16.6"
|
||||||
|
|
||||||
|
[package.extras]
|
||||||
|
test = ["cffi", "hypothesis", "pandas", "pytest", "pytz"]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "pyarrow-hotfix"
|
name = "pyarrow-hotfix"
|
||||||
version = "0.6"
|
version = "0.6"
|
||||||
@ -2215,109 +2217,119 @@ files = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "pydantic"
|
name = "pydantic"
|
||||||
version = "2.7.4"
|
version = "2.8.2"
|
||||||
description = "Data validation using Python type hints"
|
description = "Data validation using Python type hints"
|
||||||
optional = true
|
optional = true
|
||||||
python-versions = ">=3.8"
|
python-versions = ">=3.8"
|
||||||
files = [
|
files = [
|
||||||
{file = "pydantic-2.7.4-py3-none-any.whl", hash = "sha256:ee8538d41ccb9c0a9ad3e0e5f07bf15ed8015b481ced539a1759d8cc89ae90d0"},
|
{file = "pydantic-2.8.2-py3-none-any.whl", hash = "sha256:73ee9fddd406dc318b885c7a2eab8a6472b68b8fb5ba8150949fc3db939f23c8"},
|
||||||
{file = "pydantic-2.7.4.tar.gz", hash = "sha256:0c84efd9548d545f63ac0060c1e4d39bb9b14db8b3c0652338aecc07b5adec52"},
|
{file = "pydantic-2.8.2.tar.gz", hash = "sha256:6f62c13d067b0755ad1c21a34bdd06c0c12625a22b0fc09c6b149816604f7c2a"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
annotated-types = ">=0.4.0"
|
annotated-types = ">=0.4.0"
|
||||||
pydantic-core = "2.18.4"
|
pydantic-core = "2.20.1"
|
||||||
typing-extensions = ">=4.6.1"
|
typing-extensions = {version = ">=4.6.1", markers = "python_version < \"3.13\""}
|
||||||
|
|
||||||
[package.extras]
|
[package.extras]
|
||||||
email = ["email-validator (>=2.0.0)"]
|
email = ["email-validator (>=2.0.0)"]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "pydantic-core"
|
name = "pydantic-core"
|
||||||
version = "2.18.4"
|
version = "2.20.1"
|
||||||
description = "Core functionality for Pydantic validation and serialization"
|
description = "Core functionality for Pydantic validation and serialization"
|
||||||
optional = true
|
optional = true
|
||||||
python-versions = ">=3.8"
|
python-versions = ">=3.8"
|
||||||
files = [
|
files = [
|
||||||
{file = "pydantic_core-2.18.4-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:f76d0ad001edd426b92233d45c746fd08f467d56100fd8f30e9ace4b005266e4"},
|
{file = "pydantic_core-2.20.1-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:3acae97ffd19bf091c72df4d726d552c473f3576409b2a7ca36b2f535ffff4a3"},
|
||||||
{file = "pydantic_core-2.18.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:59ff3e89f4eaf14050c8022011862df275b552caef8082e37b542b066ce1ff26"},
|
{file = "pydantic_core-2.20.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:41f4c96227a67a013e7de5ff8f20fb496ce573893b7f4f2707d065907bffdbd6"},
|
||||||
{file = "pydantic_core-2.18.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a55b5b16c839df1070bc113c1f7f94a0af4433fcfa1b41799ce7606e5c79ce0a"},
|
{file = "pydantic_core-2.20.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5f239eb799a2081495ea659d8d4a43a8f42cd1fe9ff2e7e436295c38a10c286a"},
|
||||||
{file = "pydantic_core-2.18.4-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:4d0dcc59664fcb8974b356fe0a18a672d6d7cf9f54746c05f43275fc48636851"},
|
{file = "pydantic_core-2.20.1-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:53e431da3fc53360db73eedf6f7124d1076e1b4ee4276b36fb25514544ceb4a3"},
|
||||||
{file = "pydantic_core-2.18.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8951eee36c57cd128f779e641e21eb40bc5073eb28b2d23f33eb0ef14ffb3f5d"},
|
{file = "pydantic_core-2.20.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f1f62b2413c3a0e846c3b838b2ecd6c7a19ec6793b2a522745b0869e37ab5bc1"},
|
||||||
{file = "pydantic_core-2.18.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4701b19f7e3a06ea655513f7938de6f108123bf7c86bbebb1196eb9bd35cf724"},
|
{file = "pydantic_core-2.20.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5d41e6daee2813ecceea8eda38062d69e280b39df793f5a942fa515b8ed67953"},
|
||||||
{file = "pydantic_core-2.18.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e00a3f196329e08e43d99b79b286d60ce46bed10f2280d25a1718399457e06be"},
|
{file = "pydantic_core-2.20.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3d482efec8b7dc6bfaedc0f166b2ce349df0011f5d2f1f25537ced4cfc34fd98"},
|
||||||
{file = "pydantic_core-2.18.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:97736815b9cc893b2b7f663628e63f436018b75f44854c8027040e05230eeddb"},
|
{file = "pydantic_core-2.20.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:e93e1a4b4b33daed65d781a57a522ff153dcf748dee70b40c7258c5861e1768a"},
|
||||||
{file = "pydantic_core-2.18.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:6891a2ae0e8692679c07728819b6e2b822fb30ca7445f67bbf6509b25a96332c"},
|
{file = "pydantic_core-2.20.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:e7c4ea22b6739b162c9ecaaa41d718dfad48a244909fe7ef4b54c0b530effc5a"},
|
||||||
{file = "pydantic_core-2.18.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:bc4ff9805858bd54d1a20efff925ccd89c9d2e7cf4986144b30802bf78091c3e"},
|
{file = "pydantic_core-2.20.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:4f2790949cf385d985a31984907fecb3896999329103df4e4983a4a41e13e840"},
|
||||||
{file = "pydantic_core-2.18.4-cp310-none-win32.whl", hash = "sha256:1b4de2e51bbcb61fdebd0ab86ef28062704f62c82bbf4addc4e37fa4b00b7cbc"},
|
{file = "pydantic_core-2.20.1-cp310-none-win32.whl", hash = "sha256:5e999ba8dd90e93d57410c5e67ebb67ffcaadcea0ad973240fdfd3a135506250"},
|
||||||
{file = "pydantic_core-2.18.4-cp310-none-win_amd64.whl", hash = "sha256:6a750aec7bf431517a9fd78cb93c97b9b0c496090fee84a47a0d23668976b4b0"},
|
{file = "pydantic_core-2.20.1-cp310-none-win_amd64.whl", hash = "sha256:512ecfbefef6dac7bc5eaaf46177b2de58cdf7acac8793fe033b24ece0b9566c"},
|
||||||
{file = "pydantic_core-2.18.4-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:942ba11e7dfb66dc70f9ae66b33452f51ac7bb90676da39a7345e99ffb55402d"},
|
{file = "pydantic_core-2.20.1-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:d2a8fa9d6d6f891f3deec72f5cc668e6f66b188ab14bb1ab52422fe8e644f312"},
|
||||||
{file = "pydantic_core-2.18.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b2ebef0e0b4454320274f5e83a41844c63438fdc874ea40a8b5b4ecb7693f1c4"},
|
{file = "pydantic_core-2.20.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:175873691124f3d0da55aeea1d90660a6ea7a3cfea137c38afa0a5ffabe37b88"},
|
||||||
{file = "pydantic_core-2.18.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a642295cd0c8df1b86fc3dced1d067874c353a188dc8e0f744626d49e9aa51c4"},
|
{file = "pydantic_core-2.20.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:37eee5b638f0e0dcd18d21f59b679686bbd18917b87db0193ae36f9c23c355fc"},
|
||||||
{file = "pydantic_core-2.18.4-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5f09baa656c904807e832cf9cce799c6460c450c4ad80803517032da0cd062e2"},
|
{file = "pydantic_core-2.20.1-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:25e9185e2d06c16ee438ed39bf62935ec436474a6ac4f9358524220f1b236e43"},
|
||||||
{file = "pydantic_core-2.18.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:98906207f29bc2c459ff64fa007afd10a8c8ac080f7e4d5beff4c97086a3dabd"},
|
{file = "pydantic_core-2.20.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:150906b40ff188a3260cbee25380e7494ee85048584998c1e66df0c7a11c17a6"},
|
||||||
{file = "pydantic_core-2.18.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:19894b95aacfa98e7cb093cd7881a0c76f55731efad31073db4521e2b6ff5b7d"},
|
{file = "pydantic_core-2.20.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8ad4aeb3e9a97286573c03df758fc7627aecdd02f1da04516a86dc159bf70121"},
|
||||||
{file = "pydantic_core-2.18.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0fbbdc827fe5e42e4d196c746b890b3d72876bdbf160b0eafe9f0334525119c8"},
|
{file = "pydantic_core-2.20.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d3f3ed29cd9f978c604708511a1f9c2fdcb6c38b9aae36a51905b8811ee5cbf1"},
|
||||||
{file = "pydantic_core-2.18.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f85d05aa0918283cf29a30b547b4df2fbb56b45b135f9e35b6807cb28bc47951"},
|
{file = "pydantic_core-2.20.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b0dae11d8f5ded51699c74d9548dcc5938e0804cc8298ec0aa0da95c21fff57b"},
|
||||||
{file = "pydantic_core-2.18.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e85637bc8fe81ddb73fda9e56bab24560bdddfa98aa64f87aaa4e4b6730c23d2"},
|
{file = "pydantic_core-2.20.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:faa6b09ee09433b87992fb5a2859efd1c264ddc37280d2dd5db502126d0e7f27"},
|
||||||
{file = "pydantic_core-2.18.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:2f5966897e5461f818e136b8451d0551a2e77259eb0f73a837027b47dc95dab9"},
|
{file = "pydantic_core-2.20.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:9dc1b507c12eb0481d071f3c1808f0529ad41dc415d0ca11f7ebfc666e66a18b"},
|
||||||
{file = "pydantic_core-2.18.4-cp311-none-win32.whl", hash = "sha256:44c7486a4228413c317952e9d89598bcdfb06399735e49e0f8df643e1ccd0558"},
|
{file = "pydantic_core-2.20.1-cp311-none-win32.whl", hash = "sha256:fa2fddcb7107e0d1808086ca306dcade7df60a13a6c347a7acf1ec139aa6789a"},
|
||||||
{file = "pydantic_core-2.18.4-cp311-none-win_amd64.whl", hash = "sha256:8a7164fe2005d03c64fd3b85649891cd4953a8de53107940bf272500ba8a788b"},
|
{file = "pydantic_core-2.20.1-cp311-none-win_amd64.whl", hash = "sha256:40a783fb7ee353c50bd3853e626f15677ea527ae556429453685ae32280c19c2"},
|
||||||
{file = "pydantic_core-2.18.4-cp311-none-win_arm64.whl", hash = "sha256:4e99bc050fe65c450344421017f98298a97cefc18c53bb2f7b3531eb39bc7805"},
|
{file = "pydantic_core-2.20.1-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:595ba5be69b35777474fa07f80fc260ea71255656191adb22a8c53aba4479231"},
|
||||||
{file = "pydantic_core-2.18.4-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:6f5c4d41b2771c730ea1c34e458e781b18cc668d194958e0112455fff4e402b2"},
|
{file = "pydantic_core-2.20.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a4f55095ad087474999ee28d3398bae183a66be4823f753cd7d67dd0153427c9"},
|
||||||
{file = "pydantic_core-2.18.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2fdf2156aa3d017fddf8aea5adfba9f777db1d6022d392b682d2a8329e087cef"},
|
{file = "pydantic_core-2.20.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f9aa05d09ecf4c75157197f27cdc9cfaeb7c5f15021c6373932bf3e124af029f"},
|
||||||
{file = "pydantic_core-2.18.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4748321b5078216070b151d5271ef3e7cc905ab170bbfd27d5c83ee3ec436695"},
|
{file = "pydantic_core-2.20.1-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e97fdf088d4b31ff4ba35db26d9cc472ac7ef4a2ff2badeabf8d727b3377fc52"},
|
||||||
{file = "pydantic_core-2.18.4-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:847a35c4d58721c5dc3dba599878ebbdfd96784f3fb8bb2c356e123bdcd73f34"},
|
{file = "pydantic_core-2.20.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bc633a9fe1eb87e250b5c57d389cf28998e4292336926b0b6cdaee353f89a237"},
|
||||||
{file = "pydantic_core-2.18.4-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3c40d4eaad41f78e3bbda31b89edc46a3f3dc6e171bf0ecf097ff7a0ffff7cb1"},
|
{file = "pydantic_core-2.20.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d573faf8eb7e6b1cbbcb4f5b247c60ca8be39fe2c674495df0eb4318303137fe"},
|
||||||
{file = "pydantic_core-2.18.4-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:21a5e440dbe315ab9825fcd459b8814bb92b27c974cbc23c3e8baa2b76890077"},
|
{file = "pydantic_core-2.20.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:26dc97754b57d2fd00ac2b24dfa341abffc380b823211994c4efac7f13b9e90e"},
|
||||||
{file = "pydantic_core-2.18.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:01dd777215e2aa86dfd664daed5957704b769e726626393438f9c87690ce78c3"},
|
{file = "pydantic_core-2.20.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:33499e85e739a4b60c9dac710c20a08dc73cb3240c9a0e22325e671b27b70d24"},
|
||||||
{file = "pydantic_core-2.18.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:4b06beb3b3f1479d32befd1f3079cc47b34fa2da62457cdf6c963393340b56e9"},
|
{file = "pydantic_core-2.20.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:bebb4d6715c814597f85297c332297c6ce81e29436125ca59d1159b07f423eb1"},
|
||||||
{file = "pydantic_core-2.18.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:564d7922e4b13a16b98772441879fcdcbe82ff50daa622d681dd682175ea918c"},
|
{file = "pydantic_core-2.20.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:516d9227919612425c8ef1c9b869bbbee249bc91912c8aaffb66116c0b447ebd"},
|
||||||
{file = "pydantic_core-2.18.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:0eb2a4f660fcd8e2b1c90ad566db2b98d7f3f4717c64fe0a83e0adb39766d5b8"},
|
{file = "pydantic_core-2.20.1-cp312-none-win32.whl", hash = "sha256:469f29f9093c9d834432034d33f5fe45699e664f12a13bf38c04967ce233d688"},
|
||||||
{file = "pydantic_core-2.18.4-cp312-none-win32.whl", hash = "sha256:8b8bab4c97248095ae0c4455b5a1cd1cdd96e4e4769306ab19dda135ea4cdb07"},
|
{file = "pydantic_core-2.20.1-cp312-none-win_amd64.whl", hash = "sha256:035ede2e16da7281041f0e626459bcae33ed998cca6a0a007a5ebb73414ac72d"},
|
||||||
{file = "pydantic_core-2.18.4-cp312-none-win_amd64.whl", hash = "sha256:14601cdb733d741b8958224030e2bfe21a4a881fb3dd6fbb21f071cabd48fa0a"},
|
{file = "pydantic_core-2.20.1-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:0827505a5c87e8aa285dc31e9ec7f4a17c81a813d45f70b1d9164e03a813a686"},
|
||||||
{file = "pydantic_core-2.18.4-cp312-none-win_arm64.whl", hash = "sha256:c1322d7dd74713dcc157a2b7898a564ab091ca6c58302d5c7b4c07296e3fd00f"},
|
{file = "pydantic_core-2.20.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:19c0fa39fa154e7e0b7f82f88ef85faa2a4c23cc65aae2f5aea625e3c13c735a"},
|
||||||
{file = "pydantic_core-2.18.4-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:823be1deb01793da05ecb0484d6c9e20baebb39bd42b5d72636ae9cf8350dbd2"},
|
{file = "pydantic_core-2.20.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4aa223cd1e36b642092c326d694d8bf59b71ddddc94cdb752bbbb1c5c91d833b"},
|
||||||
{file = "pydantic_core-2.18.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:ebef0dd9bf9b812bf75bda96743f2a6c5734a02092ae7f721c048d156d5fabae"},
|
{file = "pydantic_core-2.20.1-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c336a6d235522a62fef872c6295a42ecb0c4e1d0f1a3e500fe949415761b8a19"},
|
||||||
{file = "pydantic_core-2.18.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ae1d6df168efb88d7d522664693607b80b4080be6750c913eefb77e34c12c71a"},
|
{file = "pydantic_core-2.20.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7eb6a0587eded33aeefea9f916899d42b1799b7b14b8f8ff2753c0ac1741edac"},
|
||||||
{file = "pydantic_core-2.18.4-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f9899c94762343f2cc2fc64c13e7cae4c3cc65cdfc87dd810a31654c9b7358cc"},
|
{file = "pydantic_core-2.20.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:70c8daf4faca8da5a6d655f9af86faf6ec2e1768f4b8b9d0226c02f3d6209703"},
|
||||||
{file = "pydantic_core-2.18.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:99457f184ad90235cfe8461c4d70ab7dd2680e28821c29eca00252ba90308c78"},
|
{file = "pydantic_core-2.20.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e9fa4c9bf273ca41f940bceb86922a7667cd5bf90e95dbb157cbb8441008482c"},
|
||||||
{file = "pydantic_core-2.18.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:18f469a3d2a2fdafe99296a87e8a4c37748b5080a26b806a707f25a902c040a8"},
|
{file = "pydantic_core-2.20.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:11b71d67b4725e7e2a9f6e9c0ac1239bbc0c48cce3dc59f98635efc57d6dac83"},
|
||||||
{file = "pydantic_core-2.18.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b7cdf28938ac6b8b49ae5e92f2735056a7ba99c9b110a474473fd71185c1af5d"},
|
{file = "pydantic_core-2.20.1-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:270755f15174fb983890c49881e93f8f1b80f0b5e3a3cc1394a255706cabd203"},
|
||||||
{file = "pydantic_core-2.18.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:938cb21650855054dc54dfd9120a851c974f95450f00683399006aa6e8abb057"},
|
{file = "pydantic_core-2.20.1-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:c81131869240e3e568916ef4c307f8b99583efaa60a8112ef27a366eefba8ef0"},
|
||||||
{file = "pydantic_core-2.18.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:44cd83ab6a51da80fb5adbd9560e26018e2ac7826f9626bc06ca3dc074cd198b"},
|
{file = "pydantic_core-2.20.1-cp313-none-win32.whl", hash = "sha256:b91ced227c41aa29c672814f50dbb05ec93536abf8f43cd14ec9521ea09afe4e"},
|
||||||
{file = "pydantic_core-2.18.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:972658f4a72d02b8abfa2581d92d59f59897d2e9f7e708fdabe922f9087773af"},
|
{file = "pydantic_core-2.20.1-cp313-none-win_amd64.whl", hash = "sha256:65db0f2eefcaad1a3950f498aabb4875c8890438bc80b19362cf633b87a8ab20"},
|
||||||
{file = "pydantic_core-2.18.4-cp38-none-win32.whl", hash = "sha256:1d886dc848e60cb7666f771e406acae54ab279b9f1e4143babc9c2258213daa2"},
|
{file = "pydantic_core-2.20.1-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:4745f4ac52cc6686390c40eaa01d48b18997cb130833154801a442323cc78f91"},
|
||||||
{file = "pydantic_core-2.18.4-cp38-none-win_amd64.whl", hash = "sha256:bb4462bd43c2460774914b8525f79b00f8f407c945d50881568f294c1d9b4443"},
|
{file = "pydantic_core-2.20.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:a8ad4c766d3f33ba8fd692f9aa297c9058970530a32c728a2c4bfd2616d3358b"},
|
||||||
{file = "pydantic_core-2.18.4-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:44a688331d4a4e2129140a8118479443bd6f1905231138971372fcde37e43528"},
|
{file = "pydantic_core-2.20.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:41e81317dd6a0127cabce83c0c9c3fbecceae981c8391e6f1dec88a77c8a569a"},
|
||||||
{file = "pydantic_core-2.18.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a2fdd81edd64342c85ac7cf2753ccae0b79bf2dfa063785503cb85a7d3593223"},
|
{file = "pydantic_core-2.20.1-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:04024d270cf63f586ad41fff13fde4311c4fc13ea74676962c876d9577bcc78f"},
|
||||||
{file = "pydantic_core-2.18.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:86110d7e1907ab36691f80b33eb2da87d780f4739ae773e5fc83fb272f88825f"},
|
{file = "pydantic_core-2.20.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:eaad4ff2de1c3823fddf82f41121bdf453d922e9a238642b1dedb33c4e4f98ad"},
|
||||||
{file = "pydantic_core-2.18.4-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:46387e38bd641b3ee5ce247563b60c5ca098da9c56c75c157a05eaa0933ed154"},
|
{file = "pydantic_core-2.20.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:26ab812fa0c845df815e506be30337e2df27e88399b985d0bb4e3ecfe72df31c"},
|
||||||
{file = "pydantic_core-2.18.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:123c3cec203e3f5ac7b000bd82235f1a3eced8665b63d18be751f115588fea30"},
|
{file = "pydantic_core-2.20.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3c5ebac750d9d5f2706654c638c041635c385596caf68f81342011ddfa1e5598"},
|
||||||
{file = "pydantic_core-2.18.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dc1803ac5c32ec324c5261c7209e8f8ce88e83254c4e1aebdc8b0a39f9ddb443"},
|
{file = "pydantic_core-2.20.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2aafc5a503855ea5885559eae883978c9b6d8c8993d67766ee73d82e841300dd"},
|
||||||
{file = "pydantic_core-2.18.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:53db086f9f6ab2b4061958d9c276d1dbe3690e8dd727d6abf2321d6cce37fa94"},
|
{file = "pydantic_core-2.20.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:4868f6bd7c9d98904b748a2653031fc9c2f85b6237009d475b1008bfaeb0a5aa"},
|
||||||
{file = "pydantic_core-2.18.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:abc267fa9837245cc28ea6929f19fa335f3dc330a35d2e45509b6566dc18be23"},
|
{file = "pydantic_core-2.20.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:aa2f457b4af386254372dfa78a2eda2563680d982422641a85f271c859df1987"},
|
||||||
{file = "pydantic_core-2.18.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:a0d829524aaefdebccb869eed855e2d04c21d2d7479b6cada7ace5448416597b"},
|
{file = "pydantic_core-2.20.1-cp38-none-win32.whl", hash = "sha256:225b67a1f6d602de0ce7f6c1c3ae89a4aa25d3de9be857999e9124f15dab486a"},
|
||||||
{file = "pydantic_core-2.18.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:509daade3b8649f80d4e5ff21aa5673e4ebe58590b25fe42fac5f0f52c6f034a"},
|
{file = "pydantic_core-2.20.1-cp38-none-win_amd64.whl", hash = "sha256:6b507132dcfc0dea440cce23ee2182c0ce7aba7054576efc65634f080dbe9434"},
|
||||||
{file = "pydantic_core-2.18.4-cp39-none-win32.whl", hash = "sha256:ca26a1e73c48cfc54c4a76ff78df3727b9d9f4ccc8dbee4ae3f73306a591676d"},
|
{file = "pydantic_core-2.20.1-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:b03f7941783b4c4a26051846dea594628b38f6940a2fdc0df00b221aed39314c"},
|
||||||
{file = "pydantic_core-2.18.4-cp39-none-win_amd64.whl", hash = "sha256:c67598100338d5d985db1b3d21f3619ef392e185e71b8d52bceacc4a7771ea7e"},
|
{file = "pydantic_core-2.20.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:1eedfeb6089ed3fad42e81a67755846ad4dcc14d73698c120a82e4ccf0f1f9f6"},
|
||||||
{file = "pydantic_core-2.18.4-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:574d92eac874f7f4db0ca653514d823a0d22e2354359d0759e3f6a406db5d55d"},
|
{file = "pydantic_core-2.20.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:635fee4e041ab9c479e31edda27fcf966ea9614fff1317e280d99eb3e5ab6fe2"},
|
||||||
{file = "pydantic_core-2.18.4-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:1f4d26ceb5eb9eed4af91bebeae4b06c3fb28966ca3a8fb765208cf6b51102ab"},
|
{file = "pydantic_core-2.20.1-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:77bf3ac639c1ff567ae3b47f8d4cc3dc20f9966a2a6dd2311dcc055d3d04fb8a"},
|
||||||
{file = "pydantic_core-2.18.4-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:77450e6d20016ec41f43ca4a6c63e9fdde03f0ae3fe90e7c27bdbeaece8b1ed4"},
|
{file = "pydantic_core-2.20.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7ed1b0132f24beeec5a78b67d9388656d03e6a7c837394f99257e2d55b461611"},
|
||||||
{file = "pydantic_core-2.18.4-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d323a01da91851a4f17bf592faf46149c9169d68430b3146dcba2bb5e5719abc"},
|
{file = "pydantic_core-2.20.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c6514f963b023aeee506678a1cf821fe31159b925c4b76fe2afa94cc70b3222b"},
|
||||||
{file = "pydantic_core-2.18.4-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:43d447dd2ae072a0065389092a231283f62d960030ecd27565672bd40746c507"},
|
{file = "pydantic_core-2.20.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:10d4204d8ca33146e761c79f83cc861df20e7ae9f6487ca290a97702daf56006"},
|
||||||
{file = "pydantic_core-2.18.4-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:578e24f761f3b425834f297b9935e1ce2e30f51400964ce4801002435a1b41ef"},
|
{file = "pydantic_core-2.20.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2d036c7187b9422ae5b262badb87a20a49eb6c5238b2004e96d4da1231badef1"},
|
||||||
{file = "pydantic_core-2.18.4-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:81b5efb2f126454586d0f40c4d834010979cb80785173d1586df845a632e4e6d"},
|
{file = "pydantic_core-2.20.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:9ebfef07dbe1d93efb94b4700f2d278494e9162565a54f124c404a5656d7ff09"},
|
||||||
{file = "pydantic_core-2.18.4-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:ab86ce7c8f9bea87b9d12c7f0af71102acbf5ecbc66c17796cff45dae54ef9a5"},
|
{file = "pydantic_core-2.20.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:6b9d9bb600328a1ce523ab4f454859e9d439150abb0906c5a1983c146580ebab"},
|
||||||
{file = "pydantic_core-2.18.4-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:90afc12421df2b1b4dcc975f814e21bc1754640d502a2fbcc6d41e77af5ec312"},
|
{file = "pydantic_core-2.20.1-cp39-none-win32.whl", hash = "sha256:784c1214cb6dd1e3b15dd8b91b9a53852aed16671cc3fbe4786f4f1db07089e2"},
|
||||||
{file = "pydantic_core-2.18.4-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:51991a89639a912c17bef4b45c87bd83593aee0437d8102556af4885811d59f5"},
|
{file = "pydantic_core-2.20.1-cp39-none-win_amd64.whl", hash = "sha256:d2fe69c5434391727efa54b47a1e7986bb0186e72a41b203df8f5b0a19a4f669"},
|
||||||
{file = "pydantic_core-2.18.4-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:293afe532740370aba8c060882f7d26cfd00c94cae32fd2e212a3a6e3b7bc15e"},
|
{file = "pydantic_core-2.20.1-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:a45f84b09ac9c3d35dfcf6a27fd0634d30d183205230a0ebe8373a0e8cfa0906"},
|
||||||
{file = "pydantic_core-2.18.4-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b48ece5bde2e768197a2d0f6e925f9d7e3e826f0ad2271120f8144a9db18d5c8"},
|
{file = "pydantic_core-2.20.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:d02a72df14dfdbaf228424573a07af10637bd490f0901cee872c4f434a735b94"},
|
||||||
{file = "pydantic_core-2.18.4-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:eae237477a873ab46e8dd748e515c72c0c804fb380fbe6c85533c7de51f23a8f"},
|
{file = "pydantic_core-2.20.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d2b27e6af28f07e2f195552b37d7d66b150adbaa39a6d327766ffd695799780f"},
|
||||||
{file = "pydantic_core-2.18.4-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:834b5230b5dfc0c1ec37b2fda433b271cbbc0e507560b5d1588e2cc1148cf1ce"},
|
{file = "pydantic_core-2.20.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:084659fac3c83fd674596612aeff6041a18402f1e1bc19ca39e417d554468482"},
|
||||||
{file = "pydantic_core-2.18.4-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:e858ac0a25074ba4bce653f9b5d0a85b7456eaddadc0ce82d3878c22489fa4ee"},
|
{file = "pydantic_core-2.20.1-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:242b8feb3c493ab78be289c034a1f659e8826e2233786e36f2893a950a719bb6"},
|
||||||
{file = "pydantic_core-2.18.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:2fd41f6eff4c20778d717af1cc50eca52f5afe7805ee530a4fbd0bae284f16e9"},
|
{file = "pydantic_core-2.20.1-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:38cf1c40a921d05c5edc61a785c0ddb4bed67827069f535d794ce6bcded919fc"},
|
||||||
{file = "pydantic_core-2.18.4.tar.gz", hash = "sha256:ec3beeada09ff865c344ff3bc2f427f5e6c26401cc6113d77e372c3fdac73864"},
|
{file = "pydantic_core-2.20.1-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:e0bbdd76ce9aa5d4209d65f2b27fc6e5ef1312ae6c5333c26db3f5ade53a1e99"},
|
||||||
|
{file = "pydantic_core-2.20.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:254ec27fdb5b1ee60684f91683be95e5133c994cc54e86a0b0963afa25c8f8a6"},
|
||||||
|
{file = "pydantic_core-2.20.1-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:407653af5617f0757261ae249d3fba09504d7a71ab36ac057c938572d1bc9331"},
|
||||||
|
{file = "pydantic_core-2.20.1-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:c693e916709c2465b02ca0ad7b387c4f8423d1db7b4649c551f27a529181c5ad"},
|
||||||
|
{file = "pydantic_core-2.20.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5b5ff4911aea936a47d9376fd3ab17e970cc543d1b68921886e7f64bd28308d1"},
|
||||||
|
{file = "pydantic_core-2.20.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:177f55a886d74f1808763976ac4efd29b7ed15c69f4d838bbd74d9d09cf6fa86"},
|
||||||
|
{file = "pydantic_core-2.20.1-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:964faa8a861d2664f0c7ab0c181af0bea66098b1919439815ca8803ef136fc4e"},
|
||||||
|
{file = "pydantic_core-2.20.1-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:4dd484681c15e6b9a977c785a345d3e378d72678fd5f1f3c0509608da24f2ac0"},
|
||||||
|
{file = "pydantic_core-2.20.1-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:f6d6cff3538391e8486a431569b77921adfcdef14eb18fbf19b7c0a5294d4e6a"},
|
||||||
|
{file = "pydantic_core-2.20.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:a6d511cc297ff0883bc3708b465ff82d7560193169a8b93260f74ecb0a5e08a7"},
|
||||||
|
{file = "pydantic_core-2.20.1.tar.gz", hash = "sha256:26ca695eeee5f9f1aeeb211ffc12f10bcb6f71e2989988fda61dabd65db878d4"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
@ -2567,110 +2579,110 @@ use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "rpds-py"
|
name = "rpds-py"
|
||||||
version = "0.18.1"
|
version = "0.19.0"
|
||||||
description = "Python bindings to Rust's persistent data structures (rpds)"
|
description = "Python bindings to Rust's persistent data structures (rpds)"
|
||||||
optional = true
|
optional = true
|
||||||
python-versions = ">=3.8"
|
python-versions = ">=3.8"
|
||||||
files = [
|
files = [
|
||||||
{file = "rpds_py-0.18.1-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:d31dea506d718693b6b2cffc0648a8929bdc51c70a311b2770f09611caa10d53"},
|
{file = "rpds_py-0.19.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:fb37bd599f031f1a6fb9e58ec62864ccf3ad549cf14bac527dbfa97123edcca4"},
|
||||||
{file = "rpds_py-0.18.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:732672fbc449bab754e0b15356c077cc31566df874964d4801ab14f71951ea80"},
|
{file = "rpds_py-0.19.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3384d278df99ec2c6acf701d067147320b864ef6727405d6470838476e44d9e8"},
|
||||||
{file = "rpds_py-0.18.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4a98a1f0552b5f227a3d6422dbd61bc6f30db170939bd87ed14f3c339aa6c7c9"},
|
{file = "rpds_py-0.19.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e54548e0be3ac117595408fd4ca0ac9278fde89829b0b518be92863b17ff67a2"},
|
||||||
{file = "rpds_py-0.18.1-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:7f1944ce16401aad1e3f7d312247b3d5de7981f634dc9dfe90da72b87d37887d"},
|
{file = "rpds_py-0.19.0-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:8eb488ef928cdbc05a27245e52de73c0d7c72a34240ef4d9893fdf65a8c1a955"},
|
||||||
{file = "rpds_py-0.18.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:38e14fb4e370885c4ecd734f093a2225ee52dc384b86fa55fe3f74638b2cfb09"},
|
{file = "rpds_py-0.19.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a5da93debdfe27b2bfc69eefb592e1831d957b9535e0943a0ee8b97996de21b5"},
|
||||||
{file = "rpds_py-0.18.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:08d74b184f9ab6289b87b19fe6a6d1a97fbfea84b8a3e745e87a5de3029bf944"},
|
{file = "rpds_py-0.19.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:79e205c70afddd41f6ee79a8656aec738492a550247a7af697d5bd1aee14f766"},
|
||||||
{file = "rpds_py-0.18.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d70129cef4a8d979caa37e7fe957202e7eee8ea02c5e16455bc9808a59c6b2f0"},
|
{file = "rpds_py-0.19.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:959179efb3e4a27610e8d54d667c02a9feaa86bbabaf63efa7faa4dfa780d4f1"},
|
||||||
{file = "rpds_py-0.18.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:ce0bb20e3a11bd04461324a6a798af34d503f8d6f1aa3d2aa8901ceaf039176d"},
|
{file = "rpds_py-0.19.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a6e605bb9edcf010f54f8b6a590dd23a4b40a8cb141255eec2a03db249bc915b"},
|
||||||
{file = "rpds_py-0.18.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:81c5196a790032e0fc2464c0b4ab95f8610f96f1f2fa3d4deacce6a79852da60"},
|
{file = "rpds_py-0.19.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:9133d75dc119a61d1a0ded38fb9ba40a00ef41697cc07adb6ae098c875195a3f"},
|
||||||
{file = "rpds_py-0.18.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:f3027be483868c99b4985fda802a57a67fdf30c5d9a50338d9db646d590198da"},
|
{file = "rpds_py-0.19.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:dd36b712d35e757e28bf2f40a71e8f8a2d43c8b026d881aa0c617b450d6865c9"},
|
||||||
{file = "rpds_py-0.18.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:d44607f98caa2961bab4fa3c4309724b185b464cdc3ba6f3d7340bac3ec97cc1"},
|
{file = "rpds_py-0.19.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:354f3a91718489912f2e0fc331c24eaaf6a4565c080e00fbedb6015857c00582"},
|
||||||
{file = "rpds_py-0.18.1-cp310-none-win32.whl", hash = "sha256:c273e795e7a0f1fddd46e1e3cb8be15634c29ae8ff31c196debb620e1edb9333"},
|
{file = "rpds_py-0.19.0-cp310-none-win32.whl", hash = "sha256:ebcbf356bf5c51afc3290e491d3722b26aaf5b6af3c1c7f6a1b757828a46e336"},
|
||||||
{file = "rpds_py-0.18.1-cp310-none-win_amd64.whl", hash = "sha256:8352f48d511de5f973e4f2f9412736d7dea76c69faa6d36bcf885b50c758ab9a"},
|
{file = "rpds_py-0.19.0-cp310-none-win_amd64.whl", hash = "sha256:75a6076289b2df6c8ecb9d13ff79ae0cad1d5fb40af377a5021016d58cd691ec"},
|
||||||
{file = "rpds_py-0.18.1-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:6b5ff7e1d63a8281654b5e2896d7f08799378e594f09cf3674e832ecaf396ce8"},
|
{file = "rpds_py-0.19.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:6d45080095e585f8c5097897313def60caa2046da202cdb17a01f147fb263b81"},
|
||||||
{file = "rpds_py-0.18.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8927638a4d4137a289e41d0fd631551e89fa346d6dbcfc31ad627557d03ceb6d"},
|
{file = "rpds_py-0.19.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c5c9581019c96f865483d031691a5ff1cc455feb4d84fc6920a5ffc48a794d8a"},
|
||||||
{file = "rpds_py-0.18.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:154bf5c93d79558b44e5b50cc354aa0459e518e83677791e6adb0b039b7aa6a7"},
|
{file = "rpds_py-0.19.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1540d807364c84516417115c38f0119dfec5ea5c0dd9a25332dea60b1d26fc4d"},
|
||||||
{file = "rpds_py-0.18.1-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:07f2139741e5deb2c5154a7b9629bc5aa48c766b643c1a6750d16f865a82c5fc"},
|
{file = "rpds_py-0.19.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:9e65489222b410f79711dc3d2d5003d2757e30874096b2008d50329ea4d0f88c"},
|
||||||
{file = "rpds_py-0.18.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8c7672e9fba7425f79019db9945b16e308ed8bc89348c23d955c8c0540da0a07"},
|
{file = "rpds_py-0.19.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9da6f400eeb8c36f72ef6646ea530d6d175a4f77ff2ed8dfd6352842274c1d8b"},
|
||||||
{file = "rpds_py-0.18.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:489bdfe1abd0406eba6b3bb4fdc87c7fa40f1031de073d0cfb744634cc8fa261"},
|
{file = "rpds_py-0.19.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:37f46bb11858717e0efa7893c0f7055c43b44c103e40e69442db5061cb26ed34"},
|
||||||
{file = "rpds_py-0.18.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3c20f05e8e3d4fc76875fc9cb8cf24b90a63f5a1b4c5b9273f0e8225e169b100"},
|
{file = "rpds_py-0.19.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:071d4adc734de562bd11d43bd134330fb6249769b2f66b9310dab7460f4bf714"},
|
||||||
{file = "rpds_py-0.18.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:967342e045564cef76dfcf1edb700b1e20838d83b1aa02ab313e6a497cf923b8"},
|
{file = "rpds_py-0.19.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:9625367c8955e4319049113ea4f8fee0c6c1145192d57946c6ffcd8fe8bf48dd"},
|
||||||
{file = "rpds_py-0.18.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:2cc7c1a47f3a63282ab0f422d90ddac4aa3034e39fc66a559ab93041e6505da7"},
|
{file = "rpds_py-0.19.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:e19509145275d46bc4d1e16af0b57a12d227c8253655a46bbd5ec317e941279d"},
|
||||||
{file = "rpds_py-0.18.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:f7afbfee1157e0f9376c00bb232e80a60e59ed716e3211a80cb8506550671e6e"},
|
{file = "rpds_py-0.19.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:4d438e4c020d8c39961deaf58f6913b1bf8832d9b6f62ec35bd93e97807e9cbc"},
|
||||||
{file = "rpds_py-0.18.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:9e6934d70dc50f9f8ea47081ceafdec09245fd9f6032669c3b45705dea096b88"},
|
{file = "rpds_py-0.19.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:90bf55d9d139e5d127193170f38c584ed3c79e16638890d2e36f23aa1630b952"},
|
||||||
{file = "rpds_py-0.18.1-cp311-none-win32.whl", hash = "sha256:c69882964516dc143083d3795cb508e806b09fc3800fd0d4cddc1df6c36e76bb"},
|
{file = "rpds_py-0.19.0-cp311-none-win32.whl", hash = "sha256:8d6ad132b1bc13d05ffe5b85e7a01a3998bf3a6302ba594b28d61b8c2cf13aaf"},
|
||||||
{file = "rpds_py-0.18.1-cp311-none-win_amd64.whl", hash = "sha256:70a838f7754483bcdc830444952fd89645569e7452e3226de4a613a4c1793fb2"},
|
{file = "rpds_py-0.19.0-cp311-none-win_amd64.whl", hash = "sha256:7ec72df7354e6b7f6eb2a17fa6901350018c3a9ad78e48d7b2b54d0412539a67"},
|
||||||
{file = "rpds_py-0.18.1-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:3dd3cd86e1db5aadd334e011eba4e29d37a104b403e8ca24dcd6703c68ca55b3"},
|
{file = "rpds_py-0.19.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:5095a7c838a8647c32aa37c3a460d2c48debff7fc26e1136aee60100a8cd8f68"},
|
||||||
{file = "rpds_py-0.18.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:05f3d615099bd9b13ecf2fc9cf2d839ad3f20239c678f461c753e93755d629ee"},
|
{file = "rpds_py-0.19.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6f2f78ef14077e08856e788fa482107aa602636c16c25bdf59c22ea525a785e9"},
|
||||||
{file = "rpds_py-0.18.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:35b2b771b13eee8729a5049c976197ff58a27a3829c018a04341bcf1ae409b2b"},
|
{file = "rpds_py-0.19.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b7cc6cb44f8636fbf4a934ca72f3e786ba3c9f9ba4f4d74611e7da80684e48d2"},
|
||||||
{file = "rpds_py-0.18.1-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ee17cd26b97d537af8f33635ef38be873073d516fd425e80559f4585a7b90c43"},
|
{file = "rpds_py-0.19.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:cf902878b4af334a09de7a45badbff0389e7cf8dc2e4dcf5f07125d0b7c2656d"},
|
||||||
{file = "rpds_py-0.18.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b646bf655b135ccf4522ed43d6902af37d3f5dbcf0da66c769a2b3938b9d8184"},
|
{file = "rpds_py-0.19.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:688aa6b8aa724db1596514751ffb767766e02e5c4a87486ab36b8e1ebc1aedac"},
|
||||||
{file = "rpds_py-0.18.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:19ba472b9606c36716062c023afa2484d1e4220548751bda14f725a7de17b4f6"},
|
{file = "rpds_py-0.19.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:57dbc9167d48e355e2569346b5aa4077f29bf86389c924df25c0a8b9124461fb"},
|
||||||
{file = "rpds_py-0.18.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6e30ac5e329098903262dc5bdd7e2086e0256aa762cc8b744f9e7bf2a427d3f8"},
|
{file = "rpds_py-0.19.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3b4cf5a9497874822341c2ebe0d5850fed392034caadc0bad134ab6822c0925b"},
|
||||||
{file = "rpds_py-0.18.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d58ad6317d188c43750cb76e9deacf6051d0f884d87dc6518e0280438648a9ac"},
|
{file = "rpds_py-0.19.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:8a790d235b9d39c70a466200d506bb33a98e2ee374a9b4eec7a8ac64c2c261fa"},
|
||||||
{file = "rpds_py-0.18.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e1735502458621921cee039c47318cb90b51d532c2766593be6207eec53e5c4c"},
|
{file = "rpds_py-0.19.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:1d16089dfa58719c98a1c06f2daceba6d8e3fb9b5d7931af4a990a3c486241cb"},
|
||||||
{file = "rpds_py-0.18.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:f5bab211605d91db0e2995a17b5c6ee5edec1270e46223e513eaa20da20076ac"},
|
{file = "rpds_py-0.19.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:bc9128e74fe94650367fe23f37074f121b9f796cabbd2f928f13e9661837296d"},
|
||||||
{file = "rpds_py-0.18.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:2fc24a329a717f9e2448f8cd1f960f9dac4e45b6224d60734edeb67499bab03a"},
|
{file = "rpds_py-0.19.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:c8f77e661ffd96ff104bebf7d0f3255b02aa5d5b28326f5408d6284c4a8b3248"},
|
||||||
{file = "rpds_py-0.18.1-cp312-none-win32.whl", hash = "sha256:1805d5901779662d599d0e2e4159d8a82c0b05faa86ef9222bf974572286b2b6"},
|
{file = "rpds_py-0.19.0-cp312-none-win32.whl", hash = "sha256:5f83689a38e76969327e9b682be5521d87a0c9e5a2e187d2bc6be4765f0d4600"},
|
||||||
{file = "rpds_py-0.18.1-cp312-none-win_amd64.whl", hash = "sha256:720edcb916df872d80f80a1cc5ea9058300b97721efda8651efcd938a9c70a72"},
|
{file = "rpds_py-0.19.0-cp312-none-win_amd64.whl", hash = "sha256:06925c50f86da0596b9c3c64c3837b2481337b83ef3519e5db2701df695453a4"},
|
||||||
{file = "rpds_py-0.18.1-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:c827576e2fa017a081346dce87d532a5310241648eb3700af9a571a6e9fc7e74"},
|
{file = "rpds_py-0.19.0-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:52e466bea6f8f3a44b1234570244b1cff45150f59a4acae3fcc5fd700c2993ca"},
|
||||||
{file = "rpds_py-0.18.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:aa3679e751408d75a0b4d8d26d6647b6d9326f5e35c00a7ccd82b78ef64f65f8"},
|
{file = "rpds_py-0.19.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:e21cc693045fda7f745c790cb687958161ce172ffe3c5719ca1764e752237d16"},
|
||||||
{file = "rpds_py-0.18.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0abeee75434e2ee2d142d650d1e54ac1f8b01e6e6abdde8ffd6eeac6e9c38e20"},
|
{file = "rpds_py-0.19.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6b31f059878eb1f5da8b2fd82480cc18bed8dcd7fb8fe68370e2e6285fa86da6"},
|
||||||
{file = "rpds_py-0.18.1-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ed402d6153c5d519a0faf1bb69898e97fb31613b49da27a84a13935ea9164dfc"},
|
{file = "rpds_py-0.19.0-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:1dd46f309e953927dd018567d6a9e2fb84783963650171f6c5fe7e5c41fd5666"},
|
||||||
{file = "rpds_py-0.18.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:338dee44b0cef8b70fd2ef54b4e09bb1b97fc6c3a58fea5db6cc083fd9fc2724"},
|
{file = "rpds_py-0.19.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:34a01a4490e170376cd79258b7f755fa13b1a6c3667e872c8e35051ae857a92b"},
|
||||||
{file = "rpds_py-0.18.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7750569d9526199c5b97e5a9f8d96a13300950d910cf04a861d96f4273d5b104"},
|
{file = "rpds_py-0.19.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:bcf426a8c38eb57f7bf28932e68425ba86def6e756a5b8cb4731d8e62e4e0223"},
|
||||||
{file = "rpds_py-0.18.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:607345bd5912aacc0c5a63d45a1f73fef29e697884f7e861094e443187c02be5"},
|
{file = "rpds_py-0.19.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f68eea5df6347d3f1378ce992d86b2af16ad7ff4dcb4a19ccdc23dea901b87fb"},
|
||||||
{file = "rpds_py-0.18.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:207c82978115baa1fd8d706d720b4a4d2b0913df1c78c85ba73fe6c5804505f0"},
|
{file = "rpds_py-0.19.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:dab8d921b55a28287733263c0e4c7db11b3ee22aee158a4de09f13c93283c62d"},
|
||||||
{file = "rpds_py-0.18.1-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:6d1e42d2735d437e7e80bab4d78eb2e459af48c0a46e686ea35f690b93db792d"},
|
{file = "rpds_py-0.19.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:6fe87efd7f47266dfc42fe76dae89060038f1d9cb911f89ae7e5084148d1cc08"},
|
||||||
{file = "rpds_py-0.18.1-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:5463c47c08630007dc0fe99fb480ea4f34a89712410592380425a9b4e1611d8e"},
|
{file = "rpds_py-0.19.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:535d4b52524a961d220875688159277f0e9eeeda0ac45e766092bfb54437543f"},
|
||||||
{file = "rpds_py-0.18.1-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:06d218939e1bf2ca50e6b0ec700ffe755e5216a8230ab3e87c059ebb4ea06afc"},
|
{file = "rpds_py-0.19.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:8b1a94b8afc154fbe36978a511a1f155f9bd97664e4f1f7a374d72e180ceb0ae"},
|
||||||
{file = "rpds_py-0.18.1-cp38-none-win32.whl", hash = "sha256:312fe69b4fe1ffbe76520a7676b1e5ac06ddf7826d764cc10265c3b53f96dbe9"},
|
{file = "rpds_py-0.19.0-cp38-none-win32.whl", hash = "sha256:7c98298a15d6b90c8f6e3caa6457f4f022423caa5fa1a1ca7a5e9e512bdb77a4"},
|
||||||
{file = "rpds_py-0.18.1-cp38-none-win_amd64.whl", hash = "sha256:9437ca26784120a279f3137ee080b0e717012c42921eb07861b412340f85bae2"},
|
{file = "rpds_py-0.19.0-cp38-none-win_amd64.whl", hash = "sha256:b0da31853ab6e58a11db3205729133ce0df26e6804e93079dee095be3d681dc1"},
|
||||||
{file = "rpds_py-0.18.1-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:19e515b78c3fc1039dd7da0a33c28c3154458f947f4dc198d3c72db2b6b5dc93"},
|
{file = "rpds_py-0.19.0-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:5039e3cef7b3e7a060de468a4a60a60a1f31786da94c6cb054e7a3c75906111c"},
|
||||||
{file = "rpds_py-0.18.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a7b28c5b066bca9a4eb4e2f2663012debe680f097979d880657f00e1c30875a0"},
|
{file = "rpds_py-0.19.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ab1932ca6cb8c7499a4d87cb21ccc0d3326f172cfb6a64021a889b591bb3045c"},
|
||||||
{file = "rpds_py-0.18.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:673fdbbf668dd958eff750e500495ef3f611e2ecc209464f661bc82e9838991e"},
|
{file = "rpds_py-0.19.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f2afd2164a1e85226fcb6a1da77a5c8896c18bfe08e82e8ceced5181c42d2179"},
|
||||||
{file = "rpds_py-0.18.1-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d960de62227635d2e61068f42a6cb6aae91a7fe00fca0e3aeed17667c8a34611"},
|
{file = "rpds_py-0.19.0-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:b1c30841f5040de47a0046c243fc1b44ddc87d1b12435a43b8edff7e7cb1e0d0"},
|
||||||
{file = "rpds_py-0.18.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:352a88dc7892f1da66b6027af06a2e7e5d53fe05924cc2cfc56495b586a10b72"},
|
{file = "rpds_py-0.19.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f757f359f30ec7dcebca662a6bd46d1098f8b9fb1fcd661a9e13f2e8ce343ba1"},
|
||||||
{file = "rpds_py-0.18.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4e0ee01ad8260184db21468a6e1c37afa0529acc12c3a697ee498d3c2c4dcaf3"},
|
{file = "rpds_py-0.19.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:15e65395a59d2e0e96caf8ee5389ffb4604e980479c32742936ddd7ade914b22"},
|
||||||
{file = "rpds_py-0.18.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e4c39ad2f512b4041343ea3c7894339e4ca7839ac38ca83d68a832fc8b3748ab"},
|
{file = "rpds_py-0.19.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cb0f6eb3a320f24b94d177e62f4074ff438f2ad9d27e75a46221904ef21a7b05"},
|
||||||
{file = "rpds_py-0.18.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:aaa71ee43a703c321906813bb252f69524f02aa05bf4eec85f0c41d5d62d0f4c"},
|
{file = "rpds_py-0.19.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b228e693a2559888790936e20f5f88b6e9f8162c681830eda303bad7517b4d5a"},
|
||||||
{file = "rpds_py-0.18.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:6cd8098517c64a85e790657e7b1e509b9fe07487fd358e19431cb120f7d96338"},
|
{file = "rpds_py-0.19.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:2575efaa5d949c9f4e2cdbe7d805d02122c16065bfb8d95c129372d65a291a0b"},
|
||||||
{file = "rpds_py-0.18.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:4adec039b8e2928983f885c53b7cc4cda8965b62b6596501a0308d2703f8af1b"},
|
{file = "rpds_py-0.19.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:5c872814b77a4e84afa293a1bee08c14daed1068b2bb1cc312edbf020bbbca2b"},
|
||||||
{file = "rpds_py-0.18.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:32b7daaa3e9389db3695964ce8e566e3413b0c43e3394c05e4b243a4cd7bef26"},
|
{file = "rpds_py-0.19.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:850720e1b383df199b8433a20e02b25b72f0fded28bc03c5bd79e2ce7ef050be"},
|
||||||
{file = "rpds_py-0.18.1-cp39-none-win32.whl", hash = "sha256:2625f03b105328729f9450c8badda34d5243231eef6535f80064d57035738360"},
|
{file = "rpds_py-0.19.0-cp39-none-win32.whl", hash = "sha256:ce84a7efa5af9f54c0aa7692c45861c1667080814286cacb9958c07fc50294fb"},
|
||||||
{file = "rpds_py-0.18.1-cp39-none-win_amd64.whl", hash = "sha256:bf18932d0003c8c4d51a39f244231986ab23ee057d235a12b2684ea26a353590"},
|
{file = "rpds_py-0.19.0-cp39-none-win_amd64.whl", hash = "sha256:1c26da90b8d06227d7769f34915913911222d24ce08c0ab2d60b354e2d9c7aff"},
|
||||||
{file = "rpds_py-0.18.1-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:cbfbea39ba64f5e53ae2915de36f130588bba71245b418060ec3330ebf85678e"},
|
{file = "rpds_py-0.19.0-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:75969cf900d7be665ccb1622a9aba225cf386bbc9c3bcfeeab9f62b5048f4a07"},
|
||||||
{file = "rpds_py-0.18.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:a3d456ff2a6a4d2adcdf3c1c960a36f4fd2fec6e3b4902a42a384d17cf4e7a65"},
|
{file = "rpds_py-0.19.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:8445f23f13339da640d1be8e44e5baf4af97e396882ebbf1692aecd67f67c479"},
|
||||||
{file = "rpds_py-0.18.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7700936ef9d006b7ef605dc53aa364da2de5a3aa65516a1f3ce73bf82ecfc7ae"},
|
{file = "rpds_py-0.19.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5a7c1062ef8aea3eda149f08120f10795835fc1c8bc6ad948fb9652a113ca55"},
|
||||||
{file = "rpds_py-0.18.1-pp310-pypy310_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:51584acc5916212e1bf45edd17f3a6b05fe0cbb40482d25e619f824dccb679de"},
|
{file = "rpds_py-0.19.0-pp310-pypy310_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:462b0c18fbb48fdbf980914a02ee38c423a25fcc4cf40f66bacc95a2d2d73bc8"},
|
||||||
{file = "rpds_py-0.18.1-pp310-pypy310_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:942695a206a58d2575033ff1e42b12b2aece98d6003c6bc739fbf33d1773b12f"},
|
{file = "rpds_py-0.19.0-pp310-pypy310_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3208f9aea18991ac7f2b39721e947bbd752a1abbe79ad90d9b6a84a74d44409b"},
|
||||||
{file = "rpds_py-0.18.1-pp310-pypy310_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b906b5f58892813e5ba5c6056d6a5ad08f358ba49f046d910ad992196ea61397"},
|
{file = "rpds_py-0.19.0-pp310-pypy310_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c3444fe52b82f122d8a99bf66777aed6b858d392b12f4c317da19f8234db4533"},
|
||||||
{file = "rpds_py-0.18.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f6f8e3fecca256fefc91bb6765a693d96692459d7d4c644660a9fff32e517843"},
|
{file = "rpds_py-0.19.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:88cb4bac7185a9f0168d38c01d7a00addece9822a52870eee26b8d5b61409213"},
|
||||||
{file = "rpds_py-0.18.1-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:7732770412bab81c5a9f6d20aeb60ae943a9b36dcd990d876a773526468e7163"},
|
{file = "rpds_py-0.19.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6b130bd4163c93798a6b9bb96be64a7c43e1cec81126ffa7ffaa106e1fc5cef5"},
|
||||||
{file = "rpds_py-0.18.1-pp310-pypy310_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:bd1105b50ede37461c1d51b9698c4f4be6e13e69a908ab7751e3807985fc0346"},
|
{file = "rpds_py-0.19.0-pp310-pypy310_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:a707b158b4410aefb6b054715545bbb21aaa5d5d0080217290131c49c2124a6e"},
|
||||||
{file = "rpds_py-0.18.1-pp310-pypy310_pp73-musllinux_1_2_i686.whl", hash = "sha256:618916f5535784960f3ecf8111581f4ad31d347c3de66d02e728de460a46303c"},
|
{file = "rpds_py-0.19.0-pp310-pypy310_pp73-musllinux_1_2_i686.whl", hash = "sha256:dc9ac4659456bde7c567107556ab065801622396b435a3ff213daef27b495388"},
|
||||||
{file = "rpds_py-0.18.1-pp310-pypy310_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:17c6d2155e2423f7e79e3bb18151c686d40db42d8645e7977442170c360194d4"},
|
{file = "rpds_py-0.19.0-pp310-pypy310_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:81ea573aa46d3b6b3d890cd3c0ad82105985e6058a4baed03cf92518081eec8c"},
|
||||||
{file = "rpds_py-0.18.1-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:6c4c4c3f878df21faf5fac86eda32671c27889e13570645a9eea0a1abdd50922"},
|
{file = "rpds_py-0.19.0-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:3f148c3f47f7f29a79c38cc5d020edcb5ca780020fab94dbc21f9af95c463581"},
|
||||||
{file = "rpds_py-0.18.1-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:fab6ce90574645a0d6c58890e9bcaac8d94dff54fb51c69e5522a7358b80ab64"},
|
{file = "rpds_py-0.19.0-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:b0906357f90784a66e89ae3eadc2654f36c580a7d65cf63e6a616e4aec3a81be"},
|
||||||
{file = "rpds_py-0.18.1-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:531796fb842b53f2695e94dc338929e9f9dbf473b64710c28af5a160b2a8927d"},
|
{file = "rpds_py-0.19.0-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f629ecc2db6a4736b5ba95a8347b0089240d69ad14ac364f557d52ad68cf94b0"},
|
||||||
{file = "rpds_py-0.18.1-pp38-pypy38_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:740884bc62a5e2bbb31e584f5d23b32320fd75d79f916f15a788d527a5e83644"},
|
{file = "rpds_py-0.19.0-pp38-pypy38_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c6feacd1d178c30e5bc37184526e56740342fd2aa6371a28367bad7908d454fc"},
|
||||||
{file = "rpds_py-0.18.1-pp38-pypy38_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:998125738de0158f088aef3cb264a34251908dd2e5d9966774fdab7402edfab7"},
|
{file = "rpds_py-0.19.0-pp38-pypy38_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ae8b6068ee374fdfab63689be0963333aa83b0815ead5d8648389a8ded593378"},
|
||||||
{file = "rpds_py-0.18.1-pp38-pypy38_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e2be6e9dd4111d5b31ba3b74d17da54a8319d8168890fbaea4b9e5c3de630ae5"},
|
{file = "rpds_py-0.19.0-pp38-pypy38_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:78d57546bad81e0da13263e4c9ce30e96dcbe720dbff5ada08d2600a3502e526"},
|
||||||
{file = "rpds_py-0.18.1-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d0cee71bc618cd93716f3c1bf56653740d2d13ddbd47673efa8bf41435a60daa"},
|
{file = "rpds_py-0.19.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a8b6683a37338818646af718c9ca2a07f89787551057fae57c4ec0446dc6224b"},
|
||||||
{file = "rpds_py-0.18.1-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2c3caec4ec5cd1d18e5dd6ae5194d24ed12785212a90b37f5f7f06b8bedd7139"},
|
{file = "rpds_py-0.19.0-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:e8481b946792415adc07410420d6fc65a352b45d347b78fec45d8f8f0d7496f0"},
|
||||||
{file = "rpds_py-0.18.1-pp38-pypy38_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:27bba383e8c5231cd559affe169ca0b96ec78d39909ffd817f28b166d7ddd4d8"},
|
{file = "rpds_py-0.19.0-pp38-pypy38_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:bec35eb20792ea64c3c57891bc3ca0bedb2884fbac2c8249d9b731447ecde4fa"},
|
||||||
{file = "rpds_py-0.18.1-pp38-pypy38_pp73-musllinux_1_2_i686.whl", hash = "sha256:a888e8bdb45916234b99da2d859566f1e8a1d2275a801bb8e4a9644e3c7e7909"},
|
{file = "rpds_py-0.19.0-pp38-pypy38_pp73-musllinux_1_2_i686.whl", hash = "sha256:aa5476c3e3a402c37779e95f7b4048db2cb5b0ed0b9d006983965e93f40fe05a"},
|
||||||
{file = "rpds_py-0.18.1-pp38-pypy38_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:6031b25fb1b06327b43d841f33842b383beba399884f8228a6bb3df3088485ff"},
|
{file = "rpds_py-0.19.0-pp38-pypy38_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:19d02c45f2507b489fd4df7b827940f1420480b3e2e471e952af4d44a1ea8e34"},
|
||||||
{file = "rpds_py-0.18.1-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:48c2faaa8adfacefcbfdb5f2e2e7bdad081e5ace8d182e5f4ade971f128e6bb3"},
|
{file = "rpds_py-0.19.0-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:a3e2fd14c5d49ee1da322672375963f19f32b3d5953f0615b175ff7b9d38daed"},
|
||||||
{file = "rpds_py-0.18.1-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:d85164315bd68c0806768dc6bb0429c6f95c354f87485ee3593c4f6b14def2bd"},
|
{file = "rpds_py-0.19.0-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:93a91c2640645303e874eada51f4f33351b84b351a689d470f8108d0e0694210"},
|
||||||
{file = "rpds_py-0.18.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6afd80f6c79893cfc0574956f78a0add8c76e3696f2d6a15bca2c66c415cf2d4"},
|
{file = "rpds_py-0.19.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e5b9fc03bf76a94065299d4a2ecd8dfbae4ae8e2e8098bbfa6ab6413ca267709"},
|
||||||
{file = "rpds_py-0.18.1-pp39-pypy39_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:fa242ac1ff583e4ec7771141606aafc92b361cd90a05c30d93e343a0c2d82a89"},
|
{file = "rpds_py-0.19.0-pp39-pypy39_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5a4b07cdf3f84310c08c1de2c12ddadbb7a77568bcb16e95489f9c81074322ed"},
|
||||||
{file = "rpds_py-0.18.1-pp39-pypy39_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d21be4770ff4e08698e1e8e0bce06edb6ea0626e7c8f560bc08222880aca6a6f"},
|
{file = "rpds_py-0.19.0-pp39-pypy39_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ba0ed0dc6763d8bd6e5de5cf0d746d28e706a10b615ea382ac0ab17bb7388633"},
|
||||||
{file = "rpds_py-0.18.1-pp39-pypy39_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5c45a639e93a0c5d4b788b2613bd637468edd62f8f95ebc6fcc303d58ab3f0a8"},
|
{file = "rpds_py-0.19.0-pp39-pypy39_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:474bc83233abdcf2124ed3f66230a1c8435896046caa4b0b5ab6013c640803cc"},
|
||||||
{file = "rpds_py-0.18.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:910e71711d1055b2768181efa0a17537b2622afeb0424116619817007f8a2b10"},
|
{file = "rpds_py-0.19.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:329c719d31362355a96b435f4653e3b4b061fcc9eba9f91dd40804ca637d914e"},
|
||||||
{file = "rpds_py-0.18.1-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b9bb1f182a97880f6078283b3505a707057c42bf55d8fca604f70dedfdc0772a"},
|
{file = "rpds_py-0.19.0-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:ef9101f3f7b59043a34f1dccbb385ca760467590951952d6701df0da9893ca0c"},
|
||||||
{file = "rpds_py-0.18.1-pp39-pypy39_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:1d54f74f40b1f7aaa595a02ff42ef38ca654b1469bef7d52867da474243cc633"},
|
{file = "rpds_py-0.19.0-pp39-pypy39_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:0121803b0f424ee2109d6e1f27db45b166ebaa4b32ff47d6aa225642636cd834"},
|
||||||
{file = "rpds_py-0.18.1-pp39-pypy39_pp73-musllinux_1_2_i686.whl", hash = "sha256:8d2e182c9ee01135e11e9676e9a62dfad791a7a467738f06726872374a83db49"},
|
{file = "rpds_py-0.19.0-pp39-pypy39_pp73-musllinux_1_2_i686.whl", hash = "sha256:8344127403dea42f5970adccf6c5957a71a47f522171fafaf4c6ddb41b61703a"},
|
||||||
{file = "rpds_py-0.18.1-pp39-pypy39_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:636a15acc588f70fda1661234761f9ed9ad79ebed3f2125d44be0862708b666e"},
|
{file = "rpds_py-0.19.0-pp39-pypy39_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:443cec402ddd650bb2b885113e1dcedb22b1175c6be223b14246a714b61cd521"},
|
||||||
{file = "rpds_py-0.18.1.tar.gz", hash = "sha256:dc48b479d540770c811fbd1eb9ba2bb66951863e448efec2e2c102625328e92f"},
|
{file = "rpds_py-0.19.0.tar.gz", hash = "sha256:4fdc9afadbeb393b4bbbad75481e0ea78e4469f2e1d713a90811700830b553a9"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@ -2893,18 +2905,18 @@ files = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "setuptools"
|
name = "setuptools"
|
||||||
version = "70.1.0"
|
version = "70.3.0"
|
||||||
description = "Easily download, build, install, upgrade, and uninstall Python packages"
|
description = "Easily download, build, install, upgrade, and uninstall Python packages"
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.8"
|
python-versions = ">=3.8"
|
||||||
files = [
|
files = [
|
||||||
{file = "setuptools-70.1.0-py3-none-any.whl", hash = "sha256:d9b8b771455a97c8a9f3ab3448ebe0b29b5e105f1228bba41028be116985a267"},
|
{file = "setuptools-70.3.0-py3-none-any.whl", hash = "sha256:fe384da74336c398e0d956d1cae0669bc02eed936cdb1d49b57de1990dc11ffc"},
|
||||||
{file = "setuptools-70.1.0.tar.gz", hash = "sha256:01a1e793faa5bd89abc851fa15d0a0db26f160890c7102cd8dce643e886b47f5"},
|
{file = "setuptools-70.3.0.tar.gz", hash = "sha256:f171bab1dfbc86b132997f26a119f6056a57950d058587841a0082e8830f9dc5"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.extras]
|
[package.extras]
|
||||||
docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"]
|
doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"]
|
||||||
testing = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "importlib-metadata", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "jaraco.test", "mypy (==1.10.0)", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.1)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy", "pytest-perf", "pytest-ruff (>=0.3.2)", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]
|
test = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "importlib-metadata", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "jaraco.test", "mypy (==1.10.0)", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy", "pytest-perf", "pytest-ruff (>=0.3.2)", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "six"
|
name = "six"
|
||||||
@ -2919,17 +2931,20 @@ files = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "sympy"
|
name = "sympy"
|
||||||
version = "1.12.1"
|
version = "1.13.0"
|
||||||
description = "Computer algebra system (CAS) in Python"
|
description = "Computer algebra system (CAS) in Python"
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.8"
|
python-versions = ">=3.8"
|
||||||
files = [
|
files = [
|
||||||
{file = "sympy-1.12.1-py3-none-any.whl", hash = "sha256:9b2cbc7f1a640289430e13d2a56f02f867a1da0190f2f99d8968c2f74da0e515"},
|
{file = "sympy-1.13.0-py3-none-any.whl", hash = "sha256:6b0b32a4673fb91bd3cac3b55406c8e01d53ae22780be467301cc452f6680c92"},
|
||||||
{file = "sympy-1.12.1.tar.gz", hash = "sha256:2877b03f998cd8c08f07cd0de5b767119cd3ef40d09f41c30d722f6686b0fb88"},
|
{file = "sympy-1.13.0.tar.gz", hash = "sha256:3b6af8f4d008b9a1a6a4268b335b984b23835f26d1d60b0526ebc71d48a25f57"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
mpmath = ">=1.1.0,<1.4.0"
|
mpmath = ">=1.1.0,<1.4"
|
||||||
|
|
||||||
|
[package.extras]
|
||||||
|
dev = ["hypothesis (>=6.70.0)", "pytest (>=7.1.0)"]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "tbb"
|
name = "tbb"
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "text-generation-server"
|
name = "text-generation-server"
|
||||||
version = "2.0.2"
|
version = "2.0.4"
|
||||||
description = "Text Generation Inference Python gRPC Server"
|
description = "Text Generation Inference Python gRPC Server"
|
||||||
authors = ["Olivier Dehaene <olivier@huggingface.co>"]
|
authors = ["Olivier Dehaene <olivier@huggingface.co>"]
|
||||||
|
|
||||||
|
@ -6,14 +6,14 @@ colorama==0.4.6 ; python_version >= "3.9" and python_version < "3.13" and (sys_p
|
|||||||
deprecated==1.2.14 ; python_version >= "3.9" and python_version < "3.13"
|
deprecated==1.2.14 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
einops==0.6.1 ; python_version >= "3.9" and python_version < "3.13"
|
einops==0.6.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
filelock==3.14.0 ; python_version >= "3.9" and python_version < "3.13"
|
filelock==3.14.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
fsspec==2024.3.1 ; python_version >= "3.9" and python_version < "3.13"
|
fsspec==2024.5.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
googleapis-common-protos==1.63.0 ; python_version >= "3.9" and python_version < "3.13"
|
googleapis-common-protos==1.63.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
grpc-interceptor==0.15.4 ; python_version >= "3.9" and python_version < "3.13"
|
grpc-interceptor==0.15.4 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
grpcio-reflection==1.62.2 ; python_version >= "3.9" and python_version < "3.13"
|
grpcio-reflection==1.62.2 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
grpcio-status==1.62.2 ; python_version >= "3.9" and python_version < "3.13"
|
grpcio-status==1.62.2 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
grpcio==1.62.2 ; python_version >= "3.9" and python_version < "3.13"
|
grpcio==1.64.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
hf-transfer==0.1.6 ; python_version >= "3.9" and python_version < "3.13"
|
hf-transfer==0.1.6 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
huggingface-hub==0.19.4 ; python_version >= "3.9" and python_version < "3.13"
|
huggingface-hub==0.23.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
idna==3.7 ; python_version >= "3.9" and python_version < "3.13"
|
idna==3.7 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
loguru==0.6.0 ; python_version >= "3.9" and python_version < "3.13"
|
loguru==0.6.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
numpy==1.26.4 ; python_version >= "3.9" and python_version < "3.13"
|
numpy==1.26.4 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
@ -32,17 +32,17 @@ prometheus-client==0.20.0 ; python_version >= "3.9" and python_version < "3.13"
|
|||||||
protobuf==4.25.3 ; python_version >= "3.9" and python_version < "3.13"
|
protobuf==4.25.3 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
py-cpuinfo==9.0.0 ; python_version >= "3.9" and python_version < "3.13"
|
py-cpuinfo==9.0.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
pyyaml==6.0.1 ; python_version >= "3.9" and python_version < "3.13"
|
pyyaml==6.0.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
regex==2024.4.28 ; python_version >= "3.9" and python_version < "3.13"
|
regex==2024.5.15 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
requests==2.31.0 ; python_version >= "3.9" and python_version < "3.13"
|
requests==2.32.2 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
safetensors==0.4.3 ; python_version >= "3.9" and python_version < "3.13"
|
safetensors==0.4.3 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
scipy==1.13.0 ; python_version >= "3.9" and python_version < "3.13"
|
scipy==1.13.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
sentencepiece==0.1.99 ; python_version >= "3.9" and python_version < "3.13"
|
sentencepiece==0.1.99 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
setuptools==69.5.1 ; python_version >= "3.9" and python_version < "3.13"
|
setuptools==70.0.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
tokenizers==0.19.1 ; python_version >= "3.9" and python_version < "3.13"
|
tokenizers==0.19.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
tqdm==4.66.2 ; python_version >= "3.9" and python_version < "3.13"
|
tqdm==4.66.4 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
transformers==4.40.1 ; python_version >= "3.9" and python_version < "3.13"
|
transformers==4.41.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
typer==0.6.1 ; python_version >= "3.9" and python_version < "3.13"
|
typer==0.6.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
typing-extensions==4.11.0 ; python_version >= "3.9" and python_version < "3.13"
|
typing-extensions==4.12.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
urllib3==2.2.1 ; python_version >= "3.9" and python_version < "3.13"
|
urllib3==2.2.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
win32-setctime==1.1.0 ; python_version >= "3.9" and python_version < "3.13" and sys_platform == "win32"
|
win32-setctime==1.1.0 ; python_version >= "3.9" and python_version < "3.13" and sys_platform == "win32"
|
||||||
wrapt==1.16.0 ; python_version >= "3.9" and python_version < "3.13"
|
wrapt==1.16.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
|
@ -6,14 +6,14 @@ colorama==0.4.6 ; python_version >= "3.9" and python_version < "3.13" and (sys_p
|
|||||||
deprecated==1.2.14 ; python_version >= "3.9" and python_version < "3.13"
|
deprecated==1.2.14 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
einops==0.6.1 ; python_version >= "3.9" and python_version < "3.13"
|
einops==0.6.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
filelock==3.14.0 ; python_version >= "3.9" and python_version < "3.13"
|
filelock==3.14.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
fsspec==2024.3.1 ; python_version >= "3.9" and python_version < "3.13"
|
fsspec==2024.5.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
googleapis-common-protos==1.63.0 ; python_version >= "3.9" and python_version < "3.13"
|
googleapis-common-protos==1.63.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
grpc-interceptor==0.15.4 ; python_version >= "3.9" and python_version < "3.13"
|
grpc-interceptor==0.15.4 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
grpcio-reflection==1.62.2 ; python_version >= "3.9" and python_version < "3.13"
|
grpcio-reflection==1.62.2 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
grpcio-status==1.62.2 ; python_version >= "3.9" and python_version < "3.13"
|
grpcio-status==1.62.2 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
grpcio==1.62.2 ; python_version >= "3.9" and python_version < "3.13"
|
grpcio==1.64.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
hf-transfer==0.1.6 ; python_version >= "3.9" and python_version < "3.13"
|
hf-transfer==0.1.6 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
huggingface-hub==0.19.4 ; python_version >= "3.9" and python_version < "3.13"
|
huggingface-hub==0.23.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
idna==3.7 ; python_version >= "3.9" and python_version < "3.13"
|
idna==3.7 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
loguru==0.6.0 ; python_version >= "3.9" and python_version < "3.13"
|
loguru==0.6.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
numpy==1.26.4 ; python_version >= "3.9" and python_version < "3.13"
|
numpy==1.26.4 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
@ -32,17 +32,17 @@ prometheus-client==0.20.0 ; python_version >= "3.9" and python_version < "3.13"
|
|||||||
protobuf==4.25.3 ; python_version >= "3.9" and python_version < "3.13"
|
protobuf==4.25.3 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
py-cpuinfo==9.0.0 ; python_version >= "3.9" and python_version < "3.13"
|
py-cpuinfo==9.0.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
pyyaml==6.0.1 ; python_version >= "3.9" and python_version < "3.13"
|
pyyaml==6.0.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
regex==2024.4.28 ; python_version >= "3.9" and python_version < "3.13"
|
regex==2024.5.15 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
requests==2.31.0 ; python_version >= "3.9" and python_version < "3.13"
|
requests==2.32.2 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
safetensors==0.4.3 ; python_version >= "3.9" and python_version < "3.13"
|
safetensors==0.4.3 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
scipy==1.13.0 ; python_version >= "3.9" and python_version < "3.13"
|
scipy==1.13.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
sentencepiece==0.1.99 ; python_version >= "3.9" and python_version < "3.13"
|
sentencepiece==0.1.99 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
setuptools==69.5.1 ; python_version >= "3.9" and python_version < "3.13"
|
setuptools==70.0.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
tokenizers==0.19.1 ; python_version >= "3.9" and python_version < "3.13"
|
tokenizers==0.19.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
tqdm==4.66.2 ; python_version >= "3.9" and python_version < "3.13"
|
tqdm==4.66.4 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
transformers==4.40.1 ; python_version >= "3.9" and python_version < "3.13"
|
transformers==4.41.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
typer==0.6.1 ; python_version >= "3.9" and python_version < "3.13"
|
typer==0.6.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
typing-extensions==4.11.0 ; python_version >= "3.9" and python_version < "3.13"
|
typing-extensions==4.12.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
urllib3==2.2.1 ; python_version >= "3.9" and python_version < "3.13"
|
urllib3==2.2.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
win32-setctime==1.1.0 ; python_version >= "3.9" and python_version < "3.13" and sys_platform == "win32"
|
win32-setctime==1.1.0 ; python_version >= "3.9" and python_version < "3.13" and sys_platform == "win32"
|
||||||
wrapt==1.16.0 ; python_version >= "3.9" and python_version < "3.13"
|
wrapt==1.16.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
import torch
|
import torch
|
||||||
from text_generation_server.utils.layers import (
|
from text_generation_server.layers import (
|
||||||
TensorParallelEmbedding,
|
TensorParallelEmbedding,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -200,31 +200,27 @@ def download_weights(
|
|||||||
try:
|
try:
|
||||||
import json
|
import json
|
||||||
|
|
||||||
medusa_head = hf_hub_download(
|
config = hf_hub_download(
|
||||||
model_id, revision=revision, filename="medusa_lm_head.safetensors"
|
|
||||||
)
|
|
||||||
medusa_config = hf_hub_download(
|
|
||||||
model_id, revision=revision, filename="config.json"
|
model_id, revision=revision, filename="config.json"
|
||||||
)
|
)
|
||||||
with open(medusa_config, "r") as f:
|
with open(config, "r") as f:
|
||||||
config = json.load(f)
|
config = json.load(f)
|
||||||
|
|
||||||
model_id = config["base_model_name_or_path"]
|
base_model_id = config.get("base_model_name_or_path", None)
|
||||||
revision = "main"
|
if base_model_id and base_model_id != model_id:
|
||||||
try:
|
try:
|
||||||
utils.weight_files(model_id, revision, extension)
|
logger.info(f"Downloading parent model {base_model_id}")
|
||||||
logger.info(
|
download_weights(
|
||||||
f"Files for parent {model_id} are already present on the host. "
|
model_id=base_model_id,
|
||||||
"Skipping download."
|
revision="main",
|
||||||
)
|
extension=extension,
|
||||||
return
|
auto_convert=auto_convert,
|
||||||
# Local files not found
|
logger_level=logger_level,
|
||||||
except (
|
json_output=json_output,
|
||||||
utils.LocalEntryNotFoundError,
|
trust_remote_code=trust_remote_code,
|
||||||
FileNotFoundError,
|
)
|
||||||
utils.EntryNotFoundError,
|
except Exception:
|
||||||
):
|
pass
|
||||||
pass
|
|
||||||
except (utils.LocalEntryNotFoundError, utils.EntryNotFoundError):
|
except (utils.LocalEntryNotFoundError, utils.EntryNotFoundError):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@ -241,31 +237,6 @@ def download_weights(
|
|||||||
if not extension == ".safetensors" or not auto_convert:
|
if not extension == ".safetensors" or not auto_convert:
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
elif (Path(model_id) / "medusa_lm_head.safetensors").exists():
|
|
||||||
# Try to load as a local Medusa model
|
|
||||||
try:
|
|
||||||
import json
|
|
||||||
|
|
||||||
medusa_head = Path(model_id) / "medusa_lm_head.safetensors"
|
|
||||||
medusa_config = Path(model_id) / "config.json"
|
|
||||||
with open(medusa_config, "r") as f:
|
|
||||||
config = json.load(f)
|
|
||||||
|
|
||||||
model_id = config["base_model_name_or_path"]
|
|
||||||
revision = "main"
|
|
||||||
try:
|
|
||||||
utils.weight_files(model_id, revision, extension)
|
|
||||||
logger.info(
|
|
||||||
f"Files for parent {model_id} are already present on the host. "
|
|
||||||
"Skipping download."
|
|
||||||
)
|
|
||||||
return
|
|
||||||
# Local files not found
|
|
||||||
except (utils.LocalEntryNotFoundError, utils.EntryNotFoundError):
|
|
||||||
pass
|
|
||||||
except (utils.LocalEntryNotFoundError, utils.EntryNotFoundError):
|
|
||||||
pass
|
|
||||||
|
|
||||||
elif (Path(model_id) / "adapter_config.json").exists():
|
elif (Path(model_id) / "adapter_config.json").exists():
|
||||||
# Try to load as a local PEFT model
|
# Try to load as a local PEFT model
|
||||||
try:
|
try:
|
||||||
@ -276,14 +247,43 @@ def download_weights(
|
|||||||
return
|
return
|
||||||
except (utils.LocalEntryNotFoundError, utils.EntryNotFoundError):
|
except (utils.LocalEntryNotFoundError, utils.EntryNotFoundError):
|
||||||
pass
|
pass
|
||||||
|
elif (Path(model_id) / "config.json").exists():
|
||||||
|
# Try to load as a local Medusa model
|
||||||
|
try:
|
||||||
|
import json
|
||||||
|
|
||||||
|
config = Path(model_id) / "config.json"
|
||||||
|
with open(config, "r") as f:
|
||||||
|
config = json.load(f)
|
||||||
|
|
||||||
|
base_model_id = config.get("base_model_name_or_path", None)
|
||||||
|
if base_model_id:
|
||||||
|
try:
|
||||||
|
logger.info(f"Downloading parent model {base_model_id}")
|
||||||
|
download_weights(
|
||||||
|
model_id=base_model_id,
|
||||||
|
revision="main",
|
||||||
|
extension=extension,
|
||||||
|
auto_convert=auto_convert,
|
||||||
|
logger_level=logger_level,
|
||||||
|
json_output=json_output,
|
||||||
|
trust_remote_code=trust_remote_code,
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
except (utils.LocalEntryNotFoundError, utils.EntryNotFoundError):
|
||||||
|
pass
|
||||||
|
|
||||||
# Try to see if there are local pytorch weights
|
# Try to see if there are local pytorch weights
|
||||||
try:
|
try:
|
||||||
# Get weights for a local model, a hub cached model and inside the WEIGHTS_CACHE_OVERRIDE
|
# Get weights for a local model, a hub cached model and inside the WEIGHTS_CACHE_OVERRIDE
|
||||||
local_pt_files = utils.weight_files(model_id, revision, ".bin")
|
try:
|
||||||
|
local_pt_files = utils.weight_files(model_id, revision, ".bin")
|
||||||
|
except Exception:
|
||||||
|
local_pt_files = utils.weight_files(model_id, revision, ".pt")
|
||||||
|
|
||||||
# No local pytorch weights
|
# No local pytorch weights
|
||||||
except utils.LocalEntryNotFoundError:
|
except (utils.LocalEntryNotFoundError, utils.EntryNotFoundError):
|
||||||
if extension == ".safetensors":
|
if extension == ".safetensors":
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"No safetensors weights found for model {model_id} at revision {revision}. "
|
f"No safetensors weights found for model {model_id} at revision {revision}. "
|
||||||
|
14
server/text_generation_server/layers/__init__.py
Normal file
14
server/text_generation_server/layers/__init__.py
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
from text_generation_server.layers.tensor_parallel import (
|
||||||
|
TensorParallelColumnLinear,
|
||||||
|
TensorParallelRowLinear,
|
||||||
|
TensorParallelEmbedding,
|
||||||
|
)
|
||||||
|
from text_generation_server.layers.linear import (
|
||||||
|
get_linear,
|
||||||
|
FastLinear,
|
||||||
|
)
|
||||||
|
from text_generation_server.layers.speculative import SpeculativeHead
|
||||||
|
|
||||||
|
# Just to add the `load` methods.
|
||||||
|
from text_generation_server.layers.layernorm import load_layer_norm
|
||||||
|
from text_generation_server.layers.conv import load_conv2d
|
106
server/text_generation_server/layers/bnb.py
Normal file
106
server/text_generation_server/layers/bnb.py
Normal file
@ -0,0 +1,106 @@
|
|||||||
|
import torch
|
||||||
|
from loguru import logger
|
||||||
|
from functools import lru_cache
|
||||||
|
import bitsandbytes as bnb
|
||||||
|
from bitsandbytes.nn import Int8Params, Params4bit
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(1)
|
||||||
|
def warn_deprecate_bnb():
|
||||||
|
logger.warning(
|
||||||
|
"Bitsandbytes 8bit is deprecated, using `eetq` is a drop-in replacement, and has much better performnce"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class Linear8bitLt(torch.nn.Module):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
weight,
|
||||||
|
bias,
|
||||||
|
has_fp16_weights=True,
|
||||||
|
memory_efficient_backward=False,
|
||||||
|
threshold=0.0,
|
||||||
|
index=None,
|
||||||
|
):
|
||||||
|
super().__init__()
|
||||||
|
assert (
|
||||||
|
not memory_efficient_backward
|
||||||
|
), "memory_efficient_backward is no longer required and the argument is deprecated in 0.37.0 and will be removed in 0.39.0"
|
||||||
|
self.state = bnb.MatmulLtState()
|
||||||
|
self.index = index
|
||||||
|
|
||||||
|
# Necessary for stacked layers
|
||||||
|
self.state.threshold = threshold
|
||||||
|
self.state.has_fp16_weights = has_fp16_weights
|
||||||
|
self.state.memory_efficient_backward = memory_efficient_backward
|
||||||
|
if threshold > 0.0 and not has_fp16_weights:
|
||||||
|
self.state.use_pool = True
|
||||||
|
|
||||||
|
self.weight = Int8Params(
|
||||||
|
weight.data,
|
||||||
|
has_fp16_weights=has_fp16_weights,
|
||||||
|
requires_grad=has_fp16_weights,
|
||||||
|
)
|
||||||
|
self.weight.cuda(weight.device)
|
||||||
|
self.bias = bias
|
||||||
|
|
||||||
|
def init_8bit_state(self):
|
||||||
|
self.state.CB = self.weight.CB
|
||||||
|
self.state.SCB = self.weight.SCB
|
||||||
|
self.weight.CB = None
|
||||||
|
self.weight.SCB = None
|
||||||
|
|
||||||
|
def forward(self, x: torch.Tensor):
|
||||||
|
self.state.is_training = self.training
|
||||||
|
if self.weight.CB is not None:
|
||||||
|
self.init_8bit_state()
|
||||||
|
|
||||||
|
# weights are cast automatically as Int8Params, but the bias has to be cast manually
|
||||||
|
if self.bias is not None and self.bias.dtype != x.dtype:
|
||||||
|
self.bias.data = self.bias.data.to(x.dtype)
|
||||||
|
|
||||||
|
out = bnb.matmul(x, self.weight, bias=self.bias, state=self.state)
|
||||||
|
|
||||||
|
if not self.state.has_fp16_weights:
|
||||||
|
if self.state.CB is not None and self.state.CxB is not None:
|
||||||
|
# we converted 8-bit row major to turing/ampere format in the first inference pass
|
||||||
|
# we no longer need the row-major weight
|
||||||
|
del self.state.CB
|
||||||
|
self.weight.data = self.state.CxB
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
class Linear4bit(torch.nn.Module):
|
||||||
|
def __init__(self, weight, bias, quant_type):
|
||||||
|
super().__init__()
|
||||||
|
self.weight = Params4bit(
|
||||||
|
weight.data,
|
||||||
|
requires_grad=False,
|
||||||
|
compress_statistics=True,
|
||||||
|
quant_type=quant_type,
|
||||||
|
)
|
||||||
|
self.compute_dtype = None
|
||||||
|
self.weight.cuda(weight.device)
|
||||||
|
self.bias = bias
|
||||||
|
|
||||||
|
def forward(self, x: torch.Tensor):
|
||||||
|
# weights are cast automatically as Int8Params, but the bias has to be cast manually
|
||||||
|
if self.bias is not None and self.bias.dtype != x.dtype:
|
||||||
|
self.bias.data = self.bias.data.to(x.dtype)
|
||||||
|
|
||||||
|
if getattr(self.weight, "quant_state", None) is None:
|
||||||
|
print(
|
||||||
|
"FP4 quantization state not initialized. Please call .cuda() or .to(device) on the LinearFP4 layer first."
|
||||||
|
)
|
||||||
|
inp_dtype = x.dtype
|
||||||
|
if self.compute_dtype is not None:
|
||||||
|
x = x.to(self.compute_dtype)
|
||||||
|
|
||||||
|
bias = None if self.bias is None else self.bias.to(self.compute_dtype)
|
||||||
|
out = bnb.matmul_4bit(
|
||||||
|
x, self.weight.t(), bias=bias, quant_state=self.weight.quant_state
|
||||||
|
)
|
||||||
|
|
||||||
|
out = out.to(inp_dtype)
|
||||||
|
|
||||||
|
return out
|
41
server/text_generation_server/layers/conv.py
Normal file
41
server/text_generation_server/layers/conv.py
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
from accelerate import init_empty_weights
|
||||||
|
import torch
|
||||||
|
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def load_conv2d(cls, prefix, weights, in_channels, out_channels, kernel_size, stride):
|
||||||
|
weight = weights.get_tensor(f"{prefix}.weight")
|
||||||
|
bias = weights.get_tensor(f"{prefix}.bias")
|
||||||
|
with init_empty_weights():
|
||||||
|
conv2d = cls(
|
||||||
|
in_channels=in_channels,
|
||||||
|
out_channels=out_channels,
|
||||||
|
kernel_size=kernel_size,
|
||||||
|
stride=stride,
|
||||||
|
)
|
||||||
|
|
||||||
|
conv2d.weight = torch.nn.Parameter(weight)
|
||||||
|
conv2d.bias = torch.nn.Parameter(bias)
|
||||||
|
return conv2d
|
||||||
|
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def load_conv2d_no_bias(
|
||||||
|
cls, prefix, weights, in_channels, out_channels, kernel_size, stride
|
||||||
|
):
|
||||||
|
weight = weights.get_tensor(f"{prefix}.weight")
|
||||||
|
with init_empty_weights():
|
||||||
|
conv2d = cls(
|
||||||
|
in_channels=in_channels,
|
||||||
|
out_channels=out_channels,
|
||||||
|
kernel_size=kernel_size,
|
||||||
|
stride=stride,
|
||||||
|
)
|
||||||
|
|
||||||
|
conv2d.weight = torch.nn.Parameter(weight)
|
||||||
|
conv2d.bias = None
|
||||||
|
return conv2d
|
||||||
|
|
||||||
|
|
||||||
|
torch.nn.Conv2d.load = load_conv2d
|
||||||
|
torch.nn.Conv2d.load_no_bias = load_conv2d_no_bias
|
25
server/text_generation_server/layers/eetq.py
Normal file
25
server/text_generation_server/layers/eetq.py
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
import torch
|
||||||
|
from EETQ import quant_weights, w8_a16_gemm
|
||||||
|
|
||||||
|
|
||||||
|
class EETQLinear(torch.nn.Module):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
weight,
|
||||||
|
bias,
|
||||||
|
) -> None:
|
||||||
|
super().__init__()
|
||||||
|
device = weight.device
|
||||||
|
if weight.dtype != torch.float16:
|
||||||
|
weight = weight.to(dtype=torch.float16)
|
||||||
|
weight = torch.t(weight).contiguous().cpu()
|
||||||
|
weight, scale = quant_weights(weight, torch.int8, False)
|
||||||
|
|
||||||
|
self.weight = weight.cuda(device)
|
||||||
|
self.scale = scale.cuda(device)
|
||||||
|
self.bias = bias.cuda(device) if bias is not None else None
|
||||||
|
|
||||||
|
def forward(self, input: torch.Tensor) -> torch.Tensor:
|
||||||
|
output = w8_a16_gemm(input, self.weight, self.scale)
|
||||||
|
output = output + self.bias if self.bias is not None else output
|
||||||
|
return output
|
43
server/text_generation_server/layers/fp8.py
Normal file
43
server/text_generation_server/layers/fp8.py
Normal file
@ -0,0 +1,43 @@
|
|||||||
|
import torch
|
||||||
|
|
||||||
|
|
||||||
|
def fp8_quantize(weight, qdtype=torch.float8_e4m3fn):
|
||||||
|
device = weight.device
|
||||||
|
# weight, scale = quant_weights(weight, torch.int8, False)
|
||||||
|
finfo = torch.finfo(qdtype)
|
||||||
|
# Calculate the scale as dtype max divided by absmax
|
||||||
|
scale = finfo.max / weight.abs().max().clamp(min=1e-12)
|
||||||
|
# scale and clamp the tensor to bring it to
|
||||||
|
# the representative range of float8 data type
|
||||||
|
# (as default cast is unsaturated)
|
||||||
|
qweight = (weight * scale).clamp(min=finfo.min, max=finfo.max)
|
||||||
|
# Return both float8 data and the inverse scale (as float),
|
||||||
|
# as both required as inputs to torch._scaled_mm
|
||||||
|
qweight = qweight.to(qdtype)
|
||||||
|
scale = scale.float().reciprocal()
|
||||||
|
return qweight, scale
|
||||||
|
|
||||||
|
|
||||||
|
class Fp8Linear(torch.nn.Module):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
weight,
|
||||||
|
bias,
|
||||||
|
) -> None:
|
||||||
|
super().__init__()
|
||||||
|
self.dtype = weight.dtype
|
||||||
|
self.qweight, self.scale = fp8_quantize(weight)
|
||||||
|
|
||||||
|
self.bias = bias if bias is not None else None
|
||||||
|
|
||||||
|
def forward(self, input: torch.Tensor) -> torch.Tensor:
|
||||||
|
qinput, scale = fp8_quantize(input)
|
||||||
|
output, _ = torch._scaled_mm(
|
||||||
|
qinput,
|
||||||
|
self.qweight.t(),
|
||||||
|
out_dtype=self.dtype,
|
||||||
|
scale_a=scale,
|
||||||
|
scale_b=self.scale,
|
||||||
|
bias=self.bias,
|
||||||
|
)
|
||||||
|
return output
|
39
server/text_generation_server/layers/gptq/__init__.py
Normal file
39
server/text_generation_server/layers/gptq/__init__.py
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
import os
|
||||||
|
import torch
|
||||||
|
from text_generation_server.utils.import_utils import (
|
||||||
|
SYSTEM,
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
major, _minor = torch.cuda.get_device_capability()
|
||||||
|
except Exception:
|
||||||
|
major = 1
|
||||||
|
|
||||||
|
HAS_EXLLAMA = False
|
||||||
|
CAN_EXLLAMA = major >= 8 or SYSTEM == "rocm"
|
||||||
|
V2 = os.getenv("EXLLAMA_VERSION", "2") == "2"
|
||||||
|
if os.getenv("DISABLE_EXLLAMA") == "True":
|
||||||
|
HAS_EXLLAMA = False
|
||||||
|
elif CAN_EXLLAMA:
|
||||||
|
try:
|
||||||
|
if V2:
|
||||||
|
from text_generation_server.layers.gptq.exllamav2 import (
|
||||||
|
QuantLinear as ExllamaQuantLinear,
|
||||||
|
create_exllama_buffers,
|
||||||
|
set_device,
|
||||||
|
)
|
||||||
|
|
||||||
|
HAS_EXLLAMA = "2"
|
||||||
|
else:
|
||||||
|
from text_generation_server.layers.gptq.exllama import (
|
||||||
|
Ex4bitLinear as ExllamaQuantLinear,
|
||||||
|
create_exllama_buffers,
|
||||||
|
set_device,
|
||||||
|
)
|
||||||
|
|
||||||
|
HAS_EXLLAMA = "1"
|
||||||
|
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
from text_generation_server.layers.gptq.quant_linear import QuantLinear
|
10
server/text_generation_server/layers/gptq/exllamav2.py.rej
Normal file
10
server/text_generation_server/layers/gptq/exllamav2.py.rej
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
diff a/server/text_generation_server/layers/gptq/exllamav2.py b/server/text_generation_server/layers/gptq/exllamav2.py (rejected hunks)
|
||||||
|
@@ -119,6 +119,8 @@ def ext_make_q_matrix(w: dict, temp_dq, key: str = None):
|
||||||
|
none_tensor,
|
||||||
|
temp_dq,
|
||||||
|
)
|
||||||
|
+ else:
|
||||||
|
+ RuntimeError("Cannot create handle")
|
||||||
|
|
||||||
|
|
||||||
|
DEVICE = None
|
356
server/text_generation_server/layers/gptq/quant_linear.py
Normal file
356
server/text_generation_server/layers/gptq/quant_linear.py
Normal file
@ -0,0 +1,356 @@
|
|||||||
|
import math
|
||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
from torch.cuda.amp import custom_fwd
|
||||||
|
|
||||||
|
import triton
|
||||||
|
import triton.language as tl
|
||||||
|
from . import custom_autotune
|
||||||
|
|
||||||
|
|
||||||
|
# code based https://github.com/fpgaminer/GPTQ-triton
|
||||||
|
@custom_autotune.autotune(
|
||||||
|
configs=[
|
||||||
|
triton.Config(
|
||||||
|
{
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 256,
|
||||||
|
"BLOCK_SIZE_K": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
},
|
||||||
|
num_stages=4,
|
||||||
|
num_warps=4,
|
||||||
|
),
|
||||||
|
triton.Config(
|
||||||
|
{
|
||||||
|
"BLOCK_SIZE_M": 128,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
},
|
||||||
|
num_stages=4,
|
||||||
|
num_warps=4,
|
||||||
|
),
|
||||||
|
triton.Config(
|
||||||
|
{
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
},
|
||||||
|
num_stages=4,
|
||||||
|
num_warps=4,
|
||||||
|
),
|
||||||
|
triton.Config(
|
||||||
|
{
|
||||||
|
"BLOCK_SIZE_M": 128,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"BLOCK_SIZE_K": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
},
|
||||||
|
num_stages=4,
|
||||||
|
num_warps=4,
|
||||||
|
),
|
||||||
|
triton.Config(
|
||||||
|
{
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"BLOCK_SIZE_K": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
},
|
||||||
|
num_stages=4,
|
||||||
|
num_warps=4,
|
||||||
|
),
|
||||||
|
triton.Config(
|
||||||
|
{
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
},
|
||||||
|
num_stages=2,
|
||||||
|
num_warps=8,
|
||||||
|
),
|
||||||
|
triton.Config(
|
||||||
|
{
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
},
|
||||||
|
num_stages=3,
|
||||||
|
num_warps=8,
|
||||||
|
),
|
||||||
|
triton.Config(
|
||||||
|
{
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
},
|
||||||
|
num_stages=2,
|
||||||
|
num_warps=4,
|
||||||
|
),
|
||||||
|
],
|
||||||
|
key=["M", "N", "K"],
|
||||||
|
nearest_power_of_two=True,
|
||||||
|
prune_configs_by={
|
||||||
|
"early_config_prune": custom_autotune.matmul248_kernel_config_pruner,
|
||||||
|
"perf_model": None,
|
||||||
|
"top_k": None,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
@triton.jit
|
||||||
|
def matmul_248_kernel(
|
||||||
|
a_ptr,
|
||||||
|
b_ptr,
|
||||||
|
c_ptr,
|
||||||
|
scales_ptr,
|
||||||
|
zeros_ptr,
|
||||||
|
g_ptr,
|
||||||
|
M,
|
||||||
|
N,
|
||||||
|
K,
|
||||||
|
bits,
|
||||||
|
maxq,
|
||||||
|
stride_am,
|
||||||
|
stride_ak,
|
||||||
|
stride_bk,
|
||||||
|
stride_bn,
|
||||||
|
stride_cm,
|
||||||
|
stride_cn,
|
||||||
|
stride_scales,
|
||||||
|
stride_zeros,
|
||||||
|
BLOCK_SIZE_M: tl.constexpr,
|
||||||
|
BLOCK_SIZE_N: tl.constexpr,
|
||||||
|
BLOCK_SIZE_K: tl.constexpr,
|
||||||
|
GROUP_SIZE_M: tl.constexpr,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Compute the matrix multiplication C = A x B.
|
||||||
|
A is of shape (M, K) float16
|
||||||
|
B is of shape (K//8, N) int32
|
||||||
|
C is of shape (M, N) float16
|
||||||
|
scales is of shape (G, N) float16
|
||||||
|
zeros is of shape (G, N) float16
|
||||||
|
g_ptr is of shape (K) int32
|
||||||
|
"""
|
||||||
|
infearure_per_bits = 32 // bits
|
||||||
|
|
||||||
|
pid = tl.program_id(axis=0)
|
||||||
|
num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
|
||||||
|
num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
|
||||||
|
num_pid_k = tl.cdiv(K, BLOCK_SIZE_K)
|
||||||
|
num_pid_in_group = GROUP_SIZE_M * num_pid_n
|
||||||
|
group_id = pid // num_pid_in_group
|
||||||
|
first_pid_m = group_id * GROUP_SIZE_M
|
||||||
|
group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
|
||||||
|
pid_m = first_pid_m + (pid % group_size_m)
|
||||||
|
pid_n = (pid % num_pid_in_group) // group_size_m
|
||||||
|
|
||||||
|
offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
|
||||||
|
offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
|
||||||
|
offs_k = tl.arange(0, BLOCK_SIZE_K)
|
||||||
|
a_ptrs = a_ptr + (
|
||||||
|
offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak
|
||||||
|
) # (BLOCK_SIZE_M, BLOCK_SIZE_K)
|
||||||
|
a_mask = offs_am[:, None] < M
|
||||||
|
# b_ptrs is set up such that it repeats elements along the K axis 8 times
|
||||||
|
b_ptrs = b_ptr + (
|
||||||
|
(offs_k[:, None] // infearure_per_bits) * stride_bk
|
||||||
|
+ offs_bn[None, :] * stride_bn
|
||||||
|
) # (BLOCK_SIZE_K, BLOCK_SIZE_N)
|
||||||
|
g_ptrs = g_ptr + offs_k
|
||||||
|
# shifter is used to extract the N bits of each element in the 32-bit word from B
|
||||||
|
scales_ptrs = scales_ptr + offs_bn[None, :]
|
||||||
|
zeros_ptrs = zeros_ptr + (offs_bn[None, :] // infearure_per_bits)
|
||||||
|
|
||||||
|
shifter = (offs_k % infearure_per_bits) * bits
|
||||||
|
zeros_shifter = (offs_bn % infearure_per_bits) * bits
|
||||||
|
accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
|
||||||
|
|
||||||
|
for k in range(0, num_pid_k):
|
||||||
|
g_idx = tl.load(g_ptrs)
|
||||||
|
|
||||||
|
# Fetch scales and zeros; these are per-outfeature and thus reused in the inner loop
|
||||||
|
scales = tl.load(
|
||||||
|
scales_ptrs + g_idx[:, None] * stride_scales
|
||||||
|
) # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
|
||||||
|
zeros = tl.load(
|
||||||
|
zeros_ptrs + g_idx[:, None] * stride_zeros
|
||||||
|
) # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
|
||||||
|
|
||||||
|
zeros = (zeros >> zeros_shifter[None, :]) & maxq
|
||||||
|
zeros = (zeros + 1) & maxq # eventually avoid overflow
|
||||||
|
|
||||||
|
a = tl.load(a_ptrs, mask=a_mask, other=0.0) # (BLOCK_SIZE_M, BLOCK_SIZE_K)
|
||||||
|
b = tl.load(b_ptrs) # (BLOCK_SIZE_K, BLOCK_SIZE_N), but repeated
|
||||||
|
|
||||||
|
# Now we need to unpack b (which is N-bit values) into 32-bit values
|
||||||
|
b = (b >> shifter[:, None]) & maxq # Extract the N-bit values
|
||||||
|
b = (b - zeros) * scales # Scale and shift
|
||||||
|
|
||||||
|
accumulator += tl.dot(a, b)
|
||||||
|
a_ptrs += BLOCK_SIZE_K
|
||||||
|
b_ptrs += (BLOCK_SIZE_K // infearure_per_bits) * stride_bk
|
||||||
|
g_ptrs += BLOCK_SIZE_K
|
||||||
|
|
||||||
|
c_ptrs = c_ptr + stride_cm * offs_am[:, None] + stride_cn * offs_bn[None, :]
|
||||||
|
c_mask = (offs_am[:, None] < M) & (offs_bn[None, :] < N)
|
||||||
|
tl.store(c_ptrs, accumulator, mask=c_mask)
|
||||||
|
|
||||||
|
|
||||||
|
def matmul248(input, qweight, scales, qzeros, g_idx, bits, maxq):
|
||||||
|
with torch.cuda.device(input.device):
|
||||||
|
output = torch.empty(
|
||||||
|
(input.shape[0], qweight.shape[1]), device=input.device, dtype=torch.float16
|
||||||
|
)
|
||||||
|
grid = lambda META: (
|
||||||
|
triton.cdiv(input.shape[0], META["BLOCK_SIZE_M"])
|
||||||
|
* triton.cdiv(qweight.shape[1], META["BLOCK_SIZE_N"]),
|
||||||
|
)
|
||||||
|
matmul_248_kernel[grid](
|
||||||
|
input,
|
||||||
|
qweight,
|
||||||
|
output,
|
||||||
|
scales,
|
||||||
|
qzeros,
|
||||||
|
g_idx,
|
||||||
|
input.shape[0],
|
||||||
|
qweight.shape[1],
|
||||||
|
input.shape[1],
|
||||||
|
bits,
|
||||||
|
maxq,
|
||||||
|
input.stride(0),
|
||||||
|
input.stride(1),
|
||||||
|
qweight.stride(0),
|
||||||
|
qweight.stride(1),
|
||||||
|
output.stride(0),
|
||||||
|
output.stride(1),
|
||||||
|
scales.stride(0),
|
||||||
|
qzeros.stride(0),
|
||||||
|
)
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
class QuantLinearFunction(torch.autograd.Function):
|
||||||
|
@staticmethod
|
||||||
|
@custom_fwd(cast_inputs=torch.float16)
|
||||||
|
def forward(ctx, input, qweight, scales, qzeros, g_idx, bits, maxq):
|
||||||
|
output = matmul248(input, qweight, scales, qzeros, g_idx, bits, maxq)
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
class QuantLinear(nn.Module):
|
||||||
|
def __init__(self, qweight, qzeros, scales, g_idx, bias, bits, groupsize):
|
||||||
|
super().__init__()
|
||||||
|
self.register_buffer("qweight", qweight)
|
||||||
|
self.register_buffer("qzeros", qzeros)
|
||||||
|
self.register_buffer("scales", scales)
|
||||||
|
self.register_buffer("g_idx", g_idx)
|
||||||
|
if bias is not None:
|
||||||
|
self.register_buffer("bias", bias)
|
||||||
|
else:
|
||||||
|
self.bias = None
|
||||||
|
if bits not in [2, 4, 8]:
|
||||||
|
raise NotImplementedError("Only 2,4,8 bits are supported.")
|
||||||
|
self.bits = bits
|
||||||
|
self.maxq = 2**self.bits - 1
|
||||||
|
self.groupsize = groupsize
|
||||||
|
|
||||||
|
self.outfeatures = qweight.shape[1]
|
||||||
|
self.infeatures = qweight.shape[0] * 32 // bits
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def new(cls, bits, groupsize, infeatures, outfeatures, bias):
|
||||||
|
if bits not in [2, 4, 8]:
|
||||||
|
raise NotImplementedError("Only 2,4,8 bits are supported.")
|
||||||
|
|
||||||
|
qweight = torch.zeros((infeatures // 32 * bits, outfeatures), dtype=torch.int32)
|
||||||
|
qzeros = torch.zeros(
|
||||||
|
(math.ceil(infeatures / groupsize), outfeatures // 32 * bits),
|
||||||
|
dtype=torch.int32,
|
||||||
|
)
|
||||||
|
scales = torch.zeros(
|
||||||
|
(math.ceil(infeatures / groupsize), outfeatures), dtype=torch.float16
|
||||||
|
)
|
||||||
|
g_idx = torch.tensor(
|
||||||
|
[i // groupsize for i in range(infeatures)], dtype=torch.int32
|
||||||
|
)
|
||||||
|
if bias:
|
||||||
|
bias = torch.zeros((outfeatures), dtype=torch.float16)
|
||||||
|
else:
|
||||||
|
bias = None
|
||||||
|
return cls(qweight, qzeros, scales, g_idx, bias, bits, groupsize)
|
||||||
|
|
||||||
|
def pack(self, linear, scales, zeros, g_idx=None):
|
||||||
|
self.g_idx = g_idx.clone() if g_idx is not None else self.g_idx
|
||||||
|
|
||||||
|
scales = scales.t().contiguous()
|
||||||
|
zeros = zeros.t().contiguous()
|
||||||
|
scale_zeros = zeros * scales
|
||||||
|
self.scales = scales.clone().half()
|
||||||
|
if linear.bias is not None:
|
||||||
|
self.bias = linear.bias.clone().half()
|
||||||
|
|
||||||
|
intweight = []
|
||||||
|
for idx in range(self.infeatures):
|
||||||
|
intweight.append(
|
||||||
|
torch.round(
|
||||||
|
(linear.weight.data[:, idx] + scale_zeros[self.g_idx[idx]])
|
||||||
|
/ self.scales[self.g_idx[idx]]
|
||||||
|
).to(torch.int)[:, None]
|
||||||
|
)
|
||||||
|
intweight = torch.cat(intweight, dim=1)
|
||||||
|
intweight = intweight.t().contiguous()
|
||||||
|
intweight = intweight.numpy().astype(np.uint32)
|
||||||
|
qweight = np.zeros(
|
||||||
|
(intweight.shape[0] // 32 * self.bits, intweight.shape[1]), dtype=np.uint32
|
||||||
|
)
|
||||||
|
i = 0
|
||||||
|
row = 0
|
||||||
|
while row < qweight.shape[0]:
|
||||||
|
if self.bits in [2, 4, 8]:
|
||||||
|
for j in range(i, i + (32 // self.bits)):
|
||||||
|
qweight[row] |= intweight[j] << (self.bits * (j - i))
|
||||||
|
i += 32 // self.bits
|
||||||
|
row += 1
|
||||||
|
else:
|
||||||
|
raise NotImplementedError("Only 2,4,8 bits are supported.")
|
||||||
|
|
||||||
|
qweight = qweight.astype(np.int32)
|
||||||
|
self.qweight = torch.from_numpy(qweight)
|
||||||
|
|
||||||
|
zeros -= 1
|
||||||
|
zeros = zeros.numpy().astype(np.uint32)
|
||||||
|
qzeros = np.zeros(
|
||||||
|
(zeros.shape[0], zeros.shape[1] // 32 * self.bits), dtype=np.uint32
|
||||||
|
)
|
||||||
|
i = 0
|
||||||
|
col = 0
|
||||||
|
while col < qzeros.shape[1]:
|
||||||
|
if self.bits in [2, 4, 8]:
|
||||||
|
for j in range(i, i + (32 // self.bits)):
|
||||||
|
qzeros[:, col] |= zeros[:, j] << (self.bits * (j - i))
|
||||||
|
i += 32 // self.bits
|
||||||
|
col += 1
|
||||||
|
else:
|
||||||
|
raise NotImplementedError("Only 2,4,8 bits are supported.")
|
||||||
|
|
||||||
|
qzeros = qzeros.astype(np.int32)
|
||||||
|
self.qzeros = torch.from_numpy(qzeros)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
out_shape = x.shape[:-1] + (self.outfeatures,)
|
||||||
|
out = QuantLinearFunction.apply(
|
||||||
|
x.reshape(-1, x.shape[-1]),
|
||||||
|
self.qweight,
|
||||||
|
self.scales,
|
||||||
|
self.qzeros,
|
||||||
|
self.g_idx,
|
||||||
|
self.bits,
|
||||||
|
self.maxq,
|
||||||
|
)
|
||||||
|
out = out + self.bias if self.bias is not None else out
|
||||||
|
return out.reshape(out_shape)
|
185
server/text_generation_server/layers/layernorm.py
Normal file
185
server/text_generation_server/layers/layernorm.py
Normal file
@ -0,0 +1,185 @@
|
|||||||
|
import torch
|
||||||
|
from torch import nn
|
||||||
|
from accelerate import init_empty_weights
|
||||||
|
from text_generation_server.utils.import_utils import (
|
||||||
|
SYSTEM,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# Monkey patching
|
||||||
|
@classmethod
|
||||||
|
def load_layer_norm(cls, prefix, weights, eps):
|
||||||
|
weight = weights.get_tensor(f"{prefix}.weight")
|
||||||
|
bias = weights.get_tensor(f"{prefix}.bias")
|
||||||
|
with init_empty_weights():
|
||||||
|
ln = cls(weight.shape, eps=eps)
|
||||||
|
|
||||||
|
ln.weight = torch.nn.Parameter(weight)
|
||||||
|
ln.bias = torch.nn.Parameter(bias)
|
||||||
|
return ln
|
||||||
|
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def load_layer_norm_no_bias(cls, prefix, weights, eps):
|
||||||
|
weight = weights.get_tensor(f"{prefix}.weight")
|
||||||
|
with init_empty_weights():
|
||||||
|
ln = cls(weight.shape, eps=eps)
|
||||||
|
|
||||||
|
ln.weight = torch.nn.Parameter(weight)
|
||||||
|
ln.bias = None
|
||||||
|
return ln
|
||||||
|
|
||||||
|
|
||||||
|
torch.nn.LayerNorm.load = load_layer_norm
|
||||||
|
torch.nn.LayerNorm.load_no_bias = load_layer_norm_no_bias
|
||||||
|
|
||||||
|
if SYSTEM == "cuda":
|
||||||
|
import dropout_layer_norm
|
||||||
|
|
||||||
|
class FastLayerNorm(nn.LayerNorm):
|
||||||
|
def forward(self, hidden_states, residual=None):
|
||||||
|
if hidden_states.shape[-1] > 8192:
|
||||||
|
if residual is not None:
|
||||||
|
hidden_states += residual
|
||||||
|
residual = hidden_states
|
||||||
|
|
||||||
|
return super(FastLayerNorm, self).forward(hidden_states), residual
|
||||||
|
else:
|
||||||
|
(
|
||||||
|
normed_hidden_states,
|
||||||
|
residual,
|
||||||
|
*rest,
|
||||||
|
) = dropout_layer_norm.dropout_add_ln_fwd(
|
||||||
|
hidden_states,
|
||||||
|
residual,
|
||||||
|
self.weight,
|
||||||
|
self.bias,
|
||||||
|
None,
|
||||||
|
None,
|
||||||
|
None,
|
||||||
|
None,
|
||||||
|
0.0,
|
||||||
|
self.eps,
|
||||||
|
1.0,
|
||||||
|
0,
|
||||||
|
None,
|
||||||
|
False,
|
||||||
|
False,
|
||||||
|
)
|
||||||
|
if residual is None:
|
||||||
|
residual = hidden_states
|
||||||
|
|
||||||
|
return normed_hidden_states, residual
|
||||||
|
|
||||||
|
elif SYSTEM == "rocm":
|
||||||
|
from vllm._C import ops
|
||||||
|
|
||||||
|
class FastLayerNorm(nn.LayerNorm):
|
||||||
|
def forward(self, hidden_states, residual=None):
|
||||||
|
if residual is not None:
|
||||||
|
hidden_states += residual
|
||||||
|
residual = hidden_states
|
||||||
|
|
||||||
|
return super().forward(hidden_states), residual
|
||||||
|
|
||||||
|
elif SYSTEM == "xpu":
|
||||||
|
import intel_extension_for_pytorch as ipex
|
||||||
|
|
||||||
|
class FastLayerNorm(nn.LayerNorm):
|
||||||
|
def forward(self, hidden_states, residual=None):
|
||||||
|
res_out = hidden_states
|
||||||
|
out = ipex.llm.functional.add_layer_norm(
|
||||||
|
residual, hidden_states, self.weight, self.bias, self.eps, True
|
||||||
|
)
|
||||||
|
if residual is not None:
|
||||||
|
res_out = residual
|
||||||
|
return out, res_out
|
||||||
|
|
||||||
|
|
||||||
|
class FastRMSNorm(nn.Module):
|
||||||
|
def __init__(self, weight: torch.Tensor, eps: float):
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
self.weight = nn.Parameter(weight)
|
||||||
|
self.variance_epsilon = eps
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def load(cls, prefix, weights, eps=1e-6):
|
||||||
|
weight = weights.get_tensor(f"{prefix}.weight")
|
||||||
|
return cls(weight, eps)
|
||||||
|
|
||||||
|
def forward(self, hidden_states, residual=None):
|
||||||
|
if SYSTEM == "xpu":
|
||||||
|
residual_out = hidden_states
|
||||||
|
out = ipex.llm.functional.add_rms_norm(
|
||||||
|
residual,
|
||||||
|
hidden_states,
|
||||||
|
self.weight,
|
||||||
|
None,
|
||||||
|
self.variance_epsilon,
|
||||||
|
True,
|
||||||
|
)
|
||||||
|
if residual is not None:
|
||||||
|
residual_out = residual
|
||||||
|
return out, residual_out
|
||||||
|
elif hidden_states.shape[-1] > 8192:
|
||||||
|
if residual is not None:
|
||||||
|
hidden_states += residual
|
||||||
|
residual = hidden_states
|
||||||
|
|
||||||
|
hidden_states = hidden_states.to(torch.float32)
|
||||||
|
variance = hidden_states.pow(2).mean(-1, keepdim=True)
|
||||||
|
hidden_states = hidden_states * torch.rsqrt(
|
||||||
|
variance + self.variance_epsilon
|
||||||
|
)
|
||||||
|
|
||||||
|
# convert into half-precision if necessary
|
||||||
|
if self.weight.dtype in [torch.float16, torch.bfloat16]:
|
||||||
|
hidden_states = hidden_states.to(self.weight.dtype)
|
||||||
|
|
||||||
|
return self.weight * hidden_states, residual
|
||||||
|
elif SYSTEM == "cuda":
|
||||||
|
# faster post attention rms norm
|
||||||
|
(
|
||||||
|
normed_hidden_states,
|
||||||
|
res,
|
||||||
|
*rest,
|
||||||
|
) = dropout_layer_norm.dropout_add_ln_fwd(
|
||||||
|
hidden_states,
|
||||||
|
residual,
|
||||||
|
self.weight,
|
||||||
|
None,
|
||||||
|
None,
|
||||||
|
None,
|
||||||
|
None,
|
||||||
|
None,
|
||||||
|
0.0,
|
||||||
|
self.variance_epsilon,
|
||||||
|
1.0,
|
||||||
|
0,
|
||||||
|
None,
|
||||||
|
False,
|
||||||
|
True, # Activate RMSNorm
|
||||||
|
)
|
||||||
|
if res is None:
|
||||||
|
res = hidden_states
|
||||||
|
|
||||||
|
return normed_hidden_states, res
|
||||||
|
elif SYSTEM == "rocm":
|
||||||
|
# We use VLLM RMSNorm kernel that can be compiled for RoCm, instead of Flash Attention ones that can not.
|
||||||
|
if residual is not None:
|
||||||
|
hidden_states += residual
|
||||||
|
residual = hidden_states
|
||||||
|
|
||||||
|
out = torch.empty_like(hidden_states)
|
||||||
|
ops.rms_norm(
|
||||||
|
out,
|
||||||
|
hidden_states,
|
||||||
|
self.weight.data,
|
||||||
|
self.variance_epsilon,
|
||||||
|
)
|
||||||
|
return out, residual
|
||||||
|
else:
|
||||||
|
raise ValueError(
|
||||||
|
"Your system seem to be not supported. Please check your install or open an issue at https://github.com/huggingface/text-generation-inference/issues with a clear reproduction."
|
||||||
|
)
|
216
server/text_generation_server/layers/linear.py
Normal file
216
server/text_generation_server/layers/linear.py
Normal file
@ -0,0 +1,216 @@
|
|||||||
|
import torch
|
||||||
|
from torch.nn import functional as F
|
||||||
|
from text_generation_server.utils.import_utils import SYSTEM
|
||||||
|
|
||||||
|
if SYSTEM == "rocm":
|
||||||
|
try:
|
||||||
|
from vllm import _custom_C
|
||||||
|
except Exception as e:
|
||||||
|
raise ImportError(f"Could not load `vllm._custom_C`. Full error: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
class FastLinear(torch.nn.Module):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
weight,
|
||||||
|
bias,
|
||||||
|
) -> None:
|
||||||
|
super().__init__()
|
||||||
|
self.weight = torch.nn.Parameter(weight, requires_grad=False)
|
||||||
|
if bias is not None:
|
||||||
|
self.bias = torch.nn.Parameter(bias, requires_grad=False)
|
||||||
|
else:
|
||||||
|
self.bias = None
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def load(cls, config, prefix: str, weights, bias: bool):
|
||||||
|
weight = weights.get_tensor(f"{prefix}.weight")
|
||||||
|
if bias:
|
||||||
|
bias = weights.get_tensor(f"{prefix}.bias")
|
||||||
|
else:
|
||||||
|
bias = None
|
||||||
|
return cls(weight, bias)
|
||||||
|
|
||||||
|
def forward(self, input: torch.Tensor) -> torch.Tensor:
|
||||||
|
return F.linear(input, self.weight, self.bias)
|
||||||
|
|
||||||
|
|
||||||
|
class FastLinearROCm(torch.nn.Module):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
weight,
|
||||||
|
bias,
|
||||||
|
) -> None:
|
||||||
|
super().__init__()
|
||||||
|
self.weight = torch.nn.Parameter(weight)
|
||||||
|
if bias is not None:
|
||||||
|
self.bias = torch.nn.Parameter(bias)
|
||||||
|
else:
|
||||||
|
self.bias = None
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def load(cls, config, prefix: str, weights, bias: bool):
|
||||||
|
weight = weights.get_tensor(f"{prefix}.weight")
|
||||||
|
if bias:
|
||||||
|
bias = weights.get_tensor(f"{prefix}.bias")
|
||||||
|
else:
|
||||||
|
bias = None
|
||||||
|
return cls(weight, bias)
|
||||||
|
|
||||||
|
def forward(self, inp: torch.Tensor) -> torch.Tensor:
|
||||||
|
weight = self.weight
|
||||||
|
bias = self.bias
|
||||||
|
|
||||||
|
if SYSTEM == "rocm" and inp.numel() // inp.shape[-1] == 1:
|
||||||
|
batched = False
|
||||||
|
inp_shape = inp.shape
|
||||||
|
|
||||||
|
if inp.dim() == 3:
|
||||||
|
inp = inp.view(-1, inp_shape[-1])
|
||||||
|
batched = True
|
||||||
|
|
||||||
|
m, k = weight.shape[0], inp_shape[1]
|
||||||
|
out = torch.empty(
|
||||||
|
inp_shape[0], weight.shape[0], dtype=inp.dtype, device="cuda"
|
||||||
|
)
|
||||||
|
if (k == 8192 and (m == 1280 or m == 7168)) or (k == 3584 and m == 8192):
|
||||||
|
_custom_C.LLMM1(weight, inp, out, 8)
|
||||||
|
elif k <= 8192 and k % 8 == 0 and m % 4 == 0:
|
||||||
|
_custom_C.LLMM1(weight, inp, out, 4)
|
||||||
|
else:
|
||||||
|
out = F.linear(inp, weight)
|
||||||
|
|
||||||
|
if batched:
|
||||||
|
out.view(*inp_shape[:-1], out.shape[-1])
|
||||||
|
|
||||||
|
if bias is not None:
|
||||||
|
out = out + bias
|
||||||
|
return out
|
||||||
|
return F.linear(inp, self.weight, self.bias)
|
||||||
|
|
||||||
|
|
||||||
|
def get_linear(weight, bias, quantize):
|
||||||
|
if quantize is None:
|
||||||
|
if SYSTEM == "rocm":
|
||||||
|
linear = FastLinearROCm(weight, bias)
|
||||||
|
else:
|
||||||
|
linear = FastLinear(weight, bias)
|
||||||
|
elif quantize == "eetq":
|
||||||
|
try:
|
||||||
|
from text_generation_server.layers.eetq import EETQLinear
|
||||||
|
|
||||||
|
linear = EETQLinear(weight, bias)
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"Please install EETQ from https://github.com/NetEase-FuXi/EETQ"
|
||||||
|
)
|
||||||
|
elif quantize == "fp8":
|
||||||
|
from text_generation_server.layers.fp8 import Fp8Linear
|
||||||
|
|
||||||
|
linear = Fp8Linear(weight, bias)
|
||||||
|
elif quantize == "bitsandbytes":
|
||||||
|
try:
|
||||||
|
from text_generation_server.layers.bnb import (
|
||||||
|
warn_deprecate_bnb,
|
||||||
|
Linear8bitLt,
|
||||||
|
)
|
||||||
|
except ImportError:
|
||||||
|
raise NotImplementedError(
|
||||||
|
f"Bitsandbytes is missing install it with `pip install bitsandbytes`."
|
||||||
|
)
|
||||||
|
warn_deprecate_bnb()
|
||||||
|
linear = Linear8bitLt(
|
||||||
|
weight,
|
||||||
|
bias,
|
||||||
|
has_fp16_weights=False,
|
||||||
|
threshold=6.0,
|
||||||
|
)
|
||||||
|
if bias is not None:
|
||||||
|
linear.bias = nn.Parameter(bias)
|
||||||
|
elif quantize == "bitsandbytes-fp4":
|
||||||
|
try:
|
||||||
|
from text_generation_server.layers.bnb import Linear4bit
|
||||||
|
except ImportError:
|
||||||
|
raise NotImplementedError(
|
||||||
|
f"Bitsandbytes is missing install it with `pip install bitsandbytes`."
|
||||||
|
)
|
||||||
|
linear = Linear4bit(
|
||||||
|
weight,
|
||||||
|
bias,
|
||||||
|
quant_type="fp4",
|
||||||
|
)
|
||||||
|
elif quantize == "bitsandbytes-nf4":
|
||||||
|
try:
|
||||||
|
from text_generation_server.layers.bnb import Linear4bit
|
||||||
|
except ImportError:
|
||||||
|
raise NotImplementedError(
|
||||||
|
f"Bitsandbytes is missing install it with `pip install bitsandbytes`."
|
||||||
|
)
|
||||||
|
linear = Linear4bit(
|
||||||
|
weight,
|
||||||
|
bias,
|
||||||
|
quant_type="nf4",
|
||||||
|
)
|
||||||
|
elif quantize == "gptq":
|
||||||
|
try:
|
||||||
|
qweight, qzeros, scales, g_idx, bits, groupsize, use_exllama = weight
|
||||||
|
except Exception:
|
||||||
|
raise NotImplementedError(
|
||||||
|
f"The passed weight is not `gptq` compatible, loader needs to be updated."
|
||||||
|
)
|
||||||
|
|
||||||
|
if use_exllama:
|
||||||
|
try:
|
||||||
|
from text_generation_server.layers.gptq import (
|
||||||
|
ExllamaQuantLinear,
|
||||||
|
)
|
||||||
|
except ImportError:
|
||||||
|
raise NotImplementedError(
|
||||||
|
f"Exllama gptq kernels are not installed. Install them `cd server/exllama_kernels && python setup.py install && cd ../exllamav2_kernels && python setup.py install`"
|
||||||
|
)
|
||||||
|
|
||||||
|
linear = ExllamaQuantLinear(
|
||||||
|
qweight, qzeros, scales, g_idx, bias, bits, groupsize
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
from text_generation_server.layers.gptq.quant_linear import QuantLinear
|
||||||
|
|
||||||
|
linear = QuantLinear(
|
||||||
|
qweight,
|
||||||
|
qzeros,
|
||||||
|
scales,
|
||||||
|
g_idx,
|
||||||
|
bias,
|
||||||
|
bits,
|
||||||
|
groupsize,
|
||||||
|
)
|
||||||
|
elif quantize == "awq":
|
||||||
|
try:
|
||||||
|
qweight, qzeros, scales, _, bits, groupsize, _ = weight
|
||||||
|
except Exception:
|
||||||
|
raise NotImplementedError(
|
||||||
|
f"The passed weight is not `awq` compatible, loader needs to be updated."
|
||||||
|
)
|
||||||
|
if SYSTEM == "rocm":
|
||||||
|
raise NotImplementedError(
|
||||||
|
"AWQ GEMM kernel can't be used on ROCm systems, please use `--quantize gptq` instead "
|
||||||
|
"to use Exllama/GPTQ kernels for AWQ inference."
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
from text_generation_server.layers.awq.quantize.qmodule import WQLinear
|
||||||
|
|
||||||
|
linear = WQLinear(
|
||||||
|
w_bit=bits,
|
||||||
|
group_size=groupsize,
|
||||||
|
qweight=qweight,
|
||||||
|
qzeros=qzeros,
|
||||||
|
scales=scales,
|
||||||
|
bias=bias is not None,
|
||||||
|
)
|
||||||
|
except ImportError:
|
||||||
|
raise NotImplementedError(
|
||||||
|
"You do not seem to have awq installed, either install it (cd server && make install-awq), or try using GPTQ `---quantize gptq` a conversion AWQ->GPTQ will happen on the fly"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
raise NotImplementedError(f"Quantization `{quantize}` is not implemented yet.")
|
||||||
|
return linear
|
189
server/text_generation_server/layers/medusa.py
Normal file
189
server/text_generation_server/layers/medusa.py
Normal file
@ -0,0 +1,189 @@
|
|||||||
|
import torch
|
||||||
|
from torch import nn
|
||||||
|
from typing import Tuple, Optional
|
||||||
|
from text_generation_server.utils.speculate import get_speculate
|
||||||
|
from text_generation_server.layers.linear import FastLinear
|
||||||
|
from text_generation_server.layers.tensor_parallel import (
|
||||||
|
TensorParallelHead,
|
||||||
|
TensorParallelColumnLinear,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class ResBlock(torch.nn.Module):
|
||||||
|
def __init__(self, config, prefix, weights):
|
||||||
|
super().__init__()
|
||||||
|
self.linear = FastLinear.load(
|
||||||
|
config, prefix=f"{prefix}.linear", weights=weights, bias=True
|
||||||
|
)
|
||||||
|
self.act = torch.nn.SiLU()
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
return x + self.act(self.linear(x))
|
||||||
|
|
||||||
|
|
||||||
|
class MedusaModel(torch.nn.Module):
|
||||||
|
def __init__(self, config, medusa_config, weights):
|
||||||
|
super().__init__()
|
||||||
|
self.heads = torch.nn.ModuleList(
|
||||||
|
[
|
||||||
|
MedusaHead(config, medusa_config, prefix=f"{i}", weights=weights)
|
||||||
|
for i in range(get_speculate())
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
speculative_logits = torch.stack([head(x) for head in self.heads], dim=1)
|
||||||
|
return speculative_logits
|
||||||
|
|
||||||
|
|
||||||
|
class MedusaHead(torch.nn.Module):
|
||||||
|
def __init__(self, config, medusa_config, prefix, weights):
|
||||||
|
super().__init__()
|
||||||
|
self.blocks = torch.nn.ModuleList(
|
||||||
|
[
|
||||||
|
ResBlock(config, prefix=f"{prefix}.{i}", weights=weights)
|
||||||
|
for i in range(medusa_config["medusa_num_layers"])
|
||||||
|
]
|
||||||
|
)
|
||||||
|
n = len(self.blocks)
|
||||||
|
self.out = FastLinear.load(
|
||||||
|
config, prefix=f"{prefix}.{n}", weights=weights, bias=False
|
||||||
|
)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
for block in self.blocks:
|
||||||
|
x = block(x)
|
||||||
|
x = self.out(x)
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
class MedusaHeadV1(nn.Module):
|
||||||
|
def __init__(self, lm_head, medusa):
|
||||||
|
super().__init__()
|
||||||
|
self.lm_head = lm_head
|
||||||
|
self.medusa = medusa
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def load(config, prefix: str, weights):
|
||||||
|
from pathlib import Path
|
||||||
|
from safetensors import safe_open
|
||||||
|
import json
|
||||||
|
|
||||||
|
speculator = config.speculator
|
||||||
|
|
||||||
|
path = speculator["path"]
|
||||||
|
medusa_config = str(Path(path) / "config.json")
|
||||||
|
|
||||||
|
for fname in speculator["model_paths"]:
|
||||||
|
filename = str(Path(path) / fname)
|
||||||
|
|
||||||
|
with open(medusa_config, "r") as f:
|
||||||
|
medusa_config = json.load(f)
|
||||||
|
routing = weights.routing
|
||||||
|
with safe_open(filename, framework="pytorch") as f:
|
||||||
|
for k in f.keys():
|
||||||
|
if k in routing and routing[k] != filename:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"Key {k} was found in multiple files: {filename} and {routing[k]}"
|
||||||
|
)
|
||||||
|
routing[k] = filename
|
||||||
|
|
||||||
|
medusa = MedusaModel(config, medusa_config, weights)
|
||||||
|
lm_head = TensorParallelHead.load(config, prefix, weights)
|
||||||
|
return MedusaHeadV1(lm_head, medusa)
|
||||||
|
|
||||||
|
def forward(
|
||||||
|
self, input: torch.Tensor
|
||||||
|
) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
|
||||||
|
logits = self.lm_head(input)
|
||||||
|
# If we have too many tokens, we skip speculative logits
|
||||||
|
if input.shape[0] > 128:
|
||||||
|
return logits, None
|
||||||
|
|
||||||
|
speculative_logits = self.medusa(input)
|
||||||
|
return logits, speculative_logits
|
||||||
|
|
||||||
|
|
||||||
|
class MedusaHeadV2(nn.Module):
|
||||||
|
def __init__(self, config, prefix, weights):
|
||||||
|
super().__init__()
|
||||||
|
from pathlib import Path
|
||||||
|
from safetensors import safe_open
|
||||||
|
import json
|
||||||
|
|
||||||
|
speculator_path = config.speculator["path"]
|
||||||
|
|
||||||
|
medusa_config = str(Path(speculator_path) / "config.json")
|
||||||
|
filename = str(Path(speculator_path) / "medusa_lm_head.safetensors")
|
||||||
|
|
||||||
|
with open(medusa_config, "r") as f:
|
||||||
|
medusa_config = json.load(f)
|
||||||
|
routing = weights.routing
|
||||||
|
with safe_open(filename, framework="pytorch") as f:
|
||||||
|
for k in f.keys():
|
||||||
|
if k in routing and routing[k] != filename:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"Key {k} was found in multiple files: {filename} and {routing[k]}"
|
||||||
|
)
|
||||||
|
routing[k] = filename
|
||||||
|
|
||||||
|
self.n_medusa_heads = get_speculate()
|
||||||
|
|
||||||
|
assert medusa_config["medusa_num_layers"] == 1
|
||||||
|
self.linear = TensorParallelColumnLinear.load_multi(
|
||||||
|
config,
|
||||||
|
prefixes=[f"{i}.0.linear" for i in range(self.n_medusa_heads)],
|
||||||
|
dim=0,
|
||||||
|
weights=weights,
|
||||||
|
bias=True,
|
||||||
|
)
|
||||||
|
self.process_group = weights.process_group
|
||||||
|
self.world_size = self.process_group.size()
|
||||||
|
self.rank = self.process_group.rank()
|
||||||
|
|
||||||
|
self.act = torch.nn.SiLU()
|
||||||
|
|
||||||
|
self.lm_head = TensorParallelHead.load(config, prefix, weights)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
# If we have too many tokens, we skip speculative logits
|
||||||
|
if x.shape[0] > 128:
|
||||||
|
logits = self.lm_head(x)
|
||||||
|
return logits, None
|
||||||
|
|
||||||
|
size = x.shape[-1]
|
||||||
|
block_size = (size + self.world_size - 1) // self.world_size
|
||||||
|
start = self.rank * block_size
|
||||||
|
stop = (self.rank + 1) * block_size
|
||||||
|
|
||||||
|
x_block = x[:, start:stop]
|
||||||
|
|
||||||
|
# Compute all medusa heads at the same time, then reshape and move the n_medusa_heads dim to dim 1
|
||||||
|
medusa_res = self.act(self.linear(x)).reshape(
|
||||||
|
*x_block.shape[:-1], self.n_medusa_heads, x_block.shape[-1]
|
||||||
|
)
|
||||||
|
|
||||||
|
# Apply all residual medusa heads
|
||||||
|
output = x[:, start:stop].unsqueeze(-2) + medusa_res
|
||||||
|
|
||||||
|
# Gather medusa heads
|
||||||
|
world_output = [
|
||||||
|
torch.empty_like(output) for _ in range(self.process_group.size())
|
||||||
|
]
|
||||||
|
torch.distributed.all_gather(world_output, output, group=self.process_group)
|
||||||
|
world_output = torch.cat(world_output, dim=-1)
|
||||||
|
|
||||||
|
# Stack x and medusa residual x
|
||||||
|
stacked_x = torch.cat([x.unsqueeze(-2), world_output], dim=-2)
|
||||||
|
|
||||||
|
# Compute lm head on x + medusa residual x
|
||||||
|
logits = self.lm_head(stacked_x)
|
||||||
|
|
||||||
|
# Finally, split logits from speculative logits
|
||||||
|
logits, speculative_logits = torch.split(
|
||||||
|
logits, [1, self.n_medusa_heads], dim=-2
|
||||||
|
)
|
||||||
|
# Squeeze added dimension
|
||||||
|
logits = logits.squeeze(-2)
|
||||||
|
|
||||||
|
return logits, speculative_logits
|
176
server/text_generation_server/layers/mlp.py
Normal file
176
server/text_generation_server/layers/mlp.py
Normal file
@ -0,0 +1,176 @@
|
|||||||
|
import torch
|
||||||
|
import math
|
||||||
|
from torch import nn
|
||||||
|
from torch.nn import functional as F
|
||||||
|
from typing import Optional, Tuple
|
||||||
|
from text_generation_server.layers import TensorParallelEmbedding, FastLinear
|
||||||
|
from text_generation_server.layers.tensor_parallel import TensorParallelHead
|
||||||
|
from text_generation_server.utils.speculate import get_speculate
|
||||||
|
|
||||||
|
|
||||||
|
class MLPSpeculatorLayerNorm(nn.Module):
|
||||||
|
"""
|
||||||
|
A L2 normalization implementation
|
||||||
|
...
|
||||||
|
Args
|
||||||
|
----
|
||||||
|
normalized_shape : int
|
||||||
|
Dimensionality of input data (size of final tensor axis)
|
||||||
|
elementwise_scale_weight : torch.Tensor
|
||||||
|
learned scaling term after normalization?
|
||||||
|
elementwise_shift_bias : torch.Tensor
|
||||||
|
learned bias term after normalization?
|
||||||
|
eps : float
|
||||||
|
Safety term to prevent division by zero. Make sure the chosen value fits in the range of your encoding scheme (i.e. fp16 requires eps >= 6e-8).
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
prefix,
|
||||||
|
config,
|
||||||
|
weights,
|
||||||
|
eps=1e-06,
|
||||||
|
):
|
||||||
|
super(MLPSpeculatorLayerNorm, self).__init__()
|
||||||
|
self.weight = weights.get_tensor(f"{prefix}.weight")
|
||||||
|
self.bias = weights.get_tensor(f"{prefix}.bias")
|
||||||
|
self.eps = eps
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
xf = x
|
||||||
|
xf = xf * torch.rsqrt(xf.pow(2).mean(-1, keepdim=True) + self.eps)
|
||||||
|
x = xf.type_as(x)
|
||||||
|
x = self.weight * x
|
||||||
|
x = x + self.bias
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
class MLPSpeculatorModel(torch.nn.Module):
|
||||||
|
def __init__(self, config, prefix, weights):
|
||||||
|
super().__init__()
|
||||||
|
self.config = config
|
||||||
|
self.n_predict = get_speculate()
|
||||||
|
self.hidden_size = config.hidden_size
|
||||||
|
self.emb = nn.ModuleList(
|
||||||
|
[
|
||||||
|
TensorParallelEmbedding(f"{prefix}.emb.{i}", weights)
|
||||||
|
for i in range(self.n_predict)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
self.proj = [
|
||||||
|
FastLinear.load(
|
||||||
|
config,
|
||||||
|
prefix=f"{prefix}.proj.{i}",
|
||||||
|
weights=weights,
|
||||||
|
bias=False,
|
||||||
|
)
|
||||||
|
for i in range(self.n_predict)
|
||||||
|
]
|
||||||
|
self.head = nn.ModuleList(
|
||||||
|
[
|
||||||
|
FastLinear.load(config, f"{prefix}.head.{i}", weights, bias=False)
|
||||||
|
for i in range(self.n_predict)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
self.ln = nn.ModuleList(
|
||||||
|
[
|
||||||
|
MLPSpeculatorLayerNorm(
|
||||||
|
prefix=f"{prefix}.ln.{i}",
|
||||||
|
config=config,
|
||||||
|
weights=weights,
|
||||||
|
)
|
||||||
|
for i in range(self.n_predict)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
# Weights ensure that state_0 accounts for 50% of state magnitude by final head in expectation
|
||||||
|
self.state_weight = 0.5 ** (0.5 / self.n_predict)
|
||||||
|
self.emb_weight = math.sqrt(1 - self.state_weight**2)
|
||||||
|
self.activation = nn.GELU()
|
||||||
|
# TODO
|
||||||
|
self.vsize = config.vocab_size
|
||||||
|
self.inner_dim = config.speculator_config["inner_dim"]
|
||||||
|
self.top_k_tokens_per_head = [1] * self.n_predict
|
||||||
|
|
||||||
|
def forward(
|
||||||
|
self,
|
||||||
|
hidden_states: torch.Tensor,
|
||||||
|
input_ids: torch.Tensor,
|
||||||
|
):
|
||||||
|
top_k_tokens_per_head = self.top_k_tokens_per_head
|
||||||
|
|
||||||
|
# k indicates # of candidates
|
||||||
|
# h indicates # of generated tokens
|
||||||
|
state = hidden_states
|
||||||
|
b = state.size(0)
|
||||||
|
ind = input_ids.unsqueeze(0)
|
||||||
|
all_probs = torch.empty(
|
||||||
|
b, self.n_predict, self.vsize, device=state.device
|
||||||
|
) # b k h v
|
||||||
|
assert (
|
||||||
|
len(top_k_tokens_per_head) == self.n_predict
|
||||||
|
), f"You must provide a topk number for each head ({self.n_predict} heads, {len(top_k_tokens_per_head)} provided)"
|
||||||
|
for i in range(self.n_predict):
|
||||||
|
# Project and predict
|
||||||
|
z = self.emb[i](ind)
|
||||||
|
z = z.mul(self.emb_weight * math.sqrt(self.inner_dim / 2)) # b k d
|
||||||
|
state = self.proj[i](state) * self.state_weight + z
|
||||||
|
state = self.activation(self.ln[i](state)) # b k d
|
||||||
|
probs = F.log_softmax(self.head[i](state), dim=-1) # b k v
|
||||||
|
_probs, preds = probs.topk(top_k_tokens_per_head[i], dim=-1) # b k k'
|
||||||
|
|
||||||
|
# Update candidate set with new predictions
|
||||||
|
|
||||||
|
# Update distribution set with new logits
|
||||||
|
all_probs[:, i] = probs.exp()
|
||||||
|
|
||||||
|
# Update state, log_probs and ind for new predictions
|
||||||
|
state = state.unsqueeze(2).expand(
|
||||||
|
-1, -1, top_k_tokens_per_head[i], -1
|
||||||
|
) # b k k' d
|
||||||
|
state = state.reshape(-1, b, state.size(3)) # b kk' d
|
||||||
|
ind = preds.view(-1, b) # b kk'
|
||||||
|
|
||||||
|
speculative_logits = all_probs
|
||||||
|
return speculative_logits
|
||||||
|
|
||||||
|
|
||||||
|
class MLPSpeculatorHead(nn.Module):
|
||||||
|
def __init__(self, lm_head, mlp_speculator):
|
||||||
|
super().__init__()
|
||||||
|
self.lm_head = lm_head
|
||||||
|
self.mlp_speculator = mlp_speculator
|
||||||
|
|
||||||
|
def forward(
|
||||||
|
self, input: torch.Tensor
|
||||||
|
) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
|
||||||
|
logits = self.lm_head(input)
|
||||||
|
# If we have too many tokens, we skip speculative logits
|
||||||
|
if input.shape[0] > 128:
|
||||||
|
return logits, None
|
||||||
|
|
||||||
|
input_ids = logits.argmax(dim=-1)
|
||||||
|
speculative_logits = self.mlp_speculator(input, input_ids)
|
||||||
|
return logits, speculative_logits
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def load(config, prefix: str, weights):
|
||||||
|
from pathlib import Path
|
||||||
|
from safetensors import safe_open
|
||||||
|
|
||||||
|
speculator_path = config.speculator["path"]
|
||||||
|
|
||||||
|
for fname in config.speculator["model_paths"]:
|
||||||
|
filename = str(Path(speculator_path) / fname)
|
||||||
|
routing = weights.routing
|
||||||
|
with safe_open(filename, framework="pytorch") as f:
|
||||||
|
for k in f.keys():
|
||||||
|
if k in routing and routing[k] != filename:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"Key {k} was found in multiple files: {filename} and {routing[k]}"
|
||||||
|
)
|
||||||
|
routing[k] = filename
|
||||||
|
|
||||||
|
mlp_speculator = MLPSpeculatorModel(config, "speculator", weights)
|
||||||
|
lm_head = TensorParallelHead.load(config, prefix, weights)
|
||||||
|
return MLPSpeculatorHead(lm_head, mlp_speculator)
|
421
server/text_generation_server/layers/rotary.py
Normal file
421
server/text_generation_server/layers/rotary.py
Normal file
@ -0,0 +1,421 @@
|
|||||||
|
import os
|
||||||
|
import torch
|
||||||
|
from torch import nn
|
||||||
|
|
||||||
|
from text_generation_server.utils.import_utils import SYSTEM
|
||||||
|
|
||||||
|
if SYSTEM == "cuda":
|
||||||
|
from flash_attn.layers.rotary import RotaryEmbedding
|
||||||
|
import rotary_emb
|
||||||
|
elif SYSTEM == "rocm":
|
||||||
|
from vllm._C import ops
|
||||||
|
elif SYSTEM == "xpu":
|
||||||
|
import intel_extension_for_pytorch as ipex
|
||||||
|
|
||||||
|
|
||||||
|
def _create_inv_freq(dim, base, device):
|
||||||
|
inv_freq = 1.0 / (
|
||||||
|
base ** (torch.arange(0, dim, 2, device=device, dtype=torch.float32) / dim)
|
||||||
|
)
|
||||||
|
return inv_freq
|
||||||
|
|
||||||
|
|
||||||
|
def _get_rope_config(config):
|
||||||
|
if os.getenv("ROPE_SCALING", None) is not None:
|
||||||
|
rope_scaling = {
|
||||||
|
"type": os.environ["ROPE_SCALING"],
|
||||||
|
"factor": float(os.environ["ROPE_FACTOR"]),
|
||||||
|
}
|
||||||
|
return rope_scaling
|
||||||
|
return getattr(config, "rope_scaling", None)
|
||||||
|
|
||||||
|
|
||||||
|
class PositionRotaryEmbedding(nn.Module):
|
||||||
|
def __init__(self, inv_freq, scaling_factor):
|
||||||
|
super().__init__()
|
||||||
|
self.inv_freq = inv_freq
|
||||||
|
self._seq_len_cached = 0
|
||||||
|
self._cos_cached = None
|
||||||
|
self._sin_cached = None
|
||||||
|
self._cos_k_cached = None
|
||||||
|
self._sin_k_cached = None
|
||||||
|
self.scaling_factor = scaling_factor
|
||||||
|
self.dynamic_args = None
|
||||||
|
|
||||||
|
def forward(
|
||||||
|
self,
|
||||||
|
query: torch.Tensor,
|
||||||
|
key: torch.Tensor,
|
||||||
|
cos: torch.Tensor,
|
||||||
|
sin: torch.Tensor,
|
||||||
|
):
|
||||||
|
# Such controlflows may add some overhead.
|
||||||
|
if SYSTEM == "cuda":
|
||||||
|
rotary_dim = cos.shape[-1]
|
||||||
|
q1 = query[..., :rotary_dim]
|
||||||
|
q2 = query[..., rotary_dim : 2 * rotary_dim]
|
||||||
|
|
||||||
|
rotary_emb.apply_rotary(q1, q2, cos, sin, q1, q2, False)
|
||||||
|
|
||||||
|
k1 = key[..., :rotary_dim]
|
||||||
|
k2 = key[..., rotary_dim : 2 * rotary_dim]
|
||||||
|
|
||||||
|
rotary_emb.apply_rotary(k1, k2, cos, sin, k1, k2, False)
|
||||||
|
elif SYSTEM == "rocm":
|
||||||
|
# NOTE: On RoCm systems, we use a ROPE implementatation adapted from VLLM which launches a single kernel for both query/key, contrary to flash-attn implementation used on NVIDIA systems.
|
||||||
|
# Compiling flash-attn rotary on RoCm, it appears hipcc is unable to unroll loops, resulting in an even slower inference compared to eager: https://github.com/pytorch/pytorch/issues/113773
|
||||||
|
|
||||||
|
head_size = query.shape[-1]
|
||||||
|
|
||||||
|
# Inplace operation, updating query and key.
|
||||||
|
ops.rotary_embedding(query, key, head_size, cos, sin, True)
|
||||||
|
elif SYSTEM == "xpu":
|
||||||
|
ipex.llm.functional.rotary_embedding(
|
||||||
|
query, key, sin, cos, query.size(-1), True
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
raise ValueError(
|
||||||
|
"Your system seem to be not supported. Please check your install or open an issue at https://github.com/huggingface/text-generation-inference/issues with a clear reproduction."
|
||||||
|
)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def static(cls, config, dim, base, device):
|
||||||
|
inv_freq = _create_inv_freq(dim, base, device)
|
||||||
|
scaling_factor = None
|
||||||
|
rope_scaling = _get_rope_config(config)
|
||||||
|
if rope_scaling is not None:
|
||||||
|
if rope_scaling["type"] == "linear":
|
||||||
|
pass
|
||||||
|
elif rope_scaling["type"] == "dynamic":
|
||||||
|
scaling_factor = rope_scaling["factor"]
|
||||||
|
return DynamicPositionRotaryEmbedding(
|
||||||
|
dim=dim,
|
||||||
|
max_position_embeddings=config.max_position_embeddings,
|
||||||
|
base=base,
|
||||||
|
device=inv_freq.device,
|
||||||
|
scaling_factor=scaling_factor,
|
||||||
|
)
|
||||||
|
elif rope_scaling["type"] == "yarn":
|
||||||
|
scaling_factor = rope_scaling["factor"]
|
||||||
|
return YarnPositionRotaryEmbedding(
|
||||||
|
dim=2 * inv_freq.shape[0],
|
||||||
|
max_position_embeddings=rope_scaling[
|
||||||
|
"original_max_position_embeddings"
|
||||||
|
],
|
||||||
|
base=10000.0,
|
||||||
|
device=inv_freq.device,
|
||||||
|
scaling_factor=scaling_factor,
|
||||||
|
extrapolation_factor=1,
|
||||||
|
attn_factor=1,
|
||||||
|
beta_fast=32,
|
||||||
|
beta_slow=1,
|
||||||
|
)
|
||||||
|
elif rope_scaling["type"] == "su":
|
||||||
|
short_factor = torch.tensor(
|
||||||
|
rope_scaling["short_factor"], dtype=torch.float32, device=device
|
||||||
|
)
|
||||||
|
short_inv_freq = 1.0 / (
|
||||||
|
short_factor
|
||||||
|
* base
|
||||||
|
** (
|
||||||
|
torch.arange(0, dim, 2, device=device, dtype=torch.float32)
|
||||||
|
/ dim
|
||||||
|
)
|
||||||
|
)
|
||||||
|
long_factor = torch.tensor(
|
||||||
|
rope_scaling["long_factor"], dtype=torch.float32, device=device
|
||||||
|
)
|
||||||
|
long_inv_freq = 1.0 / (
|
||||||
|
long_factor
|
||||||
|
* base
|
||||||
|
** (
|
||||||
|
torch.arange(0, dim, 2, device=device, dtype=torch.float32)
|
||||||
|
/ dim
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
original_max_position_embeddings = (
|
||||||
|
config.original_max_position_embeddings
|
||||||
|
)
|
||||||
|
max_position_embeddings = config.max_position_embeddings
|
||||||
|
if max_position_embeddings <= original_max_position_embeddings:
|
||||||
|
scaling_factor = 1.0
|
||||||
|
else:
|
||||||
|
scale = max_position_embeddings / original_max_position_embeddings
|
||||||
|
scaling_factor = math.sqrt(
|
||||||
|
1 + math.log(scale) / math.log(original_max_position_embeddings)
|
||||||
|
)
|
||||||
|
|
||||||
|
return SuRotaryEmbedding(
|
||||||
|
short_inv_freq=short_inv_freq,
|
||||||
|
long_inv_freq=long_inv_freq,
|
||||||
|
scaling_factor=scaling_factor,
|
||||||
|
original_max_position_embeddings=original_max_position_embeddings,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
raise NotImplementedError(
|
||||||
|
f"rope scaling type {rope_scaling['type']} is not implemented or invalid"
|
||||||
|
)
|
||||||
|
return cls(inv_freq, scaling_factor)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def load(cls, config, prefix, weights):
|
||||||
|
# XXX: Always load this in float32 !
|
||||||
|
dtype = weights.dtype
|
||||||
|
weights.dtype = torch.float32
|
||||||
|
inv_freq = weights.get_tensor(f"{prefix}.inv_freq")
|
||||||
|
weights.dtype = dtype
|
||||||
|
|
||||||
|
scaling_factor = None
|
||||||
|
rope_scaling = _get_rope_config(config)
|
||||||
|
if rope_scaling is not None:
|
||||||
|
scaling_factor = rope_scaling["factor"]
|
||||||
|
if rope_scaling["type"] == "linear":
|
||||||
|
pass
|
||||||
|
elif rope_scaling["type"] == "dynamic":
|
||||||
|
return DynamicPositionRotaryEmbedding(
|
||||||
|
dim=2 * inv_freq.shape[0],
|
||||||
|
max_position_embeddings=config.max_position_embeddings,
|
||||||
|
base=10000.0,
|
||||||
|
device=inv_freq.device,
|
||||||
|
scaling_factor=scaling_factor,
|
||||||
|
)
|
||||||
|
elif rope_scaling["type"] == "yarn":
|
||||||
|
return YarnPositionRotaryEmbedding(
|
||||||
|
dim=2 * inv_freq.shape[0],
|
||||||
|
max_position_embeddings=rope_scaling[
|
||||||
|
"original_max_position_embeddings"
|
||||||
|
],
|
||||||
|
base=10000.0,
|
||||||
|
device=inv_freq.device,
|
||||||
|
scaling_factor=scaling_factor,
|
||||||
|
extrapolation_factor=1,
|
||||||
|
attn_factor=1,
|
||||||
|
beta_fast=32,
|
||||||
|
beta_slow=1,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
raise NotImplementedError(
|
||||||
|
f"rope scaling type {rope_scaling['type']} is not implemented or invalid"
|
||||||
|
)
|
||||||
|
return cls(inv_freq, scaling_factor)
|
||||||
|
|
||||||
|
def _update_cos_sin_cache(self, dtype, device, seqlen):
|
||||||
|
# Reset the tables if the sequence length has changed,
|
||||||
|
# or if we're on a new device (possibly due to tracing for instance)
|
||||||
|
if (
|
||||||
|
seqlen > self._seq_len_cached
|
||||||
|
or self._cos_cached.device != device
|
||||||
|
or self._cos_cached.dtype != dtype
|
||||||
|
):
|
||||||
|
self._seq_len_cached = seqlen
|
||||||
|
t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
|
||||||
|
if self.scaling_factor is not None:
|
||||||
|
t /= self.scaling_factor
|
||||||
|
# Don't do einsum, it converts fp32 to fp16
|
||||||
|
# freqs = torch.einsum("i,j->ij", t, self.inv_freq)
|
||||||
|
|
||||||
|
freqs = torch.outer(t, self.inv_freq.to(device=t.device))
|
||||||
|
self._cos_cached = torch.cos(freqs).to(dtype)
|
||||||
|
self._sin_cached = torch.sin(freqs).to(dtype)
|
||||||
|
|
||||||
|
def get_cos_sin(self, position_ids: torch.Tensor, max_s: int, dtype: torch.dtype):
|
||||||
|
"""
|
||||||
|
Return cos and sin for the asked position ids
|
||||||
|
"""
|
||||||
|
if SYSTEM == "rocm":
|
||||||
|
# For RoCm, we always use float cos/sin to avoid a cast.
|
||||||
|
# For NVIDIA, for some reason, the flash-attn rotary kernel requires cos/sin and query/key to be of same dtype: https://github.com/Dao-AILab/flash-attention/blob/017716451d446e464dde9aca3a3c1ed2209caaa9/csrc/rotary/rotary.cpp#L26
|
||||||
|
# But later on goes and cast cos/sin to float anyway: https://github.com/Dao-AILab/flash-attention/blob/017716451d446e464dde9aca3a3c1ed2209caaa9/csrc/rotary/rotary_cuda.cu#L29, which looks suboptimal.
|
||||||
|
dtype = torch.float32
|
||||||
|
|
||||||
|
self._update_cos_sin_cache(dtype, position_ids.device, max_s)
|
||||||
|
|
||||||
|
cos = torch.index_select(self._cos_cached, 0, position_ids)
|
||||||
|
sin = torch.index_select(self._sin_cached, 0, position_ids)
|
||||||
|
|
||||||
|
# Note: this unsqueeze is not necessary on RoCm + VLLM ROPE implementation, but we leave it as is to avoid yet an other controlflow.
|
||||||
|
return cos.unsqueeze(1), sin.unsqueeze(1)
|
||||||
|
|
||||||
|
|
||||||
|
class SuRotaryEmbedding(PositionRotaryEmbedding):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
short_inv_freq,
|
||||||
|
long_inv_freq,
|
||||||
|
scaling_factor,
|
||||||
|
original_max_position_embeddings,
|
||||||
|
):
|
||||||
|
super(PositionRotaryEmbedding, self).__init__()
|
||||||
|
self.short_inv_freq = short_inv_freq
|
||||||
|
self.long_inv_freq = long_inv_freq
|
||||||
|
self.scaling_factor = scaling_factor
|
||||||
|
self.original_max_position_embeddings = original_max_position_embeddings
|
||||||
|
self._seq_len_cached = 0
|
||||||
|
self._cos_cached = None
|
||||||
|
self._sin_cached = None
|
||||||
|
self._cos_k_cached = None
|
||||||
|
self._sin_k_cached = None
|
||||||
|
self.dynamic_args = None
|
||||||
|
|
||||||
|
def _update_cos_sin_cache(self, dtype, device, seqlen):
|
||||||
|
# Reset the tables if the sequence length has changed,
|
||||||
|
# or if we're on a new device (possibly due to tracing for instance)
|
||||||
|
if (
|
||||||
|
seqlen > self._seq_len_cached
|
||||||
|
or self._cos_cached.device != device
|
||||||
|
or self._cos_cached.dtype != dtype
|
||||||
|
):
|
||||||
|
self._seq_len_cached = seqlen
|
||||||
|
if seqlen > self.original_max_position_embeddings:
|
||||||
|
inv_freq = self.long_inv_freq
|
||||||
|
else:
|
||||||
|
inv_freq = self.short_inv_freq
|
||||||
|
t = torch.arange(seqlen, device=device, dtype=inv_freq.dtype)
|
||||||
|
if self.scaling_factor is not None:
|
||||||
|
t /= self.scaling_factor
|
||||||
|
# Don't do einsum, it converts fp32 to fp16
|
||||||
|
# freqs = torch.einsum("i,j->ij", t, self.inv_freq)
|
||||||
|
|
||||||
|
freqs = torch.outer(t, inv_freq.to(device=t.device))
|
||||||
|
self._cos_cached = torch.cos(freqs).to(dtype)
|
||||||
|
self._sin_cached = torch.sin(freqs).to(dtype)
|
||||||
|
|
||||||
|
|
||||||
|
class DynamicPositionRotaryEmbedding(PositionRotaryEmbedding):
|
||||||
|
def __init__(self, dim, max_position_embeddings, base, device, scaling_factor):
|
||||||
|
inv_freq = _create_inv_freq(dim, base, device)
|
||||||
|
super().__init__(inv_freq, scaling_factor)
|
||||||
|
self.dim = dim
|
||||||
|
self.max_position_embeddings = max_position_embeddings
|
||||||
|
self.base = base
|
||||||
|
|
||||||
|
def _update_cos_sin_cache(self, dtype, device, seqlen):
|
||||||
|
# Reset the tables if the sequence length has changed,
|
||||||
|
# or if we're on a new device (possibly due to tracing for instance)
|
||||||
|
if (
|
||||||
|
seqlen > self._seq_len_cached
|
||||||
|
or self._cos_cached.device != device
|
||||||
|
or self._cos_cached.dtype != dtype
|
||||||
|
):
|
||||||
|
if seqlen > self.max_position_embeddings:
|
||||||
|
newbase = self.base * (
|
||||||
|
(self.scaling_factor * seqlen / self.max_position_embeddings)
|
||||||
|
- (self.scaling_factor - 1)
|
||||||
|
) ** (self.dim / (self.dim - 2))
|
||||||
|
self.inv_freq = _create_inv_freq(
|
||||||
|
self.dim, newbase, self.inv_freq.device
|
||||||
|
)
|
||||||
|
self._seq_len_cached = seqlen
|
||||||
|
t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
|
||||||
|
# Don't do einsum, it converts fp32 to fp16
|
||||||
|
# freqs = torch.einsum("i,j->ij", t, self.inv_freq)
|
||||||
|
|
||||||
|
freqs = torch.outer(t, self.inv_freq.to(device=t.device))
|
||||||
|
self._cos_cached = torch.cos(freqs).to(dtype)
|
||||||
|
self._sin_cached = torch.sin(freqs).to(dtype)
|
||||||
|
|
||||||
|
|
||||||
|
# Inverse dim formula to find dim based on number of rotations
|
||||||
|
import math
|
||||||
|
|
||||||
|
|
||||||
|
def find_correction_dim(num_rotations, dim, base=10000, max_position_embeddings=2048):
|
||||||
|
return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (
|
||||||
|
2 * math.log(base)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# Find dim range bounds based on rotations
|
||||||
|
def find_correction_range(
|
||||||
|
low_rot, high_rot, dim, base=10000, max_position_embeddings=2048
|
||||||
|
):
|
||||||
|
low = math.floor(find_correction_dim(low_rot, dim, base, max_position_embeddings))
|
||||||
|
high = math.ceil(find_correction_dim(high_rot, dim, base, max_position_embeddings))
|
||||||
|
return max(low, 0), min(high, dim - 1) # Clamp values just in case
|
||||||
|
|
||||||
|
|
||||||
|
def linear_ramp_mask(min, max, dim):
|
||||||
|
if min == max:
|
||||||
|
max += 0.001 # Prevent singularity
|
||||||
|
|
||||||
|
linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min)
|
||||||
|
ramp_func = torch.clamp(linear_func, 0, 1)
|
||||||
|
return ramp_func
|
||||||
|
|
||||||
|
|
||||||
|
def get_mscale(scale=1):
|
||||||
|
if scale <= 1:
|
||||||
|
return 1.0
|
||||||
|
return 0.1 * math.log(scale) + 1.0
|
||||||
|
|
||||||
|
|
||||||
|
class YarnPositionRotaryEmbedding(PositionRotaryEmbedding):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
dim,
|
||||||
|
max_position_embeddings,
|
||||||
|
base,
|
||||||
|
device,
|
||||||
|
scaling_factor,
|
||||||
|
*,
|
||||||
|
extrapolation_factor,
|
||||||
|
attn_factor,
|
||||||
|
beta_fast,
|
||||||
|
beta_slow,
|
||||||
|
):
|
||||||
|
inv_freq = _create_inv_freq(dim, base, device)
|
||||||
|
super().__init__(inv_freq, scaling_factor)
|
||||||
|
self.dim = dim
|
||||||
|
self.max_position_embeddings = max_position_embeddings
|
||||||
|
self.base = base
|
||||||
|
self.extrapolation_factor = extrapolation_factor
|
||||||
|
self.attn_factor = attn_factor
|
||||||
|
self.beta_fast = beta_fast
|
||||||
|
self.beta_slow = beta_slow
|
||||||
|
self.mscale = float(
|
||||||
|
get_mscale(self.scaling_factor) * self.attn_factor
|
||||||
|
) # Get n-d magnitude scaling corrected for interpolation
|
||||||
|
|
||||||
|
def _update_cos_sin_cache(self, dtype, device, seqlen):
|
||||||
|
# Reset the tables if the sequence length has changed,
|
||||||
|
# or if we're on a new device (possibly due to tracing for instance)
|
||||||
|
if (
|
||||||
|
seqlen > self._seq_len_cached
|
||||||
|
or self._cos_cached.device != device
|
||||||
|
or self._cos_cached.dtype != dtype
|
||||||
|
):
|
||||||
|
if seqlen > self.max_position_embeddings:
|
||||||
|
inv_freq_extrapolation = _create_inv_freq(
|
||||||
|
self.dim, self.base, self.inv_freq.device
|
||||||
|
)
|
||||||
|
freqs = 1.0 / inv_freq_extrapolation
|
||||||
|
inv_freq_interpolation = 1.0 / (self.scaling_factor * freqs)
|
||||||
|
low, high = find_correction_range(
|
||||||
|
self.beta_fast,
|
||||||
|
self.beta_slow,
|
||||||
|
self.dim,
|
||||||
|
self.base,
|
||||||
|
self.max_position_embeddings,
|
||||||
|
)
|
||||||
|
inv_freq_mask = (
|
||||||
|
1 - linear_ramp_mask(low, high, self.dim // 2).float().to(device)
|
||||||
|
) * self.extrapolation_factor # Get n-d rotational scaling corrected for extrapolation
|
||||||
|
inv_freq = (
|
||||||
|
inv_freq_interpolation * (1 - inv_freq_mask)
|
||||||
|
+ inv_freq_extrapolation * inv_freq_mask
|
||||||
|
)
|
||||||
|
|
||||||
|
self.inv_freq = inv_freq
|
||||||
|
self.mscale = float(
|
||||||
|
get_mscale(self.scaling_factor) * self.attn_factor
|
||||||
|
) # Get n-d magnitude scaling corrected for interpolation
|
||||||
|
|
||||||
|
self._seq_len_cached = seqlen
|
||||||
|
t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
|
||||||
|
# Don't do einsum, it converts fp32 to fp16
|
||||||
|
# freqs = torch.einsum("i,j->ij", t, self.inv_freq)
|
||||||
|
|
||||||
|
freqs = torch.outer(t, self.inv_freq.to(device=t.device))
|
||||||
|
self._cos_cached = (torch.cos(freqs) * self.mscale).to(dtype)
|
||||||
|
self._sin_cached = (torch.sin(freqs) * self.mscale).to(dtype)
|
52
server/text_generation_server/layers/speculative.py
Normal file
52
server/text_generation_server/layers/speculative.py
Normal file
@ -0,0 +1,52 @@
|
|||||||
|
import torch
|
||||||
|
import json
|
||||||
|
from typing import Tuple, Optional
|
||||||
|
from text_generation_server.layers.tensor_parallel import TensorParallelHead
|
||||||
|
from text_generation_server.layers.medusa import MedusaHeadV1, MedusaHeadV2
|
||||||
|
from text_generation_server.layers.mlp import MLPSpeculatorHead
|
||||||
|
|
||||||
|
|
||||||
|
class SpeculativeHead(torch.nn.Module):
|
||||||
|
def __init__(self, lm_head, speculator):
|
||||||
|
super().__init__()
|
||||||
|
self.head = lm_head
|
||||||
|
self.speculator = speculator
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def load(config, prefix: str, weights):
|
||||||
|
speculator = config.speculator
|
||||||
|
if speculator:
|
||||||
|
speculator_path = config.speculator["path"]
|
||||||
|
speculator_config = str(speculator_path / "config.json")
|
||||||
|
|
||||||
|
with open(speculator_config, "r") as f:
|
||||||
|
speculator_config = json.load(f)
|
||||||
|
|
||||||
|
config.speculator_config = speculator_config
|
||||||
|
try:
|
||||||
|
architecture = speculator_config["architectures"][0]
|
||||||
|
|
||||||
|
if architecture == "MLPSpeculatorPreTrainedModel":
|
||||||
|
speculator = MLPSpeculatorHead.load(config, prefix, weights)
|
||||||
|
else:
|
||||||
|
speculator = None
|
||||||
|
except KeyError:
|
||||||
|
try:
|
||||||
|
speculator = MedusaHeadV1.load(config, prefix, weights)
|
||||||
|
except:
|
||||||
|
speculator = MedusaHeadV2(config, prefix, weights)
|
||||||
|
lm_head = None
|
||||||
|
else:
|
||||||
|
lm_head = TensorParallelHead.load(config, prefix, weights)
|
||||||
|
speculator = None
|
||||||
|
return SpeculativeHead(lm_head, speculator)
|
||||||
|
|
||||||
|
def forward(
|
||||||
|
self, input: torch.Tensor
|
||||||
|
) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
|
||||||
|
if self.speculator is not None:
|
||||||
|
return self.speculator(input)
|
||||||
|
|
||||||
|
assert self.head is not None
|
||||||
|
logits = self.head(input)
|
||||||
|
return logits, None
|
188
server/text_generation_server/layers/tensor_parallel.py
Normal file
188
server/text_generation_server/layers/tensor_parallel.py
Normal file
@ -0,0 +1,188 @@
|
|||||||
|
import torch
|
||||||
|
from torch.nn import functional as F
|
||||||
|
from typing import List
|
||||||
|
from text_generation_server.layers.linear import get_linear, FastLinear
|
||||||
|
|
||||||
|
|
||||||
|
class SuperLayer(torch.nn.Module):
|
||||||
|
def __init__(self, linear):
|
||||||
|
super().__init__()
|
||||||
|
self.linear = linear
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
return self.linear.forward(x)
|
||||||
|
|
||||||
|
|
||||||
|
class TensorParallelHead(SuperLayer):
|
||||||
|
def __init__(self, linear, process_group, should_gather: bool):
|
||||||
|
super().__init__(linear)
|
||||||
|
self.process_group = process_group
|
||||||
|
self.should_gather = should_gather
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def load(config, prefix: str, weights):
|
||||||
|
if weights.process_group.size() > 1:
|
||||||
|
try:
|
||||||
|
weight = weights.get_sharded(f"{prefix}.weight", dim=0)
|
||||||
|
should_gather = True
|
||||||
|
except AssertionError:
|
||||||
|
# If the vocab size is not divisible by number of shards
|
||||||
|
# just load the entire thing.
|
||||||
|
weight = weights.get_tensor(f"{prefix}.weight")
|
||||||
|
should_gather = False
|
||||||
|
else:
|
||||||
|
weight = weights.get_tensor(f"{prefix}.weight")
|
||||||
|
should_gather = False
|
||||||
|
|
||||||
|
# GPTQ,AWQ,EETQ don't quantize heads (nor embeddings)
|
||||||
|
if config.quantize in ["gptq", "awq", "eetq"]:
|
||||||
|
quantize = None
|
||||||
|
else:
|
||||||
|
quantize = config.quantize
|
||||||
|
return TensorParallelHead(
|
||||||
|
get_linear(weight, bias=None, quantize=quantize),
|
||||||
|
process_group=weights.process_group,
|
||||||
|
should_gather=should_gather,
|
||||||
|
)
|
||||||
|
|
||||||
|
def forward(self, input: torch.Tensor) -> torch.Tensor:
|
||||||
|
if not self.should_gather:
|
||||||
|
return super().forward(input)
|
||||||
|
|
||||||
|
world_size = self.process_group.size()
|
||||||
|
if len(input.shape) == 2 and isinstance(self.linear, FastLinear):
|
||||||
|
out_dim = self.linear.weight.shape[0]
|
||||||
|
|
||||||
|
if input.shape[0] == 1:
|
||||||
|
world_out = input.new_empty(1, out_dim * world_size)
|
||||||
|
local_out = input.new_empty(1, out_dim)
|
||||||
|
gather_input = local_out
|
||||||
|
else:
|
||||||
|
world_out = input.new_empty(out_dim * world_size, input.shape[0])
|
||||||
|
gather_input = input.new_empty(out_dim, input.shape[0])
|
||||||
|
local_out = gather_input.T
|
||||||
|
|
||||||
|
torch.mm(input, self.linear.weight.T, out=local_out)
|
||||||
|
|
||||||
|
torch.distributed.all_gather_into_tensor(
|
||||||
|
world_out, gather_input, group=self.process_group
|
||||||
|
)
|
||||||
|
|
||||||
|
if input.shape[0] == 1:
|
||||||
|
return world_out
|
||||||
|
return world_out.T
|
||||||
|
|
||||||
|
output = super().forward(input)
|
||||||
|
world_output = [
|
||||||
|
torch.empty_like(output) for _ in range(self.process_group.size())
|
||||||
|
]
|
||||||
|
torch.distributed.all_gather(world_output, output, group=self.process_group)
|
||||||
|
world_output = torch.cat(world_output, dim=-1)
|
||||||
|
return world_output
|
||||||
|
|
||||||
|
|
||||||
|
class TensorParallelColumnLinear(SuperLayer):
|
||||||
|
@classmethod
|
||||||
|
def load_gate_up(cls, config, prefix: str, weights, bias: bool):
|
||||||
|
"""Specific method when the QKV was joined after the fact"""
|
||||||
|
weight = weights.get_weights_col_packed_gate_up(
|
||||||
|
prefix, quantize=config.quantize
|
||||||
|
)
|
||||||
|
if bias:
|
||||||
|
raise NotImplementedError("packed_gate_up only implemented without bias")
|
||||||
|
else:
|
||||||
|
bias = None
|
||||||
|
linear = get_linear(weight, bias, config.quantize)
|
||||||
|
return cls(linear)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def load_qkv(cls, config, prefix: str, weights, bias: bool):
|
||||||
|
"""Specific method when the QKV was joined after the fact"""
|
||||||
|
weight = weights.get_weights_col_packed_qkv(prefix, quantize=config.quantize)
|
||||||
|
if bias:
|
||||||
|
raise NotImplementedError("packed_qkv only implemented for baichuan")
|
||||||
|
else:
|
||||||
|
bias = None
|
||||||
|
linear = get_linear(weight, bias, config.quantize)
|
||||||
|
return cls(linear)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def load(cls, config, prefix: str, weights, bias: bool):
|
||||||
|
return cls.load_multi(config, [prefix], weights, bias, dim=0)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def load_multi(cls, config, prefixes: List[str], weights, bias: bool, dim: int):
|
||||||
|
weight = weights.get_multi_weights_col(
|
||||||
|
prefixes, quantize=config.quantize, dim=dim
|
||||||
|
)
|
||||||
|
|
||||||
|
if bias:
|
||||||
|
b = [weights.get_sharded(f"{p}.bias", dim=0) for p in prefixes]
|
||||||
|
bias = torch.cat(b, dim=dim)
|
||||||
|
else:
|
||||||
|
bias = None
|
||||||
|
linear = get_linear(weight, bias, config.quantize)
|
||||||
|
return cls(linear)
|
||||||
|
|
||||||
|
|
||||||
|
class TensorParallelRowLinear(SuperLayer):
|
||||||
|
def __init__(self, linear, process_group):
|
||||||
|
super().__init__(linear)
|
||||||
|
self.process_group = process_group
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def load(cls, config, prefix: str, weights, bias: bool):
|
||||||
|
weight = weights.get_multi_weights_row(prefix, quantize=config.quantize)
|
||||||
|
|
||||||
|
if bias and weights.process_group.rank() == 0:
|
||||||
|
# Rank is only on the first rank process
|
||||||
|
bias = weights.get_tensor(f"{prefix}.bias")
|
||||||
|
else:
|
||||||
|
bias = None
|
||||||
|
return cls(
|
||||||
|
get_linear(weight, bias, config.quantize),
|
||||||
|
process_group=weights.process_group,
|
||||||
|
)
|
||||||
|
|
||||||
|
def forward(self, input: torch.Tensor, reduce: bool = True) -> torch.Tensor:
|
||||||
|
out = super().forward(input)
|
||||||
|
if self.process_group.size() > 1 and reduce:
|
||||||
|
torch.distributed.all_reduce(out, group=self.process_group)
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
class TensorParallelEmbedding(torch.nn.Module):
|
||||||
|
def __init__(self, prefix: str, weights, reduce=True):
|
||||||
|
super().__init__()
|
||||||
|
weight = weights.get_partial_sharded(f"{prefix}.weight", dim=0)
|
||||||
|
num_embeddings = weights.get_shape(f"{prefix}.weight")[0]
|
||||||
|
|
||||||
|
process_group = weights.process_group
|
||||||
|
|
||||||
|
world_size = process_group.size()
|
||||||
|
rank = process_group.rank()
|
||||||
|
|
||||||
|
block_size = (num_embeddings + world_size - 1) // world_size
|
||||||
|
self.min_id = rank * block_size
|
||||||
|
self.max_id = min(num_embeddings, (rank + 1) * block_size)
|
||||||
|
self.null_idx = weight.shape[
|
||||||
|
0
|
||||||
|
] # Usually block_size, might be less in non even vocab_size.
|
||||||
|
self.process_group = weights.process_group
|
||||||
|
self.reduce = reduce
|
||||||
|
|
||||||
|
"""Additional 0 entry used for masking"""
|
||||||
|
self.weight = torch.nn.Parameter(F.pad(weight, (0, 0, 0, 1)))
|
||||||
|
|
||||||
|
def forward(self, input: torch.Tensor) -> torch.Tensor:
|
||||||
|
# default all out of bounds values to `self.null_idx` that will then be mapped to 0
|
||||||
|
# translate for [0, self.max_id - self.min_id[
|
||||||
|
input = torch.where(
|
||||||
|
(self.min_id > input) | (input >= self.max_id),
|
||||||
|
self.null_idx,
|
||||||
|
input - self.min_id,
|
||||||
|
)
|
||||||
|
out = torch.nn.functional.embedding(input, self.weight)
|
||||||
|
if self.reduce and self.process_group.size() > 1:
|
||||||
|
torch.distributed.all_reduce(out, group=self.process_group)
|
||||||
|
return out
|
@ -1,9 +1,10 @@
|
|||||||
import torch
|
import torch
|
||||||
|
import os
|
||||||
|
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
from transformers.configuration_utils import PretrainedConfig
|
from transformers.configuration_utils import PretrainedConfig
|
||||||
from transformers.models.auto import modeling_auto
|
from transformers.models.auto import modeling_auto
|
||||||
from huggingface_hub import hf_hub_download
|
from huggingface_hub import hf_hub_download, HfApi
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
@ -15,6 +16,12 @@ from text_generation_server.models.model import Model
|
|||||||
from text_generation_server.models.causal_lm import CausalLM
|
from text_generation_server.models.causal_lm import CausalLM
|
||||||
from text_generation_server.models.bloom import BLOOM
|
from text_generation_server.models.bloom import BLOOM
|
||||||
from text_generation_server.models.starcoder import StarCoder
|
from text_generation_server.models.starcoder import StarCoder
|
||||||
|
from text_generation_server.models.vlm_causal_lm import VlmCausalLM
|
||||||
|
from text_generation_server.models.custom_modeling.llava_next import (
|
||||||
|
LlavaNextForConditionalGeneration,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
|
from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
|
||||||
|
|
||||||
@ -40,8 +47,9 @@ def get_model(
|
|||||||
config_dict, _ = PretrainedConfig.get_config_dict(
|
config_dict, _ = PretrainedConfig.get_config_dict(
|
||||||
model_id, revision=revision, trust_remote_code=trust_remote_code
|
model_id, revision=revision, trust_remote_code=trust_remote_code
|
||||||
)
|
)
|
||||||
|
model_type = config_dict.get("model_type", None)
|
||||||
|
|
||||||
use_medusa = None
|
speculator = None
|
||||||
if "medusa_num_heads" in config_dict:
|
if "medusa_num_heads" in config_dict:
|
||||||
medusa_model_id = model_id
|
medusa_model_id = model_id
|
||||||
medusa_revision = revision
|
medusa_revision = revision
|
||||||
@ -61,6 +69,8 @@ def get_model(
|
|||||||
config_dict, _ = PretrainedConfig.get_config_dict(
|
config_dict, _ = PretrainedConfig.get_config_dict(
|
||||||
model_id, revision=revision, trust_remote_code=trust_remote_code
|
model_id, revision=revision, trust_remote_code=trust_remote_code
|
||||||
)
|
)
|
||||||
|
# Reload model type from parent.
|
||||||
|
model_type = config_dict.get("model_type", None)
|
||||||
is_local = Path(medusa_model_id).exists()
|
is_local = Path(medusa_model_id).exists()
|
||||||
if not is_local:
|
if not is_local:
|
||||||
medusa_config = hf_hub_download(
|
medusa_config = hf_hub_download(
|
||||||
@ -71,11 +81,70 @@ def get_model(
|
|||||||
revision=medusa_revision,
|
revision=medusa_revision,
|
||||||
filename="medusa_lm_head.safetensors",
|
filename="medusa_lm_head.safetensors",
|
||||||
)
|
)
|
||||||
use_medusa = Path(medusa_config).parent
|
speculator = {
|
||||||
|
"path": Path(medusa_config).parent,
|
||||||
|
"model_paths": ["medusa_lm_head.safetensors"],
|
||||||
|
}
|
||||||
else:
|
else:
|
||||||
use_medusa = Path(medusa_model_id)
|
speculator = {
|
||||||
|
"path": Path(medusa_model_id),
|
||||||
|
"model_paths": ["medusa_lm_head.safetensors"],
|
||||||
|
}
|
||||||
|
|
||||||
method = "medusa"
|
method = "medusa"
|
||||||
|
elif model_type == "mlp_speculator":
|
||||||
|
mlp_model_id = model_id
|
||||||
|
mlp_revision = revision
|
||||||
|
model_id = config_dict["base_model_name_or_path"]
|
||||||
|
revision = "main"
|
||||||
|
speculate_mlp = config_dict["n_predict"]
|
||||||
|
if speculate is not None:
|
||||||
|
if speculate > speculate_mlp:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"Speculate is set to `{speculate}` but this mlp_speculator models only has `{speculate_mlp}` heads, please make them match"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
set_speculate(speculate)
|
||||||
|
else:
|
||||||
|
set_speculate(speculate_mlp)
|
||||||
|
|
||||||
|
config_dict, _ = PretrainedConfig.get_config_dict(
|
||||||
|
model_id, revision=revision, trust_remote_code=trust_remote_code
|
||||||
|
)
|
||||||
|
# Reload model type from parent.
|
||||||
|
model_type = config_dict.get("model_type", None)
|
||||||
|
is_local = Path(mlp_model_id).exists()
|
||||||
|
extension = ".safetensors"
|
||||||
|
if not is_local:
|
||||||
|
mlp_speculator_config = hf_hub_download(
|
||||||
|
mlp_model_id, revision=mlp_revision, filename="config.json"
|
||||||
|
)
|
||||||
|
api = HfApi()
|
||||||
|
info = api.model_info(mlp_model_id, revision=mlp_revision)
|
||||||
|
filenames = [
|
||||||
|
s.rfilename
|
||||||
|
for s in info.siblings
|
||||||
|
if s.rfilename.endswith(extension)
|
||||||
|
and len(s.rfilename.split("/")) == 1
|
||||||
|
and "arguments" not in s.rfilename
|
||||||
|
and "args" not in s.rfilename
|
||||||
|
and "training" not in s.rfilename
|
||||||
|
]
|
||||||
|
for filename in filenames:
|
||||||
|
hf_hub_download(
|
||||||
|
mlp_model_id,
|
||||||
|
revision=mlp_revision,
|
||||||
|
filename=filename,
|
||||||
|
)
|
||||||
|
speculator = {
|
||||||
|
"path": Path(mlp_speculator_config).parent,
|
||||||
|
"model_paths": filenames,
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
speculator = Path(mlp_model_id)
|
||||||
|
filenames = [p for p in os.listdir(speculator) if p.endswith(extension)]
|
||||||
|
speculator = {"path": speculator, "model_paths": filenames}
|
||||||
|
method = "mlp_speculator"
|
||||||
else:
|
else:
|
||||||
method = "n-gram"
|
method = "n-gram"
|
||||||
|
|
||||||
@ -92,7 +161,18 @@ def get_model(
|
|||||||
return BLOOM(
|
return BLOOM(
|
||||||
model_id,
|
model_id,
|
||||||
revision,
|
revision,
|
||||||
use_medusa=use_medusa,
|
speculator=speculator,
|
||||||
|
dtype=dtype,
|
||||||
|
trust_remote_code=trust_remote_code,
|
||||||
|
)
|
||||||
|
|
||||||
|
if model_type == "llava_next":
|
||||||
|
return VlmCausalLM(
|
||||||
|
model_class=LlavaNextForConditionalGeneration,
|
||||||
|
model_id=model_id,
|
||||||
|
revision=revision,
|
||||||
|
quantize=None,
|
||||||
|
speculator=speculator,
|
||||||
dtype=dtype,
|
dtype=dtype,
|
||||||
trust_remote_code=trust_remote_code,
|
trust_remote_code=trust_remote_code,
|
||||||
)
|
)
|
||||||
@ -101,7 +181,7 @@ def get_model(
|
|||||||
return CausalLM(
|
return CausalLM(
|
||||||
model_id,
|
model_id,
|
||||||
revision,
|
revision,
|
||||||
use_medusa=use_medusa,
|
speculator=speculator,
|
||||||
dtype=dtype,
|
dtype=dtype,
|
||||||
trust_remote_code=trust_remote_code,
|
trust_remote_code=trust_remote_code,
|
||||||
)
|
)
|
||||||
|
@ -35,14 +35,14 @@ class BLOOM(CausalLM):
|
|||||||
self,
|
self,
|
||||||
model_id: str,
|
model_id: str,
|
||||||
revision: Optional[str] = None,
|
revision: Optional[str] = None,
|
||||||
use_medusa: Optional[str] = None,
|
speculator: Optional[str] = None,
|
||||||
dtype: Optional[torch.dtype] = None,
|
dtype: Optional[torch.dtype] = None,
|
||||||
trust_remote_code: bool = False,
|
trust_remote_code: bool = False,
|
||||||
):
|
):
|
||||||
super(BLOOM, self).__init__(
|
super(BLOOM, self).__init__(
|
||||||
model_id=model_id,
|
model_id=model_id,
|
||||||
revision=revision,
|
revision=revision,
|
||||||
use_medusa=use_medusa,
|
speculator=speculator,
|
||||||
dtype=dtype,
|
dtype=dtype,
|
||||||
trust_remote_code=trust_remote_code,
|
trust_remote_code=trust_remote_code,
|
||||||
)
|
)
|
||||||
|
@ -2,7 +2,7 @@ import math
|
|||||||
import torch
|
import torch
|
||||||
|
|
||||||
from typing import Optional, List, Tuple
|
from typing import Optional, List, Tuple
|
||||||
from text_generation_server.utils.import_utils import IS_XPU_SYSTEM
|
from text_generation_server.utils.import_utils import SYSTEM
|
||||||
|
|
||||||
BLOCK_SIZE: int = 16
|
BLOCK_SIZE: int = 16
|
||||||
# Will be set in warmup
|
# Will be set in warmup
|
||||||
@ -25,7 +25,7 @@ class CacheManager:
|
|||||||
self.repeat_slots = repeat_slots
|
self.repeat_slots = repeat_slots
|
||||||
|
|
||||||
element_size = torch.tensor([], dtype=dtype).element_size()
|
element_size = torch.tensor([], dtype=dtype).element_size()
|
||||||
if IS_XPU_SYSTEM:
|
if SYSTEM == "xpu":
|
||||||
x = 1
|
x = 1
|
||||||
else:
|
else:
|
||||||
x = self.block_size // element_size
|
x = self.block_size // element_size
|
||||||
|
@ -367,6 +367,7 @@ class CausalLMBatch(Batch):
|
|||||||
input_lengths = [b.input_length for b in batches]
|
input_lengths = [b.input_length for b in batches]
|
||||||
max_input_length = max(input_lengths)
|
max_input_length = max(input_lengths)
|
||||||
offsets = [max_input_length - b.input_length for b in batches]
|
offsets = [max_input_length - b.input_length for b in batches]
|
||||||
|
|
||||||
cur_padding = [b.right_padding for b in batches]
|
cur_padding = [b.right_padding for b in batches]
|
||||||
# For prefill there is a space allocated only for first token
|
# For prefill there is a space allocated only for first token
|
||||||
# Need to add padding to the max total tokens before first decode
|
# Need to add padding to the max total tokens before first decode
|
||||||
@ -596,13 +597,15 @@ class CausalLM(Model):
|
|||||||
self,
|
self,
|
||||||
model_id: str,
|
model_id: str,
|
||||||
revision: Optional[str] = None,
|
revision: Optional[str] = None,
|
||||||
use_medusa: Optional[str] = None,
|
speculator: Optional[str] = None,
|
||||||
dtype: Optional[torch.dtype] = None,
|
dtype: Optional[torch.dtype] = None,
|
||||||
trust_remote_code: bool = False,
|
trust_remote_code: bool = False,
|
||||||
):
|
):
|
||||||
|
|
||||||
|
if speculator:
|
||||||
|
raise RuntimeError("Speculator decoding is not enabled for AutoModel")
|
||||||
|
|
||||||
self.prev_bs = 0
|
self.prev_bs = 0
|
||||||
if use_medusa:
|
|
||||||
raise RuntimeError("Medusa decoding is not enabled for AutoModel")
|
|
||||||
|
|
||||||
# Create tokenizer
|
# Create tokenizer
|
||||||
tokenizer = AutoTokenizer.from_pretrained(
|
tokenizer = AutoTokenizer.from_pretrained(
|
||||||
|
@ -32,7 +32,7 @@ from transformers.modeling_outputs import (
|
|||||||
)
|
)
|
||||||
from transformers import BloomConfig, PreTrainedModel
|
from transformers import BloomConfig, PreTrainedModel
|
||||||
|
|
||||||
from text_generation_server.utils.layers import (
|
from text_generation_server.layers import (
|
||||||
TensorParallelColumnLinear,
|
TensorParallelColumnLinear,
|
||||||
TensorParallelEmbedding,
|
TensorParallelEmbedding,
|
||||||
TensorParallelRowLinear,
|
TensorParallelRowLinear,
|
||||||
|
@ -15,7 +15,7 @@ from transformers.modeling_outputs import (
|
|||||||
)
|
)
|
||||||
from transformers import CLIPConfig, CLIPTextConfig, CLIPVisionConfig
|
from transformers import CLIPConfig, CLIPTextConfig, CLIPVisionConfig
|
||||||
|
|
||||||
from text_generation_server.utils.layers import (
|
from text_generation_server.layers import (
|
||||||
TensorParallelEmbedding,
|
TensorParallelEmbedding,
|
||||||
TensorParallelColumnLinear,
|
TensorParallelColumnLinear,
|
||||||
TensorParallelRowLinear,
|
TensorParallelRowLinear,
|
||||||
|
@ -26,18 +26,22 @@ from transformers.activations import ACT2FN
|
|||||||
from typing import Optional, List, Tuple
|
from typing import Optional, List, Tuple
|
||||||
|
|
||||||
from text_generation_server.utils import paged_attention, flash_attn
|
from text_generation_server.utils import paged_attention, flash_attn
|
||||||
from text_generation_server.utils.import_utils import IS_ROCM_SYSTEM, IS_CUDA_SYSTEM
|
from text_generation_server.utils.import_utils import SYSTEM
|
||||||
from text_generation_server.utils.layers import (
|
from text_generation_server.layers import (
|
||||||
TensorParallelRowLinear,
|
TensorParallelRowLinear,
|
||||||
TensorParallelColumnLinear,
|
TensorParallelColumnLinear,
|
||||||
TensorParallelEmbedding,
|
TensorParallelEmbedding,
|
||||||
PositionRotaryEmbedding,
|
|
||||||
SpeculativeHead,
|
SpeculativeHead,
|
||||||
get_linear,
|
get_linear,
|
||||||
|
)
|
||||||
|
from text_generation_server.layers.layernorm import (
|
||||||
FastLayerNorm,
|
FastLayerNorm,
|
||||||
)
|
)
|
||||||
|
from text_generation_server.layers.rotary import (
|
||||||
|
PositionRotaryEmbedding,
|
||||||
|
)
|
||||||
|
|
||||||
if IS_CUDA_SYSTEM:
|
if SYSTEM == "cuda":
|
||||||
import dropout_layer_norm
|
import dropout_layer_norm
|
||||||
else:
|
else:
|
||||||
dropout_layer_norm = None
|
dropout_layer_norm = None
|
||||||
@ -52,7 +56,7 @@ class CohereRotary(PositionRotaryEmbedding):
|
|||||||
sin: torch.Tensor,
|
sin: torch.Tensor,
|
||||||
):
|
):
|
||||||
# Such controlflows may add some overhead.
|
# Such controlflows may add some overhead.
|
||||||
if IS_CUDA_SYSTEM:
|
if SYSTEM == "cuda":
|
||||||
import rotary_emb
|
import rotary_emb
|
||||||
|
|
||||||
q1 = query[..., ::2]
|
q1 = query[..., ::2]
|
||||||
@ -64,8 +68,8 @@ class CohereRotary(PositionRotaryEmbedding):
|
|||||||
k2 = key[..., 1::2]
|
k2 = key[..., 1::2]
|
||||||
|
|
||||||
rotary_emb.apply_rotary(k1, k2, cos, sin, k1, k2, False)
|
rotary_emb.apply_rotary(k1, k2, cos, sin, k1, k2, False)
|
||||||
elif IS_ROCM_SYSTEM:
|
elif SYSTEM == "rocm":
|
||||||
from vllm import pos_encoding_ops
|
from vllm._C import ops
|
||||||
|
|
||||||
# NOTE: On RoCm systems, we use a ROPE implementatation adapted from VLLM which launches a single kernel for both query/key, contrary to flash-attn implementation used on NVIDIA systems.
|
# NOTE: On RoCm systems, we use a ROPE implementatation adapted from VLLM which launches a single kernel for both query/key, contrary to flash-attn implementation used on NVIDIA systems.
|
||||||
# Compiling flash-attn rotary on RoCm, it appears hipcc is unable to unroll loops, resulting in an even slower inference compared to eager: https://github.com/pytorch/pytorch/issues/113773
|
# Compiling flash-attn rotary on RoCm, it appears hipcc is unable to unroll loops, resulting in an even slower inference compared to eager: https://github.com/pytorch/pytorch/issues/113773
|
||||||
@ -73,7 +77,7 @@ class CohereRotary(PositionRotaryEmbedding):
|
|||||||
head_size = query.shape[-1]
|
head_size = query.shape[-1]
|
||||||
|
|
||||||
# Inplace operation, updating query and key.
|
# Inplace operation, updating query and key.
|
||||||
pos_encoding_ops.rotary_embedding(query, key, head_size, cos, sin, False)
|
ops.rotary_embedding(query, key, head_size, cos, sin, False)
|
||||||
else:
|
else:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Your system seem to be not supported. Please check your install or open an issue at https://github.com/huggingface/text-generation-inference/issues with a clear reproduction."
|
"Your system seem to be not supported. Please check your install or open an issue at https://github.com/huggingface/text-generation-inference/issues with a clear reproduction."
|
||||||
@ -90,7 +94,7 @@ class CohereLayerNorm(nn.Module):
|
|||||||
self.eps = eps
|
self.eps = eps
|
||||||
|
|
||||||
def forward(self, hidden_states):
|
def forward(self, hidden_states):
|
||||||
if hidden_states.shape[-1] > 8192 or IS_ROCM_SYSTEM:
|
if hidden_states.shape[-1] > 8192 or SYSTEM == "rocm":
|
||||||
hidden_states = hidden_states.reshape(
|
hidden_states = hidden_states.reshape(
|
||||||
-1, self.weight.shape[0], self.weight.shape[1]
|
-1, self.weight.shape[0], self.weight.shape[1]
|
||||||
)
|
)
|
||||||
|
@ -21,21 +21,26 @@ from transformers.activations import ACT2FN
|
|||||||
from transformers.configuration_utils import PretrainedConfig
|
from transformers.configuration_utils import PretrainedConfig
|
||||||
from typing import Optional, List, Tuple, Any
|
from typing import Optional, List, Tuple, Any
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
from text_generation_server.utils.import_utils import IS_XPU_SYSTEM
|
from text_generation_server.utils.import_utils import SYSTEM
|
||||||
|
|
||||||
if not IS_XPU_SYSTEM:
|
if SYSTEM != "xpu":
|
||||||
from vllm.model_executor.layers.fused_moe import fused_moe
|
from vllm.model_executor.layers.fused_moe import fused_moe
|
||||||
|
|
||||||
from text_generation_server.utils import paged_attention, flash_attn
|
from text_generation_server.utils import paged_attention, flash_attn
|
||||||
from text_generation_server.utils.layers import (
|
from text_generation_server.layers import (
|
||||||
FastLinear,
|
FastLinear,
|
||||||
FastLayerNorm,
|
|
||||||
TensorParallelRowLinear,
|
TensorParallelRowLinear,
|
||||||
TensorParallelColumnLinear,
|
TensorParallelColumnLinear,
|
||||||
TensorParallelEmbedding,
|
TensorParallelEmbedding,
|
||||||
PositionRotaryEmbedding,
|
|
||||||
SpeculativeHead,
|
SpeculativeHead,
|
||||||
get_linear,
|
get_linear,
|
||||||
)
|
)
|
||||||
|
from text_generation_server.layers.rotary import (
|
||||||
|
PositionRotaryEmbedding,
|
||||||
|
)
|
||||||
|
from text_generation_server.layers.layernorm import (
|
||||||
|
FastLayerNorm,
|
||||||
|
)
|
||||||
from text_generation_server.utils.log import log_once
|
from text_generation_server.utils.log import log_once
|
||||||
|
|
||||||
|
|
||||||
@ -216,7 +221,7 @@ def _load_gqa(config, prefix: str, weights):
|
|||||||
|
|
||||||
bits, groupsize, desc_act, quant_method = weights._get_gptq_params()
|
bits, groupsize, desc_act, quant_method = weights._get_gptq_params()
|
||||||
|
|
||||||
from text_generation_server.utils.layers import HAS_EXLLAMA
|
from text_generation_server.layers import HAS_EXLLAMA
|
||||||
|
|
||||||
use_exllama = (
|
use_exllama = (
|
||||||
bits == 4 and HAS_EXLLAMA and config.quantize == "gptq" and not desc_act
|
bits == 4 and HAS_EXLLAMA and config.quantize == "gptq" and not desc_act
|
||||||
@ -236,7 +241,7 @@ def _load_gqa(config, prefix: str, weights):
|
|||||||
log_once(
|
log_once(
|
||||||
logger.info, "Converting AWQ model to Exllama/GPTQ packing format."
|
logger.info, "Converting AWQ model to Exllama/GPTQ packing format."
|
||||||
)
|
)
|
||||||
from text_generation_server.utils.awq.conversion_utils import (
|
from text_generation_server.layers.awq.conveersion_utils import (
|
||||||
fast_awq_to_gptq,
|
fast_awq_to_gptq,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -27,13 +27,15 @@ from transformers.configuration_utils import PretrainedConfig
|
|||||||
from typing import Optional, List, Tuple
|
from typing import Optional, List, Tuple
|
||||||
|
|
||||||
from text_generation_server.utils import paged_attention, flash_attn
|
from text_generation_server.utils import paged_attention, flash_attn
|
||||||
from text_generation_server.utils.layers import (
|
from text_generation_server.layers import (
|
||||||
TensorParallelRowLinear,
|
TensorParallelRowLinear,
|
||||||
TensorParallelColumnLinear,
|
TensorParallelColumnLinear,
|
||||||
TensorParallelEmbedding,
|
TensorParallelEmbedding,
|
||||||
PositionRotaryEmbedding,
|
|
||||||
SpeculativeHead,
|
SpeculativeHead,
|
||||||
get_linear,
|
get_linear,
|
||||||
|
)
|
||||||
|
from text_generation_server.layers.rotary import PositionRotaryEmbedding
|
||||||
|
from text_generation_server.layers.layernorm import (
|
||||||
FastRMSNorm,
|
FastRMSNorm,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -97,8 +99,13 @@ class GemmaConfig(PretrainedConfig):
|
|||||||
class GemmaFastRMSNorm(FastRMSNorm):
|
class GemmaFastRMSNorm(FastRMSNorm):
|
||||||
@classmethod
|
@classmethod
|
||||||
def load(cls, prefix, weights, eps=1e-6):
|
def load(cls, prefix, weights, eps=1e-6):
|
||||||
|
dtype = weights.dtype
|
||||||
|
weights.dtype = torch.float32
|
||||||
weight = weights.get_tensor(f"{prefix}.weight") + 1
|
weight = weights.get_tensor(f"{prefix}.weight") + 1
|
||||||
return cls(weight, eps)
|
weights.dtype = dtype
|
||||||
|
new = cls(weight, eps)
|
||||||
|
new.dtype = dtype
|
||||||
|
return new
|
||||||
|
|
||||||
# perform the multiplication in full precision and downcast after
|
# perform the multiplication in full precision and downcast after
|
||||||
def forward(self, hidden_states, residual=None):
|
def forward(self, hidden_states, residual=None):
|
||||||
@ -109,7 +116,7 @@ class GemmaFastRMSNorm(FastRMSNorm):
|
|||||||
variance = hidden_states.pow(2).mean(-1, keepdim=True)
|
variance = hidden_states.pow(2).mean(-1, keepdim=True)
|
||||||
hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
|
hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
|
||||||
hidden_states = hidden_states * self.weight
|
hidden_states = hidden_states * self.weight
|
||||||
return hidden_states.to(self.weight.dtype), residual
|
return hidden_states.to(self.dtype), residual
|
||||||
|
|
||||||
|
|
||||||
def load_attention(config, prefix, weights):
|
def load_attention(config, prefix, weights):
|
||||||
@ -151,15 +158,11 @@ def _load_gqa(config, prefix: str, weights):
|
|||||||
|
|
||||||
|
|
||||||
class FlashGemmaAttention(torch.nn.Module):
|
class FlashGemmaAttention(torch.nn.Module):
|
||||||
def __init__(
|
def __init__(self, prefix: str, config, weights, causal: bool):
|
||||||
self,
|
|
||||||
prefix: str,
|
|
||||||
config,
|
|
||||||
weights,
|
|
||||||
):
|
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.num_heads = config.num_attention_heads
|
self.num_heads = config.num_attention_heads
|
||||||
self.head_size = config.head_dim
|
self.head_size = config.head_dim
|
||||||
|
self.causal = causal
|
||||||
|
|
||||||
self.rotary_emb = PositionRotaryEmbedding.static(
|
self.rotary_emb = PositionRotaryEmbedding.static(
|
||||||
config=config,
|
config=config,
|
||||||
@ -236,6 +239,7 @@ class FlashGemmaAttention(torch.nn.Module):
|
|||||||
cu_seqlen_prefill,
|
cu_seqlen_prefill,
|
||||||
max_s,
|
max_s,
|
||||||
self.softmax_scale,
|
self.softmax_scale,
|
||||||
|
causal=self.causal,
|
||||||
)
|
)
|
||||||
# Decode
|
# Decode
|
||||||
else:
|
else:
|
||||||
@ -293,11 +297,10 @@ class GemmaMLP(nn.Module):
|
|||||||
|
|
||||||
|
|
||||||
class FlashGemmaLayer(nn.Module):
|
class FlashGemmaLayer(nn.Module):
|
||||||
def __init__(self, layer_id, config, weights):
|
def __init__(self, prefix, config, weights, causal: bool):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
prefix = f"model.layers.{layer_id}"
|
|
||||||
self.self_attn = FlashGemmaAttention(
|
self.self_attn = FlashGemmaAttention(
|
||||||
prefix=f"{prefix}.self_attn", config=config, weights=weights
|
prefix=f"{prefix}.self_attn", config=config, weights=weights, causal=causal
|
||||||
)
|
)
|
||||||
self.mlp = GemmaMLP(prefix=f"{prefix}.mlp", config=config, weights=weights)
|
self.mlp = GemmaMLP(prefix=f"{prefix}.mlp", config=config, weights=weights)
|
||||||
|
|
||||||
@ -349,30 +352,25 @@ class FlashGemmaLayer(nn.Module):
|
|||||||
|
|
||||||
|
|
||||||
class FlashGemmaModel(torch.nn.Module):
|
class FlashGemmaModel(torch.nn.Module):
|
||||||
def __init__(self, config, weights):
|
def __init__(self, prefix, config, weights, causal: bool):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
process_group = weights.process_group
|
process_group = weights.process_group
|
||||||
self.tp_rank = process_group.rank()
|
self.tp_rank = process_group.rank()
|
||||||
self.tp_world_size = process_group.size()
|
self.tp_world_size = process_group.size()
|
||||||
embed_norm = config.hidden_size**0.5
|
|
||||||
self.embed_tokens = TensorParallelEmbedding(
|
|
||||||
prefix="model.embed_tokens", weights=weights
|
|
||||||
)
|
|
||||||
self.embed_tokens.weight *= embed_norm
|
|
||||||
|
|
||||||
self.layers = nn.ModuleList(
|
self.layers = nn.ModuleList(
|
||||||
[
|
[
|
||||||
FlashGemmaLayer(
|
FlashGemmaLayer(
|
||||||
layer_id,
|
prefix=f"{prefix}.layers.{layer_id}",
|
||||||
config,
|
config=config,
|
||||||
weights,
|
weights=weights,
|
||||||
|
causal=causal,
|
||||||
)
|
)
|
||||||
for layer_id in range(config.num_hidden_layers)
|
for layer_id in range(config.num_hidden_layers)
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
self.norm = GemmaFastRMSNorm.load(
|
self.norm = GemmaFastRMSNorm.load(
|
||||||
prefix="model.norm", weights=weights, eps=config.rms_norm_eps
|
prefix=f"{prefix}.norm", weights=weights, eps=config.rms_norm_eps
|
||||||
)
|
)
|
||||||
|
|
||||||
self.gradient_checkpointing = False
|
self.gradient_checkpointing = False
|
||||||
@ -383,7 +381,7 @@ class FlashGemmaModel(torch.nn.Module):
|
|||||||
|
|
||||||
def forward(
|
def forward(
|
||||||
self,
|
self,
|
||||||
input_ids: torch.Tensor,
|
inputs_embeds: torch.Tensor,
|
||||||
position_ids: torch.Tensor,
|
position_ids: torch.Tensor,
|
||||||
cu_seqlen_prefill: Optional[torch.Tensor],
|
cu_seqlen_prefill: Optional[torch.Tensor],
|
||||||
kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
|
kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
|
||||||
@ -392,7 +390,7 @@ class FlashGemmaModel(torch.nn.Module):
|
|||||||
input_lengths: torch.Tensor,
|
input_lengths: torch.Tensor,
|
||||||
max_s: int,
|
max_s: int,
|
||||||
) -> torch.Tensor:
|
) -> torch.Tensor:
|
||||||
hidden_states = self.embed_tokens(input_ids)
|
hidden_states = inputs_embeds
|
||||||
|
|
||||||
# Get rotary cos and sin for this forward
|
# Get rotary cos and sin for this forward
|
||||||
# Avoid to index in each layer
|
# Avoid to index in each layer
|
||||||
@ -421,13 +419,30 @@ class FlashGemmaModel(torch.nn.Module):
|
|||||||
|
|
||||||
|
|
||||||
class FlashGemmaForCausalLM(torch.nn.Module):
|
class FlashGemmaForCausalLM(torch.nn.Module):
|
||||||
def __init__(self, config, weights):
|
def __init__(self, prefix, config, weights, causal: bool):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
self.model = FlashGemmaModel(config, weights)
|
embed_norm = config.hidden_size**0.5
|
||||||
|
if prefix is None:
|
||||||
|
prefix = "model"
|
||||||
|
else:
|
||||||
|
prefix = f"{prefix}.model"
|
||||||
|
|
||||||
|
self.embed_tokens = TensorParallelEmbedding(
|
||||||
|
prefix=f"{prefix}.embed_tokens", weights=weights
|
||||||
|
)
|
||||||
|
self.embed_tokens.weight *= embed_norm
|
||||||
|
|
||||||
|
self.model = FlashGemmaModel(
|
||||||
|
prefix=prefix, config=config, weights=weights, causal=causal
|
||||||
|
)
|
||||||
self.lm_head = SpeculativeHead.load(
|
self.lm_head = SpeculativeHead.load(
|
||||||
config,
|
prefix=(
|
||||||
prefix="model.embed_tokens" if config.tie_word_embeddings else "lm_head",
|
f"{prefix}.embed_tokens"
|
||||||
|
if config.tie_word_embeddings
|
||||||
|
else f"{prefix}.lm_head"
|
||||||
|
),
|
||||||
|
config=config,
|
||||||
weights=weights,
|
weights=weights,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -443,8 +458,9 @@ class FlashGemmaForCausalLM(torch.nn.Module):
|
|||||||
max_s: int,
|
max_s: int,
|
||||||
lm_head_indices: Optional[torch.Tensor] = None,
|
lm_head_indices: Optional[torch.Tensor] = None,
|
||||||
) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
|
) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
|
||||||
|
input_embeds = self.embed_tokens(input_ids)
|
||||||
hidden_states = self.model(
|
hidden_states = self.model(
|
||||||
input_ids,
|
input_embeds,
|
||||||
position_ids,
|
position_ids,
|
||||||
cu_seqlen_prefill,
|
cu_seqlen_prefill,
|
||||||
kv_cache,
|
kv_cache,
|
||||||
|
@ -0,0 +1,454 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
|
||||||
|
#
|
||||||
|
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
|
||||||
|
# and OPT implementations in this library. It has been modified from its
|
||||||
|
# original forms to accommodate minor architectural differences compared
|
||||||
|
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import torch.distributed
|
||||||
|
|
||||||
|
from torch import nn
|
||||||
|
from transformers.activations import ACT2FN
|
||||||
|
from typing import Optional, List, Tuple
|
||||||
|
|
||||||
|
from text_generation_server.utils import paged_attention, flash_attn
|
||||||
|
from text_generation_server.layers import (
|
||||||
|
TensorParallelRowLinear,
|
||||||
|
TensorParallelColumnLinear,
|
||||||
|
TensorParallelEmbedding,
|
||||||
|
SpeculativeHead,
|
||||||
|
get_linear,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def load_qkv(config, prefix: str, weights, head_size, num_heads):
|
||||||
|
if config.quantize == "gptq":
|
||||||
|
return _load_qkv_gptq(
|
||||||
|
config,
|
||||||
|
prefix,
|
||||||
|
weights,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
return _load_qkv(config, prefix, weights, head_size, num_heads)
|
||||||
|
|
||||||
|
|
||||||
|
def _load_qkv_gptq(config, prefix: str, weights):
|
||||||
|
world_size = weights.process_group.size()
|
||||||
|
rank = weights.process_group.rank()
|
||||||
|
|
||||||
|
# Weights
|
||||||
|
weight = weights.get_weights_col_packed_qkv(f"{prefix}.c_attn", config.quantize)
|
||||||
|
|
||||||
|
# Bias
|
||||||
|
slice_ = weights._get_slice(f"{prefix}.c_attn.bias")
|
||||||
|
shape = slice_.get_shape()
|
||||||
|
total_size = shape[0]
|
||||||
|
assert total_size % 3 == 0, f"Prepacked is not divisible by {3}"
|
||||||
|
single_size = total_size // 3
|
||||||
|
assert single_size % world_size == 0
|
||||||
|
block_size = single_size // world_size
|
||||||
|
start = rank * block_size
|
||||||
|
stop = (rank + 1) * block_size
|
||||||
|
tensors = []
|
||||||
|
for i in range(3):
|
||||||
|
tensor = slice_[start + i * single_size : stop + i * single_size]
|
||||||
|
tensors.append(tensor)
|
||||||
|
bias = torch.cat(tensors, dim=0)
|
||||||
|
bias = bias.to(device=weights.device)
|
||||||
|
|
||||||
|
return TensorParallelColumnLinear(get_linear(weight, bias, config.quantize))
|
||||||
|
|
||||||
|
|
||||||
|
def _load_qkv(config, prefix: str, weights, head_size, num_heads):
|
||||||
|
"""Load QKV from a single, transposed matrix."""
|
||||||
|
|
||||||
|
slice_ = weights._get_slice(f"{prefix}.c_attn.weight")
|
||||||
|
shape = slice_.get_shape()
|
||||||
|
total_size = shape[1]
|
||||||
|
assert total_size % 3 == 0, f"Prepacked is not divisible by {3}"
|
||||||
|
world_size = weights.process_group.size()
|
||||||
|
single_size = total_size // 3
|
||||||
|
assert single_size % world_size == 0
|
||||||
|
rank = weights.process_group.rank()
|
||||||
|
|
||||||
|
# Weights
|
||||||
|
block_size = single_size // world_size
|
||||||
|
start = rank * block_size
|
||||||
|
stop = (rank + 1) * block_size
|
||||||
|
tensors = []
|
||||||
|
for i in range(3):
|
||||||
|
tensor = slice_[:, start + i * single_size : stop + i * single_size]
|
||||||
|
tensors.append(tensor)
|
||||||
|
weight = torch.cat(tensors, dim=1).T
|
||||||
|
weight = weight.to(dtype=weights.dtype)
|
||||||
|
weight = weight.to(device=weights.device)
|
||||||
|
|
||||||
|
# Bias
|
||||||
|
slice_ = weights._get_slice(f"{prefix}.c_attn.bias")
|
||||||
|
shape = slice_.get_shape()
|
||||||
|
total_size = shape[0]
|
||||||
|
single_size = total_size // 3
|
||||||
|
block_size = single_size // world_size
|
||||||
|
assert single_size % world_size == 0
|
||||||
|
start = rank * block_size
|
||||||
|
stop = (rank + 1) * block_size
|
||||||
|
b = []
|
||||||
|
for i in range(3):
|
||||||
|
tensor = slice_[start + i * single_size : stop + i * single_size]
|
||||||
|
b.append(tensor)
|
||||||
|
bias = torch.cat(b, dim=0)
|
||||||
|
bias = bias.to(dtype=weights.dtype)
|
||||||
|
bias = bias.to(device=weights.device)
|
||||||
|
assert list(bias.shape) == [
|
||||||
|
3 * num_heads * head_size
|
||||||
|
], f"{weight.shape} != {[3 * num_heads * head_size]}"
|
||||||
|
|
||||||
|
return TensorParallelColumnLinear(get_linear(weight, bias, config.quantize))
|
||||||
|
|
||||||
|
|
||||||
|
def load_row(config, prefix: str, weights, bias: bool):
|
||||||
|
"""load_row, but with transposed weight matrices."""
|
||||||
|
|
||||||
|
if config.quantize == "gptq":
|
||||||
|
weight = weights.get_multi_weights_row(prefix, quantize=config.quantize)
|
||||||
|
else:
|
||||||
|
weight = weights.get_sharded(f"{prefix}.weight", dim=0).T
|
||||||
|
|
||||||
|
if bias and weights.process_group.rank() == 0:
|
||||||
|
# Rank is only on the first rank process
|
||||||
|
bias = weights.get_tensor(f"{prefix}.bias")
|
||||||
|
else:
|
||||||
|
bias = None
|
||||||
|
|
||||||
|
return TensorParallelRowLinear(
|
||||||
|
get_linear(weight, bias, config.quantize), process_group=weights.process_group
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def load_col(config, prefix: str, weights, bias: bool):
|
||||||
|
"""load_col, but with transposed weight matrices."""
|
||||||
|
if config.quantize == "gptq":
|
||||||
|
weight = weights.get_multi_weights_col(
|
||||||
|
[prefix], quantize=config.quantize, dim=1
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
weight = weights.get_sharded(f"{prefix}.weight", dim=1).T
|
||||||
|
|
||||||
|
if bias:
|
||||||
|
bias = weights.get_sharded(f"{prefix}.bias", dim=0)
|
||||||
|
else:
|
||||||
|
bias = None
|
||||||
|
|
||||||
|
return TensorParallelColumnLinear(get_linear(weight, bias, config.quantize))
|
||||||
|
|
||||||
|
|
||||||
|
class FlashGPT2Attention(torch.nn.Module):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
prefix: str,
|
||||||
|
config,
|
||||||
|
weights,
|
||||||
|
):
|
||||||
|
super().__init__()
|
||||||
|
self.num_heads = config.num_attention_heads
|
||||||
|
self.hidden_size = config.hidden_size
|
||||||
|
|
||||||
|
self.head_size = self.hidden_size // self.num_heads
|
||||||
|
self.softmax_scale = self.head_size**-0.5
|
||||||
|
|
||||||
|
if self.num_heads % weights.process_group.size() != 0:
|
||||||
|
raise ValueError(
|
||||||
|
f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
|
||||||
|
f"and `num_shards`: {weights.process_group.size()}"
|
||||||
|
)
|
||||||
|
self.num_heads = self.num_heads // weights.process_group.size()
|
||||||
|
|
||||||
|
self.query_key_value = load_qkv(
|
||||||
|
config,
|
||||||
|
prefix=prefix,
|
||||||
|
weights=weights,
|
||||||
|
head_size=self.head_size,
|
||||||
|
num_heads=self.num_heads,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.o_proj = load_row(
|
||||||
|
config,
|
||||||
|
prefix=f"{prefix}.c_proj",
|
||||||
|
weights=weights,
|
||||||
|
bias=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.kv_head_mapping = torch.arange(
|
||||||
|
0, self.num_heads, dtype=torch.int32, device=weights.device
|
||||||
|
)
|
||||||
|
|
||||||
|
def forward(
|
||||||
|
self,
|
||||||
|
hidden_states,
|
||||||
|
cu_seqlen_prefill,
|
||||||
|
kv_cache,
|
||||||
|
block_tables,
|
||||||
|
slots,
|
||||||
|
input_lengths,
|
||||||
|
max_s,
|
||||||
|
):
|
||||||
|
query, key, value = self.query_key_value(hidden_states).split(
|
||||||
|
self.head_size * self.num_heads, dim=1
|
||||||
|
)
|
||||||
|
query = query.view(-1, self.num_heads, self.head_size)
|
||||||
|
key = key.view(-1, self.num_heads, self.head_size)
|
||||||
|
value = value.view(-1, self.num_heads, self.head_size)
|
||||||
|
|
||||||
|
paged_attention.reshape_and_cache(key, value, kv_cache[0], kv_cache[1], slots)
|
||||||
|
|
||||||
|
# output tensor
|
||||||
|
attn_output = torch.empty_like(query)
|
||||||
|
|
||||||
|
# Prefill
|
||||||
|
if cu_seqlen_prefill is not None:
|
||||||
|
# flash attention
|
||||||
|
flash_attn.attention(
|
||||||
|
query,
|
||||||
|
key,
|
||||||
|
value,
|
||||||
|
attn_output,
|
||||||
|
cu_seqlen_prefill,
|
||||||
|
max_s,
|
||||||
|
self.softmax_scale,
|
||||||
|
)
|
||||||
|
# Decode
|
||||||
|
else:
|
||||||
|
paged_attention.attention(
|
||||||
|
attn_output,
|
||||||
|
query,
|
||||||
|
kv_cache[0],
|
||||||
|
kv_cache[1],
|
||||||
|
self.kv_head_mapping,
|
||||||
|
self.softmax_scale,
|
||||||
|
block_tables,
|
||||||
|
input_lengths,
|
||||||
|
max_s,
|
||||||
|
)
|
||||||
|
|
||||||
|
return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size))
|
||||||
|
|
||||||
|
|
||||||
|
class GPT2MLP(nn.Module):
|
||||||
|
def __init__(self, prefix, config, weights):
|
||||||
|
super().__init__()
|
||||||
|
act = config.activation_function
|
||||||
|
self.act = (
|
||||||
|
ACT2FN[act]
|
||||||
|
if "gelu" not in act
|
||||||
|
else lambda x: torch.nn.functional.gelu(
|
||||||
|
x,
|
||||||
|
approximate=(
|
||||||
|
"tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
|
||||||
|
),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
self.c_fc = load_col(
|
||||||
|
config, prefix=f"{prefix}.c_fc", weights=weights, bias=True
|
||||||
|
)
|
||||||
|
self.c_proj = load_row(
|
||||||
|
config,
|
||||||
|
prefix=f"{prefix}.c_proj",
|
||||||
|
weights=weights,
|
||||||
|
bias=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
intermediate_size = (
|
||||||
|
config.n_inner if config.n_inner is not None else 4 * config.hidden_size
|
||||||
|
)
|
||||||
|
|
||||||
|
self.intermediate_size = intermediate_size // weights.process_group.size()
|
||||||
|
|
||||||
|
def forward(self, hidden_states):
|
||||||
|
hidden_states = self.c_fc(hidden_states)
|
||||||
|
hidden_states = self.act(hidden_states)
|
||||||
|
return self.c_proj(hidden_states)
|
||||||
|
|
||||||
|
|
||||||
|
class FlashGPT2Layer(nn.Module):
|
||||||
|
def __init__(self, prefix, config, weights):
|
||||||
|
super().__init__()
|
||||||
|
self.self_attn = FlashGPT2Attention(
|
||||||
|
prefix=f"{prefix}.attn", config=config, weights=weights
|
||||||
|
)
|
||||||
|
self.mlp = GPT2MLP(prefix=f"{prefix}.mlp", config=config, weights=weights)
|
||||||
|
|
||||||
|
self.input_layernorm = nn.LayerNorm.load(
|
||||||
|
prefix=f"{prefix}.ln_1", weights=weights, eps=config.layer_norm_epsilon
|
||||||
|
)
|
||||||
|
self.post_attention_layernorm = nn.LayerNorm.load(
|
||||||
|
prefix=f"{prefix}.ln_2",
|
||||||
|
weights=weights,
|
||||||
|
eps=config.layer_norm_epsilon,
|
||||||
|
)
|
||||||
|
|
||||||
|
def forward(
|
||||||
|
self,
|
||||||
|
hidden_states,
|
||||||
|
residual,
|
||||||
|
cu_seqlen_prefill,
|
||||||
|
kv_cache,
|
||||||
|
block_tables,
|
||||||
|
slots,
|
||||||
|
input_lengths,
|
||||||
|
max_s,
|
||||||
|
):
|
||||||
|
residual = hidden_states
|
||||||
|
hidden_states = self.input_layernorm(hidden_states)
|
||||||
|
|
||||||
|
# Self Attention
|
||||||
|
attn_output = self.self_attn(
|
||||||
|
hidden_states,
|
||||||
|
cu_seqlen_prefill,
|
||||||
|
kv_cache,
|
||||||
|
block_tables,
|
||||||
|
slots,
|
||||||
|
input_lengths,
|
||||||
|
max_s,
|
||||||
|
)
|
||||||
|
|
||||||
|
hidden_states = attn_output + residual
|
||||||
|
residual = hidden_states
|
||||||
|
|
||||||
|
hidden_states = self.post_attention_layernorm(hidden_states)
|
||||||
|
|
||||||
|
mlp_output = self.mlp(hidden_states)
|
||||||
|
|
||||||
|
return residual + mlp_output, residual
|
||||||
|
|
||||||
|
|
||||||
|
class FlashGPT2Model(torch.nn.Module):
|
||||||
|
def __init__(self, prefix, config, weights):
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
process_group = weights.process_group
|
||||||
|
self.tp_rank = process_group.rank()
|
||||||
|
self.tp_world_size = process_group.size()
|
||||||
|
self.layers = nn.ModuleList(
|
||||||
|
[
|
||||||
|
FlashGPT2Layer(
|
||||||
|
prefix=(
|
||||||
|
f"h.{layer_id}" if not prefix else f"{prefix}.h.{layer_id}"
|
||||||
|
),
|
||||||
|
config=config,
|
||||||
|
weights=weights,
|
||||||
|
)
|
||||||
|
for layer_id in range(config.num_hidden_layers)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
self.norm = nn.LayerNorm.load(
|
||||||
|
prefix="ln_f" if not prefix else f"{prefix}.ln_f",
|
||||||
|
weights=weights,
|
||||||
|
eps=config.layer_norm_epsilon,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.gradient_checkpointing = False
|
||||||
|
|
||||||
|
self.head_size = self.layers[0].self_attn.head_size
|
||||||
|
self.num_heads = self.layers[0].self_attn.num_heads
|
||||||
|
|
||||||
|
def forward(
|
||||||
|
self,
|
||||||
|
inputs_embeds: torch.Tensor,
|
||||||
|
position_ids: torch.Tensor,
|
||||||
|
cu_seqlen_prefill: Optional[torch.Tensor],
|
||||||
|
kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
|
||||||
|
block_tables: torch.Tensor,
|
||||||
|
slots: torch.Tensor,
|
||||||
|
input_lengths: torch.Tensor,
|
||||||
|
max_s: int,
|
||||||
|
true_max_s: int,
|
||||||
|
prefill_cache_indices: Optional[torch.Tensor],
|
||||||
|
) -> torch.Tensor:
|
||||||
|
hidden_states = inputs_embeds
|
||||||
|
|
||||||
|
residual = None
|
||||||
|
for i, layer in enumerate(self.layers):
|
||||||
|
hidden_states, residual = layer(
|
||||||
|
hidden_states,
|
||||||
|
residual,
|
||||||
|
cu_seqlen_prefill,
|
||||||
|
kv_cache[i],
|
||||||
|
block_tables,
|
||||||
|
slots,
|
||||||
|
input_lengths,
|
||||||
|
max_s,
|
||||||
|
)
|
||||||
|
|
||||||
|
hidden_states = self.norm(hidden_states)
|
||||||
|
|
||||||
|
return hidden_states
|
||||||
|
|
||||||
|
|
||||||
|
class FlashGPT2ForCausalLM(torch.nn.Module):
|
||||||
|
def __init__(self, prefix, config, weights):
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
self.embed_tokens = TensorParallelEmbedding(
|
||||||
|
prefix=("wte" if not prefix else f"{prefix}.wte"),
|
||||||
|
weights=weights,
|
||||||
|
)
|
||||||
|
self.embed_positions = TensorParallelEmbedding(
|
||||||
|
prefix=("wpe" if not prefix else f"{prefix}.wpe"),
|
||||||
|
weights=weights,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.model = FlashGPT2Model(prefix, config, weights)
|
||||||
|
self.lm_head = SpeculativeHead.load(
|
||||||
|
config,
|
||||||
|
prefix="wte" if not prefix else f"{prefix}.wte",
|
||||||
|
weights=weights,
|
||||||
|
)
|
||||||
|
|
||||||
|
def forward(
|
||||||
|
self,
|
||||||
|
input_ids: torch.Tensor,
|
||||||
|
position_ids: torch.Tensor,
|
||||||
|
cu_seqlen_prefill: Optional[torch.Tensor],
|
||||||
|
kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
|
||||||
|
block_tables: torch.Tensor,
|
||||||
|
slots: torch.Tensor,
|
||||||
|
input_lengths: torch.Tensor,
|
||||||
|
max_s: int,
|
||||||
|
prefill_cache_indices: Optional[torch.Tensor] = None,
|
||||||
|
lm_head_indices: Optional[torch.Tensor] = None,
|
||||||
|
) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
|
||||||
|
token_embeds = self.embed_tokens(input_ids)
|
||||||
|
position_embeds = self.embed_positions(position_ids)
|
||||||
|
inputs_embeds = token_embeds + position_embeds
|
||||||
|
hidden_states = self.model(
|
||||||
|
inputs_embeds,
|
||||||
|
position_ids,
|
||||||
|
cu_seqlen_prefill,
|
||||||
|
kv_cache,
|
||||||
|
block_tables,
|
||||||
|
slots,
|
||||||
|
input_lengths,
|
||||||
|
max_s,
|
||||||
|
true_max_s=max_s,
|
||||||
|
prefill_cache_indices=prefill_cache_indices,
|
||||||
|
)
|
||||||
|
if lm_head_indices is not None:
|
||||||
|
hidden_states = hidden_states[lm_head_indices]
|
||||||
|
logits, speculative_logits = self.lm_head(hidden_states)
|
||||||
|
return logits, speculative_logits
|
@ -18,43 +18,59 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
|
from typing import List, Optional, Tuple
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.distributed
|
import torch.distributed
|
||||||
|
|
||||||
from torch import nn
|
from torch import nn
|
||||||
from transformers.activations import ACT2FN
|
from transformers.activations import ACT2FN
|
||||||
from transformers.configuration_utils import PretrainedConfig
|
|
||||||
from typing import Optional, List, Tuple
|
from typing import Optional, List, Tuple
|
||||||
|
|
||||||
|
from text_generation_server.utils.import_utils import SYSTEM
|
||||||
from text_generation_server.utils import paged_attention, flash_attn
|
from text_generation_server.utils import paged_attention, flash_attn
|
||||||
from text_generation_server.utils.layers import (
|
from text_generation_server.layers import (
|
||||||
TensorParallelRowLinear,
|
TensorParallelRowLinear,
|
||||||
TensorParallelColumnLinear,
|
TensorParallelColumnLinear,
|
||||||
TensorParallelEmbedding,
|
TensorParallelEmbedding,
|
||||||
PositionRotaryEmbedding,
|
|
||||||
SpeculativeHead,
|
SpeculativeHead,
|
||||||
get_linear,
|
)
|
||||||
|
from text_generation_server.layers.rotary import PositionRotaryEmbedding
|
||||||
|
from text_generation_server.layers.layernorm import (
|
||||||
FastRMSNorm,
|
FastRMSNorm,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if SYSTEM == "rocm":
|
||||||
|
try:
|
||||||
|
from vllm import _custom_C
|
||||||
|
except Exception as e:
|
||||||
|
raise ImportError(f"Could not load `vllm._custom_C`. Full error: {e}")
|
||||||
|
|
||||||
|
|
||||||
def load_attention(config, prefix, weights):
|
def load_attention(config, prefix, weights):
|
||||||
|
bias = config.attention_bias
|
||||||
if config.num_attention_heads != config.num_key_value_heads:
|
if config.num_attention_heads != config.num_key_value_heads:
|
||||||
return _load_gqa(config, prefix, weights)
|
return TensorParallelColumnLinear.load_multi(
|
||||||
|
config,
|
||||||
|
prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
|
||||||
|
dim=0,
|
||||||
|
weights=weights,
|
||||||
|
bias=bias,
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
if config.model_type == "baichuan":
|
if config.model_type == "baichuan":
|
||||||
return TensorParallelColumnLinear.load_qkv(
|
return TensorParallelColumnLinear.load_qkv(
|
||||||
config,
|
config,
|
||||||
prefix=f"{prefix}.W_pack",
|
prefix=f"{prefix}.W_pack",
|
||||||
weights=weights,
|
weights=weights,
|
||||||
bias=False,
|
bias=bias,
|
||||||
)
|
)
|
||||||
elif config.model_type == "phi3":
|
elif config.model_type == "phi3":
|
||||||
return TensorParallelColumnLinear.load_qkv(
|
return TensorParallelColumnLinear.load_qkv(
|
||||||
config,
|
config,
|
||||||
prefix=f"{prefix}.qkv_proj",
|
prefix=f"{prefix}.qkv_proj",
|
||||||
weights=weights,
|
weights=weights,
|
||||||
bias=False,
|
bias=bias,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
return TensorParallelColumnLinear.load_multi(
|
return TensorParallelColumnLinear.load_multi(
|
||||||
@ -62,36 +78,10 @@ def load_attention(config, prefix, weights):
|
|||||||
prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
|
prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
|
||||||
dim=0,
|
dim=0,
|
||||||
weights=weights,
|
weights=weights,
|
||||||
bias=False,
|
bias=bias,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def _load_gqa(config, prefix: str, weights):
|
|
||||||
assert config.hidden_size % config.num_attention_heads == 0
|
|
||||||
assert config.num_attention_heads % weights.process_group.size() == 0
|
|
||||||
|
|
||||||
weight = weights.get_multi_weights_col(
|
|
||||||
prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
|
|
||||||
quantize=config.quantize,
|
|
||||||
dim=0,
|
|
||||||
)
|
|
||||||
|
|
||||||
if config.quantize not in ["gptq", "awq"]:
|
|
||||||
weight = weight.to(dtype=weights.dtype).to(device=weights.device)
|
|
||||||
|
|
||||||
head_size = config.hidden_size // config.num_attention_heads
|
|
||||||
num_heads = config.num_attention_heads // weights.process_group.size()
|
|
||||||
num_key_value_heads = config.num_key_value_heads // weights.process_group.size()
|
|
||||||
assert list(weight.shape) == [
|
|
||||||
(num_heads + 2 * num_key_value_heads) * head_size,
|
|
||||||
config.hidden_size,
|
|
||||||
], f"{list(weight.shape)} != {[(num_heads + 2 * config.num_key_value_heads) * head_size, config.hidden_size]}"
|
|
||||||
|
|
||||||
return TensorParallelColumnLinear(
|
|
||||||
get_linear(weight, bias=None, quantize=config.quantize)
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class FlashLlamaAttention(torch.nn.Module):
|
class FlashLlamaAttention(torch.nn.Module):
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@ -200,24 +190,27 @@ class FlashLlamaAttention(torch.nn.Module):
|
|||||||
class LlamaMLP(nn.Module):
|
class LlamaMLP(nn.Module):
|
||||||
def __init__(self, prefix, config, weights):
|
def __init__(self, prefix, config, weights):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
act = config.hidden_act
|
self.hidden_act = config.hidden_act
|
||||||
self.act = (
|
self.act = (
|
||||||
ACT2FN[act]
|
ACT2FN[self.hidden_act]
|
||||||
if "gelu" not in act
|
if "gelu" not in self.hidden_act
|
||||||
else lambda x: torch.nn.functional.gelu(
|
else lambda x: torch.nn.functional.gelu(
|
||||||
x,
|
x,
|
||||||
approximate=(
|
approximate=(
|
||||||
"tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
|
"tanh"
|
||||||
|
if self.hidden_act in ["gelu_fast", "gelu_pytorch_tanh"]
|
||||||
|
else "none"
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
# Fuse gate and up proj
|
# Fuse gate and up proj
|
||||||
|
bias = getattr(config, "mlp_bias", False)
|
||||||
if config.model_type == "phi3":
|
if config.model_type == "phi3":
|
||||||
self.gate_up_proj = TensorParallelColumnLinear.load_gate_up(
|
self.gate_up_proj = TensorParallelColumnLinear.load_gate_up(
|
||||||
config,
|
config,
|
||||||
prefix=f"{prefix}.gate_up_proj",
|
prefix=f"{prefix}.gate_up_proj",
|
||||||
weights=weights,
|
weights=weights,
|
||||||
bias=False,
|
bias=bias,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
self.gate_up_proj = TensorParallelColumnLinear.load_multi(
|
self.gate_up_proj = TensorParallelColumnLinear.load_multi(
|
||||||
@ -225,22 +218,40 @@ class LlamaMLP(nn.Module):
|
|||||||
prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"],
|
prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"],
|
||||||
weights=weights,
|
weights=weights,
|
||||||
dim=0,
|
dim=0,
|
||||||
bias=False,
|
bias=bias,
|
||||||
)
|
)
|
||||||
self.down_proj = TensorParallelRowLinear.load(
|
self.down_proj = TensorParallelRowLinear.load(
|
||||||
config,
|
config,
|
||||||
prefix=f"{prefix}.down_proj",
|
prefix=f"{prefix}.down_proj",
|
||||||
weights=weights,
|
weights=weights,
|
||||||
bias=False,
|
bias=bias,
|
||||||
)
|
)
|
||||||
self.intermediate_size = (
|
self.intermediate_size = (
|
||||||
config.intermediate_size // weights.process_group.size()
|
config.intermediate_size // weights.process_group.size()
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# TODO: This is a hotfix to be removed & properly refactored.
|
||||||
|
self.quantize = config.quantize
|
||||||
|
|
||||||
def forward(self, hidden_states):
|
def forward(self, hidden_states):
|
||||||
gate_up_states = self.gate_up_proj(hidden_states)
|
if (
|
||||||
gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size)
|
SYSTEM == "rocm"
|
||||||
return self.down_proj(self.act(gate_up_states[:, 0]) * gate_up_states[:, 1])
|
and self.hidden_act == "silu"
|
||||||
|
and hidden_states.shape[0] == 1
|
||||||
|
and not self.quantize
|
||||||
|
):
|
||||||
|
out = torch.empty(
|
||||||
|
hidden_states.shape[0],
|
||||||
|
self.intermediate_size,
|
||||||
|
dtype=hidden_states.dtype,
|
||||||
|
device="cuda",
|
||||||
|
)
|
||||||
|
_custom_C.LLMM_Silu(self.gate_up_proj.linear.weight, hidden_states, out, 8)
|
||||||
|
return self.down_proj(out)
|
||||||
|
else:
|
||||||
|
gate_up_states = self.gate_up_proj(hidden_states)
|
||||||
|
gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size)
|
||||||
|
return self.down_proj(self.act(gate_up_states[:, 0]) * gate_up_states[:, 1])
|
||||||
|
|
||||||
|
|
||||||
class FlashLlamaLayer(nn.Module):
|
class FlashLlamaLayer(nn.Module):
|
||||||
@ -383,9 +394,14 @@ class FlashLlamaForCausalLM(torch.nn.Module):
|
|||||||
weights=weights,
|
weights=weights,
|
||||||
)
|
)
|
||||||
self.model = FlashLlamaModel(prefix, config, weights)
|
self.model = FlashLlamaModel(prefix, config, weights)
|
||||||
|
if config.tie_word_embeddings:
|
||||||
|
suffix = "model.embed_tokens"
|
||||||
|
else:
|
||||||
|
suffix = "lm_head"
|
||||||
|
|
||||||
self.lm_head = SpeculativeHead.load(
|
self.lm_head = SpeculativeHead.load(
|
||||||
config,
|
config,
|
||||||
prefix="lm_head" if not prefix else f"{prefix}.lm_head",
|
prefix=suffix if not prefix else f"{prefix}.suffix",
|
||||||
weights=weights,
|
weights=weights,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -26,18 +26,28 @@ from transformers.activations import ACT2FN
|
|||||||
from transformers.configuration_utils import PretrainedConfig
|
from transformers.configuration_utils import PretrainedConfig
|
||||||
from typing import Optional, List, Tuple
|
from typing import Optional, List, Tuple
|
||||||
|
|
||||||
|
from text_generation_server.utils.import_utils import SYSTEM
|
||||||
from text_generation_server.utils import paged_attention, flash_attn
|
from text_generation_server.utils import paged_attention, flash_attn
|
||||||
from text_generation_server.utils.layers import (
|
from text_generation_server.layers import (
|
||||||
TensorParallelRowLinear,
|
TensorParallelRowLinear,
|
||||||
TensorParallelColumnLinear,
|
TensorParallelColumnLinear,
|
||||||
TensorParallelEmbedding,
|
TensorParallelEmbedding,
|
||||||
PositionRotaryEmbedding,
|
|
||||||
SpeculativeHead,
|
SpeculativeHead,
|
||||||
get_linear,
|
get_linear,
|
||||||
|
)
|
||||||
|
from text_generation_server.layers.rotary import PositionRotaryEmbedding
|
||||||
|
from text_generation_server.layers.layernorm import (
|
||||||
FastRMSNorm,
|
FastRMSNorm,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if SYSTEM == "rocm":
|
||||||
|
try:
|
||||||
|
from vllm import _custom_C
|
||||||
|
except Exception as e:
|
||||||
|
raise ImportError(f"Could not load `vllm._custom_C`. Full error: {e}")
|
||||||
|
|
||||||
|
|
||||||
class MistralConfig(PretrainedConfig):
|
class MistralConfig(PretrainedConfig):
|
||||||
model_type = "mistral"
|
model_type = "mistral"
|
||||||
|
|
||||||
@ -249,14 +259,16 @@ class MistralAttention(torch.nn.Module):
|
|||||||
class MistralMLP(nn.Module):
|
class MistralMLP(nn.Module):
|
||||||
def __init__(self, prefix, config, weights):
|
def __init__(self, prefix, config, weights):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
act = config.hidden_act
|
self.hidden_act = config.hidden_act
|
||||||
self.act = (
|
self.act = (
|
||||||
ACT2FN[act]
|
ACT2FN[self.hidden_act]
|
||||||
if "gelu" not in act
|
if "gelu" not in self.hidden_act
|
||||||
else lambda x: torch.nn.functional.gelu(
|
else lambda x: torch.nn.functional.gelu(
|
||||||
x,
|
x,
|
||||||
approximate=(
|
approximate=(
|
||||||
"tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
|
"tanh"
|
||||||
|
if self.hidden_act in ["gelu_fast", "gelu_pytorch_tanh"]
|
||||||
|
else "none"
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
@ -278,10 +290,28 @@ class MistralMLP(nn.Module):
|
|||||||
config.intermediate_size // weights.process_group.size()
|
config.intermediate_size // weights.process_group.size()
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# TODO: This is a hotfix to be removed & properly refactored.
|
||||||
|
self.quantize = config.quantize
|
||||||
|
|
||||||
def forward(self, hidden_states):
|
def forward(self, hidden_states):
|
||||||
gate_up_states = self.gate_up_proj(hidden_states)
|
if (
|
||||||
gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size)
|
SYSTEM == "rocm"
|
||||||
return self.down_proj(self.act(gate_up_states[:, 0]) * gate_up_states[:, 1])
|
and self.hidden_act == "silu"
|
||||||
|
and hidden_states.shape[0] == 1
|
||||||
|
and not self.quantize
|
||||||
|
):
|
||||||
|
out = torch.empty(
|
||||||
|
hidden_states.shape[0],
|
||||||
|
self.intermediate_size,
|
||||||
|
dtype=hidden_states.dtype,
|
||||||
|
device="cuda",
|
||||||
|
)
|
||||||
|
_custom_C.LLMM_Silu(self.gate_up_proj.linear.weight, hidden_states, out, 8)
|
||||||
|
return self.down_proj(out)
|
||||||
|
else:
|
||||||
|
gate_up_states = self.gate_up_proj(hidden_states)
|
||||||
|
gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size)
|
||||||
|
return self.down_proj(self.act(gate_up_states[:, 0]) * gate_up_states[:, 1])
|
||||||
|
|
||||||
|
|
||||||
class MistralLayer(nn.Module):
|
class MistralLayer(nn.Module):
|
||||||
|
@ -24,9 +24,9 @@ import torch.distributed
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from torch import nn
|
from torch import nn
|
||||||
from text_generation_server.utils.import_utils import IS_XPU_SYSTEM
|
from text_generation_server.utils.import_utils import SYSTEM
|
||||||
|
|
||||||
if not IS_XPU_SYSTEM:
|
if SYSTEM != "xpu":
|
||||||
from vllm.model_executor.layers.fused_moe import fused_moe
|
from vllm.model_executor.layers.fused_moe import fused_moe
|
||||||
from transformers.activations import ACT2FN
|
from transformers.activations import ACT2FN
|
||||||
from transformers.configuration_utils import PretrainedConfig
|
from transformers.configuration_utils import PretrainedConfig
|
||||||
@ -34,16 +34,20 @@ from typing import Optional, List, Tuple
|
|||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
|
||||||
from text_generation_server.utils import paged_attention, flash_attn
|
from text_generation_server.utils import paged_attention, flash_attn
|
||||||
from text_generation_server.utils.layers import (
|
from text_generation_server.layers import (
|
||||||
FastLinear,
|
FastLinear,
|
||||||
FastRMSNorm,
|
|
||||||
TensorParallelRowLinear,
|
TensorParallelRowLinear,
|
||||||
TensorParallelColumnLinear,
|
TensorParallelColumnLinear,
|
||||||
TensorParallelEmbedding,
|
TensorParallelEmbedding,
|
||||||
PositionRotaryEmbedding,
|
|
||||||
SpeculativeHead,
|
SpeculativeHead,
|
||||||
get_linear,
|
get_linear,
|
||||||
)
|
)
|
||||||
|
from text_generation_server.layers.layernorm import (
|
||||||
|
FastRMSNorm,
|
||||||
|
)
|
||||||
|
from text_generation_server.layers.rotary import (
|
||||||
|
PositionRotaryEmbedding,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class MixtralConfig(PretrainedConfig):
|
class MixtralConfig(PretrainedConfig):
|
||||||
|
@ -29,15 +29,19 @@ from typing import Optional, List, Tuple
|
|||||||
|
|
||||||
from text_generation_server.utils import paged_attention, flash_attn
|
from text_generation_server.utils import paged_attention, flash_attn
|
||||||
from text_generation_server.utils.flash_attn import attention
|
from text_generation_server.utils.flash_attn import attention
|
||||||
from text_generation_server.utils.layers import (
|
from text_generation_server.layers import (
|
||||||
TensorParallelRowLinear,
|
TensorParallelRowLinear,
|
||||||
TensorParallelColumnLinear,
|
TensorParallelColumnLinear,
|
||||||
TensorParallelEmbedding,
|
TensorParallelEmbedding,
|
||||||
SpeculativeHead,
|
SpeculativeHead,
|
||||||
FastLayerNorm,
|
|
||||||
PositionRotaryEmbedding,
|
|
||||||
get_linear,
|
get_linear,
|
||||||
)
|
)
|
||||||
|
from text_generation_server.layers.layernorm import (
|
||||||
|
FastLayerNorm,
|
||||||
|
)
|
||||||
|
from text_generation_server.layers.rotary import (
|
||||||
|
PositionRotaryEmbedding,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def load_row(config, prefix: str, weights, bias: bool):
|
def load_row(config, prefix: str, weights, bias: bool):
|
||||||
|
@ -0,0 +1,110 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2024 HuggingFace Inc. team. All rights reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import torch.distributed
|
||||||
|
from torch import nn
|
||||||
|
from transformers.configuration_utils import PretrainedConfig
|
||||||
|
from typing import Optional, List, Tuple
|
||||||
|
|
||||||
|
from text_generation_server.layers.tensor_parallel import TensorParallelColumnLinear
|
||||||
|
from text_generation_server.models.custom_modeling.vlm import (
|
||||||
|
load_text_model,
|
||||||
|
load_vision_model,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class PaliGemmaForConditionalGeneration(nn.Module):
|
||||||
|
def __init__(self, prefix, config, weights):
|
||||||
|
super().__init__()
|
||||||
|
config.vision_config.quantize = config.quantize
|
||||||
|
self.vision_tower = load_vision_model(
|
||||||
|
prefix="vision_tower" if not prefix else f"{prefix}.vision_tower",
|
||||||
|
config=config.vision_config,
|
||||||
|
weights=weights,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.multi_modal_projector = TensorParallelColumnLinear.load(
|
||||||
|
config,
|
||||||
|
prefix="multi_modal_projector.linear",
|
||||||
|
weights=weights,
|
||||||
|
bias=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.vocab_size = config.vocab_size
|
||||||
|
self.config = config
|
||||||
|
|
||||||
|
text_config = config.text_config
|
||||||
|
text_config.speculator = config.speculator
|
||||||
|
text_config.quantize = config.quantize
|
||||||
|
self.text_model = load_text_model(
|
||||||
|
prefix="language_model" if not prefix else f"{prefix}.language_model",
|
||||||
|
config=config.text_config,
|
||||||
|
weights=weights,
|
||||||
|
)
|
||||||
|
self.pad_token_id = (
|
||||||
|
config.pad_token_id if config.pad_token_id is not None else -1
|
||||||
|
)
|
||||||
|
|
||||||
|
def forward(
|
||||||
|
self,
|
||||||
|
input_ids: torch.Tensor,
|
||||||
|
position_ids: torch.Tensor,
|
||||||
|
cu_seqlen_prefill: Optional[torch.Tensor],
|
||||||
|
kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
|
||||||
|
block_tables: torch.Tensor,
|
||||||
|
slots: torch.Tensor,
|
||||||
|
input_lengths: torch.Tensor,
|
||||||
|
max_s: int,
|
||||||
|
prefill_cache_indices: Optional[torch.Tensor] = None,
|
||||||
|
lm_head_indices: Optional[torch.Tensor] = None,
|
||||||
|
pixel_values: torch.FloatTensor = None,
|
||||||
|
# Unused here
|
||||||
|
pixel_attention_mask: Optional[torch.BoolTensor] = None,
|
||||||
|
image_sizes: Optional[torch.Tensor] = None,
|
||||||
|
) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
|
||||||
|
inputs_embeds = self.text_model.embed_tokens(input_ids)
|
||||||
|
# TODO This is odd but apparently pali gemma position ids start at 1.
|
||||||
|
if cu_seqlen_prefill is not None:
|
||||||
|
max_s += 1
|
||||||
|
position_ids += 1
|
||||||
|
|
||||||
|
if pixel_values is not None:
|
||||||
|
pixel_values = pixel_values.to(dtype=inputs_embeds.dtype)
|
||||||
|
image_outputs = self.vision_tower(pixel_values)
|
||||||
|
image_features = self.multi_modal_projector(image_outputs.last_hidden_state)
|
||||||
|
|
||||||
|
# mask where image or padding tokens
|
||||||
|
mask = input_ids == self.config.image_token_index
|
||||||
|
|
||||||
|
# insert image features into input embeddings
|
||||||
|
inputs_embeds[mask] = image_features.view(-1, image_features.shape[-1])
|
||||||
|
|
||||||
|
hidden_states = self.text_model.model(
|
||||||
|
inputs_embeds=inputs_embeds,
|
||||||
|
position_ids=position_ids,
|
||||||
|
cu_seqlen_prefill=cu_seqlen_prefill,
|
||||||
|
kv_cache=kv_cache,
|
||||||
|
block_tables=block_tables,
|
||||||
|
slots=slots,
|
||||||
|
input_lengths=input_lengths,
|
||||||
|
max_s=max_s,
|
||||||
|
)
|
||||||
|
|
||||||
|
if lm_head_indices is not None:
|
||||||
|
hidden_states = hidden_states[lm_head_indices]
|
||||||
|
logits, speculative_logits = self.text_model.lm_head(hidden_states)
|
||||||
|
|
||||||
|
return logits, speculative_logits
|
@ -7,15 +7,19 @@ from transformers.configuration_utils import PretrainedConfig
|
|||||||
from typing import Optional, List, Tuple
|
from typing import Optional, List, Tuple
|
||||||
|
|
||||||
from text_generation_server.utils import paged_attention, flash_attn
|
from text_generation_server.utils import paged_attention, flash_attn
|
||||||
from text_generation_server.utils.layers import (
|
from text_generation_server.layers import (
|
||||||
TensorParallelRowLinear,
|
TensorParallelRowLinear,
|
||||||
TensorParallelColumnLinear,
|
TensorParallelColumnLinear,
|
||||||
TensorParallelEmbedding,
|
TensorParallelEmbedding,
|
||||||
PositionRotaryEmbedding,
|
|
||||||
SpeculativeHead,
|
SpeculativeHead,
|
||||||
get_linear,
|
get_linear,
|
||||||
|
)
|
||||||
|
from text_generation_server.layers.layernorm import (
|
||||||
FastLayerNorm,
|
FastLayerNorm,
|
||||||
)
|
)
|
||||||
|
from text_generation_server.layers.rotary import (
|
||||||
|
PositionRotaryEmbedding,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class PhiConfig(PretrainedConfig):
|
class PhiConfig(PretrainedConfig):
|
||||||
|
@ -6,13 +6,15 @@ from transformers.activations import ACT2FN
|
|||||||
from typing import Optional, List, Tuple
|
from typing import Optional, List, Tuple
|
||||||
|
|
||||||
from text_generation_server.utils import paged_attention, flash_attn
|
from text_generation_server.utils import paged_attention, flash_attn
|
||||||
from text_generation_server.utils.layers import (
|
from text_generation_server.layers import (
|
||||||
TensorParallelRowLinear,
|
TensorParallelRowLinear,
|
||||||
TensorParallelColumnLinear,
|
TensorParallelColumnLinear,
|
||||||
TensorParallelEmbedding,
|
TensorParallelEmbedding,
|
||||||
PositionRotaryEmbedding,
|
|
||||||
SpeculativeHead,
|
SpeculativeHead,
|
||||||
get_linear,
|
get_linear,
|
||||||
|
)
|
||||||
|
from text_generation_server.layers.rotary import PositionRotaryEmbedding
|
||||||
|
from text_generation_server.layers.layernorm import (
|
||||||
FastRMSNorm,
|
FastRMSNorm,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -1,22 +1,21 @@
|
|||||||
|
from typing import List, Optional, Tuple
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.distributed
|
import torch.distributed
|
||||||
|
|
||||||
from torch import nn
|
from torch import nn
|
||||||
from transformers.modeling_utils import PreTrainedModel
|
|
||||||
from transformers.configuration_utils import PretrainedConfig
|
from transformers.configuration_utils import PretrainedConfig
|
||||||
from typing import Optional, List, Tuple
|
from transformers.modeling_utils import PreTrainedModel
|
||||||
|
|
||||||
from text_generation_server.utils import paged_attention, flash_attn
|
from text_generation_server.layers import (
|
||||||
from text_generation_server.utils.flash_attn import attention
|
SpeculativeHead,
|
||||||
from text_generation_server.utils.layers import (
|
|
||||||
TensorParallelRowLinear,
|
|
||||||
TensorParallelColumnLinear,
|
TensorParallelColumnLinear,
|
||||||
TensorParallelEmbedding,
|
TensorParallelEmbedding,
|
||||||
SpeculativeHead,
|
TensorParallelRowLinear,
|
||||||
FastLayerNorm,
|
|
||||||
PositionRotaryEmbedding,
|
|
||||||
get_linear,
|
get_linear,
|
||||||
)
|
)
|
||||||
|
from text_generation_server.layers.layernorm import FastLayerNorm
|
||||||
|
from text_generation_server.layers.rotary import PositionRotaryEmbedding
|
||||||
|
from text_generation_server.utils import flash_attn, paged_attention
|
||||||
|
|
||||||
|
|
||||||
def load_row(config, prefix: str, weights, bias: bool):
|
def load_row(config, prefix: str, weights, bias: bool):
|
||||||
@ -48,6 +47,7 @@ class RWConfig(PretrainedConfig):
|
|||||||
hidden_size=64,
|
hidden_size=64,
|
||||||
num_hidden_layers=None,
|
num_hidden_layers=None,
|
||||||
num_attention_heads=None,
|
num_attention_heads=None,
|
||||||
|
num_ln_in_prallel_attention=None,
|
||||||
layer_norm_epsilon=1e-5,
|
layer_norm_epsilon=1e-5,
|
||||||
initializer_range=0.02,
|
initializer_range=0.02,
|
||||||
use_cache=True,
|
use_cache=True,
|
||||||
@ -61,6 +61,7 @@ class RWConfig(PretrainedConfig):
|
|||||||
new_decoder_architecture=None,
|
new_decoder_architecture=None,
|
||||||
bias=False,
|
bias=False,
|
||||||
parallel_attn=False,
|
parallel_attn=False,
|
||||||
|
rope_theta=10_000.0,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
if alibi:
|
if alibi:
|
||||||
@ -71,6 +72,7 @@ class RWConfig(PretrainedConfig):
|
|||||||
self.model_type = model_type
|
self.model_type = model_type
|
||||||
self.alibi = False
|
self.alibi = False
|
||||||
self.rotary = True
|
self.rotary = True
|
||||||
|
self.rope_theta = rope_theta
|
||||||
|
|
||||||
self.vocab_size = vocab_size
|
self.vocab_size = vocab_size
|
||||||
# Backward compatibility with n_embed kwarg
|
# Backward compatibility with n_embed kwarg
|
||||||
@ -87,6 +89,7 @@ class RWConfig(PretrainedConfig):
|
|||||||
else kwargs.pop("n_head", 8)
|
else kwargs.pop("n_head", 8)
|
||||||
)
|
)
|
||||||
self.layer_norm_epsilon = layer_norm_epsilon
|
self.layer_norm_epsilon = layer_norm_epsilon
|
||||||
|
self.num_ln_in_parallel_attention = num_ln_in_prallel_attention
|
||||||
self.initializer_range = initializer_range
|
self.initializer_range = initializer_range
|
||||||
self.use_cache = use_cache
|
self.use_cache = use_cache
|
||||||
self.hidden_dropout = hidden_dropout
|
self.hidden_dropout = hidden_dropout
|
||||||
@ -128,9 +131,13 @@ class FlashRWAttention(torch.nn.Module):
|
|||||||
self.num_heads_kv = config.n_head_kv
|
self.num_heads_kv = config.n_head_kv
|
||||||
self.hidden_size = config.hidden_size
|
self.hidden_size = config.hidden_size
|
||||||
self.head_size = self.hidden_size // self.num_heads
|
self.head_size = self.hidden_size // self.num_heads
|
||||||
|
self.rope_theta = config.rope_theta
|
||||||
|
|
||||||
self.rotary_emb = PositionRotaryEmbedding.static(
|
self.rotary_emb = PositionRotaryEmbedding.static(
|
||||||
config=config, dim=self.head_size, base=10000.0, device=weights.device
|
config=config,
|
||||||
|
dim=self.head_size,
|
||||||
|
base=self.rope_theta,
|
||||||
|
device=weights.device,
|
||||||
)
|
)
|
||||||
self.softmax_scale = self.head_size ** (-0.5)
|
self.softmax_scale = self.head_size ** (-0.5)
|
||||||
|
|
||||||
@ -240,9 +247,13 @@ class FlashRWLargeAttention(torch.nn.Module):
|
|||||||
self.hidden_size = hidden_size
|
self.hidden_size = hidden_size
|
||||||
self.head_size = hidden_size // num_heads
|
self.head_size = hidden_size // num_heads
|
||||||
self.num_groups = num_groups
|
self.num_groups = num_groups
|
||||||
|
self.rope_theta = config.rope_theta
|
||||||
|
|
||||||
self.rotary_emb = PositionRotaryEmbedding.static(
|
self.rotary_emb = PositionRotaryEmbedding.static(
|
||||||
config=config, dim=self.head_size, base=10000.0, device=weights.device
|
config=config,
|
||||||
|
dim=self.head_size,
|
||||||
|
base=self.rope_theta,
|
||||||
|
device=weights.device,
|
||||||
)
|
)
|
||||||
self.softmax_scale = self.head_size ** (-0.5)
|
self.softmax_scale = self.head_size ** (-0.5)
|
||||||
|
|
||||||
@ -253,7 +264,7 @@ class FlashRWLargeAttention(torch.nn.Module):
|
|||||||
|
|
||||||
if process_group.size() > self.num_groups:
|
if process_group.size() > self.num_groups:
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
f"Tensor Parallelism is not implemented for world_size > n groups"
|
"Tensor Parallelism is not implemented for world_size > n groups"
|
||||||
)
|
)
|
||||||
if self.num_groups % process_group.size() != 0:
|
if self.num_groups % process_group.size() != 0:
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
@ -455,29 +466,61 @@ class FlashRWLayer(nn.Module):
|
|||||||
max_s,
|
max_s,
|
||||||
)
|
)
|
||||||
|
|
||||||
hidden_states, residual = self.post_attention_layernorm(
|
if self.post_attention_layernorm is not None:
|
||||||
hidden_states, residual
|
hidden_states, residual = self.post_attention_layernorm(
|
||||||
)
|
hidden_states, residual
|
||||||
|
)
|
||||||
|
|
||||||
mlp_output = self.mlp(hidden_states)
|
mlp_output = self.mlp(hidden_states)
|
||||||
|
|
||||||
return mlp_output, residual
|
return mlp_output, residual
|
||||||
|
|
||||||
|
|
||||||
|
class FlashRWLayerNorm(nn.Module):
|
||||||
|
def __init__(self, config, prefix, weights):
|
||||||
|
super().__init__()
|
||||||
|
self.num_ln = config.num_ln_in_parallel_attn
|
||||||
|
|
||||||
|
if self.num_ln == 1:
|
||||||
|
self.input_ln = FastLayerNorm.load(
|
||||||
|
prefix=f"{prefix}.input_layernorm",
|
||||||
|
weights=weights,
|
||||||
|
eps=config.layer_norm_epsilon,
|
||||||
|
)
|
||||||
|
elif self.num_ln == 2:
|
||||||
|
self.ln_attn = FastLayerNorm.load(
|
||||||
|
prefix=f"{prefix}.ln_attn",
|
||||||
|
weights=weights,
|
||||||
|
eps=config.layer_norm_epsilon,
|
||||||
|
)
|
||||||
|
self.ln_mlp = FastLayerNorm.load(
|
||||||
|
prefix=f"{prefix}.ln_mlp",
|
||||||
|
weights=weights,
|
||||||
|
eps=config.layer_norm_epsilon,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
raise ValueError("Number of layer norms can either be 1 or 2.")
|
||||||
|
|
||||||
|
def forward(
|
||||||
|
self,
|
||||||
|
hidden_states,
|
||||||
|
residual,
|
||||||
|
):
|
||||||
|
if self.num_ln == 1:
|
||||||
|
ln_hidden_states, residual = self.input_ln(hidden_states, residual)
|
||||||
|
return ln_hidden_states, ln_hidden_states, residual
|
||||||
|
elif self.num_ln == 2:
|
||||||
|
ln_attn, residual = self.ln_attn(hidden_states, residual)
|
||||||
|
ln_mlp, _ = self.ln_mlp(residual)
|
||||||
|
return ln_attn, ln_mlp, residual
|
||||||
|
|
||||||
|
|
||||||
class FlashRWLargeLayer(nn.Module):
|
class FlashRWLargeLayer(nn.Module):
|
||||||
def __init__(self, layer_id, config, weights):
|
def __init__(self, layer_id, config, weights):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
prefix = f"transformer.h.{layer_id}"
|
prefix = f"transformer.h.{layer_id}"
|
||||||
self.ln_attn = FastLayerNorm.load(
|
|
||||||
prefix=f"{prefix}.ln_attn",
|
self.ln_layer = FlashRWLayerNorm(config, prefix, weights)
|
||||||
weights=weights,
|
|
||||||
eps=config.layer_norm_epsilon,
|
|
||||||
)
|
|
||||||
self.ln_mlp = FastLayerNorm.load(
|
|
||||||
prefix=f"{prefix}.ln_mlp",
|
|
||||||
weights=weights,
|
|
||||||
eps=config.layer_norm_epsilon,
|
|
||||||
)
|
|
||||||
|
|
||||||
self.self_attention = FlashRWLargeAttention(
|
self.self_attention = FlashRWLargeAttention(
|
||||||
config,
|
config,
|
||||||
@ -503,8 +546,8 @@ class FlashRWLargeLayer(nn.Module):
|
|||||||
input_lengths,
|
input_lengths,
|
||||||
max_s,
|
max_s,
|
||||||
):
|
):
|
||||||
ln_attn, residual = self.ln_attn(hidden_states, residual)
|
# Layer norm.
|
||||||
ln_mlp, _ = self.ln_mlp(residual)
|
ln_attn, ln_mlp, residual = self.ln_layer(hidden_states, residual)
|
||||||
|
|
||||||
# Self attention.
|
# Self attention.
|
||||||
attn_output = self.self_attention(
|
attn_output = self.self_attention(
|
||||||
|
@ -6,14 +6,16 @@ from transformers.activations import ACT2FN
|
|||||||
from typing import Optional, List, Tuple
|
from typing import Optional, List, Tuple
|
||||||
|
|
||||||
from text_generation_server.utils import paged_attention, flash_attn
|
from text_generation_server.utils import paged_attention, flash_attn
|
||||||
from text_generation_server.utils.layers import (
|
from text_generation_server.layers import (
|
||||||
TensorParallelRowLinear,
|
TensorParallelRowLinear,
|
||||||
TensorParallelColumnLinear,
|
TensorParallelColumnLinear,
|
||||||
SpeculativeHead,
|
SpeculativeHead,
|
||||||
TensorParallelEmbedding,
|
TensorParallelEmbedding,
|
||||||
FastLayerNorm,
|
|
||||||
get_linear,
|
get_linear,
|
||||||
)
|
)
|
||||||
|
from text_generation_server.layers.layernorm import (
|
||||||
|
FastLayerNorm,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def load_multi_mqa(
|
def load_multi_mqa(
|
||||||
@ -80,13 +82,13 @@ def _load_multi_mqa_gptq(
|
|||||||
g_idx = g_idx.to(device=weights.device)
|
g_idx = g_idx.to(device=weights.device)
|
||||||
elif quant_method == "awq":
|
elif quant_method == "awq":
|
||||||
g_idx = None
|
g_idx = None
|
||||||
from text_generation_server.utils.awq.conversion_utils import (
|
from text_generation_server.layers.awq.conversion_utils import (
|
||||||
fast_awq_to_gptq,
|
fast_awq_to_gptq,
|
||||||
)
|
)
|
||||||
|
|
||||||
qweight, qzeros = fast_awq_to_gptq(qweight, qzeros)
|
qweight, qzeros = fast_awq_to_gptq(qweight, qzeros)
|
||||||
|
|
||||||
from text_generation_server.utils.layers import HAS_EXLLAMA
|
from text_generation_server.layers.gptq import HAS_EXLLAMA
|
||||||
|
|
||||||
use_exllama = HAS_EXLLAMA
|
use_exllama = HAS_EXLLAMA
|
||||||
weight = (qweight, qzeros, scales, g_idx, bits, groupsize, use_exllama)
|
weight = (qweight, qzeros, scales, g_idx, bits, groupsize, use_exllama)
|
||||||
|
@ -27,15 +27,19 @@ from transformers.configuration_utils import PretrainedConfig
|
|||||||
from typing import Optional, List, Tuple
|
from typing import Optional, List, Tuple
|
||||||
|
|
||||||
from text_generation_server.utils import paged_attention, flash_attn
|
from text_generation_server.utils import paged_attention, flash_attn
|
||||||
from text_generation_server.utils.layers import (
|
from text_generation_server.layers import (
|
||||||
TensorParallelRowLinear,
|
TensorParallelRowLinear,
|
||||||
TensorParallelColumnLinear,
|
TensorParallelColumnLinear,
|
||||||
TensorParallelEmbedding,
|
TensorParallelEmbedding,
|
||||||
PositionRotaryEmbedding,
|
|
||||||
SpeculativeHead,
|
SpeculativeHead,
|
||||||
get_linear,
|
get_linear,
|
||||||
FastRMSNorm,
|
)
|
||||||
|
from text_generation_server.layers.layernorm import (
|
||||||
FastLayerNorm,
|
FastLayerNorm,
|
||||||
|
FastRMSNorm,
|
||||||
|
)
|
||||||
|
from text_generation_server.layers.rotary import (
|
||||||
|
PositionRotaryEmbedding,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -29,7 +29,7 @@ from text_generation_server.models.custom_modeling.vlm import (
|
|||||||
)
|
)
|
||||||
from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask
|
from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask
|
||||||
|
|
||||||
from text_generation_server.utils.layers import (
|
from text_generation_server.layers import (
|
||||||
TensorParallelColumnLinear,
|
TensorParallelColumnLinear,
|
||||||
TensorParallelEmbedding,
|
TensorParallelEmbedding,
|
||||||
TensorParallelRowLinear,
|
TensorParallelRowLinear,
|
||||||
@ -683,9 +683,9 @@ class Idefics2ForConditionalGeneration(nn.Module):
|
|||||||
def __init__(self, prefix, config, weights):
|
def __init__(self, prefix, config, weights):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
config.vision_config.quantize = config.quantize
|
config.vision_config.quantize = config.quantize
|
||||||
config.vision_config.use_medusa = config.use_medusa
|
config.vision_config.speculator = config.speculator
|
||||||
config.text_config.quantize = config.quantize
|
config.text_config.quantize = config.quantize
|
||||||
config.text_config.use_medusa = config.use_medusa
|
config.text_config.speculator = config.speculator
|
||||||
|
|
||||||
vision_config = config.vision_config
|
vision_config = config.vision_config
|
||||||
self.text_model = load_text_model(
|
self.text_model = load_text_model(
|
||||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user