diff --git a/.gitignore b/.gitignore index 2ac2f6b4..e9ad1808 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,4 @@ server/exllama_kernels/exllama_kernels/hip_buffers.cuh server/exllama_kernels/exllama_kernels/exllama_ext_hip.cpp data/ +load_tests/*.json diff --git a/Cargo.lock b/Cargo.lock index 671d615c..2e75fe8f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -78,9 +78,9 @@ dependencies = [ [[package]] name = "anstyle-query" -version = "1.1.0" +version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad186efb764318d35165f1758e7dcef3b10628e26d41a44bc5550652e6804391" +checksum = "a64c907d4e79225ac72e2a354c9ce84d50ebb4586dee56c82b3ee73004f537f5" dependencies = [ "windows-sys 0.52.0", ] @@ -3552,7 +3552,7 @@ dependencies = [ [[package]] name = "text-generation-benchmark" -version = "2.0.1" +version = "2.0.4" dependencies = [ "average", "clap", @@ -3573,7 +3573,7 @@ dependencies = [ [[package]] name = "text-generation-client" -version = "2.0.1" +version = "2.0.4" dependencies = [ "futures", "grpc-metadata", @@ -3590,7 +3590,7 @@ dependencies = [ [[package]] name = "text-generation-launcher" -version = "2.0.1" +version = "2.0.4" dependencies = [ "clap", "ctrlc", @@ -3601,6 +3601,7 @@ dependencies = [ "reqwest", "serde", "serde_json", + "thiserror", "tracing", "tracing-subscriber", "vergen", @@ -3608,7 +3609,7 @@ dependencies = [ [[package]] name = "text-generation-router" -version = "2.0.1" +version = "2.0.4" dependencies = [ "async-stream", "axum", diff --git a/Cargo.toml b/Cargo.toml index 34e55652..aafc8435 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,7 +9,7 @@ members = [ resolver = "2" [workspace.package] -version = "2.0.2" +version = "2.0.4" edition = "2021" authors = ["Olivier Dehaene"] homepage = "https://github.com/huggingface/text-generation-inference" diff --git a/Dockerfile b/Dockerfile index 94d10bc1..73a274dc 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,5 @@ # Rust builder -FROM lukemathwalker/cargo-chef:latest-rust-1.75 AS chef +FROM lukemathwalker/cargo-chef:latest-rust-1.78 AS chef WORKDIR /usr/src FROM chef as planner @@ -50,6 +50,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins ca-certificates \ make \ curl \ + git \ && rm -rf /var/lib/apt/lists/* # Install server diff --git a/Dockerfile_amd b/Dockerfile_amd index fb820116..92dd0ea8 100644 --- a/Dockerfile_amd +++ b/Dockerfile_amd @@ -1,5 +1,5 @@ # Rust builder -FROM lukemathwalker/cargo-chef:latest-rust-1.75 AS chef +FROM lukemathwalker/cargo-chef:latest-rust-1.78 AS chef WORKDIR /usr/src ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse @@ -36,7 +36,7 @@ COPY launcher launcher RUN cargo build --release # Text Generation Inference base image for RoCm -FROM rocm/dev-ubuntu-22.04:5.7 as base +FROM rocm/dev-ubuntu-22.04:6.1.1_hip_update as base RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ build-essential \ @@ -50,13 +50,24 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins # Needed to build VLLM & flash. rocthrust-dev \ hipsparse-dev \ - hipblas-dev && \ + hipblas-dev \ + hipblaslt-dev \ + rocblas-dev \ + hiprand-dev \ + rocrand-dev \ + miopen-hip-dev \ + hipfft-dev \ + hipcub-dev \ + hipsolver-dev \ + rccl-dev \ + cmake \ + python3-dev && \ rm -rf /var/lib/apt/lists/* # Keep in sync with `server/pyproject.toml ARG MAMBA_VERSION=23.1.0-1 -ARG PYTORCH_VERSION='2.2.0.dev0' -ARG ROCM_VERSION='5.7' +ARG PYTORCH_VERSION='2.3.0' +ARG ROCM_VERSION='6.0.2' ARG PYTHON_VERSION='3.10.10' # Automatically set by buildx ARG TARGETPLATFORM @@ -75,12 +86,44 @@ RUN chmod +x ~/mambaforge.sh && \ mamba init && \ rm ~/mambaforge.sh -# Install PyTorch 2.2 RC compiled against RoCm 5.7, as VLLM can not be compiled with RoCm 5.6. -RUN pip install torch --index-url https://download.pytorch.org/whl/test/rocm5.7/ +# Install flash-attention, torch dependencies +RUN pip install numpy einops ninja --no-cache-dir + +RUN conda install intel::mkl-static intel::mkl-include +RUN pip uninstall -y triton && \ + git clone --depth 1 --single-branch https://github.com/ROCm/triton.git && \ + cd triton/python && \ + pip install . + +RUN git clone --depth 1 --recursive --single-branch --branch 2.3-patched https://github.com/fxmarty/pytorch.git pytorch && cd pytorch && pip install -r requirements.txt --no-cache-dir + +ARG _GLIBCXX_USE_CXX11_ABI="1" +ARG CMAKE_PREFIX_PATH="/opt/conda" +ARG PYTORCH_ROCM_ARCH="gfx90a;gfx942" +ARG BUILD_CAFFE2="0" \ + BUILD_CAFFE2_OPS="0" \ + USE_CUDA="0" \ + USE_ROCM="1" \ + BUILD_TEST="0" \ + USE_FBGEMM="0" \ + USE_NNPACK="0" \ + USE_QNNPACK="0" \ + USE_XNNPACK="0" \ + USE_FLASH_ATTENTION="1" \ + USE_MEM_EFF_ATTENTION="0" + +RUN cd pytorch && python tools/amd_build/build_amd.py && python setup.py install + +# Set as recommended: https://github.com/ROCm/triton/wiki/A-script-to-set-program-execution-environment-in-ROCm +ENV HIP_FORCE_DEV_KERNARG=1 + +# On MI250 and MI300, performances for flash with Triton FA are slightly better than CK. +# However, Triton requires a tunning for each prompt length, which is prohibitive. +ENV ROCM_USE_FLASH_ATTN_V2_TRITON=0 FROM base AS kernel-builder -# Build vllm kernels +# # Build vllm kernels FROM kernel-builder AS vllm-builder WORKDIR /usr/src @@ -102,21 +145,21 @@ RUN make build-flash-attention-v2-rocm FROM kernel-builder as custom-kernels-builder WORKDIR /usr/src COPY server/custom_kernels/ . -RUN PYTORCH_ROCM_ARCH=gfx90a python setup.py build +RUN python setup.py build # Build exllama kernels FROM kernel-builder as exllama-kernels-builder WORKDIR /usr/src COPY server/exllama_kernels/ . -RUN PYTORCH_ROCM_ARCH="gfx90a" python setup.py build +RUN python setup.py build # Build exllama v2 kernels FROM kernel-builder as exllamav2-kernels-builder WORKDIR /usr/src COPY server/exllamav2_kernels/ . -RUN PYTORCH_ROCM_ARCH="gfx90a" python setup.py build +RUN python setup.py build FROM base as base-copy @@ -140,9 +183,6 @@ COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 # Copy build artifacts from exllamav2 kernels builder COPY --from=exllamav2-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages -# Install flash-attention dependencies -RUN pip install einops --no-cache-dir - # Install server COPY proto proto COPY server server @@ -160,7 +200,8 @@ COPY --from=builder /usr/src/target/release/text-generation-router /usr/local/bi COPY --from=builder /usr/src/target/release/text-generation-launcher /usr/local/bin/text-generation-launcher # AWS Sagemaker compatible image -FROM base-copy as sagemaker +FROM base as sagemaker + COPY sagemaker-entrypoint.sh entrypoint.sh RUN chmod +x entrypoint.sh @@ -169,5 +210,8 @@ ENTRYPOINT ["./entrypoint.sh"] # Final image FROM base-copy -ENTRYPOINT ["text-generation-launcher"] +COPY ./tgi-entrypoint.sh /tgi-entrypoint.sh +RUN chmod +x /tgi-entrypoint.sh + +ENTRYPOINT ["/tgi-entrypoint.sh"] CMD ["--json-output"] diff --git a/Dockerfile_intel b/Dockerfile_intel index d0791cac..809992e1 100644 --- a/Dockerfile_intel +++ b/Dockerfile_intel @@ -1,4 +1,4 @@ -FROM lukemathwalker/cargo-chef:latest-rust-1.75 AS chef +FROM lukemathwalker/cargo-chef:latest-rust-1.78 AS chef WORKDIR /usr/src ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse @@ -36,18 +36,19 @@ RUN cargo build --release # Text Generation Inference base image for Intel -FROM intel/intel-extension-for-pytorch:2.1.10-xpu as base +FROM intel/intel-extension-for-pytorch:2.1.30-xpu as base USER root # libssl.so.1.1 is not installed on Ubuntu 22.04 by default, install it RUN wget http://nz2.archive.ubuntu.com/ubuntu/pool/main/o/openssl/libssl1.1_1.1.1f-1ubuntu2_amd64.deb && \ dpkg -i ./libssl1.1_1.1.1f-1ubuntu2_amd64.deb +RUN wget -qO - https://repositories.intel.com/gpu/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \ | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list -RUN apt-get update && apt install -y intel-basekit xpu-smi cmake python3-dev ninja-build +RUN apt-get update && apt install -y intel-basekit xpu-smi # Text Generation Inference base env ENV HUGGINGFACE_HUB_CACHE=/data \ @@ -56,9 +57,8 @@ ENV HUGGINGFACE_HUB_CACHE=/data \ WORKDIR /usr/src -# Build pytorch and ipex -RUN git clone https://github.com/intel/intel-extension-for-pytorch && cd intel-extension-for-pytorch && git checkout -b xpu_main origin/xpu-main -RUN git clone https://github.com/pytorch/pytorch.git && cd pytorch && git checkout 209f2fa8ff86652f67d75c2f19bf9cb9942fd018 && git apply /usr/src/intel-extension-for-pytorch/torch_patches/00*.patch +RUN wget https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.1.30a0-cp310-cp310-linux_x86_64.whl +RUN pip install intel_extension_for_pytorch-2.1.30a0-cp310-cp310-linux_x86_64.whl # Install server COPY proto proto @@ -72,25 +72,11 @@ RUN cd server && \ ENV CCL_ROOT=/opt/intel/oneapi/ccl/latest ENV I_MPI_ROOT=/opt/intel/oneapi/mpi/latest ENV FI_PROVIDER_PATH=/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/lib/prov:/usr/lib/x86_64-linux-gnu/libfabric -ENV DIAGUTIL_PATH=/opt/intel/oneapi/compiler/latest/etc/compiler/sys_check/sys_check.sh -ENV CCL_CONFIGURATION=cpu_gpu_dpcpp -ENV MANPATH=/opt/intel/oneapi/mpi/latest/share/man:/opt/intel/oneapi/mpi/latest/share/man:/opt/intel/oneapi/compiler/latest/share/man -ENV CMAKE_PREFIX_PATH=/opt/intel/oneapi/mkl/latest/lib/cmake:/opt/intel/oneapi/compiler/latest -ENV CMPLR_ROOT=/opt/intel/oneapi/compiler/latest ENV LIBRARY_PATH=/opt/intel/oneapi/mpi/latest/lib:/opt/intel/oneapi/ccl/latest/lib/:/opt/intel/oneapi/mkl/latest/lib/:/opt/intel/oneapi/compiler/latest/lib -ENV OCL_ICD_FILENAMES=libintelocl_emu.so:libalteracl.so:/opt/intel/oneapi/compiler/latest/lib/libintelocl.so -ENV CLASSPATH=/opt/intel/oneapi/mpi/latest/share/java/mpi.jar:/opt/intel/oneapi/mpi/latest/share/java/mpi.jar ENV LD_LIBRARY_PATH=/opt/intel/oneapi/ccl/latest/lib/:/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/lib:/opt/intel/oneapi/mpi/latest/lib:/opt/intel/oneapi/mkl/latest/lib:/opt/intel/oneapi/compiler/latest/opt/compiler/lib:/opt/intel/oneapi/compiler/latest/lib:/opt/intel/oneapi/lib:/opt/intel/oneapi/lib/intel64: -ENV MKLROOT=/opt/intel/oneapi/mkl/latest -ENV NLSPATH=/opt/intel/oneapi/mkl/latest/share/locale/%l_%t/%N:/opt/intel/oneapi/compiler/latest/lib/locale/%l_%t/%N ENV PATH=/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/bin:/opt/intel/oneapi/mpi/latest/bin:/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/bin:/opt/intel/oneapi/mkl/latest/bin/:/opt/intel/oneapi/compiler/latest/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin -ENV CPATH=/opt/intel/oneapi/mpi/latest/include:/opt/intel/oneapi/ccl/latest/include:/opt/intel/oneapi/mkl/latest/include ENV CCL_ZE_IPC_EXCHANGE=sockets - -RUN pip uninstall -y torch && cd pytorch && git submodule update --init --recursive && python setup.py install -RUN pip uninstall -y intel-extension-for-pytorch && cd intel-extension-for-pytorch && git submodule update --init --recursive && USE_AOT_DEVLIST='pvc' BUILD_SEPARATE_OPS=ON BUILD_WITH_CPU=ON USE_XETLA=ON python setup.py install - # Install benchmarker COPY --from=builder /usr/src/target/release/text-generation-benchmark /usr/local/bin/text-generation-benchmark # Install router diff --git a/assets/tgi_grafana.json b/assets/tgi_grafana.json new file mode 100644 index 00000000..5f5a74ad --- /dev/null +++ b/assets/tgi_grafana.json @@ -0,0 +1,3999 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS_EKS API INFERENCE PROD", + "label": "Prometheus EKS API Inference Prod", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__elements": {}, + "__requires": [ + { + "type": "panel", + "id": "gauge", + "name": "Gauge", + "version": "" + }, + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "10.0.2" + }, + { + "type": "panel", + "id": "heatmap", + "name": "Heatmap", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 2, + "id": 551, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "fieldMinMax": false, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1000 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 0, + "y": 0 + }, + "id": 49, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "(histogram_quantile(0.5, sum by (le) (rate(tgi_request_queue_duration_bucket{container=\"$service\"}[10m]))) * 1000) > 0", + "hide": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "(histogram_quantile(0.5, sum by (le) (rate(tgi_batch_inference_duration_bucket{method=\"prefill\", container=\"$service\"}[10m]))) * 1000) > 0", + "hide": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "C" + }, + { + "datasource": { + "name": "Expression", + "type": "__expr__", + "uid": "__expr__" + }, + "expression": "$B + $C", + "hide": false, + "refId": "D", + "type": "math" + } + ], + "title": "Time to first token", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 9, + "y": 0 + }, + "id": 44, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "(histogram_quantile(0.5, sum by (le) (rate(tgi_batch_forward_duration_bucket{method=\"decode\", container=\"$service\"}[10m]))) * 1000)>0", + "instant": false, + "range": true, + "refId": "A" + } + ], + "title": "Decode per-token latency", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 7, + "x": 17, + "y": 0 + }, + "id": 45, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "sum((rate(tgi_request_generated_tokens_sum{container=\"$service\"}[10m]) / rate(tgi_request_generated_tokens_count{container=\"$service\"}[10m]))>0)", + "instant": false, + "range": true, + "refId": "A" + } + ], + "title": "Throughput (generated tok/s)", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "p50" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "p90" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "p99" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 7 + }, + "id": 48, + "options": { + "legend": { + "calcs": [ + "min", + "max" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.5, sum by (le) (rate(tgi_request_input_length_bucket{container=\"$service\"}[10m])))", + "legendFormat": "p50", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.9, sum by (le) (rate(tgi_request_input_length_bucket{container=\"$service\"}[10m])))", + "hide": false, + "legendFormat": "p90", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.99, sum by (le) (rate(tgi_request_input_length_bucket{container=\"$service\"}[10m])))", + "hide": false, + "legendFormat": "p99", + "range": true, + "refId": "C" + } + ], + "title": "Number of tokens per prompt", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "p50" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "p90" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "p99" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 7 + }, + "id": 30, + "options": { + "legend": { + "calcs": [ + "min", + "max" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.5, sum by (le) (rate(tgi_request_generated_tokens_bucket{container=\"$service\"}[10m])))", + "legendFormat": "p50", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.9, sum by (le) (rate(tgi_request_generated_tokens_bucket{container=\"$service\"}[10m])))", + "hide": false, + "legendFormat": "p90", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.99, sum by (le) (rate(tgi_request_generated_tokens_bucket{container=\"$service\"}[10m])))", + "hide": false, + "legendFormat": "p99", + "range": true, + "refId": "C" + } + ], + "title": "Number of generated tokens per request", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 15 + }, + "id": 20, + "panels": [], + "title": "General", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 30, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 16 + }, + "id": 4, + "maxDataPoints": 100, + "options": { + "legend": { + "calcs": [ + "min", + "max" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "sum(increase(tgi_request_success{container=\"$service\"}[1m]))", + "legendFormat": "Success", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "sum(increase(tgi_request_failure{container=\"$service\"}[1m])) by (err)", + "hide": false, + "legendFormat": "Error: {{err}}", + "range": true, + "refId": "B" + } + ], + "title": "Requests", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "p50" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "p90" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "p99" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 13, + "w": 9, + "x": 6, + "y": 16 + }, + "id": 6, + "options": { + "legend": { + "calcs": [ + "min", + "max" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.5, sum by (le) (rate(tgi_request_mean_time_per_token_duration_bucket{container=\"$service\"}[10m])))", + "legendFormat": "p50", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.9, sum by (le) (rate(tgi_request_mean_time_per_token_duration_bucket{container=\"$service\"}[10m])))", + "hide": false, + "legendFormat": "p90", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.99, sum by (le) (rate(tgi_request_mean_time_per_token_duration_bucket{container=\"$service\"}[10m])))", + "hide": false, + "legendFormat": "p99", + "range": true, + "refId": "C" + } + ], + "title": "Mean Time Per Token quantiles", + "type": "timeseries" + }, + { + "cards": {}, + "color": { + "cardColor": "#5794F2", + "colorScale": "linear", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "min": 0, + "mode": "opacity" + }, + "dataFormat": "tsbuckets", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 13, + "w": 9, + "x": 15, + "y": 16 + }, + "heatmap": {}, + "hideZeroBuckets": false, + "highlightCards": true, + "id": 13, + "legend": { + "show": false + }, + "maxDataPoints": 25, + "options": { + "calculate": false, + "calculation": {}, + "cellGap": 2, + "cellValues": {}, + "color": { + "exponent": 0.5, + "fill": "#5794F2", + "min": 0, + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Spectral", + "steps": 128 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": false + }, + "rowsFrame": { + "layout": "auto" + }, + "showValue": "never", + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": false + }, + "yAxis": { + "axisPlacement": "left", + "decimals": 1, + "reverse": false, + "unit": "s" + } + }, + "pluginVersion": "10.4.2", + "reverseYBuckets": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(increase(tgi_request_mean_time_per_token_duration_bucket{container=\"$service\"}[5m])) by (le)", + "format": "heatmap", + "interval": "", + "legendFormat": "{{ le }}", + "range": true, + "refId": "A" + } + ], + "title": "Mean Time Per Token", + "tooltip": { + "show": true, + "showHistogram": false + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "yAxis": { + "decimals": 1, + "format": "s", + "logBase": 1, + "show": true + }, + "yBucketBound": "auto" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 70 + }, + { + "color": "red", + "value": 85 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 0, + "y": 24 + }, + "id": 18, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "count(tgi_request_count{container=\"$service\"})", + "legendFormat": "Replicas", + "range": true, + "refId": "A" + } + ], + "title": "Number of replicas", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 70 + }, + { + "color": "red", + "value": 85 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 3, + "y": 24 + }, + "id": 32, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "10.4.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "sum(tgi_queue_size{container=\"$service\"})", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Queue Size", + "type": "gauge" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 29 + }, + "id": 26, + "panels": [], + "title": "Batching", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "bars", + "fillOpacity": 50, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 0, + "y": 30 + }, + "id": 29, + "maxDataPoints": 40, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "avg(tgi_batch_current_max_tokens{container=\"$service\"})", + "legendFormat": "{{ pod }}", + "range": true, + "refId": "A" + } + ], + "title": "Max tokens per batch", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "p50" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "p90" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "p99" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 9, + "w": 4, + "x": 6, + "y": 30 + }, + "id": 33, + "options": { + "legend": { + "calcs": [ + "min", + "max" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.5, sum by (le) (rate(tgi_request_skipped_tokens_bucket{container=\"$service\"}[10m])))", + "legendFormat": "p50", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.9, sum by (le) (rate(tgi_request_skipped_tokens_bucket{container=\"$service\"}[10m])))", + "hide": false, + "legendFormat": "p90", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.99, sum by (le) (rate(tgi_request_skipped_tokens_bucket{container=\"$service\"}[10m])))", + "hide": false, + "legendFormat": "p99", + "range": true, + "refId": "C" + } + ], + "title": "Speculated Tokens", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "p50" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "p90" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "p99" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 9, + "w": 5, + "x": 10, + "y": 30 + }, + "id": 46, + "options": { + "legend": { + "calcs": [ + "min", + "max" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.5, sum by (le) (rate(tgi_request_input_length_bucket{container=\"$service\"}[10m])))", + "legendFormat": "p50", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.9, sum by (le) (rate(tgi_request_input_length_bucket{container=\"$service\"}[10m])))", + "hide": false, + "legendFormat": "p90", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.99, sum by (le) (rate(tgi_request_input_length_bucket{container=\"$service\"}[10m])))", + "hide": false, + "legendFormat": "p99", + "range": true, + "refId": "C" + } + ], + "title": "Prompt Tokens", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "p50" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "p90" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "p99" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 9, + "w": 9, + "x": 15, + "y": 30 + }, + "id": 8, + "options": { + "legend": { + "calcs": [ + "min", + "max" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.5, sum by (le) (rate(tgi_request_duration_bucket{container=\"$service\"}[10m])))", + "legendFormat": "p50", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.9, sum by (le) (rate(tgi_request_duration_bucket{container=\"$service\"}[10m])))", + "hide": false, + "legendFormat": "p90", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.99, sum by (le) (rate(tgi_request_duration_bucket{container=\"$service\"}[10m])))", + "hide": false, + "legendFormat": "p99", + "range": true, + "refId": "C" + } + ], + "title": "Latency quantiles", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "bars", + "fillOpacity": 50, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 35 + }, + "id": 27, + "maxDataPoints": 40, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "avg(tgi_batch_current_size{container=\"$service\"})", + "legendFormat": "{{ pod }}", + "range": true, + "refId": "A" + } + ], + "title": "Batch Size", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 30, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 6, + "x": 0, + "y": 39 + }, + "id": 28, + "maxDataPoints": 100, + "options": { + "legend": { + "calcs": [ + "min", + "max" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "sum(increase(tgi_batch_concat{container=\"$service\"}[1m])) by (reason)", + "hide": false, + "legendFormat": "Reason: {{ reason }}", + "range": true, + "refId": "B" + } + ], + "title": "Concatenates", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "p50" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "p90" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "p99" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 9, + "w": 9, + "x": 6, + "y": 39 + }, + "id": 31, + "options": { + "legend": { + "calcs": [ + "min", + "max" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.5, sum by (le) (rate(tgi_request_queue_duration_bucket{container=\"$service\"}[10m])))", + "legendFormat": "p50", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.9, sum by (le) (rate(tgi_request_queue_duration_bucket{container=\"$service\"}[10m])))", + "hide": false, + "legendFormat": "p90", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.99, sum by (le) (rate(tgi_request_queue_duration_bucket{container=\"$service\"}[10m])))", + "hide": false, + "legendFormat": "p99", + "range": true, + "refId": "C" + } + ], + "title": "Queue quantiles", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 48 + }, + "id": 22, + "panels": [], + "title": "Prefill", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "p50" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "p90" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "p99" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 0, + "y": 49 + }, + "id": 7, + "options": { + "legend": { + "calcs": [ + "min", + "max" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.5, sum by (le) (rate(tgi_batch_inference_duration_bucket{method=\"prefill\", container=\"$service\"}[10m])))", + "legendFormat": "p50", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.9, sum by (le) (rate(tgi_batch_inference_duration_bucket{method=\"prefill\", container=\"$service\"}[10m])))", + "hide": false, + "legendFormat": "p90", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.99, sum by (le) (rate(tgi_batch_inference_duration_bucket{method=\"prefill\", container=\"$service\"}[10m])))", + "hide": false, + "legendFormat": "p99", + "range": true, + "refId": "C" + } + ], + "title": "Prefill Quantiles", + "type": "timeseries" + }, + { + "cards": {}, + "color": { + "cardColor": "#5794F2", + "colorScale": "linear", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "min": 0, + "mode": "opacity" + }, + "dataFormat": "tsbuckets", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 12, + "y": 49 + }, + "heatmap": {}, + "hideZeroBuckets": false, + "highlightCards": true, + "id": 14, + "legend": { + "show": false + }, + "maxDataPoints": 25, + "options": { + "calculate": false, + "calculation": {}, + "cellGap": 2, + "cellValues": {}, + "color": { + "exponent": 0.5, + "fill": "#5794F2", + "min": 0, + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Spectral", + "steps": 128 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": false + }, + "rowsFrame": { + "layout": "auto" + }, + "showValue": "never", + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": false + }, + "yAxis": { + "axisPlacement": "left", + "decimals": 1, + "reverse": false, + "unit": "s" + } + }, + "pluginVersion": "10.4.2", + "reverseYBuckets": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(increase(tgi_batch_inference_duration_bucket{method=\"prefill\", container=\"$service\"}[5m])) by (le)", + "format": "heatmap", + "interval": "", + "legendFormat": "{{ le }}", + "range": true, + "refId": "A" + } + ], + "title": "Prefill Latency", + "tooltip": { + "show": true, + "showHistogram": false + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "yAxis": { + "decimals": 1, + "format": "s", + "logBase": 1, + "show": true + }, + "yBucketBound": "auto" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 60 + }, + "id": 24, + "panels": [], + "title": "Decode", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "p50" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "p90" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "p99" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 0, + "y": 61 + }, + "id": 11, + "options": { + "legend": { + "calcs": [ + "min", + "max" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.5, sum by (le) (rate(tgi_batch_inference_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))", + "legendFormat": "p50", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.9, sum by (le) (rate(tgi_batch_inference_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))", + "hide": false, + "legendFormat": "p90", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.99, sum by (le) (rate(tgi_batch_inference_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))", + "hide": false, + "legendFormat": "p99", + "range": true, + "refId": "C" + } + ], + "title": "Decode quantiles", + "type": "timeseries" + }, + { + "cards": {}, + "color": { + "cardColor": "#5794F2", + "colorScale": "linear", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "min": 0, + "mode": "opacity" + }, + "dataFormat": "tsbuckets", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 12, + "y": 61 + }, + "heatmap": {}, + "hideZeroBuckets": false, + "highlightCards": true, + "id": 15, + "legend": { + "show": false + }, + "maxDataPoints": 25, + "options": { + "calculate": false, + "calculation": {}, + "cellGap": 2, + "cellValues": {}, + "color": { + "exponent": 0.5, + "fill": "#5794F2", + "min": 0, + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Spectral", + "steps": 128 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": false + }, + "rowsFrame": { + "layout": "auto" + }, + "showValue": "never", + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": false + }, + "yAxis": { + "axisPlacement": "left", + "decimals": 1, + "reverse": false, + "unit": "s" + } + }, + "pluginVersion": "10.4.2", + "reverseYBuckets": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(increase(tgi_batch_inference_duration_bucket{method=\"decode\", container=\"$service\"}[5m])) by (le)", + "format": "heatmap", + "interval": "", + "legendFormat": "{{ le }}", + "range": true, + "refId": "A" + } + ], + "title": "Decode Latency", + "tooltip": { + "show": true, + "showHistogram": false + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "yAxis": { + "decimals": 1, + "format": "s", + "logBase": 1, + "show": true + }, + "yBucketBound": "auto" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 72 + }, + "id": 43, + "panels": [], + "title": "Debug", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "p50" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "p90" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "p99" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 11, + "w": 6, + "x": 0, + "y": 73 + }, + "id": 38, + "options": { + "legend": { + "calcs": [ + "min", + "max" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.5, sum by (le) (rate(tgi_batch_forward_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))", + "legendFormat": "p50", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.9, sum by (le) (rate(tgi_batch_forward_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))", + "hide": false, + "legendFormat": "p90", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.99, sum by (le) (rate(tgi_batch_forward_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))", + "hide": false, + "legendFormat": "p99", + "range": true, + "refId": "C" + } + ], + "title": "Forward quantiles", + "type": "timeseries" + }, + { + "cards": {}, + "color": { + "cardColor": "#5794F2", + "colorScale": "linear", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "min": 0, + "mode": "opacity" + }, + "dataFormat": "tsbuckets", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 6, + "x": 6, + "y": 73 + }, + "heatmap": {}, + "hideZeroBuckets": false, + "highlightCards": true, + "id": 35, + "legend": { + "show": false + }, + "maxDataPoints": 25, + "options": { + "calculate": false, + "calculation": {}, + "cellGap": 2, + "cellValues": {}, + "color": { + "exponent": 0.5, + "fill": "#5794F2", + "min": 0, + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Spectral", + "steps": 128 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": false + }, + "rowsFrame": { + "layout": "auto" + }, + "showValue": "never", + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": false + }, + "yAxis": { + "axisPlacement": "left", + "decimals": 1, + "reverse": false, + "unit": "s" + } + }, + "pluginVersion": "10.4.2", + "reverseYBuckets": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(increase(tgi_batch_forward_duration_bucket{method=\"decode\", container=\"$service\"}[5m])) by (le)", + "format": "heatmap", + "interval": "", + "legendFormat": "{{ le }}", + "range": true, + "refId": "A" + } + ], + "title": "Forward Latency", + "tooltip": { + "show": true, + "showHistogram": false + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "yAxis": { + "decimals": 1, + "format": "s", + "logBase": 1, + "show": true + }, + "yBucketBound": "auto" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "p50" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "p90" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "p99" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 11, + "w": 6, + "x": 12, + "y": 73 + }, + "id": 34, + "options": { + "legend": { + "calcs": [ + "min", + "max" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.5, sum by (le) (rate(tgi_batch_decode_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))", + "legendFormat": "p50", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.9, sum by (le) (rate(tgi_batch_decode_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))", + "hide": false, + "legendFormat": "p90", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.99, sum by (le) (rate(tgi_batch_decode_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))", + "hide": false, + "legendFormat": "p99", + "range": true, + "refId": "C" + } + ], + "title": "Token Decode quantiles", + "type": "timeseries" + }, + { + "cards": {}, + "color": { + "cardColor": "#5794F2", + "colorScale": "linear", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "min": 0, + "mode": "opacity" + }, + "dataFormat": "tsbuckets", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 6, + "x": 18, + "y": 73 + }, + "heatmap": {}, + "hideZeroBuckets": false, + "highlightCards": true, + "id": 40, + "legend": { + "show": false + }, + "maxDataPoints": 25, + "options": { + "calculate": false, + "calculation": {}, + "cellGap": 2, + "cellValues": {}, + "color": { + "exponent": 0.5, + "fill": "#5794F2", + "min": 0, + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Spectral", + "steps": 128 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": false + }, + "rowsFrame": { + "layout": "auto" + }, + "showValue": "never", + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": false + }, + "yAxis": { + "axisPlacement": "left", + "decimals": 1, + "reverse": false, + "unit": "s" + } + }, + "pluginVersion": "10.4.2", + "reverseYBuckets": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(increase(tgi_batch_decode_duration_bucket{method=\"decode\", container=\"$service\"}[5m])) by (le)", + "format": "heatmap", + "interval": "", + "legendFormat": "{{ le }}", + "range": true, + "refId": "A" + } + ], + "title": "Token Decode Latency", + "tooltip": { + "show": true, + "showHistogram": false + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "yAxis": { + "decimals": 1, + "format": "s", + "logBase": 1, + "show": true + }, + "yBucketBound": "auto" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "p50" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "p90" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "p99" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 11, + "w": 6, + "x": 0, + "y": 84 + }, + "id": 42, + "options": { + "legend": { + "calcs": [ + "min", + "max" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.5, sum by (le) (rate(tgi_batch_filter_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))", + "legendFormat": "p50", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.9, sum by (le) (rate(tgi_batch_filter_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))", + "hide": false, + "legendFormat": "p90", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.99, sum by (le) (rate(tgi_batch_filter_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))", + "hide": false, + "legendFormat": "p99", + "range": true, + "refId": "C" + } + ], + "title": "Filter Batch quantiles", + "type": "timeseries" + }, + { + "cards": {}, + "color": { + "cardColor": "#5794F2", + "colorScale": "linear", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "min": 0, + "mode": "opacity" + }, + "dataFormat": "tsbuckets", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 6, + "x": 6, + "y": 84 + }, + "heatmap": {}, + "hideZeroBuckets": false, + "highlightCards": true, + "id": 39, + "legend": { + "show": false + }, + "maxDataPoints": 25, + "options": { + "calculate": false, + "calculation": {}, + "cellGap": 2, + "cellValues": {}, + "color": { + "exponent": 0.5, + "fill": "#5794F2", + "min": 0, + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Spectral", + "steps": 128 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": false + }, + "rowsFrame": { + "layout": "auto" + }, + "showValue": "never", + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": false + }, + "yAxis": { + "axisPlacement": "left", + "decimals": 1, + "reverse": false, + "unit": "s" + } + }, + "pluginVersion": "10.4.2", + "reverseYBuckets": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(increase(tgi_batch_filter_duration_bucket{method=\"decode\", container=\"$service\"}[5m])) by (le)", + "format": "heatmap", + "interval": "", + "legendFormat": "{{ le }}", + "range": true, + "refId": "A" + } + ], + "title": "Filter Batch Latency", + "tooltip": { + "show": true, + "showHistogram": false + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "yAxis": { + "decimals": 1, + "format": "s", + "logBase": 1, + "show": true + }, + "yBucketBound": "auto" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "p50" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "p90" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "p99" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 11, + "w": 6, + "x": 12, + "y": 84 + }, + "id": 36, + "options": { + "legend": { + "calcs": [ + "min", + "max" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.5, sum by (le) (rate(tgi_batch_concat_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))", + "legendFormat": "p50", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.9, sum by (le) (rate(tgi_batch_concat_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))", + "hide": false, + "legendFormat": "p90", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.99, sum by (le) (rate(tgi_batch_concat_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))", + "hide": false, + "legendFormat": "p99", + "range": true, + "refId": "C" + } + ], + "title": "Batch Concat quantiles", + "type": "timeseries" + }, + { + "cards": {}, + "color": { + "cardColor": "#5794F2", + "colorScale": "linear", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "min": 0, + "mode": "opacity" + }, + "dataFormat": "tsbuckets", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 6, + "x": 18, + "y": 84 + }, + "heatmap": {}, + "hideZeroBuckets": false, + "highlightCards": true, + "id": 41, + "legend": { + "show": false + }, + "maxDataPoints": 25, + "options": { + "calculate": false, + "calculation": {}, + "cellGap": 2, + "cellValues": {}, + "color": { + "exponent": 0.5, + "fill": "#5794F2", + "min": 0, + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Spectral", + "steps": 128 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": false + }, + "rowsFrame": { + "layout": "auto" + }, + "showValue": "never", + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": false + }, + "yAxis": { + "axisPlacement": "left", + "decimals": 1, + "reverse": false, + "unit": "s" + } + }, + "pluginVersion": "10.4.2", + "reverseYBuckets": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(increase(tgi_batch_concat_duration_bucket{method=\"decode\", container=\"$service\"}[5m])) by (le)", + "format": "heatmap", + "interval": "", + "legendFormat": "{{ le }}", + "range": true, + "refId": "A" + } + ], + "title": "Batch Concat latency", + "tooltip": { + "show": true, + "showHistogram": false + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "yAxis": { + "decimals": 1, + "format": "s", + "logBase": 1, + "show": true + }, + "yBucketBound": "auto" + } + ], + "refresh": "", + "schemaVersion": 39, + "tags": [], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "gpu-txt-gen-cohereforai-c4ai-command-r-plu-ba7f1", + "value": "gpu-txt-gen-cohereforai-c4ai-command-r-plu-ba7f1" + }, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "definition": "label_values(tgi_request_count, container)", + "hide": 0, + "includeAll": false, + "multi": false, + "name": "service", + "options": [], + "query": { + "query": "label_values(tgi_request_count, container)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-30m", + "to": "now-30s" + }, + "timepicker": { + "nowDelay": "30s" + }, + "timezone": "", + "title": "Text Generation Inference", + "uid": "RHSk7EL4kdqsd", + "version": 12, + "weekStart": "" +} diff --git a/benchmark/src/event.rs b/benchmark/src/event.rs index 91ce8400..07482aed 100644 --- a/benchmark/src/event.rs +++ b/benchmark/src/event.rs @@ -11,7 +11,7 @@ pub(crate) enum Event { /// Key press. Key(event::KeyEvent), /// Terminal resize. - Resize(u16, u16), + Resize, } pub(crate) async fn terminal_event_task( @@ -47,8 +47,8 @@ async fn event_loop(fps: u32, event_sender: mpsc::Sender) { if event::poll(Duration::from_secs(0)).expect("no events available") { match event::read().expect("unable to read event") { event::Event::Key(e) => event_sender.send(Event::Key(e)).await.unwrap_or(()), - event::Event::Resize(w, h) => { - event_sender.send(Event::Resize(w, h)).await.unwrap_or(()) + event::Event::Resize(_w, _h) => { + event_sender.send(Event::Resize).await.unwrap_or(()) } _ => (), } diff --git a/clients/python/text_generation/__init__.py b/clients/python/text_generation/__init__.py index 5ab10fdb..a8e67071 100644 --- a/clients/python/text_generation/__init__.py +++ b/clients/python/text_generation/__init__.py @@ -14,5 +14,10 @@ __version__ = "0.6.0" +DEPRECATION_WARNING = ( + "`text_generation` clients are deprecated and will be removed in the near future. " + "Please use the `InferenceClient` from the `huggingface_hub` package instead." +) + from text_generation.client import Client, AsyncClient from text_generation.inference_api import InferenceAPIClient, InferenceAPIAsyncClient diff --git a/clients/python/text_generation/client.py b/clients/python/text_generation/client.py index 0e86901d..12966747 100644 --- a/clients/python/text_generation/client.py +++ b/clients/python/text_generation/client.py @@ -1,16 +1,21 @@ import json import requests +import warnings from aiohttp import ClientSession, ClientTimeout from pydantic import ValidationError from typing import Dict, Optional, List, AsyncIterator, Iterator, Union +from text_generation import DEPRECATION_WARNING from text_generation.types import ( StreamResponse, Response, Request, Parameters, Grammar, + CompletionRequest, + Completion, + CompletionComplete, ChatRequest, ChatCompletionChunk, ChatComplete, @@ -19,6 +24,9 @@ from text_generation.types import ( ) from text_generation.errors import parse_error +# emit deprecation warnings +warnings.simplefilter("always", DeprecationWarning) + class Client: """Client to make calls to a text-generation-inference instance @@ -59,11 +67,100 @@ class Client: timeout (`int`): Timeout in seconds """ + warnings.warn(DEPRECATION_WARNING, DeprecationWarning) self.base_url = base_url self.headers = headers self.cookies = cookies self.timeout = timeout + def completion( + self, + prompt: str, + frequency_penalty: Optional[float] = None, + max_tokens: Optional[int] = None, + repetition_penalty: Optional[float] = None, + seed: Optional[int] = None, + stream: bool = False, + temperature: Optional[float] = None, + top_p: Optional[float] = None, + stop: Optional[List[str]] = None, + ): + """ + Given a prompt, generate a response synchronously + + Args: + prompt (`str`): + Prompt + frequency_penalty (`float`): + The parameter for frequency penalty. 0.0 means no penalty + Penalize new tokens based on their existing frequency in the text so far, + decreasing the model's likelihood to repeat the same line verbatim. + max_tokens (`int`): + Maximum number of generated tokens + repetition_penalty (`float`): + The parameter for frequency penalty. 0.0 means no penalty. See [this + paper](https://arxiv.org/pdf/1909.05858.pdf) for more details. + seed (`int`): + Random sampling seed + stream (`bool`): + Stream the response + temperature (`float`): + The value used to module the logits distribution. + top_p (`float`): + If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or + higher are kept for generation + stop (`List[str]`): + Stop generating tokens if a member of `stop` is generated + """ + request = CompletionRequest( + model="tgi", + prompt=prompt, + frequency_penalty=frequency_penalty, + max_tokens=max_tokens, + repetition_penalty=repetition_penalty, + seed=seed, + stream=stream, + temperature=temperature, + top_p=top_p, + stop=stop, + ) + if not stream: + resp = requests.post( + f"{self.base_url}/v1/completions", + json=request.dict(), + headers=self.headers, + cookies=self.cookies, + timeout=self.timeout, + ) + payload = resp.json() + if resp.status_code != 200: + raise parse_error(resp.status_code, payload) + return Completion(**payload) + else: + return self._completion_stream_response(request) + + def _completion_stream_response(self, request): + resp = requests.post( + f"{self.base_url}/v1/completions", + json=request.dict(), + headers=self.headers, + cookies=self.cookies, + timeout=self.timeout, + stream=True, + ) + # iterate and print stream + for byte_payload in resp.iter_lines(): + if byte_payload == b"\n": + continue + payload = byte_payload.decode("utf-8") + if payload.startswith("data:"): + json_payload = json.loads(payload.lstrip("data:").rstrip("\n")) + try: + response = CompletionComplete(**json_payload) + yield response + except ValidationError: + raise parse_error(resp.status, json_payload) + def chat( self, messages: List[Message], @@ -82,6 +179,7 @@ class Client: tools: Optional[List[Tool]] = None, tool_prompt: Optional[str] = None, tool_choice: Optional[str] = None, + stop: Optional[List[str]] = None, ): """ Given a list of messages, generate a response asynchronously @@ -124,6 +222,8 @@ class Client: A prompt to be appended before the tools tool_choice (`str`): The tool to use + stop (`List[str]`): + Stop generating tokens if a member of `stop` is generated """ request = ChatRequest( @@ -144,6 +244,7 @@ class Client: tools=tools, tool_prompt=tool_prompt, tool_choice=tool_choice, + stop=stop, ) if not stream: resp = requests.post( @@ -449,11 +550,99 @@ class AsyncClient: timeout (`int`): Timeout in seconds """ + warnings.warn(DEPRECATION_WARNING, DeprecationWarning) self.base_url = base_url self.headers = headers self.cookies = cookies self.timeout = ClientTimeout(timeout) + async def completion( + self, + prompt: str, + frequency_penalty: Optional[float] = None, + max_tokens: Optional[int] = None, + repetition_penalty: Optional[float] = None, + seed: Optional[int] = None, + stream: bool = False, + temperature: Optional[float] = None, + top_p: Optional[float] = None, + stop: Optional[List[str]] = None, + ) -> Union[Completion, AsyncIterator[CompletionComplete]]: + """ + Given a prompt, generate a response asynchronously + + Args: + prompt (`str`): + Prompt + frequency_penalty (`float`): + The parameter for frequency penalty. 0.0 means no penalty + Penalize new tokens based on their existing frequency in the text so far, + decreasing the model's likelihood to repeat the same line verbatim. + max_tokens (`int`): + Maximum number of generated tokens + repetition_penalty (`float`): + The parameter for frequency penalty. 0.0 means no penalty. See [this + paper](https://arxiv.org/pdf/1909.05858.pdf) for more details. + seed (`int`): + Random sampling seed + stream (`bool`): + Stream the response + temperature (`float`): + The value used to module the logits distribution. + top_p (`float`): + If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or + higher are kept for generation + stop (`List[str]`): + Stop generating tokens if a member of `stop` is generated + """ + request = CompletionRequest( + model="tgi", + prompt=prompt, + frequency_penalty=frequency_penalty, + max_tokens=max_tokens, + repetition_penalty=repetition_penalty, + seed=seed, + stream=stream, + temperature=temperature, + top_p=top_p, + stop=stop, + ) + if not stream: + return await self._completion_single_response(request) + else: + return self._completion_stream_response(request) + + async def _completion_single_response(self, request): + async with ClientSession( + headers=self.headers, cookies=self.cookies, timeout=self.timeout + ) as session: + async with session.post( + f"{self.base_url}/v1/completions", json=request.dict() + ) as resp: + payload = await resp.json() + if resp.status != 200: + raise parse_error(resp.status, payload) + return Completion(**payload) + + async def _completion_stream_response(self, request): + async with ClientSession( + headers=self.headers, cookies=self.cookies, timeout=self.timeout + ) as session: + async with session.post( + f"{self.base_url}/v1/completions", json=request.dict() + ) as resp: + async for byte_payload in resp.content: + if byte_payload == b"\n": + continue + payload = byte_payload.decode("utf-8") + if payload.startswith("data:"): + json_payload = json.loads(payload.lstrip("data:").rstrip("\n")) + try: + response = CompletionComplete(**json_payload) + yield response + except ValidationError: + raise parse_error(resp.status, json_payload) + async def chat( self, messages: List[Message], @@ -472,6 +661,7 @@ class AsyncClient: tools: Optional[List[Tool]] = None, tool_prompt: Optional[str] = None, tool_choice: Optional[str] = None, + stop: Optional[List[str]] = None, ) -> Union[ChatComplete, AsyncIterator[ChatCompletionChunk]]: """ Given a list of messages, generate a response asynchronously @@ -514,6 +704,8 @@ class AsyncClient: A prompt to be appended before the tools tool_choice (`str`): The tool to use + stop (`List[str]`): + Stop generating tokens if a member of `stop` is generated """ request = ChatRequest( @@ -534,6 +726,7 @@ class AsyncClient: tools=tools, tool_prompt=tool_prompt, tool_choice=tool_choice, + stop=stop, ) if not stream: return await self._chat_single_response(request) diff --git a/clients/python/text_generation/types.py b/clients/python/text_generation/types.py index 5e32bc6f..eb872ee6 100644 --- a/clients/python/text_generation/types.py +++ b/clients/python/text_generation/types.py @@ -46,30 +46,6 @@ class Tool(BaseModel): function: dict -class ChatCompletionComplete(BaseModel): - # Index of the chat completion - index: int - # Message associated with the chat completion - message: Message - # Log probabilities for the chat completion - logprobs: Optional[Any] - # Reason for completion - finish_reason: str - # Usage details of the chat completion - usage: Optional[Any] = None - - -class CompletionComplete(BaseModel): - # Index of the chat completion - index: int - # Message associated with the chat completion - text: str - # Log probabilities for the chat completion - logprobs: Optional[Any] - # Reason for completion - finish_reason: str - - class Function(BaseModel): name: Optional[str] arguments: str @@ -95,24 +71,41 @@ class Choice(BaseModel): finish_reason: Optional[str] = None -class ChatCompletionChunk(BaseModel): - id: str - object: str - created: int +class CompletionRequest(BaseModel): + # Model identifier model: str - system_fingerprint: str - choices: List[Choice] + # Prompt + prompt: str + # The parameter for repetition penalty. 1.0 means no penalty. + # See [this paper](https://arxiv.org/pdf/1909.05858.pdf) for more details. + repetition_penalty: Optional[float] = None + # The parameter for frequency penalty. 1.0 means no penalty + # Penalize new tokens based on their existing frequency in the text so far, + # decreasing the model's likelihood to repeat the same line verbatim. + frequency_penalty: Optional[float] = None + # Maximum number of tokens to generate + max_tokens: Optional[int] = None + # Flag to indicate streaming response + stream: bool = False + # Random sampling seed + seed: Optional[int] = None + # Sampling temperature + temperature: Optional[float] = None + # Top-p value for nucleus sampling + top_p: Optional[float] = None + # Stop generating tokens if a member of `stop` is generated + stop: Optional[List[str]] = None -class ChatComplete(BaseModel): - # Chat completion details - id: str - object: str - created: int - model: str - system_fingerprint: str - choices: List[ChatCompletionComplete] - usage: Any +class CompletionComplete(BaseModel): + # Index of the chat completion + index: int + # Message associated with the chat completion + text: str + # Log probabilities for the chat completion + logprobs: Optional[Any] + # Reason for completion + finish_reason: str class Completion(BaseModel): @@ -163,6 +156,41 @@ class ChatRequest(BaseModel): tool_prompt: Optional[str] = None # Choice of tool to be used tool_choice: Optional[str] = None + # Stop generating tokens if a member of `stop` is generated + stop: Optional[List[str]] = None + + +class ChatCompletionComplete(BaseModel): + # Index of the chat completion + index: int + # Message associated with the chat completion + message: Message + # Log probabilities for the chat completion + logprobs: Optional[Any] + # Reason for completion + finish_reason: str + # Usage details of the chat completion + usage: Optional[Any] = None + + +class ChatComplete(BaseModel): + # Chat completion details + id: str + object: str + created: int + model: str + system_fingerprint: str + choices: List[ChatCompletionComplete] + usage: Any + + +class ChatCompletionChunk(BaseModel): + id: str + object: str + created: int + model: str + system_fingerprint: str + choices: List[Choice] class Parameters(BaseModel): diff --git a/docs/openapi.json b/docs/openapi.json index 2a387c2f..79c3b80f 100644 --- a/docs/openapi.json +++ b/docs/openapi.json @@ -1121,6 +1121,15 @@ "description": "An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the\ntokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered.", "example": 0.95, "nullable": true + }, + "stop": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Up to 4 sequences where the API will stop generating further tokens.", + "example": "null", + "nullable": true } } }, diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index c815b535..a7351a33 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -3,8 +3,16 @@ title: Text Generation Inference - local: quicktour title: Quick Tour + - local: installation_nvidia + title: Using TGI with Nvidia GPUs + - local: installation_amd + title: Using TGI with AMD GPUs + - local: installation_gaudi + title: Using TGI with Intel Gaudi + - local: installation_inferentia + title: Using TGI with AWS Inferentia - local: installation - title: Installation + title: Installation from source - local: supported_models title: Supported Models and Hardware - local: messages_api @@ -20,7 +28,7 @@ - local: basic_tutorials/using_cli title: Using TGI CLI - local: basic_tutorials/launcher - title: All TGI CLI options + title: All TGI CLI options - local: basic_tutorials/non_core_models title: Non-core Model Serving - local: basic_tutorials/safety @@ -29,6 +37,10 @@ title: Using Guidance, JSON, tools - local: basic_tutorials/visual_language_models title: Visual Language Models + - local: basic_tutorials/monitoring + title: Monitoring TGI with Prometheus and Grafana + - local: basic_tutorials/train_medusa + title: Train Medusa title: Tutorials - sections: - local: conceptual/streaming diff --git a/docs/source/basic_tutorials/gated_model_access.md b/docs/source/basic_tutorials/gated_model_access.md index 060d177d..970afa0e 100644 --- a/docs/source/basic_tutorials/gated_model_access.md +++ b/docs/source/basic_tutorials/gated_model_access.md @@ -19,6 +19,6 @@ docker run --gpus all \ --shm-size 1g \ -e HUGGING_FACE_HUB_TOKEN=$token \ -p 8080:80 \ - -v $volume:/data ghcr.io/huggingface/text-generation-inference:1.4 \ + -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.0.3 \ --model-id $model ``` diff --git a/docs/source/basic_tutorials/monitoring.md b/docs/source/basic_tutorials/monitoring.md new file mode 100644 index 00000000..509b0aff --- /dev/null +++ b/docs/source/basic_tutorials/monitoring.md @@ -0,0 +1,75 @@ +# Monitoring TGI server with Prometheus and Grafana dashboard + +TGI server deployment can easily be monitored through a Grafana dashboard, consuming a Prometheus data collection. Example of inspectable metrics are statistics on the effective batch sizes used by TGI, prefill/decode latencies, number of generated tokens, etc. + +In this tutorial, we look at how to set up a local Grafana dashboard to monitor TGI usage. + +![Grafana dashboard for TGI](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/grafana.png) + +## Setup on the server machine + +First, on your server machine, TGI needs to be launched as usual. TGI exposes [multiple](https://github.com/huggingface/text-generation-inference/discussions/1127#discussioncomment-7240527) metrics that can be collected by Prometheus monitoring server. + +In the rest of this tutorial, we assume that TGI was launched through Docker with `--network host`. + +On the server where TGI is hosted, a Prometheus server needs to be installed and launched. To do so, please follow [Prometheus installation instructions](https://prometheus.io/download/#prometheus). For example, at the time of writing on a Linux machine: + +``` +wget https://github.com/prometheus/prometheus/releases/download/v2.52.0/prometheus-2.52.0.linux-amd64.tar.gz +tar -xvzf prometheus-2.52.0.linux-amd64.tar.gz +cd prometheus +``` + +Prometheus needs to be configured to listen on TGI's port. To do so, in Prometheus configuration file `prometheus.yml`, one needs to edit the lines: +``` + static_configs: + - targets: ["0.0.0.0:80"] +``` +to use the correct IP address and port. + +We suggest to try `curl 0.0.0.0:80/generate -X POST -d '{"inputs":"hey chatbot, how are","parameters":{"max_new_tokens":15}}' -H 'Content-Type: application/json'` on the server side to make sure to configure the correct IP and port. + +Once Prometheus is configured, Prometheus server can be launched on the same machine where TGI is launched: +``` +./prometheus --config.file="prometheus.yml" +``` + +In this guide, Prometheus monitoring data will be consumed on a local computer. Hence, we need to forward Prometheus port (by default 9090) to the local computer. To do so, we can for example: +* Use ssh [local port forwarding](https://www.ssh.com/academy/ssh/tunneling-example) +* Use ngrok port tunneling + +For simplicity, we will use [Ngrok](https://ngrok.com/docs/) in this guide to tunnel Prometheus port from the TGI server to the outside word. + +For that, you should follow the steps at https://dashboard.ngrok.com/get-started/setup/linux, and once Ngrok is installed, use: +```bash +ngrok http http://0.0.0.0:9090 +``` + +As a sanity check, one can make sure that Prometheus server can be accessed at the URL given by Ngrok (in the style of https://d661-4-223-164-145.ngrok-free.app) from a local machine. + +## Setup on the monitoring machine + +Monitoring is typically done on an other machine than the server one. We use a Grafana dashboard to monitor TGI's server usage. + +Two options are available: +* Use Grafana Cloud for an hosted dashboard solution (https://grafana.com/products/cloud/). +* Self-host a grafana dashboard. + +In this tutorial, for simplicity, we will self host the dashbard. We recommend installing Grafana Open-source edition following [the official install instructions](https://grafana.com/grafana/download?platform=linux&edition=oss), using the available Linux binaries. For example: + +```bash +wget https://dl.grafana.com/oss/release/grafana-11.0.0.linux-amd64.tar.gz +tar -zxvf grafana-11.0.0.linux-amd64.tar.gz +cd grafana-11.0.0 +./bin/grafana-server +``` + +Once the Grafana server is launched, the Grafana interface is available at http://localhost:3000. One needs to log in with the `admin` username and `admin` password. + +Once logged in, the Prometheus data source for Grafana needs to be configured, in the option `Add your first data source`. There, a Prometheus data source needs to be added with the Ngrok address we got earlier, that exposes Prometheus port (example: https://d661-4-223-164-145.ngrok-free.app). + +Once Prometheus data source is configured, we can finally create our dashboard! From home, go to `Create your first dashboard` and then `Import dashboard`. There, we will use the recommended dashboard template [tgi_grafana.json](https://github.com/huggingface/text-generation-inference/blob/main/assets/tgi_grafana.json) for a dashboard ready to be used, but you may configure your own dashboard as you like. + +Community contributed dashboard templates are also available, for example [here](https://grafana.com/grafana/dashboards/19831-text-generation-inference-dashboard/) or [here](https://grafana.com/grafana/dashboards/20246-text-generation-inference/). + +Load your dashboard configuration, and your TGI dashboard should be ready to go! diff --git a/docs/source/basic_tutorials/train_medusa.md b/docs/source/basic_tutorials/train_medusa.md new file mode 100644 index 00000000..76cb6bed --- /dev/null +++ b/docs/source/basic_tutorials/train_medusa.md @@ -0,0 +1,208 @@ +# Train Medusa + +This tutorial will show you how to train a Medusa model on a dataset of your choice. Please check out the [speculation documentation](../conceptual/speculation.md) for more information on how Medusa works and speculation in general. + +## What are the benefits of training a Medusa model? + +Training Medusa heads can greatly improve the speed of generation. Medusa adds extra "heads" to LLMs to predict multiple future tokens simultaneously. When augmenting a model with Medusa, the original model stays untouched, and only the new heads are fine-tuned during training. + +One of the most important things is to have a good dataset (with similar data to what will be used in production) because Medusa has a much higher hit-rate when the generation is in-domain. + +If you train Medusa on a dataset that is very different from the one you will use in production then the model will not be able to predict the future tokens accurately and consequently the speedup will be minimal or non-existent. + +## Self-distillation (Generating data for training) + +There are many methods for preparing data for training, but one of the easiest and most effective ways is to "self-distill" the data. This means that you can use the same model to generate the data that you will use to train the model. + +Essentially, you prompt the model with a similar input to what you will use in production and the model will generate the output. + +We'll use this output to help train the medusa heads to predict the `n+1`, `n+2`, `n+3`, etc tokens in the sequence. + +## Training + +The original implementation of Medusa is available at [https://github.com/FasterDecoding/Medusa](https://github.com/FasterDecoding/Medusa) and we'll follow a very similar process to train the model as described on the original repository. + +### Getting Started + +There are two methods for training the model: + +- `torchrun` that is a wrapper around `torch.distributed.launch` +- a forked version of `axlotl` that supports Medusa + +In this tutorial we'll use `torchrun` to train the model as it is the most straightforward way to train the model but similar steps can be followed to train the model using `axlotl` if you prefer. + +### Training with `torchrun` + +```bash +mkdir medusa-training +cd medusa-training + +pyenv install 3.10 +pyenv local 3.10 + +uv venv -p 3.10 +source .venv/bin/activate +``` + +Now lets clone the original `Medusa` repository and install the library. + +```bash +git clone https://github.com/FasterDecoding/Medusa.git +cd Medusa +pip install -e . +``` + +Next we'll need some data to train on, we can use the `ShareGPT_Vicuna_unfiltered` dataset that is available on the Hugging Face Hub. + +```bash +apt install git-lfs +git lfs install +git clone https://huggingface.co/datasets/Aeala/ShareGPT_Vicuna_unfiltered +``` + +Currently our directory structure looks like this: + +```bash +. +├── assets +├── CITATION.cff +├── create_data.py +├── data_generation +├── deepspeed.json +├── last_run_prepared +├── LICENSE +├── llm_judge +├── medusa +├── medusa_llm.egg-info +├── mistral.json +├── notebooks +├── pyproject.toml +├── README.md +├── ROADMAP.md +├── scripts +├── ShareGPT_Vicuna_unfiltered +│   ├── README.md +│   ├── ShareGPT_2023.05.04v0_Wasteland_Edition.json +│   └── ShareGPT_V4.3_unfiltered_cleaned_split.json +├── simple_gradio_interface.py +├── tiny-llama.json +└── vicuna_7b_qlora_stage1 +``` + +## Start Training + +Now the lets generate the data and start training the model. This process will take a while since we are generating data from the model. + +First make sure you have an instance of TGI running with the model you want to use for self-distillation. + +```bash +model=HuggingFaceH4/zephyr-7b-beta +volume=/home/ubuntu/.cache/huggingface/hub/ + +docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id $model +``` + +Now we can generate the data using the `create_data.py` script. + +```bash +python create_data.py \ + --input-filename ShareGPT_Vicuna_unfiltered/ShareGPT_V4.3_unfiltered_cleaned_split.json \ + --output-filename zephyr_self_distill.json +``` + +At this point our terminal should look like this: + +
+ +
+ +> Note: In the screen shot above we are only using a the first 500 examples from the dataset to speed up the process, you should have a much larger dataset for training. + +Now we can finally get to the fun part and start training the model! + +Using `torchrun` we can easily launch the `medusa` training script with the `zephyr_self_distill.json` configuration file. + +> NOTE: If you just self-distilled you may still have the model running, make sure to stop it before starting the training in order to allow all of the resources to be used for training. + +```bash +WANDB_MODE=offline torchrun --nproc_per_node=4 medusa/train/train_legacy.py \ + --model_name_or_path HuggingFaceH4/zephyr-7b-beta \ + --data_path zephyr_self_distill.json \ + --bf16 True \ + --output_dir zephyr_out \ + --num_train_epochs 5 \ + --per_device_train_batch_size 4 \ + --per_device_eval_batch_size 4 \ + --gradient_accumulation_steps 4 \ + --evaluation_strategy "no" \ + --save_strategy "no" \ + --learning_rate 1e-3 \ + --weight_decay 0.0 \ + --warmup_ratio 0.1 \ + --lr_scheduler_type "cosine" \ + --logging_steps 1 \ + --tf32 True \ + --model_max_length 2048 \ + --lazy_preprocess True \ + --medusa_num_heads 3 \ + --medusa_num_layers 1 \ + --deepspeed deepspeed.json +``` + +
+ +
+ +If successful, you should see the similar output to the one below: + +```bash +wandb: Run history: +wandb: train/epoch ▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇███ +wandb: train/global_step ▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇███ +wandb: train/learning_rate ▅███▇▇▆▅▅▄▃▂▂▁▁▁ +wandb: train/loss ██▆▄▄▃▃▂▂▃▁▁▂▁▁▁ +wandb: train/medusa0_loss ▆▆▇▆▆▅▄▅▃▃▃▃▂▂▂▂▂▃▂▂▂▁▁▁▂▁▁▁▁▁█▁▁▁▂▁▁▁▁▁ +wandb: train/medusa0_top1 ▁▁▁▁▁▁▁▁▃▂▃▃▄▄▄▃▄▃▄▄▅▅▆▅▆▆▇▅▇▇▄▇█▇▅▇█▆▇▇ +wandb: train/medusa1_loss ▇▇█▇▇▆▅▅▃▄▃▃▃▃▃▃▃▃▃▃▂▁▂▂▂▁▁▂▁▁▇▁▁▁▂▁▁▁▁▁ +wandb: train/medusa1_top1 ▁▁▁▁▁▁▁▁▃▂▃▃▃▄▄▃▃▂▃▃▅▅▆▄█▆▇▅▇▇▅█▇▇▅▇█▆▆▇ +wandb: train/medusa2_loss ▃▃▄▄▄▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁█▁▁▁▂▁▁▁▁▁ +wandb: train/medusa2_top1 ▁▁▁▂▁▁▁▁▂▂▃▃▃▄▄▃▃▂▃▃▅▆▅▄█▆▆▅▆▆▄█▇▇▄▇█▆▆▇ +wandb: train/total_flos ▁ +wandb: train/train_loss ▁ +wandb: train/train_runtime ▁ +wandb: train/train_samples_per_second ▁ +wandb: train/train_steps_per_second ▁ +wandb: +wandb: Run summary: +wandb: train/epoch 2.0 +wandb: train/global_step 16 +wandb: train/learning_rate 0.0 +wandb: train/loss 14.8906 +wandb: train/medusa0_loss 4.25 +wandb: train/medusa0_top1 0.28809 +wandb: train/medusa1_loss 4.8125 +wandb: train/medusa1_top1 0.22727 +wandb: train/medusa2_loss 5.5 +wandb: train/medusa2_top1 0.17293 +wandb: train/total_flos 0.0 +wandb: train/train_loss 23.98242 +wandb: train/train_runtime 396.9266 +wandb: train/train_samples_per_second 2.519 +wandb: train/train_steps_per_second 0.04 +``` + +Last but most importantly, don't forget to push this model to the Hugging Face Hub so you can use it in your projects. + +```bash +python -m medusa.hf_utils \ + --folder zephyr_out_medusa_mlp_zephyr-7b-beta_medusa_3_lr_0.001_layers_1 \ + --repo drbh/zephyr_medusa_demo +``` + +Woo, we've successfully trained a Medusa model and pushed it to the Hugging Face Hub! 🎉 diff --git a/docs/source/basic_tutorials/using_guidance.md b/docs/source/basic_tutorials/using_guidance.md index 606f2453..d0008fdb 100644 --- a/docs/source/basic_tutorials/using_guidance.md +++ b/docs/source/basic_tutorials/using_guidance.md @@ -2,7 +2,7 @@ Text Generation Inference (TGI) now supports [JSON and regex grammars](#grammar-and-constraints) and [tools and functions](#tools-and-functions) to help developers guide LLM responses to fit their needs. -These feature are available starting from version `1.4.3`. They are accessible via the [text_generation](https://pypi.org/project/text-generation/) library. The tool support is compatible with OpenAI's client libraries. The following guide will walk you through the new features and how to use them! +These feature are available starting from version `1.4.3`. They are accessible via the [`huggingface_hub`](https://pypi.org/project/huggingface-hub/) library. The tool support is compatible with OpenAI's client libraries. The following guide will walk you through the new features and how to use them! _note: guidance is supported as grammar in the `/generate` endpoint and as tools in the `/chat/completions` endpoint._ @@ -74,6 +74,45 @@ curl localhost:3000/generate \ ``` +### Hugging Face Hub Python Library + +The Hugging Face Hub Python library provides a client that makes it easy to interact with the Messages API. Here's an example of how to use the client to send a request with a grammar parameter. + +```python +from huggingface_hub import InferenceClient + +client = InferenceClient("http://localhost:3000") + +schema = { + "properties": { + "location": {"title": "Location", "type": "string"}, + "activity": {"title": "Activity", "type": "string"}, + "animals_seen": { + "maximum": 5, + "minimum": 1, + "title": "Animals Seen", + "type": "integer", + }, + "animals": {"items": {"type": "string"}, "title": "Animals", "type": "array"}, + }, + "required": ["location", "activity", "animals_seen", "animals"], + "title": "Animals", + "type": "object", +} + +user_input = "I saw a puppy a cat and a raccoon during my bike ride in the park" +resp = client.text_generation( + f"convert to JSON: 'f{user_input}'. please use the following schema: {schema}", + max_new_tokens=100, + seed=42, + grammar={"type": "json", "value": schema}, +) + +print(resp) +# { "activity": "bike ride", "animals": ["puppy", "cat", "raccoon"], "animals_seen": 3, "location": "park" } + +``` + A grammar can be defined using Pydantic models, JSON schemas, or regular expressions. The LLM will then generate a response that conforms to the specified grammar. > Note: A grammar must compile to an intermediate representation to constrain the output. Grammar compilation is a computationally expensive and may take a few seconds to complete on the first request. Subsequent requests will use the cached grammar and will be much faster. @@ -83,134 +122,55 @@ A grammar can be defined using Pydantic models, JSON schemas, or regular express Using Pydantic models we can define a similar grammar as the previous example in a shorter and more readable way. ```python -import requests +from huggingface_hub import InferenceClient from pydantic import BaseModel, conint from typing import List + class Animals(BaseModel): location: str activity: str animals_seen: conint(ge=1, le=5) # Constrained integer type animals: List[str] -prompt = "convert to JSON: I saw a puppy a cat and a raccoon during my bike ride in the park" -data = { - "inputs": prompt, - "parameters": { - "repetition_penalty": 1.3, - "grammar": { - "type": "json", - "value": Animals.schema() - } - } -} +client = InferenceClient("http://localhost:3000") -headers = { - "Content-Type": "application/json", -} - -response = requests.post( - 'http://127.0.0.1:3000/generate', - headers=headers, - json=data +user_input = "I saw a puppy a cat and a raccoon during my bike ride in the park" +resp = client.text_generation( + f"convert to JSON: 'f{user_input}'. please use the following schema: {Animals.schema()}", + max_new_tokens=100, + seed=42, + grammar={"type": "json", "value": Animals.schema()}, ) -print(response.json()) -# {'generated_text': '{ "activity": "bike riding", "animals": ["puppy","cat","raccoon"],"animals_seen": 3, "location":"park" }'} + +print(resp) +# { "activity": "bike ride", "animals": ["puppy", "cat", "raccoon"], "animals_seen": 3, "location": "park" } + ``` -### JSON Schema Integration - -If Pydantic's not your style, go raw with direct JSON Schema integration. This is similar to the first example but with programmatic control. +defining a grammar as regular expressions ```python -import requests +from huggingface_hub import InferenceClient -json_schema = { - "properties": { - "location": { - "type": "string" - }, - "activity": { - "type": "string" - }, - "animals_seen": { - "type": "integer", - "minimum": 1, - "maximum": 5 - }, - "animals": { - "type": "array", - "items": { - "type": "string" - } - } +client = InferenceClient("http://localhost:3000") + +regexp = "((25[0-5]|2[0-4]\\d|[01]?\\d\\d?)\\.){3}(25[0-5]|2[0-4]\\d|[01]?\\d\\d?)" + +resp = client.text_generation( + f"Whats Googles DNS? Please use the following regex: {regexp}", + seed=42, + grammar={ + "type": "regex", + "value": regexp, }, - "required": ["location", "activity", "animals_seen", "animals"] -} - -data = { - "inputs": "convert to JSON: I saw a puppy a cat and a raccoon during my bike ride in the park", - "parameters": { - "max_new_tokens": 200, - "repetition_penalty": 1.3, - "grammar": { - "type": "json", - "value": json_schema - } - } -} - -headers = { - "Content-Type": "application/json", -} - -response = requests.post( - 'http://127.0.0.1:3000/generate', - headers=headers, - json=data ) -print(response.json()) -# {'generated_text': '{\n"activity": "biking",\n"animals": ["puppy","cat","raccoon"]\n , "animals_seen": 3,\n "location":"park"}'} -``` -### Using the client - -TGI provides a client library to that make it easy to send requests with all of the parameters we've discussed above. Here's an example of how to use the client to send a request with a grammar parameter. - -```python -from text_generation import AsyncClient -from text_generation.types import GrammarType - -# NOTE: tools defined above and removed for brevity - -# Define an async function to encapsulate the async operation -async def main(): - client = AsyncClient(base_url="http://localhost:3000") - - # Use 'await' to wait for the async method 'chat' to complete - response = await client.generate( - "Whats Googles DNS", - max_new_tokens=10, - decoder_input_details=True, - seed=1, - grammar={ - "type": GrammarType.Regex, - "value": "((25[0-5]|2[0-4]\\d|[01]?\\d\\d?)\\.){3}(25[0-5]|2[0-4]\\d|[01]?\\d\\d?)", - }, - ) - - # Once the response is received, you can process it - print(response.generated_text) - -# Ensure the main async function is run in the event loop -if __name__ == "__main__": - import asyncio - asyncio.run(main()) - -# 118.8.0.84 +print(resp) +# 7.1.1.1 ``` @@ -265,107 +225,87 @@ curl localhost:3000/v1/chat/completions \ // {"id":"","object":"text_completion","created":1709051640,"model":"HuggingFaceH4/zephyr-7b-beta","system_fingerprint":"1.4.3-native","choices":[{"index":0,"message":{"role":"assistant","tool_calls":{"id":0,"type":"function","function":{"description":null,"name":"tools","parameters":{"format":"celsius","location":"New York"}}}},"logprobs":null,"finish_reason":"eos_token"}],"usage":{"prompt_tokens":157,"completion_tokens":19,"total_tokens":176}} ``` -### Text Generation Inference Client +### Chat Completion with Tools -TGI provides a client library to interact with the Messages API and Tool functions. The client library is available in both synchronous and asynchronous versions. +Grammars are supported in the `/generate` endpoint, while tools are supported in the `/chat/completions` endpoint. Here's an example of how to use the client to send a request with a tool parameter. ```python -from text_generation import AsyncClient +from huggingface_hub import InferenceClient -# NOTE: tools defined above and removed for brevity +client = InferenceClient("http://localhost:3000") -# Define an async function to encapsulate the async operation -async def main(): - client = AsyncClient(base_url="http://localhost:3000") - - # Use 'await' to wait for the async method 'chat' to complete - response = await client.chat( - max_tokens=100, - seed=1, - tools=tools, - presence_penalty=-1.1, - messages=[ - { - "role": "system", - "content": "You're a helpful assistant! Answer the users question best you can.", +tools = [ + { + "type": "function", + "function": { + "name": "get_current_weather", + "description": "Get the current weather", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "The city and state, e.g. San Francisco, CA", + }, + "format": { + "type": "string", + "enum": ["celsius", "fahrenheit"], + "description": "The temperature unit to use. Infer this from the users location.", + }, + }, + "required": ["location", "format"], }, - { - "role": "user", - "content": "What is the weather like in Brooklyn, New York?", + }, + }, + { + "type": "function", + "function": { + "name": "get_n_day_weather_forecast", + "description": "Get an N-day weather forecast", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "The city and state, e.g. San Francisco, CA", + }, + "format": { + "type": "string", + "enum": ["celsius", "fahrenheit"], + "description": "The temperature unit to use. Infer this from the users location.", + }, + "num_days": { + "type": "integer", + "description": "The number of days to forecast", + }, + }, + "required": ["location", "format", "num_days"], }, - ], - ) + }, + }, +] - # Once the response is received, you can process it - print(response.choices[0].message.tool_calls) +chat = client.chat_completion( + messages=[ + { + "role": "system", + "content": "You're a helpful assistant! Answer the users question best you can.", + }, + { + "role": "user", + "content": "What is the weather like in Brooklyn, New York?", + }, + ], + tools=tools, + seed=42, + max_tokens=100, +) -# Ensure the main async function is run in the event loop -if __name__ == "__main__": - import asyncio - asyncio.run(main()) - -# {"id":"","object":"text_completion","created":1709051942,"model":"HuggingFaceH4/zephyr-7b-beta","system_fingerprint":"1.4.3-native","choices":[{"index":0,"message":{"role":"assistant","tool_calls":{"id":0,"type":"function","function":{"description":null,"name":"tools","parameters":{"format":"celsius","location":"New York"}}}},"logprobs":null,"finish_reason":"eos_token"}],"usage":{"prompt_tokens":157,"completion_tokens":20,"total_tokens":177}} +print(chat.choices[0].message.tool_calls) +# [ChatCompletionOutputToolCall(function=ChatCompletionOutputFunctionDefinition(arguments={'format': 'fahrenheit', 'location': 'Brooklyn, New York', 'num_days': 7}, name='get_n_day_weather_forecast', description=None), id=0, type='function')] ``` -
- Tools used in example above - -```python - tools = [ - { - "type": "function", - "function": { - "name": "get_current_weather", - "description": "Get the current weather", - "parameters": { - "type": "object", - "properties": { - "location": { - "type": "string", - "description": "The city and state, e.g. San Francisco, CA", - }, - "format": { - "type": "string", - "enum": ["celsius", "fahrenheit"], - "description": "The temperature unit to use. Infer this from the users location.", - }, - }, - "required": ["location", "format"], - }, - }, - }, - { - "type": "function", - "function": { - "name": "get_n_day_weather_forecast", - "description": "Get an N-day weather forecast", - "parameters": { - "type": "object", - "properties": { - "location": { - "type": "string", - "description": "The city and state, e.g. San Francisco, CA", - }, - "format": { - "type": "string", - "enum": ["celsius", "fahrenheit"], - "description": "The temperature unit to use. Infer this from the users location.", - }, - "num_days": { - "type": "integer", - "description": "The number of days to forecast", - }, - }, - "required": ["location", "format", "num_days"], - }, - }, - } - ] -``` - -
- ### OpenAI integration TGI exposes an OpenAI-compatible API, which means you can use OpenAI's client libraries to interact with TGI's Messages API and Tool functions. diff --git a/docs/source/basic_tutorials/visual_language_models.md b/docs/source/basic_tutorials/visual_language_models.md index e804ef09..3770db0b 100644 --- a/docs/source/basic_tutorials/visual_language_models.md +++ b/docs/source/basic_tutorials/visual_language_models.md @@ -53,7 +53,67 @@ for token in client.text_generation(prompt, max_new_tokens=10, stream=True): # This is a picture of an anthropomorphic rabbit in a space suit. ``` -If you want additional details, you can add `details=True`. In this case, you get a `TextGenerationStreamResponse` which contains additional information such as the probabilities and the tokens. For the final response in the stream, it also returns the full generated text. +or via the `chat_completion` endpoint: + +```python +from huggingface_hub import InferenceClient + +client = InferenceClient("http://127.0.0.1:3000") + +chat = client.chat_completion( + messages=[ + { + "role": "user", + "content": [ + {"type": "text", "text": "Whats in this image?"}, + { + "type": "image_url", + "image_url": { + "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png" + }, + }, + ], + }, + ], + seed=42, + max_tokens=100, +) + +print(chat) +# ChatCompletionOutput(choices=[ChatCompletionOutputComplete(finish_reason='length', index=0, message=ChatCompletionOutputMessage(role='assistant', content=" The image you've provided features an anthropomorphic rabbit in spacesuit attire. This rabbit is depicted with human-like posture and movement, standing on a rocky terrain with a vast, reddish-brown landscape in the background. The spacesuit is detailed with mission patches, circuitry, and a helmet that covers the rabbit's face and ear, with an illuminated red light on the chest area.\n\nThe artwork style is that of a", name=None, tool_calls=None), logprobs=None)], created=1714589614, id='', model='llava-hf/llava-v1.6-mistral-7b-hf', object='text_completion', system_fingerprint='2.0.2-native', usage=ChatCompletionOutputUsage(completion_tokens=100, prompt_tokens=2943, total_tokens=3043)) + +``` + +or with OpenAi's library: + +```python +from openai import OpenAI + +# init the client but point it to TGI +client = OpenAI(base_url="http://localhost:3000/v1", api_key="-") + +chat_completion = client.chat.completions.create( + model="tgi", + messages=[ + { + "role": "user", + "content": [ + {"type": "text", "text": "Whats in this image?"}, + { + "type": "image_url", + "image_url": { + "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png" + }, + }, + ], + }, + ], + stream=False, +) + +print(chat_completion) +# ChatCompletion(id='', choices=[Choice(finish_reason='eos_token', index=0, logprobs=None, message=ChatCompletionMessage(content=' The image depicts an anthropomorphic rabbit dressed in a space suit with gear that resembles NASA attire. The setting appears to be a solar eclipse with dramatic mountain peaks and a partial celestial body in the sky. The artwork is detailed and vivid, with a warm color palette and a sense of an adventurous bunny exploring or preparing for a journey beyond Earth. ', role='assistant', function_call=None, tool_calls=None))], created=1714589732, model='llava-hf/llava-v1.6-mistral-7b-hf', object='text_completion', system_fingerprint='2.0.2-native', usage=CompletionUsage(completion_tokens=84, prompt_tokens=2943, total_tokens=3027)) +``` ### Inference Through Sending `cURL` Requests diff --git a/docs/source/conceptual/guidance.md b/docs/source/conceptual/guidance.md index 0ce34f2f..a566c4a6 100644 --- a/docs/source/conceptual/guidance.md +++ b/docs/source/conceptual/guidance.md @@ -76,7 +76,7 @@ There are two main ways to use guidance; you can either use the `/generate` endp Under the hood tools are a special case of grammars that allows the model to choose one or none of the provided tools. -Please refer to [using guidance](../basic_tutorial/using_guidance) for more examples and details on how to use guidance in Python, JavaScript, and cURL. +Please refer to [using guidance](../basic_tutorials/using_guidance) for more examples and details on how to use guidance in Python, JavaScript, and cURL. ### Getting the most out of guidance diff --git a/docs/source/conceptual/speculation.md b/docs/source/conceptual/speculation.md index 79b1c82e..45618ae3 100644 --- a/docs/source/conceptual/speculation.md +++ b/docs/source/conceptual/speculation.md @@ -27,7 +27,7 @@ You can check a few existing fine-tunes for popular models: - [text-generation-inference/Mistral-7B-Instruct-v0.2-medusa](https://huggingface.co/text-generation-inference/Mistral-7B-Instruct-v0.2-medusa) -In order to create your own medusa heads for your own finetune, you should check own the original medusa repo. [https://github.com/FasterDecoding/Medusa](https://github.com/FasterDecoding/Medusa) +In order to create your own medusa heads for your own finetune, you should check own the original medusa repo. [../basic_tutorials/train_medusa.md](../basic_tutorials/train_medusa.md) In order to use medusa models in TGI, simply point to a medusa enabled model, and everything will load automatically. diff --git a/docs/source/installation.md b/docs/source/installation.md index 3e62102d..b6c24d55 100644 --- a/docs/source/installation.md +++ b/docs/source/installation.md @@ -1,6 +1,10 @@ -# Installation +# Installation from source -This section explains how to install the CLI tool as well as installing TGI from source. **The strongly recommended approach is to use Docker, as it does not require much setup. Check [the Quick Tour](./quicktour) to learn how to run TGI with Docker.** + + +Installing TGI from source is not the recommended usage. We strongly recommend to use TGI through Docker, check the [Quick Tour](./quicktour), [Installation for Nvidia GPUs](./installation_nvidia) and [Installation for AMD GPUs](./installation_amd) to learn how to use TGI with Docker. + + ## Install CLI diff --git a/docs/source/installation_amd.md b/docs/source/installation_amd.md new file mode 100644 index 00000000..636d301c --- /dev/null +++ b/docs/source/installation_amd.md @@ -0,0 +1,38 @@ +# Using TGI with AMD GPUs + +TGI is supported and tested on [AMD Instinct MI210](https://www.amd.com/en/products/accelerators/instinct/mi200/mi210.html), [MI250](https://www.amd.com/en/products/accelerators/instinct/mi200/mi250.html) and [MI300](https://www.amd.com/en/products/accelerators/instinct/mi300.html) GPUs. The support may be extended in the future. The recommended usage is through Docker. Make sure to check the [AMD documentation](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/docker.html) on how to use Docker with AMD GPUs. + +On a server powered by AMD GPUs, TGI can be launched with the following command: + +```bash +model=teknium/OpenHermes-2.5-Mistral-7B +volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run + +docker run --rm -it --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \ + --device=/dev/kfd --device=/dev/dri --group-add video \ + --ipc=host --shm-size 256g --net host -v $volume:/data \ + ghcr.io/huggingface/text-generation-inference:2.0.3-rocm \ + --model-id $model +``` + +The launched TGI server can then be queried from clients, make sure to check out the [Consuming TGI](./basic_tutorials/consuming_tgi) guide. + +## TunableOp + +TGI's docker image for AMD GPUs integrates [PyTorch's TunableOp](https://github.com/pytorch/pytorch/tree/main/aten/src/ATen/cuda/tunable), which allows to do an additional warmup to select the best performing matrix multiplication (GEMM) kernel from rocBLAS or hipBLASLt. + +Experimentally, on MI300X, we noticed a 6-8% latency improvement when using TunableOp on top of ROCm 6.1 and PyTorch 2.3. + +TunableOp is enabled by default, the warmup may take 1-2 minutes. In case you would like to disable TunableOp, please pass `--env PYTORCH_TUNABLEOP_ENABLED="0"` when launcher TGI's docker container. + +## Flash attention implementation + +Two implementations of Flash Attention are available for ROCm, the first is [ROCm/flash-attention](https://github.com/ROCm/flash-attention) based on a [Composable Kernel](https://github.com/ROCm/composable_kernel) (CK) implementation, and the second is a [Triton implementation](https://github.com/huggingface/text-generation-inference/blob/main/server/text_generation_server/utils/flash_attn_triton.py). + +By default, the Composable Kernel implementation is used. However, the Triton implementation has slightly lower latency on MI250 and MI300, but requires a warmup which can be prohibitive as it needs to be done again for each new prompt length. If needed, FA Triton impelmentation can be enabled with `--env ROCM_USE_FLASH_ATTN_V2_TRITON="0"` when launching TGI's docker container. + +## Unsupported features + +The following features are currently not supported in the ROCm version of TGI, and the supported may be extended in the future: +* Loading [AWQ](https://huggingface.co/docs/transformers/quantization#awq) checkpoints. +* Kernel for sliding window attention (Mistral) diff --git a/docs/source/installation_gaudi.md b/docs/source/installation_gaudi.md new file mode 100644 index 00000000..1ddf2b47 --- /dev/null +++ b/docs/source/installation_gaudi.md @@ -0,0 +1,3 @@ +# Using TGI with Intel Gaudi + +Check out this [repository](https://github.com/huggingface/tgi-gaudi) to serve models with TGI on Gaudi and Gaudi2 with [Optimum Habana](https://huggingface.co/docs/optimum/habana/index). diff --git a/docs/source/installation_inferentia.md b/docs/source/installation_inferentia.md new file mode 100644 index 00000000..0394e6de --- /dev/null +++ b/docs/source/installation_inferentia.md @@ -0,0 +1,3 @@ +# Using TGI with Inferentia + +Check out this [guide](https://github.com/huggingface/optimum-neuron/tree/main/text-generation-inference) on how to serve models with TGI on Inferentia2. diff --git a/docs/source/installation_nvidia.md b/docs/source/installation_nvidia.md new file mode 100644 index 00000000..62e1a3d6 --- /dev/null +++ b/docs/source/installation_nvidia.md @@ -0,0 +1,18 @@ +# Using TGI with Nvidia GPUs + +TGI optimized models are supported on NVIDIA [H100](https://www.nvidia.com/en-us/data-center/h100/), [A100](https://www.nvidia.com/en-us/data-center/a100/), [A10G](https://www.nvidia.com/en-us/data-center/products/a10-gpu/) and [T4](https://www.nvidia.com/en-us/data-center/tesla-t4/) GPUs with CUDA 12.2+. Note that you have to install [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html) to use it. + +For other NVIDIA GPUs, continuous batching will still apply, but some operations like flash attention and paged attention will not be executed. + +TGI can be used on NVIDIA GPUs through its official docker image: + +```bash +model=teknium/OpenHermes-2.5-Mistral-7B +volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run + +docker run --gpus all --shm-size 64g -p 8080:80 -v $volume:/data \ + ghcr.io/huggingface/text-generation-inference:2.0.3 \ + --model-id $model +``` + +The launched TGI server can then be queried from clients, make sure to check out the [Consuming TGI](./basic_tutorials/consuming_tgi) guide. diff --git a/docs/source/quicktour.md b/docs/source/quicktour.md index 70cf575c..6137c6f6 100644 --- a/docs/source/quicktour.md +++ b/docs/source/quicktour.md @@ -2,30 +2,27 @@ The easiest way of getting started is using the official Docker container. Install Docker following [their installation instructions](https://docs.docker.com/get-docker/). -Let's say you want to deploy [teknium/OpenHermes-2.5-Mistral-7B](https://huggingface.co/teknium/OpenHermes-2.5-Mistral-7B) model with TGI. Here is an example on how to do that: +## Launching TGI + +Let's say you want to deploy [teknium/OpenHermes-2.5-Mistral-7B](https://huggingface.co/teknium/OpenHermes-2.5-Mistral-7B) model with TGI on an Nvidia GPU. Here is an example on how to do that: ```bash model=teknium/OpenHermes-2.5-Mistral-7B volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run -docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:1.4 --model-id $model +docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data \ + ghcr.io/huggingface/text-generation-inference:2.0.3 \ + --model-id $model ``` - +### Supported hardware -To use NVIDIA GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). We also recommend using NVIDIA drivers with CUDA version 12.2 or higher. +TGI supports various hardware. Make sure to check the [Using TGI with Nvidia GPUs](./installation_nvidia), [Using TGI with AMD GPUs](./installation_amd), [Using TGI with Gaudi](./installation_gaudi), [Using TGI with Inferentia](./installation_inferentia) guides depending on which hardware you would like to deploy TGI on. - - -TGI also supports ROCm-enabled AMD GPUs (only MI210 and MI250 are tested), details are available in the [Supported Hardware section](./supported_models#supported-hardware) and [AMD documentation](https://rocm.docs.amd.com/en/latest/deploy/docker.html). To launch TGI on ROCm GPUs, please use instead: - -```bash -docker run --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --device=/dev/kfd --device=/dev/dri --group-add video --ipc=host --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:1.4-rocm --model-id $model -``` +## Consuming TGI Once TGI is running, you can use the `generate` endpoint by doing requests. To learn more about how to query the endpoints, check the [Consuming TGI](./basic_tutorials/consuming_tgi) section, where we show examples with utility libraries and UIs. Below you can see a simple snippet to query the endpoint. - @@ -91,7 +88,7 @@ curl 127.0.0.1:8080/generate \ To see all possible deploy flags and options, you can use the `--help` flag. It's possible to configure the number of shards, quantization, generation parameters, and more. ```bash -docker run ghcr.io/huggingface/text-generation-inference:1.4 --help +docker run ghcr.io/huggingface/text-generation-inference:2.0.3 --help ``` diff --git a/docs/source/supported_models.md b/docs/source/supported_models.md index fa1f9f61..4b6cf731 100644 --- a/docs/source/supported_models.md +++ b/docs/source/supported_models.md @@ -1,29 +1,36 @@ + # Supported Models and Hardware Text Generation Inference enables serving optimized models on specific hardware for the highest performance. The following sections list which models are hardware are supported. ## Supported Models -The following models are optimized and can be served with TGI, which uses custom CUDA kernels for better inference. You can add the flag `--disable-custom-kernels` at the end of the `docker run` command if you wish to disable them. - -- [BLOOM](https://huggingface.co/bigscience/bloom) -- [FLAN-T5](https://huggingface.co/google/flan-t5-xxl) -- [Galactica](https://huggingface.co/facebook/galactica-120b) -- [GPT-Neox](https://huggingface.co/EleutherAI/gpt-neox-20b) -- [Llama](https://github.com/facebookresearch/llama) -- [OPT](https://huggingface.co/facebook/opt-66b) -- [SantaCoder](https://huggingface.co/bigcode/santacoder) -- [Starcoder](https://huggingface.co/bigcode/starcoder) -- [Falcon 7B](https://huggingface.co/tiiuae/falcon-7b) -- [Falcon 40B](https://huggingface.co/tiiuae/falcon-40b) -- [MPT](https://huggingface.co/mosaicml/mpt-30b) -- [Llama V2](https://huggingface.co/meta-llama) -- [Code Llama](https://huggingface.co/codellama) +- [Idefics 2](https://huggingface.co/HuggingFaceM4/idefics2-8b) (Multimodal) +- [Llava Next (1.6)](https://huggingface.co/llava-hf/llava-v1.6-vicuna-13b-hf) (Multimodal) +- [Llama](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) +- [Phi 3](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct) +- [Gemma](https://huggingface.co/google/gemma-7b) +- [Cohere](https://huggingface.co/CohereForAI/c4ai-command-r-plus) +- [Dbrx](https://huggingface.co/databricks/dbrx-instruct) +- [Mamba](https://huggingface.co/state-spaces/mamba-2.8b-slimpj) - [Mistral](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2) -- [Mixtral](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1) -- [Phi](https://huggingface.co/microsoft/phi-2) -- [Idefics](HuggingFaceM4/idefics-9b-instruct) (Multimodal) -- [Llava-next](llava-hf/llava-v1.6-mistral-7b-hf) (Multimodal) +- [Mixtral](https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1) +- [Gpt Bigcode](https://huggingface.co/bigcode/gpt_bigcode-santacoder) +- [Phi](https://huggingface.co/microsoft/phi-1_5) +- [Baichuan](https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat) +- [Falcon](https://huggingface.co/tiiuae/falcon-7b-instruct) +- [StarCoder 2](https://huggingface.co/bigcode/starcoder2-15b-instruct-v0.1) +- [Qwen 2](https://huggingface.co/bigcode/starcoder2-15b-instruct-v0.1) +- [Opt](https://huggingface.co/facebook/opt-6.7b) +- [T5](https://huggingface.co/google/flan-t5-xxl) +- [Galactica](https://huggingface.co/facebook/galactica-120b) +- [SantaCoder](https://huggingface.co/bigcode/santacoder) +- [Bloom](https://huggingface.co/bigscience/bloom-560m) +- [Mpt](https://huggingface.co/mosaicml/mpt-7b-instruct) +- [Gpt2](https://huggingface.co/openai-community/gpt2) +- [Gpt Neox](https://huggingface.co/EleutherAI/gpt-neox-20b) +- [Idefics](https://huggingface.co/HuggingFaceM4/idefics-9b) (Multimodal) + If the above list lacks the model you would like to serve, depending on the model's pipeline type, you can try to initialize and serve the model anyways to see how well it performs, but performance isn't guaranteed for non-optimized models: @@ -38,18 +45,4 @@ If you wish to serve a supported model that already exists on a local folder, ju ```bash text-generation-launcher --model-id -`````` - - -## Supported Hardware - -TGI optimized models are supported on NVIDIA [A100](https://www.nvidia.com/en-us/data-center/a100/), [A10G](https://www.nvidia.com/en-us/data-center/products/a10-gpu/) and [T4](https://www.nvidia.com/en-us/data-center/tesla-t4/) GPUs with CUDA 12.2+. Note that you have to install [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html) to use it. For other NVIDIA GPUs, continuous batching will still apply, but some operations like flash attention and paged attention will not be executed. - -TGI also has support of ROCm-enabled AMD Instinct MI210 and MI250 GPUs, with paged attention, GPTQ quantization, flash attention v2 support. The following features are currently not supported in the ROCm version of TGI, and the supported may be extended in the future: -* Loading [AWQ](https://huggingface.co/docs/transformers/quantization#awq) checkpoints. -* Flash [layer norm kernel](https://github.com/Dao-AILab/flash-attention/tree/main/csrc/layer_norm) -* Kernel for sliding window attention (Mistral) - -TGI is also supported on the following AI hardware accelerators: -- *Habana first-gen Gaudi and Gaudi2:* check out this [repository](https://github.com/huggingface/tgi-gaudi) to serve models with TGI on Gaudi and Gaudi2 with [Optimum Habana](https://huggingface.co/docs/optimum/habana/index) -* *AWS Inferentia2:* check out this [guide](https://github.com/huggingface/optimum-neuron/tree/main/text-generation-inference) on how to serve models with TGI on Inferentia2. +``` diff --git a/integration-tests/images/cow_beach.png b/integration-tests/images/cow_beach.png new file mode 100644 index 00000000..d67f8a1b Binary files /dev/null and b/integration-tests/images/cow_beach.png differ diff --git a/integration-tests/models/__snapshots__/test_flash_gpt2/test_flash_gpt2.json b/integration-tests/models/__snapshots__/test_flash_gpt2/test_flash_gpt2.json new file mode 100644 index 00000000..ca7393a3 --- /dev/null +++ b/integration-tests/models/__snapshots__/test_flash_gpt2/test_flash_gpt2.json @@ -0,0 +1,99 @@ +{ + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 2061, + "logprob": null, + "text": "What" + }, + { + "id": 318, + "logprob": -3.1835938, + "text": " is" + }, + { + "id": 2769, + "logprob": -9.171875, + "text": " deep" + }, + { + "id": 4673, + "logprob": -1.6425781, + "text": " learning" + }, + { + "id": 30, + "logprob": -0.7314453, + "text": "?" + } + ], + "seed": null, + "tokens": [ + { + "id": 198, + "logprob": -0.68603516, + "special": false, + "text": "\n" + }, + { + "id": 198, + "logprob": -0.005393982, + "special": false, + "text": "\n" + }, + { + "id": 29744, + "logprob": -0.31079102, + "special": false, + "text": "Deep" + }, + { + "id": 4673, + "logprob": -0.08300781, + "special": false, + "text": " learning" + }, + { + "id": 318, + "logprob": -0.58984375, + "special": false, + "text": " is" + }, + { + "id": 257, + "logprob": -0.953125, + "special": false, + "text": " a" + }, + { + "id": 649, + "logprob": -2.0957031, + "special": false, + "text": " new" + }, + { + "id": 2214, + "logprob": -1.8095703, + "special": false, + "text": " field" + }, + { + "id": 286, + "logprob": -1.0673828, + "special": false, + "text": " of" + }, + { + "id": 2267, + "logprob": -0.9375, + "special": false, + "text": " research" + } + ], + "top_tokens": null + }, + "generated_text": "\n\nDeep learning is a new field of research" +} diff --git a/integration-tests/models/__snapshots__/test_flash_gpt2/test_flash_gpt2_load.json b/integration-tests/models/__snapshots__/test_flash_gpt2/test_flash_gpt2_load.json new file mode 100644 index 00000000..7bd15b90 --- /dev/null +++ b/integration-tests/models/__snapshots__/test_flash_gpt2/test_flash_gpt2_load.json @@ -0,0 +1,398 @@ +[ + { + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 2061, + "logprob": null, + "text": "What" + }, + { + "id": 318, + "logprob": -3.1835938, + "text": " is" + }, + { + "id": 2769, + "logprob": -9.171875, + "text": " deep" + }, + { + "id": 4673, + "logprob": -1.6425781, + "text": " learning" + }, + { + "id": 30, + "logprob": -0.7314453, + "text": "?" + } + ], + "seed": null, + "tokens": [ + { + "id": 198, + "logprob": -0.68603516, + "special": false, + "text": "\n" + }, + { + "id": 198, + "logprob": -0.005672455, + "special": false, + "text": "\n" + }, + { + "id": 29744, + "logprob": -0.3251953, + "special": false, + "text": "Deep" + }, + { + "id": 4673, + "logprob": -0.08294678, + "special": false, + "text": " learning" + }, + { + "id": 318, + "logprob": -0.5854492, + "special": false, + "text": " is" + }, + { + "id": 257, + "logprob": -0.9423828, + "special": false, + "text": " a" + }, + { + "id": 649, + "logprob": -2.0800781, + "special": false, + "text": " new" + }, + { + "id": 2214, + "logprob": -1.8369141, + "special": false, + "text": " field" + }, + { + "id": 286, + "logprob": -1.0683594, + "special": false, + "text": " of" + }, + { + "id": 2267, + "logprob": -0.9711914, + "special": false, + "text": " research" + } + ], + "top_tokens": null + }, + "generated_text": "\n\nDeep learning is a new field of research" + }, + { + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 2061, + "logprob": null, + "text": "What" + }, + { + "id": 318, + "logprob": -3.1660156, + "text": " is" + }, + { + "id": 2769, + "logprob": -9.1796875, + "text": " deep" + }, + { + "id": 4673, + "logprob": -1.6376953, + "text": " learning" + }, + { + "id": 30, + "logprob": -0.72216797, + "text": "?" + } + ], + "seed": null, + "tokens": [ + { + "id": 198, + "logprob": -0.7089844, + "special": false, + "text": "\n" + }, + { + "id": 198, + "logprob": -0.0054779053, + "special": false, + "text": "\n" + }, + { + "id": 29744, + "logprob": -0.3190918, + "special": false, + "text": "Deep" + }, + { + "id": 4673, + "logprob": -0.08319092, + "special": false, + "text": " learning" + }, + { + "id": 318, + "logprob": -0.5839844, + "special": false, + "text": " is" + }, + { + "id": 257, + "logprob": -0.9506836, + "special": false, + "text": " a" + }, + { + "id": 649, + "logprob": -2.0878906, + "special": false, + "text": " new" + }, + { + "id": 2214, + "logprob": -1.8496094, + "special": false, + "text": " field" + }, + { + "id": 286, + "logprob": -1.0673828, + "special": false, + "text": " of" + }, + { + "id": 2267, + "logprob": -0.9370117, + "special": false, + "text": " research" + } + ], + "top_tokens": null + }, + "generated_text": "\n\nDeep learning is a new field of research" + }, + { + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 2061, + "logprob": null, + "text": "What" + }, + { + "id": 318, + "logprob": -3.1660156, + "text": " is" + }, + { + "id": 2769, + "logprob": -9.1796875, + "text": " deep" + }, + { + "id": 4673, + "logprob": -1.6376953, + "text": " learning" + }, + { + "id": 30, + "logprob": -0.72216797, + "text": "?" + } + ], + "seed": null, + "tokens": [ + { + "id": 198, + "logprob": -0.7089844, + "special": false, + "text": "\n" + }, + { + "id": 198, + "logprob": -0.0054779053, + "special": false, + "text": "\n" + }, + { + "id": 29744, + "logprob": -0.3190918, + "special": false, + "text": "Deep" + }, + { + "id": 4673, + "logprob": -0.08319092, + "special": false, + "text": " learning" + }, + { + "id": 318, + "logprob": -0.5839844, + "special": false, + "text": " is" + }, + { + "id": 257, + "logprob": -0.9506836, + "special": false, + "text": " a" + }, + { + "id": 649, + "logprob": -2.0878906, + "special": false, + "text": " new" + }, + { + "id": 2214, + "logprob": -1.8496094, + "special": false, + "text": " field" + }, + { + "id": 286, + "logprob": -1.0673828, + "special": false, + "text": " of" + }, + { + "id": 2267, + "logprob": -0.9370117, + "special": false, + "text": " research" + } + ], + "top_tokens": null + }, + "generated_text": "\n\nDeep learning is a new field of research" + }, + { + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 2061, + "logprob": null, + "text": "What" + }, + { + "id": 318, + "logprob": -3.1660156, + "text": " is" + }, + { + "id": 2769, + "logprob": -9.1796875, + "text": " deep" + }, + { + "id": 4673, + "logprob": -1.6376953, + "text": " learning" + }, + { + "id": 30, + "logprob": -0.72216797, + "text": "?" + } + ], + "seed": null, + "tokens": [ + { + "id": 198, + "logprob": -0.7089844, + "special": false, + "text": "\n" + }, + { + "id": 198, + "logprob": -0.0054779053, + "special": false, + "text": "\n" + }, + { + "id": 29744, + "logprob": -0.3190918, + "special": false, + "text": "Deep" + }, + { + "id": 4673, + "logprob": -0.08319092, + "special": false, + "text": " learning" + }, + { + "id": 318, + "logprob": -0.5839844, + "special": false, + "text": " is" + }, + { + "id": 257, + "logprob": -0.9506836, + "special": false, + "text": " a" + }, + { + "id": 649, + "logprob": -2.0878906, + "special": false, + "text": " new" + }, + { + "id": 2214, + "logprob": -1.8496094, + "special": false, + "text": " field" + }, + { + "id": 286, + "logprob": -1.0673828, + "special": false, + "text": " of" + }, + { + "id": 2267, + "logprob": -0.9370117, + "special": false, + "text": " research" + } + ], + "top_tokens": null + }, + "generated_text": "\n\nDeep learning is a new field of research" + } +] diff --git a/integration-tests/models/__snapshots__/test_flash_pali_gemma/test_flash_pali_gemma.json b/integration-tests/models/__snapshots__/test_flash_pali_gemma/test_flash_pali_gemma.json new file mode 100644 index 00000000..037e0b16 --- /dev/null +++ b/integration-tests/models/__snapshots__/test_flash_pali_gemma/test_flash_pali_gemma.json @@ -0,0 +1,25 @@ +{ + "details": { + "best_of_sequences": null, + "finish_reason": "eos_token", + "generated_tokens": 2, + "prefill": [], + "seed": null, + "tokens": [ + { + "id": 54901, + "logprob": -0.72753906, + "special": false, + "text": "beach" + }, + { + "id": 1, + "logprob": -0.011009216, + "special": true, + "text": "" + } + ], + "top_tokens": null + }, + "generated_text": "beach" +} diff --git a/integration-tests/models/test_flash_gpt2.py b/integration-tests/models/test_flash_gpt2.py new file mode 100644 index 00000000..0c7977d0 --- /dev/null +++ b/integration-tests/models/test_flash_gpt2.py @@ -0,0 +1,44 @@ +import pytest + + +@pytest.fixture(scope="module") +def flash_gpt2_handle(launcher): + with launcher("openai-community/gpt2", num_shard=2) as handle: + yield handle + + +@pytest.fixture(scope="module") +async def flash_gpt2(flash_gpt2_handle): + await flash_gpt2_handle.health(300) + return flash_gpt2_handle.client + + +@pytest.mark.asyncio +async def test_flash_gpt2(flash_gpt2, response_snapshot): + response = await flash_gpt2.generate( + "What is deep learning?", + max_new_tokens=10, + decoder_input_details=True, + ) + + assert response.details.generated_tokens == 10 + assert response == response_snapshot + + +@pytest.mark.asyncio +async def test_flash_gpt2_load(flash_gpt2, generate_load, response_snapshot): + responses = await generate_load( + flash_gpt2, + "What is deep learning?", + max_new_tokens=10, + n=4, + ) + + generated_texts = [r.generated_text for r in responses] + + assert len(generated_texts) == 4 + assert all( + [text == generated_texts[0] for text in generated_texts] + ), generated_texts + + assert responses == response_snapshot diff --git a/integration-tests/models/test_flash_pali_gemma.py b/integration-tests/models/test_flash_pali_gemma.py new file mode 100644 index 00000000..d4e83c9f --- /dev/null +++ b/integration-tests/models/test_flash_pali_gemma.py @@ -0,0 +1,39 @@ +import pytest +import requests +import io +import base64 + + +@pytest.fixture(scope="module") +def flash_pali_gemma_handle(launcher): + with launcher( + "google/paligemma-3b-pt-224", + num_shard=1, + revision="float16", + max_input_length=4000, + max_total_tokens=4096, + ) as handle: + yield handle + + +@pytest.fixture(scope="module") +async def flash_pali_gemma(flash_pali_gemma_handle): + await flash_pali_gemma_handle.health(300) + return flash_pali_gemma_handle.client + + +def get_cow_beach(): + with open("integration-tests/images/cow_beach.png", "rb") as image_file: + encoded_string = base64.b64encode(image_file.read()) + return f"data:image/png;base64,{encoded_string.decode('utf-8')}" + + +@pytest.mark.asyncio +@pytest.mark.private +async def test_flash_pali_gemma(flash_pali_gemma, response_snapshot): + cow = get_cow_beach() + inputs = f"![]({cow})Where is the cow standing?\n" + response = await flash_pali_gemma.generate(inputs, max_new_tokens=20) + + assert response.generated_text == "beach" + assert response == response_snapshot diff --git a/launcher/Cargo.toml b/launcher/Cargo.toml index 6b6fd58e..eb219423 100644 --- a/launcher/Cargo.toml +++ b/launcher/Cargo.toml @@ -14,6 +14,7 @@ nix = { version = "0.28.0", features = ["signal"] } once_cell = "1.19.0" serde = { version = "1.0.188", features = ["derive"] } serde_json = "1.0.107" +thiserror = "1.0.59" tracing = "0.1.37" tracing-subscriber = { version = "0.3.17", features = ["json", "env-filter"] } diff --git a/launcher/src/main.rs b/launcher/src/main.rs index 01e34a6f..88b0db57 100644 --- a/launcher/src/main.rs +++ b/launcher/src/main.rs @@ -18,14 +18,33 @@ use std::thread; use std::thread::sleep; use std::time::{Duration, Instant}; use std::{fs, io}; -use tracing_subscriber::EnvFilter; +use thiserror::Error; +use tracing_subscriber::{filter::LevelFilter, EnvFilter}; mod env_runtime; +#[derive(Deserialize)] +struct RawConfig { + max_position_embeddings: Option, + n_positions: Option, + max_seq_len: Option, +} + #[derive(Deserialize)] struct Config { max_position_embeddings: Option, - max_seq_len: Option, +} + +impl From for Config { + fn from(other: RawConfig) -> Self { + let max_position_embeddings = other + .max_position_embeddings + .or(other.max_seq_len) + .or(other.n_positions); + Config { + max_position_embeddings, + } + } } #[derive(Clone, Copy, Debug, ValueEnum)] @@ -453,6 +472,7 @@ fn shard_manager( max_total_tokens: usize, max_batch_size: Option, otlp_endpoint: Option, + log_level: LevelFilter, status_sender: mpsc::Sender, shutdown: Arc, _shutdown_sender: mpsc::Sender<()>, @@ -475,7 +495,7 @@ fn shard_manager( "--uds-path".to_string(), uds_path, "--logger-level".to_string(), - "INFO".to_string(), + log_level.to_string().to_uppercase(), "--json-output".to_string(), ]; @@ -755,13 +775,13 @@ struct PythonLogMessage { impl PythonLogMessage { fn trace(&self) { match self.record.level.name { - PythonLogLevelEnum::Trace => tracing::trace!("{}", self.text), - PythonLogLevelEnum::Debug => tracing::debug!("{}", self.text), - PythonLogLevelEnum::Info => tracing::info!("{}", self.text), - PythonLogLevelEnum::Success => tracing::info!("{}", self.text), - PythonLogLevelEnum::Warning => tracing::warn!("{}", self.text), - PythonLogLevelEnum::Error => tracing::error!("{}", self.text), - PythonLogLevelEnum::Critical => tracing::error!("{}", self.text), + PythonLogLevelEnum::Trace => tracing::trace!("{}", self.text.trim_end()), + PythonLogLevelEnum::Debug => tracing::debug!("{}", self.text.trim_end()), + PythonLogLevelEnum::Info => tracing::info!("{}", self.text.trim_end()), + PythonLogLevelEnum::Success => tracing::info!("{}", self.text.trim_end()), + PythonLogLevelEnum::Warning => tracing::warn!("{}", self.text.trim_end()), + PythonLogLevelEnum::Error => tracing::error!("{}", self.text.trim_end()), + PythonLogLevelEnum::Critical => tracing::error!("{}", self.text.trim_end()), } } } @@ -823,26 +843,26 @@ fn find_num_shards( Ok(num_shard) } -#[derive(Debug)] +#[derive(Debug, Error)] enum LauncherError { + #[error("Invalid argument: {0}")] ArgumentValidation(String), + #[error("not enough cuda devices: {0}")] NotEnoughCUDADevices(String), + #[error("Download error")] DownloadError, + #[error("Shard cannot start")] ShardCannotStart, + #[error("Shard disconnected")] ShardDisconnected, + #[error("Shard failed")] ShardFailed, + #[error("Webserver failed")] WebserverFailed, + #[error("Webserver cannot start")] WebserverCannotStart, } -impl core::fmt::Display for LauncherError { - fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { - write!(f, "{self:?}") - } -} - -impl std::error::Error for LauncherError {} - fn download_convert_model(args: &Args, running: Arc) -> Result<(), LauncherError> { // Enter download tracing span let _span = tracing::span!(tracing::Level::INFO, "download").entered(); @@ -981,6 +1001,7 @@ fn spawn_shards( args: &Args, cuda_graphs: Vec, max_total_tokens: usize, + max_log_level: LevelFilter, shutdown: Arc, shutdown_receiver: &mpsc::Receiver<()>, shutdown_sender: mpsc::Sender<()>, @@ -1038,6 +1059,7 @@ fn spawn_shards( max_total_tokens, max_batch_size, otlp_endpoint, + max_log_level, status_sender, shutdown, shutdown_sender, @@ -1278,8 +1300,22 @@ fn main() -> Result<(), LauncherError> { let args: Args = Args::parse(); // Filter events with LOG_LEVEL - let env_filter = - EnvFilter::try_from_env("LOG_LEVEL").unwrap_or_else(|_| EnvFilter::new("info")); + let varname = "LOG_LEVEL"; + let env_filter = if let Ok(log_level) = std::env::var(varname) { + // Override to avoid simple logs to be spammed with tokio level informations + let log_level = match &log_level[..] { + "warn" => "text_generation_launcher=warn,text_generation_router=warn", + "info" => "text_generation_launcher=info,text_generation_router=info", + "debug" => "text_generation_launcher=debug,text_generation_router=debug", + log_level => log_level, + }; + EnvFilter::builder() + .with_default_directive(LevelFilter::INFO.into()) + .parse_lossy(log_level) + } else { + EnvFilter::new("info") + }; + let max_log_level = env_filter.max_level_hint().unwrap_or(LevelFilter::INFO); if args.json_output { tracing_subscriber::fmt() @@ -1322,33 +1358,30 @@ fn main() -> Result<(), LauncherError> { }; let content = std::fs::read_to_string(filename)?; - let config: Config = serde_json::from_str(&content)?; + let config: RawConfig = serde_json::from_str(&content)?; + let config: Config = config.into(); // Quantization usually means you're even more RAM constrained. let max_default = 4096; - let max_position_embeddings = match (config.max_position_embeddings, config.max_seq_len) { - (Some(max_position_embeddings), _) | (None, Some(max_position_embeddings)) => { - if max_position_embeddings > max_default { - let max = max_position_embeddings; - if args.max_input_tokens.is_none() - && args.max_total_tokens.is_none() - && args.max_batch_prefill_tokens.is_none() - { - tracing::info!("Model supports up to {max} but tgi will now set its default to {max_default} instead. This is to save VRAM by refusing large prompts in order to allow more users on the same hardware. You can increase that size using `--max-batch-prefill-tokens={} --max-total-tokens={max} --max-input-tokens={}`.", max + 50, max - 1); - } - max_default - } else { - max_position_embeddings + if let Some(max_position_embeddings) = config.max_position_embeddings { + if max_position_embeddings > max_default { + let max = max_position_embeddings; + if args.max_input_tokens.is_none() + && args.max_total_tokens.is_none() + && args.max_batch_prefill_tokens.is_none() + { + tracing::info!("Model supports up to {max} but tgi will now set its default to {max_default} instead. This is to save VRAM by refusing large prompts in order to allow more users on the same hardware. You can increase that size using `--max-batch-prefill-tokens={} --max-total-tokens={max} --max-input-tokens={}`.", max + 50, max - 1); } + Ok(max_default) + } else { + Ok(max_position_embeddings) } - _ => { - return Err(Box::new(LauncherError::ArgumentValidation( - "no max defined".to_string(), - ))); - } - }; - Ok(max_position_embeddings) + } else { + Err(Box::new(LauncherError::ArgumentValidation( + "no max defined".to_string(), + ))) + } }; let max_position_embeddings: usize = get_max_position_embeddings().unwrap_or(4096); @@ -1504,6 +1537,7 @@ fn main() -> Result<(), LauncherError> { &args, cuda_graphs, max_total_tokens, + max_log_level, shutdown.clone(), &shutdown_receiver, shutdown_sender, diff --git a/load_tests/Makefile b/load_tests/Makefile new file mode 100644 index 00000000..9199aa3b --- /dev/null +++ b/load_tests/Makefile @@ -0,0 +1,9 @@ + +ShareGPT_V3_unfiltered_cleaned_split.json: + wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json + +prepare_share: ShareGPT_V3_unfiltered_cleaned_split.json + python filter.py + +prepare_orca: + python orca.py diff --git a/load_tests/filter.py b/load_tests/filter.py new file mode 100644 index 00000000..a00226ed --- /dev/null +++ b/load_tests/filter.py @@ -0,0 +1,26 @@ +import json + + +def main(): + with open("./ShareGPT_V3_unfiltered_cleaned_split.json", "r") as f: + data = json.load(f) + + # Select only the first 2k conversations that start with a human. + max = 2000 + conversations = [] + for conversation in data: + conv = conversation.get("conversations") + if conv and conv[0]["from"] == "human": + # Trim the rest of the output + conversation["conversations"] = conversation["conversations"][:1] + conversations.append(conversation) + + if len(conversation) >= max: + break + + with open("./small.json", "w") as f: + data = json.dump(conversations, f, indent=4) + + +if __name__ == "__main__": + main() diff --git a/load_tests/orca.py b/load_tests/orca.py new file mode 100644 index 00000000..e607d27c --- /dev/null +++ b/load_tests/orca.py @@ -0,0 +1,27 @@ +import json +import datasets +import tqdm + + +def main(): + dataset = datasets.load_dataset("Open-Orca/OpenOrca", split="train") + # Select only the first 2k conversations that start with a human. + max = min(2000, len(dataset)) + conversations = [] + for item in tqdm.tqdm(dataset, total=max): + conversation = { + "conversations": [ + {"from": "human", "value": item["question"]}, + ], + "id": item["id"], + } + conversations.append(conversation) + if len(conversations) >= max: + break + + with open("./small.json", "w") as f: + data = json.dump(conversations, f, indent=4) + + +if __name__ == "__main__": + main() diff --git a/load_tests/starcoder_load.js b/load_tests/starcoder_load.js deleted file mode 100644 index 2f6cb3d6..00000000 --- a/load_tests/starcoder_load.js +++ /dev/null @@ -1,63 +0,0 @@ -import {check} from 'k6'; -import http from 'k6/http'; -import {Trend} from 'k6/metrics'; - -const host = __ENV.HOST || '127.0.0.1:3000'; - -const totalTime = new Trend('total_time', true); -const validationTime = new Trend('validation_time', true); -const queueTime = new Trend('queue_time', true); -const inferenceTime = new Trend('inference_time', true); -const timePerToken = new Trend('time_per_token', true); - -const example = { - payload: JSON.stringify({ - inputs: '# This is a fibonacci function written in the Python programming language.' + - 'def fibonacci', - parameters: { - details: true, - max_new_tokens: 60, - temperature: 0.2, - top_p: 0.95, - seed: 0, - }, - }), - generated_tokens: 60 -}; - -export const options = { - thresholds: { - http_req_failed: ['rate==0'], - time_per_token: ['p(95)<90'], - queue_time: ['p(95)<1500'], - }, - scenarios: { - load_test: { - executor: 'constant-arrival-rate', - duration: '60s', - preAllocatedVUs: 100, - rate: 10, - timeUnit: '1s', - }, - }, -}; - -export default function () { - const headers = {'Content-Type': 'application/json'}; - const res = http.post(`http://${host}/generate`, example.payload, { - headers, - }); - - check(res, { - 'Post status is 200': (r) => res.status === 200, - 'Post response generated tokens': (r) => res.status === 200 && res.json().details.generated_tokens === example.generated_tokens, - }); - - if (res.status === 200) { - totalTime.add(res.headers["X-Total-Time"]); - validationTime.add(res.headers["X-Validation-Time"]); - queueTime.add(res.headers["X-Queue-Time"]); - inferenceTime.add(res.headers["X-Inference-Time"]); - timePerToken.add(res.headers["X-Time-Per-Token"]); - } -} diff --git a/router/client/src/client.rs b/router/client/src/client.rs index fc49e2a8..4a0d44ee 100644 --- a/router/client/src/client.rs +++ b/router/client/src/client.rs @@ -110,6 +110,7 @@ impl Client { max_prefill_tokens: u32, max_total_tokens: u32, max_batch_size: Option, + model_id: &str ) -> Result> { let warmup_enabled: bool = env::var("WARMUP_ENABLED").ok().map_or(true, |value| value.to_lowercase() == "true"); if !warmup_enabled { @@ -152,25 +153,76 @@ impl Client { let mut batch_counter: u64 = 0; let mut request_counter: u64 = 0; - for shape in shapes.iter() { - let (batch_size, seq_length) = shape; - let mut batches: Vec = vec![ - self.create_warmup_batch( - *shape, - &mut batch_counter, - &mut request_counter, - max_input_length, - max_total_tokens, - seq_bucket_size, - false, - None, - ) - ]; - // if possible, create second batch in order to trigger concatenate operation - if *batch_size < max_decode_batch_size { - batches.push( + if model_id.contains("llava") { + let mut n_tokens = 0; + let mut requests = Vec::new(); + // Create requests + while n_tokens < max_prefill_tokens { + let truncate = cmp::min(max_input_length, max_prefill_tokens - n_tokens); + + let mut inputs = String::new(); + inputs.push_str("![](data:image/jpeg;base64,iVBORw0KGgoAAAANSUhEUgAAABQAAAAUCAIAAAAC64paAAABg2lDQ1BJQ0MgcHJvZmlsZQAAKJF9kT1Iw0AcxV/TSotUROxQxCFDdbKLijjWKhShQqgVWnUwufQLmrQkKS6OgmvBwY/FqoOLs64OroIg+AHi7OCk6CIl/i8ptIjx4Lgf7+497t4BQqvKNDOQADTdMjKppJjLr4rBVwQQwhAERGVm1uckKQ3P8XUPH1/v4jzL+9yfY0AtmAzwicQJVjcs4g3imU2rznmfOMLKskp8Tjxh0AWJH7muuPzGueSwwDMjRjYzTxwhFks9rPQwKxsa8TRxTNV0yhdyLquctzhr1Qbr3JO/MFzQV5a5TnMUKSxiCRJEKGiggiosxGnVSTGRof2kh3/E8UvkUshVASPHAmrQIDt+8D/43a1ZnJp0k8JJoO/Ftj/GgOAu0G7a9vexbbdPAP8zcKV3/bUWMPtJerOrxY6AwW3g4rqrKXvA5Q4QfarLhuxIfppCsQi8n9E35YHhW6B/ze2ts4/TByBLXaVvgINDYLxE2ese7w719vbvmU5/PycecohsjayNAAAACXBIWXMAAC4jAAAuIwF4pT92AAAAB3RJTUUH6AQIEQMnlTSSjwAAABl0RVh0Q29tbWVudABDcmVhdGVkIHdpdGggR0lNUFeBDhcAAAASSURBVDjLY2AYBaNgFIyCoQsABMQAAeRw1DoAAAAASUVORK5CYII=)"); + inputs.push_str(&"_test ".to_string().repeat(max_input_length as usize)); + + requests.push(Request { + id: 0, + // We truncate the input on the server side to be sure that it has the correct size + inputs, + truncate, + // Set sampling parameters to also take these ops into account in the max memory + parameters: Some(NextTokenChooserParameters { + temperature: 0.9, + top_k: 10, + top_p: 0.9, + typical_p: 0.9, + do_sample: false, + seed: 0, + repetition_penalty: 1.2, + frequency_penalty: 0.1, + watermark: true, + grammar: String::new(), + grammar_type: GrammarType::None as i32, + }), + stopping_parameters: Some(StoppingCriteriaParameters { + max_new_tokens: max_total_tokens - truncate, + stop_sequences: vec![], + ignore_eos_token: true, + }), + prefill_logprobs: true, + top_n_tokens: 20, + }); + n_tokens += max_input_length; + + // Check max_batch_size + if Some(requests.len()) == max_batch_size { + break; + } + } + + let mut batches = Vec::new(); + batches.push(Batch { + id: 0, + size: requests.len() as u32, + requests, + max_tokens: 0, + }); + + let request = tonic::Request::new(WarmupRequest { + batches, + max_input_length, + max_prefill_tokens, + max_total_tokens, + }) + .inject_context(); + let response = self.stub.warmup(request).await?.into_inner(); + Ok(response.max_supported_total_tokens) + } + else { + for shape in shapes.iter() { + let (batch_size, seq_length) = shape; + let mut batches: Vec = vec![ self.create_warmup_batch( - (1, *seq_length), + *shape, &mut batch_counter, &mut request_counter, max_input_length, @@ -179,56 +231,45 @@ impl Client { false, None, ) - ); + ]; + // if possible, create second batch in order to trigger concatenate operation + if *batch_size < max_decode_batch_size { + batches.push( + self.create_warmup_batch( + (1, *seq_length), + &mut batch_counter, + &mut request_counter, + max_input_length, + max_total_tokens, + seq_bucket_size, + false, + None, + ) + ); + } + + let request = tonic::Request::new(WarmupRequest { + batches, + max_input_length, + max_prefill_tokens, + max_total_tokens, + }).inject_context(); + let _response = self.stub.warmup(request).await?.into_inner(); } - let request = tonic::Request::new(WarmupRequest { - batches, - max_input_length, - max_prefill_tokens, - max_total_tokens, - }).inject_context(); - let _response = self.stub.warmup(request).await?.into_inner(); - } + // send batches to warmup all possible decode shapes + if decode_batch_sizes.len() > 1 { + let steps_per_bucket: u32 = if decode_bucket_size <= max_prefill_batch_size { + decode_bucket_size + } else { + decode_bucket_size.div_ceil(max_prefill_batch_size) + }; + let max_new_tokens: u32 = 2 * decode_batch_sizes.len() as u32 * steps_per_bucket; - // send batches to warmup all possible decode shapes - if decode_batch_sizes.len() > 1 { - let steps_per_bucket: u32 = if decode_bucket_size <= max_prefill_batch_size { - decode_bucket_size - } else { - decode_bucket_size.div_ceil(max_prefill_batch_size) - }; - let max_new_tokens: u32 = 2 * decode_batch_sizes.len() as u32 * steps_per_bucket; - - let mut requests_send: u32 = cmp::min(max_prefill_batch_size, decode_bucket_size); - let mut batches: Vec = vec![ - self.create_warmup_batch( - (requests_send, seq_bucket_size), - &mut batch_counter, - &mut request_counter, - max_input_length, - max_total_tokens, - seq_bucket_size, - false, - Some(max_new_tokens), - ) - ]; - - let get_current_decode_batch_size = |num: u32| -> u32 { - decode_batch_sizes.iter() - .filter(|&&x| x >= num) - .min() - .copied() - .unwrap() - }; - - let mut current_decode_batch_size: u32 = get_current_decode_batch_size(requests_send); - while current_decode_batch_size < max_decode_batch_size { - let distance_to_next_bucket = current_decode_batch_size + decode_bucket_size - requests_send; - let num_requests: u32 = cmp::min(distance_to_next_bucket, max_prefill_batch_size); - batches.push( + let mut requests_send: u32 = cmp::min(max_prefill_batch_size, decode_bucket_size); + let mut batches: Vec = vec![ self.create_warmup_batch( - (num_requests, seq_bucket_size), + (requests_send, seq_bucket_size), &mut batch_counter, &mut request_counter, max_input_length, @@ -237,48 +278,74 @@ impl Client { false, Some(max_new_tokens), ) - ); + ]; - requests_send += num_requests; - current_decode_batch_size = get_current_decode_batch_size(requests_send); + let get_current_decode_batch_size = |num: u32| -> u32 { + decode_batch_sizes.iter() + .filter(|&&x| x >= num) + .min() + .copied() + .unwrap() + }; + + let mut current_decode_batch_size: u32 = get_current_decode_batch_size(requests_send); + while current_decode_batch_size < max_decode_batch_size { + let distance_to_next_bucket = current_decode_batch_size + decode_bucket_size - requests_send; + let num_requests: u32 = cmp::min(distance_to_next_bucket, max_prefill_batch_size); + batches.push( + self.create_warmup_batch( + (num_requests, seq_bucket_size), + &mut batch_counter, + &mut request_counter, + max_input_length, + max_total_tokens, + seq_bucket_size, + false, + Some(max_new_tokens), + ) + ); + + requests_send += num_requests; + current_decode_batch_size = get_current_decode_batch_size(requests_send); + } + + let request = tonic::Request::new(WarmupRequest { + batches, + max_input_length, + max_prefill_tokens, + max_total_tokens, + }).inject_context(); + let _response = self.stub.warmup(request).await?.into_inner(); } - let request = tonic::Request::new(WarmupRequest { - batches, - max_input_length, - max_prefill_tokens, - max_total_tokens, - }).inject_context(); - let _response = self.stub.warmup(request).await?.into_inner(); - } - - // send batches with default params to warm up Greedy search - let mut greedy_shapes: Vec<(u32, u32)> = Vec::with_capacity(prefill_batch_sizes.len()); - for batch_size in &prefill_batch_sizes { - greedy_shapes.push((*batch_size, seq_bucket_size.clone())); - } - for greedy_shape in greedy_shapes.iter() { - let batches: Vec = vec![ - self.create_warmup_batch( - *greedy_shape, - &mut batch_counter, - &mut request_counter, + // send batches with default params to warm up Greedy search + let mut greedy_shapes: Vec<(u32, u32)> = Vec::with_capacity(prefill_batch_sizes.len()); + for batch_size in &prefill_batch_sizes { + greedy_shapes.push((*batch_size, seq_bucket_size.clone())); + } + for greedy_shape in greedy_shapes.iter() { + let batches: Vec = vec![ + self.create_warmup_batch( + *greedy_shape, + &mut batch_counter, + &mut request_counter, + max_input_length, + max_total_tokens, + seq_bucket_size, + true, + None, + ) + ]; + let request = tonic::Request::new(WarmupRequest { + batches, max_input_length, + max_prefill_tokens, max_total_tokens, - seq_bucket_size, - true, - None, - ) - ]; - let request = tonic::Request::new(WarmupRequest { - batches, - max_input_length, - max_prefill_tokens, - max_total_tokens, - }).inject_context(); - let _response = self.stub.warmup(request).await?.into_inner(); + }).inject_context(); + let _response = self.stub.warmup(request).await?.into_inner(); + } + Ok(None) // No support for maximum total tokens } - Ok(None) // No support for maximum total tokens } #[instrument(skip_all)] diff --git a/router/client/src/sharded_client.rs b/router/client/src/sharded_client.rs index e2c800dd..fdd84035 100644 --- a/router/client/src/sharded_client.rs +++ b/router/client/src/sharded_client.rs @@ -100,6 +100,7 @@ impl ShardedClient { max_prefill_tokens: u32, max_total_tokens: u32, max_batch_size: Option, + model_id: &str, ) -> Result> { let futures: Vec<_> = self .clients @@ -110,6 +111,7 @@ impl ShardedClient { max_prefill_tokens, max_total_tokens, max_batch_size, + model_id )) }) .collect(); diff --git a/router/grpc-metadata/src/lib.rs b/router/grpc-metadata/src/lib.rs index 7ba353fa..3068a61c 100644 --- a/router/grpc-metadata/src/lib.rs +++ b/router/grpc-metadata/src/lib.rs @@ -2,30 +2,9 @@ //! Inspired by: https://github.com/open-telemetry/opentelemetry-rust gRPC examples use opentelemetry::global; -use opentelemetry::propagation::{Extractor, Injector}; +use opentelemetry::propagation::Injector; use tracing_opentelemetry::OpenTelemetrySpanExt; -/// Extract context metadata from a gRPC request's metadata -struct MetadataExtractor<'a>(pub &'a tonic::metadata::MetadataMap); - -impl<'a> Extractor for MetadataExtractor<'a> { - /// Get a value for a key from the MetadataMap. If the value can't be converted to &str, returns None - fn get(&self, key: &str) -> Option<&str> { - self.0.get(key).and_then(|metadata| metadata.to_str().ok()) - } - - /// Collect all the keys from the MetadataMap. - fn keys(&self) -> Vec<&str> { - self.0 - .keys() - .map(|key| match key { - tonic::metadata::KeyRef::Ascii(v) => v.as_str(), - tonic::metadata::KeyRef::Binary(v) => v.as_str(), - }) - .collect::>() - } -} - /// Inject context in the metadata of a gRPC request. struct MetadataInjector<'a>(pub &'a mut tonic::metadata::MetadataMap); diff --git a/router/src/config.rs b/router/src/config.rs index 88cde69a..d27b1136 100644 --- a/router/src/config.rs +++ b/router/src/config.rs @@ -100,7 +100,6 @@ impl LlavaNext { } #[derive(Clone, Debug, Serialize, Deserialize)] -#[serde(tag = "model_type")] #[serde(rename_all = "snake_case")] pub struct ClipVisionModel { image_size: usize, @@ -108,7 +107,6 @@ pub struct ClipVisionModel { } #[derive(Clone, Debug, Serialize, Deserialize)] -#[serde(tag = "model_type")] #[serde(rename_all = "snake_case")] pub struct Idefics2 {} @@ -118,6 +116,24 @@ impl Idefics2 { } } +#[derive(Clone, Debug, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub struct PaliTextConfig { + num_image_tokens: usize, +} + +#[derive(Clone, Debug, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub struct Paligemma { + text_config: PaliTextConfig, +} + +impl Paligemma { + pub fn get_number_of_features(&self, _height: usize, _width: usize) -> usize { + self.text_config.num_image_tokens + } +} + #[derive(Clone, Debug, Serialize, Deserialize)] #[serde(tag = "model_type")] #[serde(rename_all = "snake_case")] @@ -132,12 +148,15 @@ pub enum Config { Santacoder, Bloom, Mpt, + Gpt2, GptNeox, Phi, #[serde(rename = "phi-msft")] PhiMsft, + Phi3, Llama, Baichuan, + Paligemma(Paligemma), Gemma, Cohere, Drbx, diff --git a/router/src/infer.rs b/router/src/infer.rs index d48b47f6..04328246 100644 --- a/router/src/infer.rs +++ b/router/src/infer.rs @@ -4,7 +4,7 @@ use crate::validation::{Validation, ValidationError}; use crate::{ ChatTemplateInputs, ChatTemplateVersions, Entry, GenerateRequest, GenerateStreamResponse, - HubTokenizerConfig, Message, PrefillToken, Queue, Token, + HubTokenizerConfig, Message, MessageChunk, PrefillToken, Queue, Text, TextMessage, Token, }; use crate::{FunctionRef, FunctionsMap, GrammarType, Properties, Tool, ToolType, Tools}; use futures::future::try_join_all; @@ -373,16 +373,15 @@ impl ChatTemplate { if self.use_default_tool_template { if let Some(last_message) = messages.last_mut() { if let Some((GrammarType::Json(tools), tool_prompt)) = grammar_with_prompt { - last_message.content = Some(format!( - "{}\n---\n{}\n{}", - last_message.content.as_deref().unwrap_or_default(), - tool_prompt, - tools - )); + last_message.content.push(MessageChunk::Text(Text { + text: format!("\n---\n{}\n{}", tool_prompt, tools), + })); } } } + let messages: Vec = messages.into_iter().map(|c| c.into()).collect(); + self.template .render(ChatTemplateInputs { messages, @@ -950,8 +949,7 @@ impl InferError { #[cfg(test)] mod tests { use crate::infer::raise_exception; - use crate::ChatTemplateInputs; - use crate::Message; + use crate::{ChatTemplateInputs, TextMessage}; use minijinja::Environment; #[test] @@ -985,29 +983,21 @@ mod tests { let chat_template_inputs = ChatTemplateInputs { messages: vec![ - Message { + TextMessage { role: "user".to_string(), - content: Some("Hi!".to_string()), - name: None, - tool_calls: None, + content: "Hi!".to_string(), }, - Message { + TextMessage { role: "assistant".to_string(), - content: Some("Hello how can I help?".to_string()), - name: None, - tool_calls: None, + content: "Hello how can I help?".to_string(), }, - Message { + TextMessage { role: "user".to_string(), - content: Some("What is Deep Learning?".to_string()), - name: None, - tool_calls: None, + content: "What is Deep Learning?".to_string(), }, - Message { + TextMessage { role: "assistant".to_string(), - content: Some("magic!".to_string()), - name: None, - tool_calls: None, + content: "magic!".to_string(), }, ], bos_token: Some("[BOS]"), @@ -1055,35 +1045,25 @@ mod tests { let chat_template_inputs = ChatTemplateInputs { messages: vec![ - Message { + TextMessage { role: "user".to_string(), - content: Some("Hi!".to_string()), - name: None, - tool_calls: None, + content: "Hi!".to_string(), }, - Message { + TextMessage { role: "user".to_string(), - content: Some("Hi again!".to_string()), - name: None, - tool_calls: None, + content: "Hi again!".to_string(), }, - Message { + TextMessage { role: "assistant".to_string(), - content: Some("Hello how can I help?".to_string()), - name: None, - tool_calls: None, + content: "Hello how can I help?".to_string(), }, - Message { + TextMessage { role: "user".to_string(), - content: Some("What is Deep Learning?".to_string()), - name: None, - tool_calls: None, + content: "What is Deep Learning?".to_string(), }, - Message { + TextMessage { role: "assistant".to_string(), - content: Some("magic!".to_string()), - name: None, - tool_calls: None, + content: "magic!".to_string(), }, ], bos_token: Some("[BOS]"), @@ -1136,29 +1116,21 @@ mod tests { let chat_template_inputs = ChatTemplateInputs { messages: vec![ - Message { + TextMessage { role: "user".to_string(), - content: Some("Hi!".to_string()), - name: None, - tool_calls: None, + content: "Hi!".to_string(), }, - Message { + TextMessage { role: "assistant".to_string(), - content: Some("Hello how can I help?".to_string()), - name: None, - tool_calls: None, + content: "Hello how can I help?".to_string(), }, - Message { + TextMessage { role: "user".to_string(), - content: Some("What is Deep Learning?".to_string()), - name: None, - tool_calls: None, + content: "What is Deep Learning?".to_string(), }, - Message { + TextMessage { role: "assistant".to_string(), - content: Some("magic!".to_string()), - name: None, - tool_calls: None, + content: "magic!".to_string(), }, ], bos_token: Some("[BOS]"), @@ -1195,29 +1167,21 @@ mod tests { let chat_template_inputs = ChatTemplateInputs { messages: vec![ - Message { + TextMessage { role: "user".to_string(), - content: Some("Hi!".to_string()), - name: None, - tool_calls: None, + content: "Hi!".to_string(), }, - Message { + TextMessage { role: "assistant".to_string(), - content: Some("Hello how can I help?".to_string()), - name: None, - tool_calls: None, + content: "Hello how can I help?".to_string(), }, - Message { + TextMessage { role: "user".to_string(), - content: Some("What is Deep Learning?".to_string()), - name: None, - tool_calls: None, + content: "What is Deep Learning?".to_string(), }, - Message { + TextMessage { role: "assistant".to_string(), - content: Some("magic!".to_string()), - name: None, - tool_calls: None, + content: "magic!".to_string(), }, ], bos_token: Some("[BOS]"), @@ -1240,34 +1204,24 @@ mod tests { #[test] fn test_many_chat_templates() { let example_chat = vec![ - Message { + TextMessage { role: "user".to_string(), - content: Some("Hello, how are you?".to_string()), - name: None, - tool_calls: None, + content: "Hello, how are you?".to_string(), }, - Message { + TextMessage { role: "assistant".to_string(), - content: Some("I'm doing great. How can I help you today?".to_string()), - name: None, - tool_calls: None, + content: "I'm doing great. How can I help you today?".to_string(), }, - Message { + TextMessage { role: "user".to_string(), - content: Some("I'd like to show off how chat templating works!".to_string()), - name: None, - tool_calls: None, + content: "I'd like to show off how chat templating works!".to_string(), }, ]; - let example_chat_with_system = vec![Message { + let example_chat_with_system = [TextMessage { role: "system".to_string(), - content: Some( - "You are a friendly chatbot who always responds in the style of a pirate" - .to_string(), - ), - name: None, - tool_calls: None, + content: "You are a friendly chatbot who always responds in the style of a pirate" + .to_string(), }] .iter() .chain(&example_chat) @@ -1384,7 +1338,7 @@ mod tests { { let mut env = Environment::new(); env.add_function("raise_exception", raise_exception); - let tmpl = env.template_from_str(&chat_template); + let tmpl = env.template_from_str(chat_template); let result = tmpl.unwrap().render(input).unwrap(); assert_eq!(result, target); } @@ -1407,17 +1361,13 @@ mod tests { chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}", input: ChatTemplateInputs { messages: vec![ - Message { + TextMessage{ role: "system".to_string(), - content: Some("You are a friendly chatbot who always responds in the style of a pirate".to_string()), - name: None, - tool_calls: None, + content: "You are a friendly chatbot who always responds in the style of a pirate".to_string(), }, - Message { + TextMessage{ role: "user".to_string(), - content: Some("How many helicopters can a human eat in one sitting?".to_string()), - name: None, - tool_calls: None, + content: "How many helicopters can a human eat in one sitting?".to_string(), }, ], add_generation_prompt: true, diff --git a/router/src/lib.rs b/router/src/lib.rs index fac4c14e..ba1d9acc 100644 --- a/router/src/lib.rs +++ b/router/src/lib.rs @@ -11,6 +11,7 @@ use queue::{Entry, Queue}; use serde::{Deserialize, Serialize}; use tokio::sync::OwnedSemaphorePermit; use tokio_stream::wrappers::UnboundedReceiverStream; +use tracing::warn; use utoipa::ToSchema; use validation::Validation; @@ -159,6 +160,8 @@ pub struct Info { #[schema(example = "32")] pub max_client_batch_size: usize, /// Router Info + #[schema(example = "text-generation-router")] + pub router: &'static str, #[schema(example = "0.5.0")] pub version: &'static str, #[schema(nullable = true, example = "null")] @@ -399,6 +402,11 @@ pub struct CompletionRequest { #[serde(default)] #[schema(example = "1.0")] pub frequency_penalty: Option, + + /// Up to 4 sequences where the API will stop generating further tokens. + #[serde(default)] + #[schema(nullable = true, example = "null")] + pub stop: Option>, } #[derive(Clone, Deserialize, Serialize, ToSchema, Default)] @@ -438,7 +446,7 @@ pub(crate) struct ChatCompletion { #[derive(Clone, Deserialize, Serialize, ToSchema)] pub(crate) struct ChatCompletionComplete { pub index: u32, - pub message: Message, + pub message: OutputMessage, pub logprobs: Option, pub finish_reason: String, } @@ -531,6 +539,30 @@ impl ChatCompletion { return_logprobs: bool, tool_calls: Option>, ) -> Self { + let message = match (output, tool_calls) { + (Some(content), None) => OutputMessage::ChatMessage(TextMessage { + role: "assistant".into(), + content, + }), + (None, Some(tool_calls)) => OutputMessage::ToolCall(ToolCallMessage { + role: "assistant".to_string(), + tool_calls, + }), + (Some(output), Some(_)) => { + warn!("Received both chat and tool call"); + OutputMessage::ChatMessage(TextMessage { + role: "assistant".into(), + content: output, + }) + } + (None, None) => { + warn!("Didn't receive an answer"); + OutputMessage::ChatMessage(TextMessage { + role: "assistant".into(), + content: "".to_string(), + }) + } + }; Self { id: String::new(), object: "text_completion".into(), @@ -539,12 +571,7 @@ impl ChatCompletion { system_fingerprint, choices: vec![ChatCompletionComplete { index: 0, - message: Message { - role: "assistant".into(), - content: output, - name: None, - tool_calls, - }, + message, logprobs: return_logprobs .then(|| ChatCompletionLogprobs::from((details.tokens, details.top_tokens))), finish_reason: details.finish_reason.to_string(), @@ -566,7 +593,8 @@ pub(crate) struct CompletionCompleteChunk { pub model: String, pub system_fingerprint: String, } -#[derive(Clone, Deserialize, Serialize, ToSchema)] + +#[derive(Clone, Serialize, ToSchema)] pub(crate) struct ChatCompletionChunk { pub id: String, pub object: String, @@ -578,7 +606,7 @@ pub(crate) struct ChatCompletionChunk { pub choices: Vec, } -#[derive(Clone, Deserialize, Serialize, ToSchema)] +#[derive(Clone, Serialize, ToSchema)] pub(crate) struct ChatCompletionChoice { pub index: u32, pub delta: ChatCompletionDelta, @@ -586,21 +614,21 @@ pub(crate) struct ChatCompletionChoice { pub finish_reason: Option, } -#[derive(Clone, Debug, Deserialize, Serialize, ToSchema)] -pub(crate) struct ChatCompletionDelta { - #[schema(example = "user")] - // TODO Modify this to a true enum. - #[serde(default, skip_serializing_if = "Option::is_none")] - pub role: Option, - #[serde(default, skip_serializing_if = "Option::is_none")] - #[schema(example = "What is Deep Learning?")] - pub content: Option, - // default to None - #[serde(default, skip_serializing_if = "Option::is_none")] - pub tool_calls: Option, +#[derive(Clone, Deserialize, ToSchema, Serialize, Debug, PartialEq)] +pub struct ToolCallDelta { + #[schema(example = "assistant")] + role: String, + tool_calls: DeltaToolCall, } -#[derive(Clone, Deserialize, Serialize, ToSchema, Debug)] +#[derive(Clone, Debug, Serialize, ToSchema)] +#[serde(untagged)] +enum ChatCompletionDelta { + Chat(TextMessage), + Tool(ToolCallDelta), +} + +#[derive(Clone, Deserialize, Serialize, ToSchema, Debug, PartialEq)] pub(crate) struct DeltaToolCall { pub index: u32, pub id: String, @@ -608,7 +636,7 @@ pub(crate) struct DeltaToolCall { pub function: Function, } -#[derive(Clone, Deserialize, Serialize, ToSchema, Debug)] +#[derive(Clone, Deserialize, Serialize, ToSchema, Debug, PartialEq)] pub(crate) struct Function { pub name: Option, pub arguments: String, @@ -626,15 +654,13 @@ impl ChatCompletionChunk { finish_reason: Option, ) -> Self { let delta = match (delta, tool_calls) { - (Some(delta), _) => ChatCompletionDelta { - role: Some("assistant".to_string()), - content: Some(delta), - tool_calls: None, - }, - (None, Some(tool_calls)) => ChatCompletionDelta { - role: Some("assistant".to_string()), - content: None, - tool_calls: Some(DeltaToolCall { + (Some(delta), _) => ChatCompletionDelta::Chat(TextMessage { + role: "assistant".to_string(), + content: delta, + }), + (None, Some(tool_calls)) => ChatCompletionDelta::Tool(ToolCallDelta { + role: "assistant".to_string(), + tool_calls: DeltaToolCall { index: 0, id: String::new(), r#type: "function".to_string(), @@ -642,13 +668,12 @@ impl ChatCompletionChunk { name: None, arguments: tool_calls[0].to_string(), }, - }), - }, - (None, None) => ChatCompletionDelta { - role: None, - content: None, - tool_calls: None, - }, + }, + }), + (None, None) => ChatCompletionDelta::Chat(TextMessage { + role: "assistant".to_string(), + content: "".to_string(), + }), }; Self { id: String::new(), @@ -849,7 +874,7 @@ where state.end() } -#[derive(Clone, Debug, Deserialize, Serialize, ToSchema, Default)] +#[derive(Clone, Debug, Deserialize, Serialize, ToSchema, Default, PartialEq)] pub(crate) struct FunctionDefinition { #[serde(default)] pub description: Option, @@ -869,7 +894,7 @@ pub(crate) struct Tool { #[derive(Clone, Serialize, Deserialize, Default)] pub(crate) struct ChatTemplateInputs<'a> { - messages: Vec, + messages: Vec, bos_token: Option<&'a str>, eos_token: Option<&'a str>, add_generation_prompt: bool, @@ -877,88 +902,113 @@ pub(crate) struct ChatTemplateInputs<'a> { tools_prompt: Option<&'a str>, } -#[derive(Clone, Deserialize, Serialize, ToSchema, Default, Debug)] +#[derive(Clone, Deserialize, Serialize, ToSchema, Default, Debug, PartialEq)] pub(crate) struct ToolCall { - pub id: u32, + pub id: String, pub r#type: String, pub function: FunctionDefinition, } -#[derive(Clone, Deserialize, Serialize, ToSchema, Default, Debug)] -pub(crate) struct Text { - #[serde(default)] - pub text: String, +#[derive(Clone, Deserialize, ToSchema, Serialize, Debug, PartialEq)] +struct Url { + url: String, } -#[derive(Clone, Deserialize, Serialize, ToSchema, Default, Debug)] -pub(crate) struct ImageUrl { - #[serde(default)] - pub url: String, +#[derive(Clone, Deserialize, ToSchema, Serialize, Debug, PartialEq)] +struct ImageUrl { + image_url: Url, } -#[derive(Clone, Deserialize, Serialize, ToSchema, Default, Debug)] -pub(crate) struct Content { - pub r#type: String, +#[derive(Clone, Deserialize, ToSchema, Serialize, Debug, PartialEq)] +struct Text { + text: String, +} + +#[derive(Clone, Deserialize, ToSchema, Serialize, Debug, PartialEq)] +#[serde(tag = "type")] +#[serde(rename_all = "snake_case")] +enum MessageChunk { + Text(Text), + ImageUrl(ImageUrl), +} + +#[derive(Clone, Deserialize, ToSchema, Serialize, Debug, PartialEq)] +pub struct Message { + #[schema(example = "user")] + role: String, + #[schema(example = "My name is David and I")] + #[serde(deserialize_with = "message_content_serde::deserialize")] + content: Vec, #[serde(default, skip_serializing_if = "Option::is_none")] - pub text: Option, - #[serde(default, skip_serializing_if = "Option::is_none")] - pub image_url: Option, + #[schema(example = "\"David\"")] + name: Option, } mod message_content_serde { use super::*; - use serde::de; - use serde::Deserializer; - use serde_json::Value; + use serde::{Deserialize, Deserializer}; - pub fn deserialize<'de, D>(deserializer: D) -> Result, D::Error> + pub fn deserialize<'de, D>(deserializer: D) -> Result, D::Error> where D: Deserializer<'de>, { - let value = Value::deserialize(deserializer)?; - match value { - Value::String(s) => Ok(Some(s)), - Value::Array(arr) => { - let results: Result, _> = arr - .into_iter() - .map(|v| { - let content: Content = - serde_json::from_value(v).map_err(de::Error::custom)?; - match content.r#type.as_str() { - "text" => Ok(content.text.unwrap_or_default()), - "image_url" => { - if let Some(url) = content.image_url { - Ok(format!("![]({})", url.url)) - } else { - Ok(String::new()) - } - } - _ => Err(de::Error::custom("invalid content type")), - } - }) - .collect(); - - results.map(|strings| Some(strings.join(""))) + #[derive(Deserialize)] + #[serde(untagged)] + enum Message { + Text(String), + Chunks(Vec), + } + let message: Message = Deserialize::deserialize(deserializer)?; + let chunks = match message { + Message::Text(text) => { + vec![MessageChunk::Text(Text { text })] } - Value::Null => Ok(None), - _ => Err(de::Error::custom("invalid token format")), + Message::Chunks(s) => s, + }; + Ok(chunks) + } +} + +#[derive(Clone, Deserialize, ToSchema, Serialize, Debug, PartialEq)] +pub struct TextMessage { + #[schema(example = "user")] + pub role: String, + #[schema(example = "My name is David and I")] + pub content: String, +} + +impl From for TextMessage { + fn from(value: Message) -> Self { + TextMessage { + role: value.role, + content: value + .content + .into_iter() + .map(|c| match c { + MessageChunk::Text(Text { text }) => text, + MessageChunk::ImageUrl(image) => { + let url = image.image_url.url; + format!("![]({url})") + } + }) + .collect::>() + .join(""), } } } -#[derive(Clone, Deserialize, ToSchema, Serialize, Debug)] -pub(crate) struct Message { - #[schema(example = "user")] - pub role: String, - #[serde(skip_serializing_if = "Option::is_none")] - #[schema(example = "My name is David and I")] - #[serde(deserialize_with = "message_content_serde::deserialize")] - pub content: Option, - #[serde(default, skip_serializing_if = "Option::is_none")] - #[schema(example = "\"David\"")] - pub name: Option, - #[serde(default, skip_serializing_if = "Option::is_none")] - pub tool_calls: Option>, +#[derive(Clone, Deserialize, ToSchema, Serialize, Debug, PartialEq)] +pub struct ToolCallMessage { + #[schema(example = "assistant")] + role: String, + tool_calls: Vec, +} + +#[derive(Clone, Deserialize, ToSchema, Serialize, Debug, PartialEq)] +#[serde(untagged)] +pub(crate) enum OutputMessage { + ChatMessage(TextMessage), + ToolCall(ToolCallMessage), } #[derive(Clone, Debug, Deserialize, ToSchema)] @@ -1121,7 +1171,7 @@ pub(crate) struct ErrorResponse { #[cfg(test)] mod tests { use super::*; - + use serde_json::json; use tokenizers::Tokenizer; pub(crate) async fn get_tokenizer() -> Tokenizer { @@ -1189,4 +1239,100 @@ mod tests { ); assert_eq!(config.eos_token, Some("<|end▁of▁sentence|>".to_string())); } + + #[test] + fn test_chat_simple_string() { + let json = json!({ + "model": "", + "messages": [{ + "role": "user", + "content": "What is Deep Learning?" + }] + }); + let request: ChatRequest = serde_json::from_str(json.to_string().as_str()).unwrap(); + + assert_eq!( + request.messages[0], + Message { + role: "user".to_string(), + content: vec![MessageChunk::Text(Text { + text: "What is Deep Learning?".to_string() + }),], + name: None + } + ); + } + + #[test] + fn test_chat_request() { + let json = json!({ + "model": "", + "messages": [{ + "role": "user", + "content": [ + {"type": "text", "text": "Whats in this image?"}, + {"type": "image_url", "image_url": {"url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png"}}, + ] + }] + }); + let request: ChatRequest = serde_json::from_str(json.to_string().as_str()).unwrap(); + + assert_eq!( + request.messages[0], + Message{ + role: "user".to_string(), + content: vec![ + MessageChunk::Text(Text { text: "Whats in this image?".to_string() }), + MessageChunk::ImageUrl(ImageUrl { image_url: Url { url: "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png".to_string() } }) + ], + name: None + } + ); + } + + #[test] + fn text_message_convert() { + let message = Message{ + role: "user".to_string(), + content: vec![ + MessageChunk::Text(Text { text: "Whats in this image?".to_string() }), + MessageChunk::ImageUrl(ImageUrl { image_url: Url { url: "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png".to_string() } }) + ], + name: None + }; + let textmsg: TextMessage = message.into(); + assert_eq!(textmsg.content, "Whats in this image?![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png)"); + } + #[test] + fn openai_output() { + let message = OutputMessage::ChatMessage(TextMessage { + role: "assistant".to_string(), + content: "This is the answer".to_string(), + }); + let serialized = serde_json::to_string(&message).unwrap(); + assert_eq!( + serialized, + r#"{"role":"assistant","content":"This is the answer"}"# + ); + + let message = OutputMessage::ToolCall(ToolCallMessage { + role: "assistant".to_string(), + tool_calls: vec![ToolCall { + id: "0".to_string(), + r#type: "function".to_string(), + function: FunctionDefinition { + description: None, + name: "myfn".to_string(), + arguments: json!({ + "format": "csv" + }), + }, + }], + }); + let serialized = serde_json::to_string(&message).unwrap(); + assert_eq!( + serialized, + r#"{"role":"assistant","tool_calls":[{"id":"0","type":"function","function":{"description":null,"name":"myfn","arguments":{"format":"csv"}}}]}"# + ); + } } diff --git a/router/src/main.rs b/router/src/main.rs index ae7666a0..4f9f0f73 100644 --- a/router/src/main.rs +++ b/router/src/main.rs @@ -23,7 +23,7 @@ use tokenizers::Tokenizer; use tower_http::cors::AllowOrigin; use tracing_subscriber::layer::SubscriberExt; use tracing_subscriber::util::SubscriberInitExt; -use tracing_subscriber::{EnvFilter, Layer}; +use tracing_subscriber::{filter::LevelFilter, EnvFilter, Layer}; /// App Configuration #[derive(Parser, Debug)] @@ -349,6 +349,7 @@ async fn main() -> Result<(), RouterError> { max_batch_prefill_tokens, max_total_tokens as u32, max_batch_size, + &model_info.model_id ) .await .map_err(RouterError::Warmup)? @@ -482,8 +483,21 @@ fn init_logging(otlp_endpoint: Option, json_output: bool) { } // Filter events with LOG_LEVEL - let env_filter = - EnvFilter::try_from_env("LOG_LEVEL").unwrap_or_else(|_| EnvFilter::new("info")); + let varname = "LOG_LEVEL"; + let env_filter = if let Ok(log_level) = std::env::var(varname) { + // Override to avoid simple logs to be spammed with tokio level informations + let log_level = match &log_level[..] { + "warn" => "text_generation_launcher=warn,text_generation_router=warn", + "info" => "text_generation_launcher=info,text_generation_router=info", + "debug" => "text_generation_launcher=debug,text_generation_router=debug", + log_level => log_level, + }; + EnvFilter::builder() + .with_default_directive(LevelFilter::INFO.into()) + .parse_lossy(log_level) + } else { + EnvFilter::new("info") + }; tracing_subscriber::registry() .with(env_filter) diff --git a/router/src/server.rs b/router/src/server.rs index 7343c852..1edcc472 100644 --- a/router/src/server.rs +++ b/router/src/server.rs @@ -599,9 +599,22 @@ async fn completions( let span = tracing::Span::current(); metrics::increment_counter!("tgi_request_count"); - let stream = req.stream; - let max_new_tokens = req.max_tokens.or(Some(100)); - let seed = req.seed; + let CompletionRequest { + max_tokens, + seed, + stop, + stream, + temperature, + .. + } = req; + + let max_new_tokens = max_tokens.or(Some(100)); + let stop = stop.unwrap_or_default(); + // enable greedy only when temperature is 0 + let (do_sample, temperature) = match temperature { + Some(temperature) if temperature == 0.0 => (false, None), + other => (true, other), + }; // if suffix is present throw an error if req.suffix.is_some() { @@ -637,16 +650,16 @@ async fn completions( inputs: prompt.to_string(), parameters: GenerateParameters { best_of: None, - temperature: req.temperature, + temperature, repetition_penalty: req.repetition_penalty, frequency_penalty: req.frequency_penalty, top_k: None, top_p: req.top_p, typical_p: None, - do_sample: true, + do_sample, max_new_tokens, return_full_text: None, - stop: Vec::new(), + stop: stop.clone(), truncate: None, watermark: false, details: true, @@ -698,7 +711,7 @@ async fn completions( model: model_id.clone(), system_fingerprint: system_fingerprint.clone(), }) - .map_or_else(|_e| Event::default(), |data| data) + .unwrap_or_else(|_e| Event::default()) }; let (header_tx, header_rx) = oneshot::channel(); @@ -990,7 +1003,6 @@ async fn chat_completions( ) -> Result)> { let span = tracing::Span::current(); metrics::increment_counter!("tgi_request_count"); - let ChatRequest { logprobs, max_tokens, @@ -1124,13 +1136,10 @@ async fn chat_completions( logprobs, stream_token.details.map(|d| d.finish_reason.to_string()), )) - .map_or_else( - |e| { - println!("Failed to serialize ChatCompletionChunk: {:?}", e); - Event::default() - }, - |data| data, - ) + .unwrap_or_else(|e| { + println!("Failed to serialize ChatCompletionChunk: {:?}", e); + Event::default() + }) }; let (headers, response_stream) = generate_stream_internal( @@ -1165,7 +1174,7 @@ async fn chat_completions( ) })?; let tool_calls = vec![ToolCall { - id: 0, + id: "0".to_string(), r#type: "function".to_string(), function: FunctionDefinition { description: None, @@ -1568,6 +1577,7 @@ pub async fn run( max_batch_size, validation_workers, max_client_batch_size, + router: env!("CARGO_PKG_NAME"), version: env!("CARGO_PKG_VERSION"), sha: option_env!("VERGEN_GIT_SHA"), docker_label: option_env!("DOCKER_LABEL"), diff --git a/router/src/validation.rs b/router/src/validation.rs index db832042..ee48f705 100644 --- a/router/src/validation.rs +++ b/router/src/validation.rs @@ -565,6 +565,30 @@ fn prepare_input( inputs = modified_inputs; tokenizer_query } + Some(Config::Paligemma(config)) => { + let mut modified_inputs = String::with_capacity(inputs.len()); + let mut tokenizer_query = String::with_capacity(inputs.len()); + let mut start = 0; + for chunk in RE.find_iter(&inputs) { + let chunk_start = chunk.start(); + let chunk_end = chunk.end(); + if chunk_start != start { + modified_inputs.push_str(&inputs[start..chunk_start]); + tokenizer_query.push_str(&inputs[start..chunk_start]); + } + let (image_uri, height, width) = fetch_image(&inputs[chunk_start..chunk_end])?; + let slots = config.get_number_of_features(height, width); + tokenizer_query.push_str(&"".repeat(slots)); + modified_inputs.push_str(&image_uri); + start = chunk_end; + } + if start != inputs.len() - 1 { + modified_inputs.push_str(&inputs[start..]); + tokenizer_query.push_str(&inputs[start..]); + } + inputs = modified_inputs; + tokenizer_query + } Some(Config::Idefics2(config)) => { let mut modified_inputs = String::with_capacity(inputs.len()); let mut tokenizer_query = String::with_capacity(inputs.len()); diff --git a/rust-toolchain.toml b/rust-toolchain.toml index 67982433..507ee859 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,6 +1,5 @@ [toolchain] -# Released on: 28 December, 2023 -# Branched from master on: 10 November, 2023 -# https://releases.rs/docs/1.75.0/ -channel = "1.75.0" +# Released on: 02 May, 2024 +# https://releases.rs/docs/1.78.0/ +channel = "1.78.0" components = ["rustfmt", "clippy"] diff --git a/server/Makefile-flash-att-v2 b/server/Makefile-flash-att-v2 index 803b3d1f..36ef576a 100644 --- a/server/Makefile-flash-att-v2 +++ b/server/Makefile-flash-att-v2 @@ -1,5 +1,5 @@ flash_att_v2_commit_cuda := 23e8fa5a263d1c7122bc46a86ef32030ee7130f9 -flash_att_v2_commit_rocm := 8736558c287ff2ef28b24878e42828c595ac3e69 +flash_att_v2_commit_rocm := 2554f490101742ccdc56620a938f847f61754be6 flash-attention-v2-cuda: @@ -18,12 +18,12 @@ install-flash-attention-v2-cuda: build-flash-attention-v2-cuda flash-attention-v2-rocm: # Clone flash attention pip install -U packaging ninja --no-cache-dir - git clone https://github.com/fxmarty/flash-attention-rocm flash-attention-v2 + git clone https://github.com/ROCm/flash-attention.git flash-attention-v2 build-flash-attention-v2-rocm: flash-attention-v2-rocm cd flash-attention-v2 && git fetch && git checkout $(flash_att_v2_commit_rocm) cd flash-attention-v2 && git submodule update --init --recursive - cd flash-attention-v2 && PYTORCH_ROCM_ARCH=gfx90a python setup.py build + cd flash-attention-v2 && GPU_ARCHS="gfx90a;gfx942" PYTORCH_ROCM_ARCH="gfx90a;gfx942" python setup.py build install-flash-attention-v2-rocm: build-flash-attention-v2-rocm cd flash-attention-v2 && git submodule update --init --recursive && python setup.py install diff --git a/server/Makefile-vllm b/server/Makefile-vllm index 6f36c679..62fa413f 100644 --- a/server/Makefile-vllm +++ b/server/Makefile-vllm @@ -14,11 +14,11 @@ install-vllm-cuda: build-vllm-cuda vllm-rocm: # Clone vllm pip install -U ninja packaging --no-cache-dir - git clone https://github.com/fxmarty/vllm-public.git vllm + git clone https://github.com/fxmarty/rocm-vllm.git vllm build-vllm-rocm: vllm-rocm - cd vllm && git fetch && git checkout ad9b7c4095ef54419a0533d254f2ad84bd2dfcae - cd vllm && python setup.py build + cd vllm && git fetch && git checkout ca6913b3c2ffacdcb7d15e914dc34adbc6c89479 + cd vllm && PYTORCH_ROCM_ARCH="gfx90a;gfx942" python setup.py install install-vllm-rocm: build-vllm-rocm pip uninstall vllm -y || true diff --git a/server/exllama_kernels/exllama_kernels/hip_compat.cuh b/server/exllama_kernels/exllama_kernels/hip_compat.cuh index 5e698b1a..f2a3dcad 100644 --- a/server/exllama_kernels/exllama_kernels/hip_compat.cuh +++ b/server/exllama_kernels/exllama_kernels/hip_compat.cuh @@ -10,8 +10,9 @@ __device__ __forceinline__ __half __compat_hrcp(__half x) { } __device__ __forceinline__ __half2 __compat_h2rcp(__half2 x) { - return _Float16_2{static_cast<_Float16>(__builtin_amdgcn_rcph(x.x)), - static_cast<_Float16>(__builtin_amdgcn_rcph(x.y))}; + return _Float16_2{ + _Float16_2{static_cast<_Float16>(1.0f), + static_cast<_Float16>(1.0f)} / x.data}; } #define hrcp __compat_hrcp diff --git a/server/poetry.lock b/server/poetry.lock index cdbbd581..f59225a7 100644 --- a/server/poetry.lock +++ b/server/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. [[package]] name = "accelerate" @@ -194,13 +194,13 @@ files = [ [[package]] name = "certifi" -version = "2024.6.2" +version = "2024.7.4" description = "Python package for providing Mozilla's CA Bundle." optional = false python-versions = ">=3.6" files = [ - {file = "certifi-2024.6.2-py3-none-any.whl", hash = "sha256:ddc6c8ce995e6987e7faf5e3f1b02b302836a0e5d98ece18392cb1a36c72ad56"}, - {file = "certifi-2024.6.2.tar.gz", hash = "sha256:3cd43f1c6fa7dedc5899d69d3ad0398fd018ad1a17fba83ddaf78aa46c747516"}, + {file = "certifi-2024.7.4-py3-none-any.whl", hash = "sha256:c198e21b1289c2ab85ee4e67bb4b4ef3ead0892059901a8d5b622f24a1101e90"}, + {file = "certifi-2024.7.4.tar.gz", hash = "sha256:5a1e7645bc0ec61a09e26c36f6106dd4cf40c6db3a1fb6352b0244e7fb057c7b"}, ] [[package]] @@ -474,13 +474,13 @@ files = [ [[package]] name = "exceptiongroup" -version = "1.2.1" +version = "1.2.2" description = "Backport of PEP 654 (exception groups)" optional = false python-versions = ">=3.7" files = [ - {file = "exceptiongroup-1.2.1-py3-none-any.whl", hash = "sha256:5258b9ed329c5bbdd31a309f53cbfb0b155341807f6ff7606a1e801a891b29ad"}, - {file = "exceptiongroup-1.2.1.tar.gz", hash = "sha256:a4785e48b045528f5bfe627b6ad554ff32def154f42372786903b7abcfe1aa16"}, + {file = "exceptiongroup-1.2.2-py3-none-any.whl", hash = "sha256:3111b9d131c238bec2f8f516e123e14ba243563fb135d3fe885990585aa7795b"}, + {file = "exceptiongroup-1.2.2.tar.gz", hash = "sha256:47c2edf7c6738fafb49fd34290706d1a1a2f4d1c6df275526b62cbb4aa5393cc"}, ] [package.extras] @@ -628,17 +628,17 @@ tqdm = ["tqdm"] [[package]] name = "googleapis-common-protos" -version = "1.63.1" +version = "1.63.2" description = "Common protobufs used in Google APIs" optional = false python-versions = ">=3.7" files = [ - {file = "googleapis-common-protos-1.63.1.tar.gz", hash = "sha256:c6442f7a0a6b2a80369457d79e6672bb7dcbaab88e0848302497e3ec80780a6a"}, - {file = "googleapis_common_protos-1.63.1-py2.py3-none-any.whl", hash = "sha256:0e1c2cdfcbc354b76e4a211a35ea35d6926a835cba1377073c4861db904a1877"}, + {file = "googleapis-common-protos-1.63.2.tar.gz", hash = "sha256:27c5abdffc4911f28101e635de1533fb4cfd2c37fbaa9174587c799fac90aa87"}, + {file = "googleapis_common_protos-1.63.2-py2.py3-none-any.whl", hash = "sha256:27a2499c7e8aff199665b22741997e485eccc8645aa9176c7c988e6fae507945"}, ] [package.dependencies] -protobuf = ">=3.19.5,<3.20.0 || >3.20.0,<3.20.1 || >3.20.1,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<6.0.0.dev0" +protobuf = ">=3.20.2,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<6.0.0.dev0" [package.extras] grpc = ["grpcio (>=1.44.0,<2.0.0.dev0)"] @@ -942,13 +942,13 @@ files = [ [[package]] name = "importlib-metadata" -version = "7.2.1" +version = "8.0.0" description = "Read metadata from Python packages" optional = false python-versions = ">=3.8" files = [ - {file = "importlib_metadata-7.2.1-py3-none-any.whl", hash = "sha256:ffef94b0b66046dd8ea2d619b701fe978d9264d38f3998bc4c27ec3b146a87c8"}, - {file = "importlib_metadata-7.2.1.tar.gz", hash = "sha256:509ecb2ab77071db5137c655e24ceb3eee66e7bbc6574165d0d114d9fc4bbe68"}, + {file = "importlib_metadata-8.0.0-py3-none-any.whl", hash = "sha256:15584cf2b1bf449d98ff8a6ff1abef57bf20f3ac6454f431736cd3e660921b2f"}, + {file = "importlib_metadata-8.0.0.tar.gz", hash = "sha256:188bd24e4c346d3f0a933f275c2fec67050326a856b9a359881d7c2a697e8812"}, ] [package.dependencies] @@ -1025,13 +1025,13 @@ files = [ [[package]] name = "jsonschema" -version = "4.22.0" +version = "4.23.0" description = "An implementation of JSON Schema validation for Python" optional = true python-versions = ">=3.8" files = [ - {file = "jsonschema-4.22.0-py3-none-any.whl", hash = "sha256:ff4cfd6b1367a40e7bc6411caec72effadd3db0bbe5017de188f2d6108335802"}, - {file = "jsonschema-4.22.0.tar.gz", hash = "sha256:5b22d434a45935119af990552c862e5d6d564e8f6601206b305a61fdf661a2b7"}, + {file = "jsonschema-4.23.0-py3-none-any.whl", hash = "sha256:fbadb6f8b144a8f8cf9f0b89ba94501d143e50411a1278633f56a7acf7fd5566"}, + {file = "jsonschema-4.23.0.tar.gz", hash = "sha256:d71497fef26351a33265337fa77ffeb82423f3ea21283cd9467bb03999266bc4"}, ] [package.dependencies] @@ -1042,7 +1042,7 @@ rpds-py = ">=0.7.1" [package.extras] format = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3987", "uri-template", "webcolors (>=1.11)"] -format-nongpl = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3986-validator (>0.1.0)", "uri-template", "webcolors (>=1.11)"] +format-nongpl = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3986-validator (>0.1.0)", "uri-template", "webcolors (>=24.6.0)"] [[package]] name = "jsonschema-specifications" @@ -1580,13 +1580,13 @@ files = [ [[package]] name = "nvidia-nvjitlink-cu12" -version = "12.5.40" +version = "12.5.82" description = "Nvidia JIT LTO Library" optional = false python-versions = ">=3" files = [ - {file = "nvidia_nvjitlink_cu12-12.5.40-py3-none-manylinux2014_x86_64.whl", hash = "sha256:d9714f27c1d0f0895cd8915c07a87a1d0029a0aa36acaf9156952ec2a8a12189"}, - {file = "nvidia_nvjitlink_cu12-12.5.40-py3-none-win_amd64.whl", hash = "sha256:c3401dc8543b52d3a8158007a0c1ab4e9c768fcbd24153a48c86972102197ddd"}, + {file = "nvidia_nvjitlink_cu12-12.5.82-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f9b37bc5c8cf7509665cb6ada5aaa0ce65618f2332b7d3e78e9790511f111212"}, + {file = "nvidia_nvjitlink_cu12-12.5.82-py3-none-win_amd64.whl", hash = "sha256:e782564d705ff0bf61ac3e1bf730166da66dd2fe9012f111ede5fc49b64ae697"}, ] [[package]] @@ -1756,24 +1756,24 @@ files = [ [[package]] name = "optimum" -version = "1.20.0" +version = "1.21.2" description = "Optimum Library is an extension of the Hugging Face Transformers library, providing a framework to integrate third-party libraries from Hardware Partners and interface with their specific functionality." optional = false python-versions = ">=3.7.0" files = [ - {file = "optimum-1.20.0-py3-none-any.whl", hash = "sha256:0c0d0746043c95e22cf3586946d7408d353f10c0486f1c7d2d11084a5cfc0ede"}, - {file = "optimum-1.20.0.tar.gz", hash = "sha256:b64c7536fe738db9b56605105efe72006401ad2aa00cb499ae407f2e06f3043b"}, + {file = "optimum-1.21.2-py3-none-any.whl", hash = "sha256:8b3633b9312413ceac5156294a2a0cd221268baf5a2c593f4d54ec20bff296d8"}, + {file = "optimum-1.21.2.tar.gz", hash = "sha256:037e65d265237809fac69e9003215c60cf6de56e97c62ff7565abab4a94a64ce"}, ] [package.dependencies] coloredlogs = "*" datasets = "*" huggingface-hub = ">=0.8.0" -numpy = "*" +numpy = "<2.0" packaging = "*" sympy = "*" torch = ">=1.11" -transformers = {version = ">=4.26.0,<4.42.0", extras = ["sentencepiece"]} +transformers = {version = ">=4.26.0,<4.43.0", extras = ["sentencepiece"]} [package.extras] amd = ["optimum-amd"] @@ -1786,15 +1786,16 @@ exporters-gpu = ["onnx", "onnxruntime-gpu", "timm"] exporters-tf = ["h5py", "numpy (<1.24.0)", "onnx", "onnxruntime", "tensorflow (>=2.4,<=2.12.1)", "tf2onnx", "timm", "transformers[sentencepiece] (>=4.26.0,<4.38.0)"] furiosa = ["optimum-furiosa"] graphcore = ["optimum-graphcore"] -habana = ["optimum-habana", "transformers (>=4.38.0,<4.39.0)"] -intel = ["optimum-intel (>=1.16.0)"] -neural-compressor = ["optimum-intel[neural-compressor] (>=1.16.0)"] +habana = ["optimum-habana", "transformers (>=4.40.0,<4.41.0)"] +intel = ["optimum-intel (>=1.18.0)"] +ipex = ["optimum-intel[ipex] (>=1.18.0)"] +neural-compressor = ["optimum-intel[neural-compressor] (>=1.18.0)"] neuron = ["optimum-neuron[neuron] (>=0.0.20)", "transformers (>=4.36.2,<4.42.0)"] neuronx = ["optimum-neuron[neuronx] (>=0.0.20)", "transformers (>=4.36.2,<4.42.0)"] -nncf = ["optimum-intel[nncf] (>=1.16.0)"] +nncf = ["optimum-intel[nncf] (>=1.18.0)"] onnxruntime = ["datasets (>=1.2.1)", "evaluate", "onnx", "onnxruntime (>=1.11.0)", "protobuf (>=3.20.1)"] onnxruntime-gpu = ["accelerate", "datasets (>=1.2.1)", "evaluate", "onnx", "onnxruntime-gpu (>=1.11.0)", "protobuf (>=3.20.1)"] -openvino = ["optimum-intel[openvino] (>=1.16.0)"] +openvino = ["optimum-intel[openvino] (>=1.18.0)"] quality = ["black (>=23.1,<24.0)", "ruff (==0.1.5)"] tests = ["Pillow", "accelerate", "diffusers (>=0.17.0)", "einops", "invisible-watermark", "parameterized", "pytest (<=8.0.0)", "pytest-xdist", "requests", "rjieba", "sacremoses", "scikit-learn", "timm", "torchaudio", "torchvision"] @@ -1970,84 +1971,95 @@ test = ["black", "datasets", "diffusers (<0.21.0)", "hf-doc-builder", "parameter [[package]] name = "pillow" -version = "10.3.0" +version = "10.4.0" description = "Python Imaging Library (Fork)" optional = false python-versions = ">=3.8" files = [ - {file = "pillow-10.3.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:90b9e29824800e90c84e4022dd5cc16eb2d9605ee13f05d47641eb183cd73d45"}, - {file = "pillow-10.3.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a2c405445c79c3f5a124573a051062300936b0281fee57637e706453e452746c"}, - {file = "pillow-10.3.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:78618cdbccaa74d3f88d0ad6cb8ac3007f1a6fa5c6f19af64b55ca170bfa1edf"}, - {file = "pillow-10.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:261ddb7ca91fcf71757979534fb4c128448b5b4c55cb6152d280312062f69599"}, - {file = "pillow-10.3.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:ce49c67f4ea0609933d01c0731b34b8695a7a748d6c8d186f95e7d085d2fe475"}, - {file = "pillow-10.3.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:b14f16f94cbc61215115b9b1236f9c18403c15dd3c52cf629072afa9d54c1cbf"}, - {file = "pillow-10.3.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:d33891be6df59d93df4d846640f0e46f1a807339f09e79a8040bc887bdcd7ed3"}, - {file = "pillow-10.3.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:b50811d664d392f02f7761621303eba9d1b056fb1868c8cdf4231279645c25f5"}, - {file = "pillow-10.3.0-cp310-cp310-win32.whl", hash = "sha256:ca2870d5d10d8726a27396d3ca4cf7976cec0f3cb706debe88e3a5bd4610f7d2"}, - {file = "pillow-10.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:f0d0591a0aeaefdaf9a5e545e7485f89910c977087e7de2b6c388aec32011e9f"}, - {file = "pillow-10.3.0-cp310-cp310-win_arm64.whl", hash = "sha256:ccce24b7ad89adb5a1e34a6ba96ac2530046763912806ad4c247356a8f33a67b"}, - {file = "pillow-10.3.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:5f77cf66e96ae734717d341c145c5949c63180842a545c47a0ce7ae52ca83795"}, - {file = "pillow-10.3.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e4b878386c4bf293578b48fc570b84ecfe477d3b77ba39a6e87150af77f40c57"}, - {file = "pillow-10.3.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fdcbb4068117dfd9ce0138d068ac512843c52295ed996ae6dd1faf537b6dbc27"}, - {file = "pillow-10.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9797a6c8fe16f25749b371c02e2ade0efb51155e767a971c61734b1bf6293994"}, - {file = "pillow-10.3.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:9e91179a242bbc99be65e139e30690e081fe6cb91a8e77faf4c409653de39451"}, - {file = "pillow-10.3.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:1b87bd9d81d179bd8ab871603bd80d8645729939f90b71e62914e816a76fc6bd"}, - {file = "pillow-10.3.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:81d09caa7b27ef4e61cb7d8fbf1714f5aec1c6b6c5270ee53504981e6e9121ad"}, - {file = "pillow-10.3.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:048ad577748b9fa4a99a0548c64f2cb8d672d5bf2e643a739ac8faff1164238c"}, - {file = "pillow-10.3.0-cp311-cp311-win32.whl", hash = "sha256:7161ec49ef0800947dc5570f86568a7bb36fa97dd09e9827dc02b718c5643f09"}, - {file = "pillow-10.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:8eb0908e954d093b02a543dc963984d6e99ad2b5e36503d8a0aaf040505f747d"}, - {file = "pillow-10.3.0-cp311-cp311-win_arm64.whl", hash = "sha256:4e6f7d1c414191c1199f8996d3f2282b9ebea0945693fb67392c75a3a320941f"}, - {file = "pillow-10.3.0-cp312-cp312-macosx_10_10_x86_64.whl", hash = "sha256:e46f38133e5a060d46bd630faa4d9fa0202377495df1f068a8299fd78c84de84"}, - {file = "pillow-10.3.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:50b8eae8f7334ec826d6eeffaeeb00e36b5e24aa0b9df322c247539714c6df19"}, - {file = "pillow-10.3.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9d3bea1c75f8c53ee4d505c3e67d8c158ad4df0d83170605b50b64025917f338"}, - {file = "pillow-10.3.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:19aeb96d43902f0a783946a0a87dbdad5c84c936025b8419da0a0cd7724356b1"}, - {file = "pillow-10.3.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:74d28c17412d9caa1066f7a31df8403ec23d5268ba46cd0ad2c50fb82ae40462"}, - {file = "pillow-10.3.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:ff61bfd9253c3915e6d41c651d5f962da23eda633cf02262990094a18a55371a"}, - {file = "pillow-10.3.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:d886f5d353333b4771d21267c7ecc75b710f1a73d72d03ca06df49b09015a9ef"}, - {file = "pillow-10.3.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4b5ec25d8b17217d635f8935dbc1b9aa5907962fae29dff220f2659487891cd3"}, - {file = "pillow-10.3.0-cp312-cp312-win32.whl", hash = "sha256:51243f1ed5161b9945011a7360e997729776f6e5d7005ba0c6879267d4c5139d"}, - {file = "pillow-10.3.0-cp312-cp312-win_amd64.whl", hash = "sha256:412444afb8c4c7a6cc11a47dade32982439925537e483be7c0ae0cf96c4f6a0b"}, - {file = "pillow-10.3.0-cp312-cp312-win_arm64.whl", hash = "sha256:798232c92e7665fe82ac085f9d8e8ca98826f8e27859d9a96b41d519ecd2e49a"}, - {file = "pillow-10.3.0-cp38-cp38-macosx_10_10_x86_64.whl", hash = "sha256:4eaa22f0d22b1a7e93ff0a596d57fdede2e550aecffb5a1ef1106aaece48e96b"}, - {file = "pillow-10.3.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:cd5e14fbf22a87321b24c88669aad3a51ec052eb145315b3da3b7e3cc105b9a2"}, - {file = "pillow-10.3.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1530e8f3a4b965eb6a7785cf17a426c779333eb62c9a7d1bbcf3ffd5bf77a4aa"}, - {file = "pillow-10.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5d512aafa1d32efa014fa041d38868fda85028e3f930a96f85d49c7d8ddc0383"}, - {file = "pillow-10.3.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:339894035d0ede518b16073bdc2feef4c991ee991a29774b33e515f1d308e08d"}, - {file = "pillow-10.3.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:aa7e402ce11f0885305bfb6afb3434b3cd8f53b563ac065452d9d5654c7b86fd"}, - {file = "pillow-10.3.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:0ea2a783a2bdf2a561808fe4a7a12e9aa3799b701ba305de596bc48b8bdfce9d"}, - {file = "pillow-10.3.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:c78e1b00a87ce43bb37642c0812315b411e856a905d58d597750eb79802aaaa3"}, - {file = "pillow-10.3.0-cp38-cp38-win32.whl", hash = "sha256:72d622d262e463dfb7595202d229f5f3ab4b852289a1cd09650362db23b9eb0b"}, - {file = "pillow-10.3.0-cp38-cp38-win_amd64.whl", hash = "sha256:2034f6759a722da3a3dbd91a81148cf884e91d1b747992ca288ab88c1de15999"}, - {file = "pillow-10.3.0-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:2ed854e716a89b1afcedea551cd85f2eb2a807613752ab997b9974aaa0d56936"}, - {file = "pillow-10.3.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:dc1a390a82755a8c26c9964d457d4c9cbec5405896cba94cf51f36ea0d855002"}, - {file = "pillow-10.3.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4203efca580f0dd6f882ca211f923168548f7ba334c189e9eab1178ab840bf60"}, - {file = "pillow-10.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3102045a10945173d38336f6e71a8dc71bcaeed55c3123ad4af82c52807b9375"}, - {file = "pillow-10.3.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:6fb1b30043271ec92dc65f6d9f0b7a830c210b8a96423074b15c7bc999975f57"}, - {file = "pillow-10.3.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:1dfc94946bc60ea375cc39cff0b8da6c7e5f8fcdc1d946beb8da5c216156ddd8"}, - {file = "pillow-10.3.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:b09b86b27a064c9624d0a6c54da01c1beaf5b6cadfa609cf63789b1d08a797b9"}, - {file = "pillow-10.3.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:d3b2348a78bc939b4fed6552abfd2e7988e0f81443ef3911a4b8498ca084f6eb"}, - {file = "pillow-10.3.0-cp39-cp39-win32.whl", hash = "sha256:45ebc7b45406febf07fef35d856f0293a92e7417ae7933207e90bf9090b70572"}, - {file = "pillow-10.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:0ba26351b137ca4e0db0342d5d00d2e355eb29372c05afd544ebf47c0956ffeb"}, - {file = "pillow-10.3.0-cp39-cp39-win_arm64.whl", hash = "sha256:50fd3f6b26e3441ae07b7c979309638b72abc1a25da31a81a7fbd9495713ef4f"}, - {file = "pillow-10.3.0-pp310-pypy310_pp73-macosx_10_10_x86_64.whl", hash = "sha256:6b02471b72526ab8a18c39cb7967b72d194ec53c1fd0a70b050565a0f366d355"}, - {file = "pillow-10.3.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:8ab74c06ffdab957d7670c2a5a6e1a70181cd10b727cd788c4dd9005b6a8acd9"}, - {file = "pillow-10.3.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:048eeade4c33fdf7e08da40ef402e748df113fd0b4584e32c4af74fe78baaeb2"}, - {file = "pillow-10.3.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9e2ec1e921fd07c7cda7962bad283acc2f2a9ccc1b971ee4b216b75fad6f0463"}, - {file = "pillow-10.3.0-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:4c8e73e99da7db1b4cad7f8d682cf6abad7844da39834c288fbfa394a47bbced"}, - {file = "pillow-10.3.0-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:16563993329b79513f59142a6b02055e10514c1a8e86dca8b48a893e33cf91e3"}, - {file = "pillow-10.3.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:dd78700f5788ae180b5ee8902c6aea5a5726bac7c364b202b4b3e3ba2d293170"}, - {file = "pillow-10.3.0-pp39-pypy39_pp73-macosx_10_10_x86_64.whl", hash = "sha256:aff76a55a8aa8364d25400a210a65ff59d0168e0b4285ba6bf2bd83cf675ba32"}, - {file = "pillow-10.3.0-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:b7bc2176354defba3edc2b9a777744462da2f8e921fbaf61e52acb95bafa9828"}, - {file = "pillow-10.3.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:793b4e24db2e8742ca6423d3fde8396db336698c55cd34b660663ee9e45ed37f"}, - {file = "pillow-10.3.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d93480005693d247f8346bc8ee28c72a2191bdf1f6b5db469c096c0c867ac015"}, - {file = "pillow-10.3.0-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:c83341b89884e2b2e55886e8fbbf37c3fa5efd6c8907124aeb72f285ae5696e5"}, - {file = "pillow-10.3.0-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:1a1d1915db1a4fdb2754b9de292642a39a7fb28f1736699527bb649484fb966a"}, - {file = "pillow-10.3.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:a0eaa93d054751ee9964afa21c06247779b90440ca41d184aeb5d410f20ff591"}, - {file = "pillow-10.3.0.tar.gz", hash = "sha256:9d2455fbf44c914840c793e89aa82d0e1763a14253a000743719ae5946814b2d"}, + {file = "pillow-10.4.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:4d9667937cfa347525b319ae34375c37b9ee6b525440f3ef48542fcf66f2731e"}, + {file = "pillow-10.4.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:543f3dc61c18dafb755773efc89aae60d06b6596a63914107f75459cf984164d"}, + {file = "pillow-10.4.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7928ecbf1ece13956b95d9cbcfc77137652b02763ba384d9ab508099a2eca856"}, + {file = "pillow-10.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e4d49b85c4348ea0b31ea63bc75a9f3857869174e2bf17e7aba02945cd218e6f"}, + {file = "pillow-10.4.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:6c762a5b0997f5659a5ef2266abc1d8851ad7749ad9a6a5506eb23d314e4f46b"}, + {file = "pillow-10.4.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:a985e028fc183bf12a77a8bbf36318db4238a3ded7fa9df1b9a133f1cb79f8fc"}, + {file = "pillow-10.4.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:812f7342b0eee081eaec84d91423d1b4650bb9828eb53d8511bcef8ce5aecf1e"}, + {file = "pillow-10.4.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:ac1452d2fbe4978c2eec89fb5a23b8387aba707ac72810d9490118817d9c0b46"}, + {file = "pillow-10.4.0-cp310-cp310-win32.whl", hash = "sha256:bcd5e41a859bf2e84fdc42f4edb7d9aba0a13d29a2abadccafad99de3feff984"}, + {file = "pillow-10.4.0-cp310-cp310-win_amd64.whl", hash = "sha256:ecd85a8d3e79cd7158dec1c9e5808e821feea088e2f69a974db5edf84dc53141"}, + {file = "pillow-10.4.0-cp310-cp310-win_arm64.whl", hash = "sha256:ff337c552345e95702c5fde3158acb0625111017d0e5f24bf3acdb9cc16b90d1"}, + {file = "pillow-10.4.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:0a9ec697746f268507404647e531e92889890a087e03681a3606d9b920fbee3c"}, + {file = "pillow-10.4.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:dfe91cb65544a1321e631e696759491ae04a2ea11d36715eca01ce07284738be"}, + {file = "pillow-10.4.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5dc6761a6efc781e6a1544206f22c80c3af4c8cf461206d46a1e6006e4429ff3"}, + {file = "pillow-10.4.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5e84b6cc6a4a3d76c153a6b19270b3526a5a8ed6b09501d3af891daa2a9de7d6"}, + {file = "pillow-10.4.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:bbc527b519bd3aa9d7f429d152fea69f9ad37c95f0b02aebddff592688998abe"}, + {file = "pillow-10.4.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:76a911dfe51a36041f2e756b00f96ed84677cdeb75d25c767f296c1c1eda1319"}, + {file = "pillow-10.4.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:59291fb29317122398786c2d44427bbd1a6d7ff54017075b22be9d21aa59bd8d"}, + {file = "pillow-10.4.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:416d3a5d0e8cfe4f27f574362435bc9bae57f679a7158e0096ad2beb427b8696"}, + {file = "pillow-10.4.0-cp311-cp311-win32.whl", hash = "sha256:7086cc1d5eebb91ad24ded9f58bec6c688e9f0ed7eb3dbbf1e4800280a896496"}, + {file = "pillow-10.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:cbed61494057c0f83b83eb3a310f0bf774b09513307c434d4366ed64f4128a91"}, + {file = "pillow-10.4.0-cp311-cp311-win_arm64.whl", hash = "sha256:f5f0c3e969c8f12dd2bb7e0b15d5c468b51e5017e01e2e867335c81903046a22"}, + {file = "pillow-10.4.0-cp312-cp312-macosx_10_10_x86_64.whl", hash = "sha256:673655af3eadf4df6b5457033f086e90299fdd7a47983a13827acf7459c15d94"}, + {file = "pillow-10.4.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:866b6942a92f56300012f5fbac71f2d610312ee65e22f1aa2609e491284e5597"}, + {file = "pillow-10.4.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:29dbdc4207642ea6aad70fbde1a9338753d33fb23ed6956e706936706f52dd80"}, + {file = "pillow-10.4.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bf2342ac639c4cf38799a44950bbc2dfcb685f052b9e262f446482afaf4bffca"}, + {file = "pillow-10.4.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:f5b92f4d70791b4a67157321c4e8225d60b119c5cc9aee8ecf153aace4aad4ef"}, + {file = "pillow-10.4.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:86dcb5a1eb778d8b25659d5e4341269e8590ad6b4e8b44d9f4b07f8d136c414a"}, + {file = "pillow-10.4.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:780c072c2e11c9b2c7ca37f9a2ee8ba66f44367ac3e5c7832afcfe5104fd6d1b"}, + {file = "pillow-10.4.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:37fb69d905be665f68f28a8bba3c6d3223c8efe1edf14cc4cfa06c241f8c81d9"}, + {file = "pillow-10.4.0-cp312-cp312-win32.whl", hash = "sha256:7dfecdbad5c301d7b5bde160150b4db4c659cee2b69589705b6f8a0c509d9f42"}, + {file = "pillow-10.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:1d846aea995ad352d4bdcc847535bd56e0fd88d36829d2c90be880ef1ee4668a"}, + {file = "pillow-10.4.0-cp312-cp312-win_arm64.whl", hash = "sha256:e553cad5179a66ba15bb18b353a19020e73a7921296a7979c4a2b7f6a5cd57f9"}, + {file = "pillow-10.4.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:8bc1a764ed8c957a2e9cacf97c8b2b053b70307cf2996aafd70e91a082e70df3"}, + {file = "pillow-10.4.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:6209bb41dc692ddfee4942517c19ee81b86c864b626dbfca272ec0f7cff5d9fb"}, + {file = "pillow-10.4.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bee197b30783295d2eb680b311af15a20a8b24024a19c3a26431ff83eb8d1f70"}, + {file = "pillow-10.4.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1ef61f5dd14c300786318482456481463b9d6b91ebe5ef12f405afbba77ed0be"}, + {file = "pillow-10.4.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:297e388da6e248c98bc4a02e018966af0c5f92dfacf5a5ca22fa01cb3179bca0"}, + {file = "pillow-10.4.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:e4db64794ccdf6cb83a59d73405f63adbe2a1887012e308828596100a0b2f6cc"}, + {file = "pillow-10.4.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:bd2880a07482090a3bcb01f4265f1936a903d70bc740bfcb1fd4e8a2ffe5cf5a"}, + {file = "pillow-10.4.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4b35b21b819ac1dbd1233317adeecd63495f6babf21b7b2512d244ff6c6ce309"}, + {file = "pillow-10.4.0-cp313-cp313-win32.whl", hash = "sha256:551d3fd6e9dc15e4c1eb6fc4ba2b39c0c7933fa113b220057a34f4bb3268a060"}, + {file = "pillow-10.4.0-cp313-cp313-win_amd64.whl", hash = "sha256:030abdbe43ee02e0de642aee345efa443740aa4d828bfe8e2eb11922ea6a21ea"}, + {file = "pillow-10.4.0-cp313-cp313-win_arm64.whl", hash = "sha256:5b001114dd152cfd6b23befeb28d7aee43553e2402c9f159807bf55f33af8a8d"}, + {file = "pillow-10.4.0-cp38-cp38-macosx_10_10_x86_64.whl", hash = "sha256:8d4d5063501b6dd4024b8ac2f04962d661222d120381272deea52e3fc52d3736"}, + {file = "pillow-10.4.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:7c1ee6f42250df403c5f103cbd2768a28fe1a0ea1f0f03fe151c8741e1469c8b"}, + {file = "pillow-10.4.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b15e02e9bb4c21e39876698abf233c8c579127986f8207200bc8a8f6bb27acf2"}, + {file = "pillow-10.4.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7a8d4bade9952ea9a77d0c3e49cbd8b2890a399422258a77f357b9cc9be8d680"}, + {file = "pillow-10.4.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:43efea75eb06b95d1631cb784aa40156177bf9dd5b4b03ff38979e048258bc6b"}, + {file = "pillow-10.4.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:950be4d8ba92aca4b2bb0741285a46bfae3ca699ef913ec8416c1b78eadd64cd"}, + {file = "pillow-10.4.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:d7480af14364494365e89d6fddc510a13e5a2c3584cb19ef65415ca57252fb84"}, + {file = "pillow-10.4.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:73664fe514b34c8f02452ffb73b7a92c6774e39a647087f83d67f010eb9a0cf0"}, + {file = "pillow-10.4.0-cp38-cp38-win32.whl", hash = "sha256:e88d5e6ad0d026fba7bdab8c3f225a69f063f116462c49892b0149e21b6c0a0e"}, + {file = "pillow-10.4.0-cp38-cp38-win_amd64.whl", hash = "sha256:5161eef006d335e46895297f642341111945e2c1c899eb406882a6c61a4357ab"}, + {file = "pillow-10.4.0-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:0ae24a547e8b711ccaaf99c9ae3cd975470e1a30caa80a6aaee9a2f19c05701d"}, + {file = "pillow-10.4.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:298478fe4f77a4408895605f3482b6cc6222c018b2ce565c2b6b9c354ac3229b"}, + {file = "pillow-10.4.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:134ace6dc392116566980ee7436477d844520a26a4b1bd4053f6f47d096997fd"}, + {file = "pillow-10.4.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:930044bb7679ab003b14023138b50181899da3f25de50e9dbee23b61b4de2126"}, + {file = "pillow-10.4.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:c76e5786951e72ed3686e122d14c5d7012f16c8303a674d18cdcd6d89557fc5b"}, + {file = "pillow-10.4.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:b2724fdb354a868ddf9a880cb84d102da914e99119211ef7ecbdc613b8c96b3c"}, + {file = "pillow-10.4.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:dbc6ae66518ab3c5847659e9988c3b60dc94ffb48ef9168656e0019a93dbf8a1"}, + {file = "pillow-10.4.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:06b2f7898047ae93fad74467ec3d28fe84f7831370e3c258afa533f81ef7f3df"}, + {file = "pillow-10.4.0-cp39-cp39-win32.whl", hash = "sha256:7970285ab628a3779aecc35823296a7869f889b8329c16ad5a71e4901a3dc4ef"}, + {file = "pillow-10.4.0-cp39-cp39-win_amd64.whl", hash = "sha256:961a7293b2457b405967af9c77dcaa43cc1a8cd50d23c532e62d48ab6cdd56f5"}, + {file = "pillow-10.4.0-cp39-cp39-win_arm64.whl", hash = "sha256:32cda9e3d601a52baccb2856b8ea1fc213c90b340c542dcef77140dfa3278a9e"}, + {file = "pillow-10.4.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:5b4815f2e65b30f5fbae9dfffa8636d992d49705723fe86a3661806e069352d4"}, + {file = "pillow-10.4.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:8f0aef4ef59694b12cadee839e2ba6afeab89c0f39a3adc02ed51d109117b8da"}, + {file = "pillow-10.4.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9f4727572e2918acaa9077c919cbbeb73bd2b3ebcfe033b72f858fc9fbef0026"}, + {file = "pillow-10.4.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ff25afb18123cea58a591ea0244b92eb1e61a1fd497bf6d6384f09bc3262ec3e"}, + {file = "pillow-10.4.0-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:dc3e2db6ba09ffd7d02ae9141cfa0ae23393ee7687248d46a7507b75d610f4f5"}, + {file = "pillow-10.4.0-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:02a2be69f9c9b8c1e97cf2713e789d4e398c751ecfd9967c18d0ce304efbf885"}, + {file = "pillow-10.4.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:0755ffd4a0c6f267cccbae2e9903d95477ca2f77c4fcf3a3a09570001856c8a5"}, + {file = "pillow-10.4.0-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:a02364621fe369e06200d4a16558e056fe2805d3468350df3aef21e00d26214b"}, + {file = "pillow-10.4.0-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:1b5dea9831a90e9d0721ec417a80d4cbd7022093ac38a568db2dd78363b00908"}, + {file = "pillow-10.4.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9b885f89040bb8c4a1573566bbb2f44f5c505ef6e74cec7ab9068c900047f04b"}, + {file = "pillow-10.4.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:87dd88ded2e6d74d31e1e0a99a726a6765cda32d00ba72dc37f0651f306daaa8"}, + {file = "pillow-10.4.0-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:2db98790afc70118bd0255c2eeb465e9767ecf1f3c25f9a1abb8ffc8cfd1fe0a"}, + {file = "pillow-10.4.0-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:f7baece4ce06bade126fb84b8af1c33439a76d8a6fd818970215e0560ca28c27"}, + {file = "pillow-10.4.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:cfdd747216947628af7b259d274771d84db2268ca062dd5faf373639d00113a3"}, + {file = "pillow-10.4.0.tar.gz", hash = "sha256:166c1cd4d24309b30d61f79f4a9114b7b2313d7450912277855ff5dfd7cd4a06"}, ] [package.extras] -docs = ["furo", "olefile", "sphinx (>=2.4)", "sphinx-copybutton", "sphinx-inline-tabs", "sphinx-removed-in", "sphinxext-opengraph"] +docs = ["furo", "olefile", "sphinx (>=7.3)", "sphinx-copybutton", "sphinx-inline-tabs", "sphinxext-opengraph"] fpx = ["olefile"] mic = ["olefile"] tests = ["check-manifest", "coverage", "defusedxml", "markdown2", "olefile", "packaging", "pyroma", "pytest", "pytest-cov", "pytest-timeout"] @@ -2156,52 +2168,42 @@ files = [ [[package]] name = "pyarrow" -version = "16.1.0" +version = "17.0.0" description = "Python library for Apache Arrow" optional = false python-versions = ">=3.8" files = [ - {file = "pyarrow-16.1.0-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:17e23b9a65a70cc733d8b738baa6ad3722298fa0c81d88f63ff94bf25eaa77b9"}, - {file = "pyarrow-16.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:4740cc41e2ba5d641071d0ab5e9ef9b5e6e8c7611351a5cb7c1d175eaf43674a"}, - {file = "pyarrow-16.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:98100e0268d04e0eec47b73f20b39c45b4006f3c4233719c3848aa27a03c1aef"}, - {file = "pyarrow-16.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f68f409e7b283c085f2da014f9ef81e885d90dcd733bd648cfba3ef265961848"}, - {file = "pyarrow-16.1.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:a8914cd176f448e09746037b0c6b3a9d7688cef451ec5735094055116857580c"}, - {file = "pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:48be160782c0556156d91adbdd5a4a7e719f8d407cb46ae3bb4eaee09b3111bd"}, - {file = "pyarrow-16.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:9cf389d444b0f41d9fe1444b70650fea31e9d52cfcb5f818b7888b91b586efff"}, - {file = "pyarrow-16.1.0-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:d0ebea336b535b37eee9eee31761813086d33ed06de9ab6fc6aaa0bace7b250c"}, - {file = "pyarrow-16.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2e73cfc4a99e796727919c5541c65bb88b973377501e39b9842ea71401ca6c1c"}, - {file = "pyarrow-16.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bf9251264247ecfe93e5f5a0cd43b8ae834f1e61d1abca22da55b20c788417f6"}, - {file = "pyarrow-16.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ddf5aace92d520d3d2a20031d8b0ec27b4395cab9f74e07cc95edf42a5cc0147"}, - {file = "pyarrow-16.1.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:25233642583bf658f629eb230b9bb79d9af4d9f9229890b3c878699c82f7d11e"}, - {file = "pyarrow-16.1.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:a33a64576fddfbec0a44112eaf844c20853647ca833e9a647bfae0582b2ff94b"}, - {file = "pyarrow-16.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:185d121b50836379fe012753cf15c4ba9638bda9645183ab36246923875f8d1b"}, - {file = "pyarrow-16.1.0-cp312-cp312-macosx_10_15_x86_64.whl", hash = "sha256:2e51ca1d6ed7f2e9d5c3c83decf27b0d17bb207a7dea986e8dc3e24f80ff7d6f"}, - {file = "pyarrow-16.1.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:06ebccb6f8cb7357de85f60d5da50e83507954af617d7b05f48af1621d331c9a"}, - {file = "pyarrow-16.1.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b04707f1979815f5e49824ce52d1dceb46e2f12909a48a6a753fe7cafbc44a0c"}, - {file = "pyarrow-16.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0d32000693deff8dc5df444b032b5985a48592c0697cb6e3071a5d59888714e2"}, - {file = "pyarrow-16.1.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:8785bb10d5d6fd5e15d718ee1d1f914fe768bf8b4d1e5e9bf253de8a26cb1628"}, - {file = "pyarrow-16.1.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:e1369af39587b794873b8a307cc6623a3b1194e69399af0efd05bb202195a5a7"}, - {file = "pyarrow-16.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:febde33305f1498f6df85e8020bca496d0e9ebf2093bab9e0f65e2b4ae2b3444"}, - {file = "pyarrow-16.1.0-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:b5f5705ab977947a43ac83b52ade3b881eb6e95fcc02d76f501d549a210ba77f"}, - {file = "pyarrow-16.1.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:0d27bf89dfc2576f6206e9cd6cf7a107c9c06dc13d53bbc25b0bd4556f19cf5f"}, - {file = "pyarrow-16.1.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0d07de3ee730647a600037bc1d7b7994067ed64d0eba797ac74b2bc77384f4c2"}, - {file = "pyarrow-16.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fbef391b63f708e103df99fbaa3acf9f671d77a183a07546ba2f2c297b361e83"}, - {file = "pyarrow-16.1.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:19741c4dbbbc986d38856ee7ddfdd6a00fc3b0fc2d928795b95410d38bb97d15"}, - {file = "pyarrow-16.1.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:f2c5fb249caa17b94e2b9278b36a05ce03d3180e6da0c4c3b3ce5b2788f30eed"}, - {file = "pyarrow-16.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:e6b6d3cd35fbb93b70ade1336022cc1147b95ec6af7d36906ca7fe432eb09710"}, - {file = "pyarrow-16.1.0-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:18da9b76a36a954665ccca8aa6bd9f46c1145f79c0bb8f4f244f5f8e799bca55"}, - {file = "pyarrow-16.1.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:99f7549779b6e434467d2aa43ab2b7224dd9e41bdde486020bae198978c9e05e"}, - {file = "pyarrow-16.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f07fdffe4fd5b15f5ec15c8b64584868d063bc22b86b46c9695624ca3505b7b4"}, - {file = "pyarrow-16.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ddfe389a08ea374972bd4065d5f25d14e36b43ebc22fc75f7b951f24378bf0b5"}, - {file = "pyarrow-16.1.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:3b20bd67c94b3a2ea0a749d2a5712fc845a69cb5d52e78e6449bbd295611f3aa"}, - {file = "pyarrow-16.1.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:ba8ac20693c0bb0bf4b238751d4409e62852004a8cf031c73b0e0962b03e45e3"}, - {file = "pyarrow-16.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:31a1851751433d89a986616015841977e0a188662fcffd1a5677453f1df2de0a"}, - {file = "pyarrow-16.1.0.tar.gz", hash = "sha256:15fbb22ea96d11f0b5768504a3f961edab25eaf4197c341720c4a387f6c60315"}, + {file = "pyarrow-17.0.0-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:a5c8b238d47e48812ee577ee20c9a2779e6a5904f1708ae240f53ecbee7c9f07"}, + {file = "pyarrow-17.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:db023dc4c6cae1015de9e198d41250688383c3f9af8f565370ab2b4cb5f62655"}, + {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:da1e060b3876faa11cee287839f9cc7cdc00649f475714b8680a05fd9071d545"}, + {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:75c06d4624c0ad6674364bb46ef38c3132768139ddec1c56582dbac54f2663e2"}, + {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:fa3c246cc58cb5a4a5cb407a18f193354ea47dd0648194e6265bd24177982fe8"}, + {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:f7ae2de664e0b158d1607699a16a488de3d008ba99b3a7aa5de1cbc13574d047"}, + {file = "pyarrow-17.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:5984f416552eea15fd9cee03da53542bf4cddaef5afecefb9aa8d1010c335087"}, + {file = "pyarrow-17.0.0-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:1c8856e2ef09eb87ecf937104aacfa0708f22dfeb039c363ec99735190ffb977"}, + {file = "pyarrow-17.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2e19f569567efcbbd42084e87f948778eb371d308e137a0f97afe19bb860ccb3"}, + {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6b244dc8e08a23b3e352899a006a26ae7b4d0da7bb636872fa8f5884e70acf15"}, + {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b72e87fe3e1db343995562f7fff8aee354b55ee83d13afba65400c178ab2597"}, + {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:dc5c31c37409dfbc5d014047817cb4ccd8c1ea25d19576acf1a001fe07f5b420"}, + {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:e3343cb1e88bc2ea605986d4b94948716edc7a8d14afd4e2c097232f729758b4"}, + {file = "pyarrow-17.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:a27532c38f3de9eb3e90ecab63dfda948a8ca859a66e3a47f5f42d1e403c4d03"}, + {file = "pyarrow-17.0.0-cp312-cp312-macosx_10_15_x86_64.whl", hash = "sha256:9b8a823cea605221e61f34859dcc03207e52e409ccf6354634143e23af7c8d22"}, + {file = "pyarrow-17.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f1e70de6cb5790a50b01d2b686d54aaf73da01266850b05e3af2a1bc89e16053"}, + {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0071ce35788c6f9077ff9ecba4858108eebe2ea5a3f7cf2cf55ebc1dbc6ee24a"}, + {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:757074882f844411fcca735e39aae74248a1531367a7c80799b4266390ae51cc"}, + {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:9ba11c4f16976e89146781a83833df7f82077cdab7dc6232c897789343f7891a"}, + {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:b0c6ac301093b42d34410b187bba560b17c0330f64907bfa4f7f7f2444b0cf9b"}, + {file = "pyarrow-17.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:392bc9feabc647338e6c89267635e111d71edad5fcffba204425a7c8d13610d7"}, + {file = "pyarrow-17.0.0-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:af5ff82a04b2171415f1410cff7ebb79861afc5dae50be73ce06d6e870615204"}, + {file = "pyarrow-17.0.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:edca18eaca89cd6382dfbcff3dd2d87633433043650c07375d095cd3517561d8"}, ] [package.dependencies] numpy = ">=1.16.6" +[package.extras] +test = ["cffi", "hypothesis", "pandas", "pytest", "pytz"] + [[package]] name = "pyarrow-hotfix" version = "0.6" @@ -2215,109 +2217,119 @@ files = [ [[package]] name = "pydantic" -version = "2.7.4" +version = "2.8.2" description = "Data validation using Python type hints" optional = true python-versions = ">=3.8" files = [ - {file = "pydantic-2.7.4-py3-none-any.whl", hash = "sha256:ee8538d41ccb9c0a9ad3e0e5f07bf15ed8015b481ced539a1759d8cc89ae90d0"}, - {file = "pydantic-2.7.4.tar.gz", hash = "sha256:0c84efd9548d545f63ac0060c1e4d39bb9b14db8b3c0652338aecc07b5adec52"}, + {file = "pydantic-2.8.2-py3-none-any.whl", hash = "sha256:73ee9fddd406dc318b885c7a2eab8a6472b68b8fb5ba8150949fc3db939f23c8"}, + {file = "pydantic-2.8.2.tar.gz", hash = "sha256:6f62c13d067b0755ad1c21a34bdd06c0c12625a22b0fc09c6b149816604f7c2a"}, ] [package.dependencies] annotated-types = ">=0.4.0" -pydantic-core = "2.18.4" -typing-extensions = ">=4.6.1" +pydantic-core = "2.20.1" +typing-extensions = {version = ">=4.6.1", markers = "python_version < \"3.13\""} [package.extras] email = ["email-validator (>=2.0.0)"] [[package]] name = "pydantic-core" -version = "2.18.4" +version = "2.20.1" description = "Core functionality for Pydantic validation and serialization" optional = true python-versions = ">=3.8" files = [ - {file = "pydantic_core-2.18.4-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:f76d0ad001edd426b92233d45c746fd08f467d56100fd8f30e9ace4b005266e4"}, - {file = "pydantic_core-2.18.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:59ff3e89f4eaf14050c8022011862df275b552caef8082e37b542b066ce1ff26"}, - {file = "pydantic_core-2.18.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a55b5b16c839df1070bc113c1f7f94a0af4433fcfa1b41799ce7606e5c79ce0a"}, - {file = "pydantic_core-2.18.4-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:4d0dcc59664fcb8974b356fe0a18a672d6d7cf9f54746c05f43275fc48636851"}, - {file = "pydantic_core-2.18.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8951eee36c57cd128f779e641e21eb40bc5073eb28b2d23f33eb0ef14ffb3f5d"}, - {file = "pydantic_core-2.18.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4701b19f7e3a06ea655513f7938de6f108123bf7c86bbebb1196eb9bd35cf724"}, - {file = "pydantic_core-2.18.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e00a3f196329e08e43d99b79b286d60ce46bed10f2280d25a1718399457e06be"}, - {file = "pydantic_core-2.18.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:97736815b9cc893b2b7f663628e63f436018b75f44854c8027040e05230eeddb"}, - {file = "pydantic_core-2.18.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:6891a2ae0e8692679c07728819b6e2b822fb30ca7445f67bbf6509b25a96332c"}, - {file = "pydantic_core-2.18.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:bc4ff9805858bd54d1a20efff925ccd89c9d2e7cf4986144b30802bf78091c3e"}, - {file = "pydantic_core-2.18.4-cp310-none-win32.whl", hash = "sha256:1b4de2e51bbcb61fdebd0ab86ef28062704f62c82bbf4addc4e37fa4b00b7cbc"}, - {file = "pydantic_core-2.18.4-cp310-none-win_amd64.whl", hash = "sha256:6a750aec7bf431517a9fd78cb93c97b9b0c496090fee84a47a0d23668976b4b0"}, - {file = "pydantic_core-2.18.4-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:942ba11e7dfb66dc70f9ae66b33452f51ac7bb90676da39a7345e99ffb55402d"}, - {file = "pydantic_core-2.18.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b2ebef0e0b4454320274f5e83a41844c63438fdc874ea40a8b5b4ecb7693f1c4"}, - {file = "pydantic_core-2.18.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a642295cd0c8df1b86fc3dced1d067874c353a188dc8e0f744626d49e9aa51c4"}, - {file = "pydantic_core-2.18.4-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5f09baa656c904807e832cf9cce799c6460c450c4ad80803517032da0cd062e2"}, - {file = "pydantic_core-2.18.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:98906207f29bc2c459ff64fa007afd10a8c8ac080f7e4d5beff4c97086a3dabd"}, - {file = "pydantic_core-2.18.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:19894b95aacfa98e7cb093cd7881a0c76f55731efad31073db4521e2b6ff5b7d"}, - {file = "pydantic_core-2.18.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0fbbdc827fe5e42e4d196c746b890b3d72876bdbf160b0eafe9f0334525119c8"}, - {file = "pydantic_core-2.18.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f85d05aa0918283cf29a30b547b4df2fbb56b45b135f9e35b6807cb28bc47951"}, - {file = "pydantic_core-2.18.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e85637bc8fe81ddb73fda9e56bab24560bdddfa98aa64f87aaa4e4b6730c23d2"}, - {file = "pydantic_core-2.18.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:2f5966897e5461f818e136b8451d0551a2e77259eb0f73a837027b47dc95dab9"}, - {file = "pydantic_core-2.18.4-cp311-none-win32.whl", hash = "sha256:44c7486a4228413c317952e9d89598bcdfb06399735e49e0f8df643e1ccd0558"}, - {file = "pydantic_core-2.18.4-cp311-none-win_amd64.whl", hash = "sha256:8a7164fe2005d03c64fd3b85649891cd4953a8de53107940bf272500ba8a788b"}, - {file = "pydantic_core-2.18.4-cp311-none-win_arm64.whl", hash = "sha256:4e99bc050fe65c450344421017f98298a97cefc18c53bb2f7b3531eb39bc7805"}, - {file = "pydantic_core-2.18.4-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:6f5c4d41b2771c730ea1c34e458e781b18cc668d194958e0112455fff4e402b2"}, - {file = "pydantic_core-2.18.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2fdf2156aa3d017fddf8aea5adfba9f777db1d6022d392b682d2a8329e087cef"}, - {file = "pydantic_core-2.18.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4748321b5078216070b151d5271ef3e7cc905ab170bbfd27d5c83ee3ec436695"}, - {file = "pydantic_core-2.18.4-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:847a35c4d58721c5dc3dba599878ebbdfd96784f3fb8bb2c356e123bdcd73f34"}, - {file = "pydantic_core-2.18.4-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3c40d4eaad41f78e3bbda31b89edc46a3f3dc6e171bf0ecf097ff7a0ffff7cb1"}, - {file = "pydantic_core-2.18.4-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:21a5e440dbe315ab9825fcd459b8814bb92b27c974cbc23c3e8baa2b76890077"}, - {file = "pydantic_core-2.18.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:01dd777215e2aa86dfd664daed5957704b769e726626393438f9c87690ce78c3"}, - {file = "pydantic_core-2.18.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:4b06beb3b3f1479d32befd1f3079cc47b34fa2da62457cdf6c963393340b56e9"}, - {file = "pydantic_core-2.18.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:564d7922e4b13a16b98772441879fcdcbe82ff50daa622d681dd682175ea918c"}, - {file = "pydantic_core-2.18.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:0eb2a4f660fcd8e2b1c90ad566db2b98d7f3f4717c64fe0a83e0adb39766d5b8"}, - {file = "pydantic_core-2.18.4-cp312-none-win32.whl", hash = "sha256:8b8bab4c97248095ae0c4455b5a1cd1cdd96e4e4769306ab19dda135ea4cdb07"}, - {file = "pydantic_core-2.18.4-cp312-none-win_amd64.whl", hash = "sha256:14601cdb733d741b8958224030e2bfe21a4a881fb3dd6fbb21f071cabd48fa0a"}, - {file = "pydantic_core-2.18.4-cp312-none-win_arm64.whl", hash = "sha256:c1322d7dd74713dcc157a2b7898a564ab091ca6c58302d5c7b4c07296e3fd00f"}, - {file = "pydantic_core-2.18.4-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:823be1deb01793da05ecb0484d6c9e20baebb39bd42b5d72636ae9cf8350dbd2"}, - {file = "pydantic_core-2.18.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:ebef0dd9bf9b812bf75bda96743f2a6c5734a02092ae7f721c048d156d5fabae"}, - {file = "pydantic_core-2.18.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ae1d6df168efb88d7d522664693607b80b4080be6750c913eefb77e34c12c71a"}, - {file = "pydantic_core-2.18.4-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f9899c94762343f2cc2fc64c13e7cae4c3cc65cdfc87dd810a31654c9b7358cc"}, - {file = "pydantic_core-2.18.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:99457f184ad90235cfe8461c4d70ab7dd2680e28821c29eca00252ba90308c78"}, - {file = "pydantic_core-2.18.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:18f469a3d2a2fdafe99296a87e8a4c37748b5080a26b806a707f25a902c040a8"}, - {file = "pydantic_core-2.18.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b7cdf28938ac6b8b49ae5e92f2735056a7ba99c9b110a474473fd71185c1af5d"}, - {file = "pydantic_core-2.18.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:938cb21650855054dc54dfd9120a851c974f95450f00683399006aa6e8abb057"}, - {file = "pydantic_core-2.18.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:44cd83ab6a51da80fb5adbd9560e26018e2ac7826f9626bc06ca3dc074cd198b"}, - {file = "pydantic_core-2.18.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:972658f4a72d02b8abfa2581d92d59f59897d2e9f7e708fdabe922f9087773af"}, - {file = "pydantic_core-2.18.4-cp38-none-win32.whl", hash = "sha256:1d886dc848e60cb7666f771e406acae54ab279b9f1e4143babc9c2258213daa2"}, - {file = "pydantic_core-2.18.4-cp38-none-win_amd64.whl", hash = "sha256:bb4462bd43c2460774914b8525f79b00f8f407c945d50881568f294c1d9b4443"}, - {file = "pydantic_core-2.18.4-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:44a688331d4a4e2129140a8118479443bd6f1905231138971372fcde37e43528"}, - {file = "pydantic_core-2.18.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a2fdd81edd64342c85ac7cf2753ccae0b79bf2dfa063785503cb85a7d3593223"}, - {file = "pydantic_core-2.18.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:86110d7e1907ab36691f80b33eb2da87d780f4739ae773e5fc83fb272f88825f"}, - {file = "pydantic_core-2.18.4-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:46387e38bd641b3ee5ce247563b60c5ca098da9c56c75c157a05eaa0933ed154"}, - {file = "pydantic_core-2.18.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:123c3cec203e3f5ac7b000bd82235f1a3eced8665b63d18be751f115588fea30"}, - {file = "pydantic_core-2.18.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dc1803ac5c32ec324c5261c7209e8f8ce88e83254c4e1aebdc8b0a39f9ddb443"}, - {file = "pydantic_core-2.18.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:53db086f9f6ab2b4061958d9c276d1dbe3690e8dd727d6abf2321d6cce37fa94"}, - {file = "pydantic_core-2.18.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:abc267fa9837245cc28ea6929f19fa335f3dc330a35d2e45509b6566dc18be23"}, - {file = "pydantic_core-2.18.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:a0d829524aaefdebccb869eed855e2d04c21d2d7479b6cada7ace5448416597b"}, - {file = "pydantic_core-2.18.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:509daade3b8649f80d4e5ff21aa5673e4ebe58590b25fe42fac5f0f52c6f034a"}, - {file = "pydantic_core-2.18.4-cp39-none-win32.whl", hash = "sha256:ca26a1e73c48cfc54c4a76ff78df3727b9d9f4ccc8dbee4ae3f73306a591676d"}, - {file = "pydantic_core-2.18.4-cp39-none-win_amd64.whl", hash = "sha256:c67598100338d5d985db1b3d21f3619ef392e185e71b8d52bceacc4a7771ea7e"}, - {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:574d92eac874f7f4db0ca653514d823a0d22e2354359d0759e3f6a406db5d55d"}, - {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:1f4d26ceb5eb9eed4af91bebeae4b06c3fb28966ca3a8fb765208cf6b51102ab"}, - {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:77450e6d20016ec41f43ca4a6c63e9fdde03f0ae3fe90e7c27bdbeaece8b1ed4"}, - {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d323a01da91851a4f17bf592faf46149c9169d68430b3146dcba2bb5e5719abc"}, - {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:43d447dd2ae072a0065389092a231283f62d960030ecd27565672bd40746c507"}, - {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:578e24f761f3b425834f297b9935e1ce2e30f51400964ce4801002435a1b41ef"}, - {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:81b5efb2f126454586d0f40c4d834010979cb80785173d1586df845a632e4e6d"}, - {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:ab86ce7c8f9bea87b9d12c7f0af71102acbf5ecbc66c17796cff45dae54ef9a5"}, - {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:90afc12421df2b1b4dcc975f814e21bc1754640d502a2fbcc6d41e77af5ec312"}, - {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:51991a89639a912c17bef4b45c87bd83593aee0437d8102556af4885811d59f5"}, - {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:293afe532740370aba8c060882f7d26cfd00c94cae32fd2e212a3a6e3b7bc15e"}, - {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b48ece5bde2e768197a2d0f6e925f9d7e3e826f0ad2271120f8144a9db18d5c8"}, - {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:eae237477a873ab46e8dd748e515c72c0c804fb380fbe6c85533c7de51f23a8f"}, - {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:834b5230b5dfc0c1ec37b2fda433b271cbbc0e507560b5d1588e2cc1148cf1ce"}, - {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:e858ac0a25074ba4bce653f9b5d0a85b7456eaddadc0ce82d3878c22489fa4ee"}, - {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:2fd41f6eff4c20778d717af1cc50eca52f5afe7805ee530a4fbd0bae284f16e9"}, - {file = "pydantic_core-2.18.4.tar.gz", hash = "sha256:ec3beeada09ff865c344ff3bc2f427f5e6c26401cc6113d77e372c3fdac73864"}, + {file = "pydantic_core-2.20.1-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:3acae97ffd19bf091c72df4d726d552c473f3576409b2a7ca36b2f535ffff4a3"}, + {file = "pydantic_core-2.20.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:41f4c96227a67a013e7de5ff8f20fb496ce573893b7f4f2707d065907bffdbd6"}, + {file = "pydantic_core-2.20.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5f239eb799a2081495ea659d8d4a43a8f42cd1fe9ff2e7e436295c38a10c286a"}, + {file = "pydantic_core-2.20.1-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:53e431da3fc53360db73eedf6f7124d1076e1b4ee4276b36fb25514544ceb4a3"}, + {file = "pydantic_core-2.20.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f1f62b2413c3a0e846c3b838b2ecd6c7a19ec6793b2a522745b0869e37ab5bc1"}, + {file = "pydantic_core-2.20.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5d41e6daee2813ecceea8eda38062d69e280b39df793f5a942fa515b8ed67953"}, + {file = "pydantic_core-2.20.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3d482efec8b7dc6bfaedc0f166b2ce349df0011f5d2f1f25537ced4cfc34fd98"}, + {file = "pydantic_core-2.20.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:e93e1a4b4b33daed65d781a57a522ff153dcf748dee70b40c7258c5861e1768a"}, + {file = "pydantic_core-2.20.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:e7c4ea22b6739b162c9ecaaa41d718dfad48a244909fe7ef4b54c0b530effc5a"}, + {file = "pydantic_core-2.20.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:4f2790949cf385d985a31984907fecb3896999329103df4e4983a4a41e13e840"}, + {file = "pydantic_core-2.20.1-cp310-none-win32.whl", hash = "sha256:5e999ba8dd90e93d57410c5e67ebb67ffcaadcea0ad973240fdfd3a135506250"}, + {file = "pydantic_core-2.20.1-cp310-none-win_amd64.whl", hash = "sha256:512ecfbefef6dac7bc5eaaf46177b2de58cdf7acac8793fe033b24ece0b9566c"}, + {file = "pydantic_core-2.20.1-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:d2a8fa9d6d6f891f3deec72f5cc668e6f66b188ab14bb1ab52422fe8e644f312"}, + {file = "pydantic_core-2.20.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:175873691124f3d0da55aeea1d90660a6ea7a3cfea137c38afa0a5ffabe37b88"}, + {file = "pydantic_core-2.20.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:37eee5b638f0e0dcd18d21f59b679686bbd18917b87db0193ae36f9c23c355fc"}, + {file = "pydantic_core-2.20.1-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:25e9185e2d06c16ee438ed39bf62935ec436474a6ac4f9358524220f1b236e43"}, + {file = "pydantic_core-2.20.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:150906b40ff188a3260cbee25380e7494ee85048584998c1e66df0c7a11c17a6"}, + {file = "pydantic_core-2.20.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8ad4aeb3e9a97286573c03df758fc7627aecdd02f1da04516a86dc159bf70121"}, + {file = "pydantic_core-2.20.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d3f3ed29cd9f978c604708511a1f9c2fdcb6c38b9aae36a51905b8811ee5cbf1"}, + {file = "pydantic_core-2.20.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b0dae11d8f5ded51699c74d9548dcc5938e0804cc8298ec0aa0da95c21fff57b"}, + {file = "pydantic_core-2.20.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:faa6b09ee09433b87992fb5a2859efd1c264ddc37280d2dd5db502126d0e7f27"}, + {file = "pydantic_core-2.20.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:9dc1b507c12eb0481d071f3c1808f0529ad41dc415d0ca11f7ebfc666e66a18b"}, + {file = "pydantic_core-2.20.1-cp311-none-win32.whl", hash = "sha256:fa2fddcb7107e0d1808086ca306dcade7df60a13a6c347a7acf1ec139aa6789a"}, + {file = "pydantic_core-2.20.1-cp311-none-win_amd64.whl", hash = "sha256:40a783fb7ee353c50bd3853e626f15677ea527ae556429453685ae32280c19c2"}, + {file = "pydantic_core-2.20.1-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:595ba5be69b35777474fa07f80fc260ea71255656191adb22a8c53aba4479231"}, + {file = "pydantic_core-2.20.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a4f55095ad087474999ee28d3398bae183a66be4823f753cd7d67dd0153427c9"}, + {file = "pydantic_core-2.20.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f9aa05d09ecf4c75157197f27cdc9cfaeb7c5f15021c6373932bf3e124af029f"}, + {file = "pydantic_core-2.20.1-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e97fdf088d4b31ff4ba35db26d9cc472ac7ef4a2ff2badeabf8d727b3377fc52"}, + {file = "pydantic_core-2.20.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bc633a9fe1eb87e250b5c57d389cf28998e4292336926b0b6cdaee353f89a237"}, + {file = "pydantic_core-2.20.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d573faf8eb7e6b1cbbcb4f5b247c60ca8be39fe2c674495df0eb4318303137fe"}, + {file = "pydantic_core-2.20.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:26dc97754b57d2fd00ac2b24dfa341abffc380b823211994c4efac7f13b9e90e"}, + {file = "pydantic_core-2.20.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:33499e85e739a4b60c9dac710c20a08dc73cb3240c9a0e22325e671b27b70d24"}, + {file = "pydantic_core-2.20.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:bebb4d6715c814597f85297c332297c6ce81e29436125ca59d1159b07f423eb1"}, + {file = "pydantic_core-2.20.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:516d9227919612425c8ef1c9b869bbbee249bc91912c8aaffb66116c0b447ebd"}, + {file = "pydantic_core-2.20.1-cp312-none-win32.whl", hash = "sha256:469f29f9093c9d834432034d33f5fe45699e664f12a13bf38c04967ce233d688"}, + {file = "pydantic_core-2.20.1-cp312-none-win_amd64.whl", hash = "sha256:035ede2e16da7281041f0e626459bcae33ed998cca6a0a007a5ebb73414ac72d"}, + {file = "pydantic_core-2.20.1-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:0827505a5c87e8aa285dc31e9ec7f4a17c81a813d45f70b1d9164e03a813a686"}, + {file = "pydantic_core-2.20.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:19c0fa39fa154e7e0b7f82f88ef85faa2a4c23cc65aae2f5aea625e3c13c735a"}, + {file = "pydantic_core-2.20.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4aa223cd1e36b642092c326d694d8bf59b71ddddc94cdb752bbbb1c5c91d833b"}, + {file = "pydantic_core-2.20.1-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c336a6d235522a62fef872c6295a42ecb0c4e1d0f1a3e500fe949415761b8a19"}, + {file = "pydantic_core-2.20.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7eb6a0587eded33aeefea9f916899d42b1799b7b14b8f8ff2753c0ac1741edac"}, + {file = "pydantic_core-2.20.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:70c8daf4faca8da5a6d655f9af86faf6ec2e1768f4b8b9d0226c02f3d6209703"}, + {file = "pydantic_core-2.20.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e9fa4c9bf273ca41f940bceb86922a7667cd5bf90e95dbb157cbb8441008482c"}, + {file = "pydantic_core-2.20.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:11b71d67b4725e7e2a9f6e9c0ac1239bbc0c48cce3dc59f98635efc57d6dac83"}, + {file = "pydantic_core-2.20.1-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:270755f15174fb983890c49881e93f8f1b80f0b5e3a3cc1394a255706cabd203"}, + {file = "pydantic_core-2.20.1-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:c81131869240e3e568916ef4c307f8b99583efaa60a8112ef27a366eefba8ef0"}, + {file = "pydantic_core-2.20.1-cp313-none-win32.whl", hash = "sha256:b91ced227c41aa29c672814f50dbb05ec93536abf8f43cd14ec9521ea09afe4e"}, + {file = "pydantic_core-2.20.1-cp313-none-win_amd64.whl", hash = "sha256:65db0f2eefcaad1a3950f498aabb4875c8890438bc80b19362cf633b87a8ab20"}, + {file = "pydantic_core-2.20.1-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:4745f4ac52cc6686390c40eaa01d48b18997cb130833154801a442323cc78f91"}, + {file = "pydantic_core-2.20.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:a8ad4c766d3f33ba8fd692f9aa297c9058970530a32c728a2c4bfd2616d3358b"}, + {file = "pydantic_core-2.20.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:41e81317dd6a0127cabce83c0c9c3fbecceae981c8391e6f1dec88a77c8a569a"}, + {file = "pydantic_core-2.20.1-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:04024d270cf63f586ad41fff13fde4311c4fc13ea74676962c876d9577bcc78f"}, + {file = "pydantic_core-2.20.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:eaad4ff2de1c3823fddf82f41121bdf453d922e9a238642b1dedb33c4e4f98ad"}, + {file = "pydantic_core-2.20.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:26ab812fa0c845df815e506be30337e2df27e88399b985d0bb4e3ecfe72df31c"}, + {file = "pydantic_core-2.20.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3c5ebac750d9d5f2706654c638c041635c385596caf68f81342011ddfa1e5598"}, + {file = "pydantic_core-2.20.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2aafc5a503855ea5885559eae883978c9b6d8c8993d67766ee73d82e841300dd"}, + {file = "pydantic_core-2.20.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:4868f6bd7c9d98904b748a2653031fc9c2f85b6237009d475b1008bfaeb0a5aa"}, + {file = "pydantic_core-2.20.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:aa2f457b4af386254372dfa78a2eda2563680d982422641a85f271c859df1987"}, + {file = "pydantic_core-2.20.1-cp38-none-win32.whl", hash = "sha256:225b67a1f6d602de0ce7f6c1c3ae89a4aa25d3de9be857999e9124f15dab486a"}, + {file = "pydantic_core-2.20.1-cp38-none-win_amd64.whl", hash = "sha256:6b507132dcfc0dea440cce23ee2182c0ce7aba7054576efc65634f080dbe9434"}, + {file = "pydantic_core-2.20.1-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:b03f7941783b4c4a26051846dea594628b38f6940a2fdc0df00b221aed39314c"}, + {file = "pydantic_core-2.20.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:1eedfeb6089ed3fad42e81a67755846ad4dcc14d73698c120a82e4ccf0f1f9f6"}, + {file = "pydantic_core-2.20.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:635fee4e041ab9c479e31edda27fcf966ea9614fff1317e280d99eb3e5ab6fe2"}, + {file = "pydantic_core-2.20.1-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:77bf3ac639c1ff567ae3b47f8d4cc3dc20f9966a2a6dd2311dcc055d3d04fb8a"}, + {file = "pydantic_core-2.20.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7ed1b0132f24beeec5a78b67d9388656d03e6a7c837394f99257e2d55b461611"}, + {file = "pydantic_core-2.20.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c6514f963b023aeee506678a1cf821fe31159b925c4b76fe2afa94cc70b3222b"}, + {file = "pydantic_core-2.20.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:10d4204d8ca33146e761c79f83cc861df20e7ae9f6487ca290a97702daf56006"}, + {file = "pydantic_core-2.20.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2d036c7187b9422ae5b262badb87a20a49eb6c5238b2004e96d4da1231badef1"}, + {file = "pydantic_core-2.20.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:9ebfef07dbe1d93efb94b4700f2d278494e9162565a54f124c404a5656d7ff09"}, + {file = "pydantic_core-2.20.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:6b9d9bb600328a1ce523ab4f454859e9d439150abb0906c5a1983c146580ebab"}, + {file = "pydantic_core-2.20.1-cp39-none-win32.whl", hash = "sha256:784c1214cb6dd1e3b15dd8b91b9a53852aed16671cc3fbe4786f4f1db07089e2"}, + {file = "pydantic_core-2.20.1-cp39-none-win_amd64.whl", hash = "sha256:d2fe69c5434391727efa54b47a1e7986bb0186e72a41b203df8f5b0a19a4f669"}, + {file = "pydantic_core-2.20.1-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:a45f84b09ac9c3d35dfcf6a27fd0634d30d183205230a0ebe8373a0e8cfa0906"}, + {file = "pydantic_core-2.20.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:d02a72df14dfdbaf228424573a07af10637bd490f0901cee872c4f434a735b94"}, + {file = "pydantic_core-2.20.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d2b27e6af28f07e2f195552b37d7d66b150adbaa39a6d327766ffd695799780f"}, + {file = "pydantic_core-2.20.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:084659fac3c83fd674596612aeff6041a18402f1e1bc19ca39e417d554468482"}, + {file = "pydantic_core-2.20.1-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:242b8feb3c493ab78be289c034a1f659e8826e2233786e36f2893a950a719bb6"}, + {file = "pydantic_core-2.20.1-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:38cf1c40a921d05c5edc61a785c0ddb4bed67827069f535d794ce6bcded919fc"}, + {file = "pydantic_core-2.20.1-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:e0bbdd76ce9aa5d4209d65f2b27fc6e5ef1312ae6c5333c26db3f5ade53a1e99"}, + {file = "pydantic_core-2.20.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:254ec27fdb5b1ee60684f91683be95e5133c994cc54e86a0b0963afa25c8f8a6"}, + {file = "pydantic_core-2.20.1-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:407653af5617f0757261ae249d3fba09504d7a71ab36ac057c938572d1bc9331"}, + {file = "pydantic_core-2.20.1-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:c693e916709c2465b02ca0ad7b387c4f8423d1db7b4649c551f27a529181c5ad"}, + {file = "pydantic_core-2.20.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5b5ff4911aea936a47d9376fd3ab17e970cc543d1b68921886e7f64bd28308d1"}, + {file = "pydantic_core-2.20.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:177f55a886d74f1808763976ac4efd29b7ed15c69f4d838bbd74d9d09cf6fa86"}, + {file = "pydantic_core-2.20.1-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:964faa8a861d2664f0c7ab0c181af0bea66098b1919439815ca8803ef136fc4e"}, + {file = "pydantic_core-2.20.1-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:4dd484681c15e6b9a977c785a345d3e378d72678fd5f1f3c0509608da24f2ac0"}, + {file = "pydantic_core-2.20.1-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:f6d6cff3538391e8486a431569b77921adfcdef14eb18fbf19b7c0a5294d4e6a"}, + {file = "pydantic_core-2.20.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:a6d511cc297ff0883bc3708b465ff82d7560193169a8b93260f74ecb0a5e08a7"}, + {file = "pydantic_core-2.20.1.tar.gz", hash = "sha256:26ca695eeee5f9f1aeeb211ffc12f10bcb6f71e2989988fda61dabd65db878d4"}, ] [package.dependencies] @@ -2567,110 +2579,110 @@ use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] [[package]] name = "rpds-py" -version = "0.18.1" +version = "0.19.0" description = "Python bindings to Rust's persistent data structures (rpds)" optional = true python-versions = ">=3.8" files = [ - {file = "rpds_py-0.18.1-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:d31dea506d718693b6b2cffc0648a8929bdc51c70a311b2770f09611caa10d53"}, - {file = "rpds_py-0.18.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:732672fbc449bab754e0b15356c077cc31566df874964d4801ab14f71951ea80"}, - {file = "rpds_py-0.18.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4a98a1f0552b5f227a3d6422dbd61bc6f30db170939bd87ed14f3c339aa6c7c9"}, - {file = "rpds_py-0.18.1-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:7f1944ce16401aad1e3f7d312247b3d5de7981f634dc9dfe90da72b87d37887d"}, - {file = "rpds_py-0.18.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:38e14fb4e370885c4ecd734f093a2225ee52dc384b86fa55fe3f74638b2cfb09"}, - {file = "rpds_py-0.18.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:08d74b184f9ab6289b87b19fe6a6d1a97fbfea84b8a3e745e87a5de3029bf944"}, - {file = "rpds_py-0.18.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d70129cef4a8d979caa37e7fe957202e7eee8ea02c5e16455bc9808a59c6b2f0"}, - {file = "rpds_py-0.18.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:ce0bb20e3a11bd04461324a6a798af34d503f8d6f1aa3d2aa8901ceaf039176d"}, - {file = "rpds_py-0.18.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:81c5196a790032e0fc2464c0b4ab95f8610f96f1f2fa3d4deacce6a79852da60"}, - {file = "rpds_py-0.18.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:f3027be483868c99b4985fda802a57a67fdf30c5d9a50338d9db646d590198da"}, - {file = "rpds_py-0.18.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:d44607f98caa2961bab4fa3c4309724b185b464cdc3ba6f3d7340bac3ec97cc1"}, - {file = "rpds_py-0.18.1-cp310-none-win32.whl", hash = "sha256:c273e795e7a0f1fddd46e1e3cb8be15634c29ae8ff31c196debb620e1edb9333"}, - {file = "rpds_py-0.18.1-cp310-none-win_amd64.whl", hash = "sha256:8352f48d511de5f973e4f2f9412736d7dea76c69faa6d36bcf885b50c758ab9a"}, - {file = "rpds_py-0.18.1-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:6b5ff7e1d63a8281654b5e2896d7f08799378e594f09cf3674e832ecaf396ce8"}, - {file = "rpds_py-0.18.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8927638a4d4137a289e41d0fd631551e89fa346d6dbcfc31ad627557d03ceb6d"}, - {file = "rpds_py-0.18.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:154bf5c93d79558b44e5b50cc354aa0459e518e83677791e6adb0b039b7aa6a7"}, - {file = "rpds_py-0.18.1-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:07f2139741e5deb2c5154a7b9629bc5aa48c766b643c1a6750d16f865a82c5fc"}, - {file = "rpds_py-0.18.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8c7672e9fba7425f79019db9945b16e308ed8bc89348c23d955c8c0540da0a07"}, - {file = "rpds_py-0.18.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:489bdfe1abd0406eba6b3bb4fdc87c7fa40f1031de073d0cfb744634cc8fa261"}, - {file = "rpds_py-0.18.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3c20f05e8e3d4fc76875fc9cb8cf24b90a63f5a1b4c5b9273f0e8225e169b100"}, - {file = "rpds_py-0.18.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:967342e045564cef76dfcf1edb700b1e20838d83b1aa02ab313e6a497cf923b8"}, - {file = "rpds_py-0.18.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:2cc7c1a47f3a63282ab0f422d90ddac4aa3034e39fc66a559ab93041e6505da7"}, - {file = "rpds_py-0.18.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:f7afbfee1157e0f9376c00bb232e80a60e59ed716e3211a80cb8506550671e6e"}, - {file = "rpds_py-0.18.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:9e6934d70dc50f9f8ea47081ceafdec09245fd9f6032669c3b45705dea096b88"}, - {file = "rpds_py-0.18.1-cp311-none-win32.whl", hash = "sha256:c69882964516dc143083d3795cb508e806b09fc3800fd0d4cddc1df6c36e76bb"}, - {file = "rpds_py-0.18.1-cp311-none-win_amd64.whl", hash = "sha256:70a838f7754483bcdc830444952fd89645569e7452e3226de4a613a4c1793fb2"}, - {file = "rpds_py-0.18.1-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:3dd3cd86e1db5aadd334e011eba4e29d37a104b403e8ca24dcd6703c68ca55b3"}, - {file = "rpds_py-0.18.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:05f3d615099bd9b13ecf2fc9cf2d839ad3f20239c678f461c753e93755d629ee"}, - {file = "rpds_py-0.18.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:35b2b771b13eee8729a5049c976197ff58a27a3829c018a04341bcf1ae409b2b"}, - {file = "rpds_py-0.18.1-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ee17cd26b97d537af8f33635ef38be873073d516fd425e80559f4585a7b90c43"}, - {file = "rpds_py-0.18.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b646bf655b135ccf4522ed43d6902af37d3f5dbcf0da66c769a2b3938b9d8184"}, - {file = "rpds_py-0.18.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:19ba472b9606c36716062c023afa2484d1e4220548751bda14f725a7de17b4f6"}, - {file = "rpds_py-0.18.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6e30ac5e329098903262dc5bdd7e2086e0256aa762cc8b744f9e7bf2a427d3f8"}, - {file = "rpds_py-0.18.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d58ad6317d188c43750cb76e9deacf6051d0f884d87dc6518e0280438648a9ac"}, - {file = "rpds_py-0.18.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e1735502458621921cee039c47318cb90b51d532c2766593be6207eec53e5c4c"}, - {file = "rpds_py-0.18.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:f5bab211605d91db0e2995a17b5c6ee5edec1270e46223e513eaa20da20076ac"}, - {file = "rpds_py-0.18.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:2fc24a329a717f9e2448f8cd1f960f9dac4e45b6224d60734edeb67499bab03a"}, - {file = "rpds_py-0.18.1-cp312-none-win32.whl", hash = "sha256:1805d5901779662d599d0e2e4159d8a82c0b05faa86ef9222bf974572286b2b6"}, - {file = "rpds_py-0.18.1-cp312-none-win_amd64.whl", hash = "sha256:720edcb916df872d80f80a1cc5ea9058300b97721efda8651efcd938a9c70a72"}, - {file = "rpds_py-0.18.1-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:c827576e2fa017a081346dce87d532a5310241648eb3700af9a571a6e9fc7e74"}, - {file = "rpds_py-0.18.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:aa3679e751408d75a0b4d8d26d6647b6d9326f5e35c00a7ccd82b78ef64f65f8"}, - {file = "rpds_py-0.18.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0abeee75434e2ee2d142d650d1e54ac1f8b01e6e6abdde8ffd6eeac6e9c38e20"}, - {file = "rpds_py-0.18.1-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ed402d6153c5d519a0faf1bb69898e97fb31613b49da27a84a13935ea9164dfc"}, - {file = "rpds_py-0.18.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:338dee44b0cef8b70fd2ef54b4e09bb1b97fc6c3a58fea5db6cc083fd9fc2724"}, - {file = "rpds_py-0.18.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7750569d9526199c5b97e5a9f8d96a13300950d910cf04a861d96f4273d5b104"}, - {file = "rpds_py-0.18.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:607345bd5912aacc0c5a63d45a1f73fef29e697884f7e861094e443187c02be5"}, - {file = "rpds_py-0.18.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:207c82978115baa1fd8d706d720b4a4d2b0913df1c78c85ba73fe6c5804505f0"}, - {file = "rpds_py-0.18.1-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:6d1e42d2735d437e7e80bab4d78eb2e459af48c0a46e686ea35f690b93db792d"}, - {file = "rpds_py-0.18.1-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:5463c47c08630007dc0fe99fb480ea4f34a89712410592380425a9b4e1611d8e"}, - {file = "rpds_py-0.18.1-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:06d218939e1bf2ca50e6b0ec700ffe755e5216a8230ab3e87c059ebb4ea06afc"}, - {file = "rpds_py-0.18.1-cp38-none-win32.whl", hash = "sha256:312fe69b4fe1ffbe76520a7676b1e5ac06ddf7826d764cc10265c3b53f96dbe9"}, - {file = "rpds_py-0.18.1-cp38-none-win_amd64.whl", hash = "sha256:9437ca26784120a279f3137ee080b0e717012c42921eb07861b412340f85bae2"}, - {file = "rpds_py-0.18.1-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:19e515b78c3fc1039dd7da0a33c28c3154458f947f4dc198d3c72db2b6b5dc93"}, - {file = "rpds_py-0.18.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a7b28c5b066bca9a4eb4e2f2663012debe680f097979d880657f00e1c30875a0"}, - {file = "rpds_py-0.18.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:673fdbbf668dd958eff750e500495ef3f611e2ecc209464f661bc82e9838991e"}, - {file = "rpds_py-0.18.1-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d960de62227635d2e61068f42a6cb6aae91a7fe00fca0e3aeed17667c8a34611"}, - {file = "rpds_py-0.18.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:352a88dc7892f1da66b6027af06a2e7e5d53fe05924cc2cfc56495b586a10b72"}, - {file = "rpds_py-0.18.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4e0ee01ad8260184db21468a6e1c37afa0529acc12c3a697ee498d3c2c4dcaf3"}, - {file = "rpds_py-0.18.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e4c39ad2f512b4041343ea3c7894339e4ca7839ac38ca83d68a832fc8b3748ab"}, - {file = "rpds_py-0.18.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:aaa71ee43a703c321906813bb252f69524f02aa05bf4eec85f0c41d5d62d0f4c"}, - {file = "rpds_py-0.18.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:6cd8098517c64a85e790657e7b1e509b9fe07487fd358e19431cb120f7d96338"}, - {file = "rpds_py-0.18.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:4adec039b8e2928983f885c53b7cc4cda8965b62b6596501a0308d2703f8af1b"}, - {file = "rpds_py-0.18.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:32b7daaa3e9389db3695964ce8e566e3413b0c43e3394c05e4b243a4cd7bef26"}, - {file = "rpds_py-0.18.1-cp39-none-win32.whl", hash = "sha256:2625f03b105328729f9450c8badda34d5243231eef6535f80064d57035738360"}, - {file = "rpds_py-0.18.1-cp39-none-win_amd64.whl", hash = "sha256:bf18932d0003c8c4d51a39f244231986ab23ee057d235a12b2684ea26a353590"}, - {file = "rpds_py-0.18.1-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:cbfbea39ba64f5e53ae2915de36f130588bba71245b418060ec3330ebf85678e"}, - {file = "rpds_py-0.18.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:a3d456ff2a6a4d2adcdf3c1c960a36f4fd2fec6e3b4902a42a384d17cf4e7a65"}, - {file = "rpds_py-0.18.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7700936ef9d006b7ef605dc53aa364da2de5a3aa65516a1f3ce73bf82ecfc7ae"}, - {file = "rpds_py-0.18.1-pp310-pypy310_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:51584acc5916212e1bf45edd17f3a6b05fe0cbb40482d25e619f824dccb679de"}, - {file = "rpds_py-0.18.1-pp310-pypy310_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:942695a206a58d2575033ff1e42b12b2aece98d6003c6bc739fbf33d1773b12f"}, - {file = "rpds_py-0.18.1-pp310-pypy310_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b906b5f58892813e5ba5c6056d6a5ad08f358ba49f046d910ad992196ea61397"}, - {file = "rpds_py-0.18.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f6f8e3fecca256fefc91bb6765a693d96692459d7d4c644660a9fff32e517843"}, - {file = "rpds_py-0.18.1-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:7732770412bab81c5a9f6d20aeb60ae943a9b36dcd990d876a773526468e7163"}, - {file = "rpds_py-0.18.1-pp310-pypy310_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:bd1105b50ede37461c1d51b9698c4f4be6e13e69a908ab7751e3807985fc0346"}, - {file = "rpds_py-0.18.1-pp310-pypy310_pp73-musllinux_1_2_i686.whl", hash = "sha256:618916f5535784960f3ecf8111581f4ad31d347c3de66d02e728de460a46303c"}, - {file = "rpds_py-0.18.1-pp310-pypy310_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:17c6d2155e2423f7e79e3bb18151c686d40db42d8645e7977442170c360194d4"}, - {file = "rpds_py-0.18.1-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:6c4c4c3f878df21faf5fac86eda32671c27889e13570645a9eea0a1abdd50922"}, - {file = "rpds_py-0.18.1-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:fab6ce90574645a0d6c58890e9bcaac8d94dff54fb51c69e5522a7358b80ab64"}, - {file = "rpds_py-0.18.1-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:531796fb842b53f2695e94dc338929e9f9dbf473b64710c28af5a160b2a8927d"}, - {file = "rpds_py-0.18.1-pp38-pypy38_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:740884bc62a5e2bbb31e584f5d23b32320fd75d79f916f15a788d527a5e83644"}, - {file = "rpds_py-0.18.1-pp38-pypy38_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:998125738de0158f088aef3cb264a34251908dd2e5d9966774fdab7402edfab7"}, - {file = "rpds_py-0.18.1-pp38-pypy38_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e2be6e9dd4111d5b31ba3b74d17da54a8319d8168890fbaea4b9e5c3de630ae5"}, - {file = "rpds_py-0.18.1-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d0cee71bc618cd93716f3c1bf56653740d2d13ddbd47673efa8bf41435a60daa"}, - {file = "rpds_py-0.18.1-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2c3caec4ec5cd1d18e5dd6ae5194d24ed12785212a90b37f5f7f06b8bedd7139"}, - {file = "rpds_py-0.18.1-pp38-pypy38_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:27bba383e8c5231cd559affe169ca0b96ec78d39909ffd817f28b166d7ddd4d8"}, - {file = "rpds_py-0.18.1-pp38-pypy38_pp73-musllinux_1_2_i686.whl", hash = "sha256:a888e8bdb45916234b99da2d859566f1e8a1d2275a801bb8e4a9644e3c7e7909"}, - {file = "rpds_py-0.18.1-pp38-pypy38_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:6031b25fb1b06327b43d841f33842b383beba399884f8228a6bb3df3088485ff"}, - {file = "rpds_py-0.18.1-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:48c2faaa8adfacefcbfdb5f2e2e7bdad081e5ace8d182e5f4ade971f128e6bb3"}, - {file = "rpds_py-0.18.1-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:d85164315bd68c0806768dc6bb0429c6f95c354f87485ee3593c4f6b14def2bd"}, - {file = "rpds_py-0.18.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6afd80f6c79893cfc0574956f78a0add8c76e3696f2d6a15bca2c66c415cf2d4"}, - {file = "rpds_py-0.18.1-pp39-pypy39_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:fa242ac1ff583e4ec7771141606aafc92b361cd90a05c30d93e343a0c2d82a89"}, - {file = "rpds_py-0.18.1-pp39-pypy39_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d21be4770ff4e08698e1e8e0bce06edb6ea0626e7c8f560bc08222880aca6a6f"}, - {file = "rpds_py-0.18.1-pp39-pypy39_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5c45a639e93a0c5d4b788b2613bd637468edd62f8f95ebc6fcc303d58ab3f0a8"}, - {file = "rpds_py-0.18.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:910e71711d1055b2768181efa0a17537b2622afeb0424116619817007f8a2b10"}, - {file = "rpds_py-0.18.1-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b9bb1f182a97880f6078283b3505a707057c42bf55d8fca604f70dedfdc0772a"}, - {file = "rpds_py-0.18.1-pp39-pypy39_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:1d54f74f40b1f7aaa595a02ff42ef38ca654b1469bef7d52867da474243cc633"}, - {file = "rpds_py-0.18.1-pp39-pypy39_pp73-musllinux_1_2_i686.whl", hash = "sha256:8d2e182c9ee01135e11e9676e9a62dfad791a7a467738f06726872374a83db49"}, - {file = "rpds_py-0.18.1-pp39-pypy39_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:636a15acc588f70fda1661234761f9ed9ad79ebed3f2125d44be0862708b666e"}, - {file = "rpds_py-0.18.1.tar.gz", hash = "sha256:dc48b479d540770c811fbd1eb9ba2bb66951863e448efec2e2c102625328e92f"}, + {file = "rpds_py-0.19.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:fb37bd599f031f1a6fb9e58ec62864ccf3ad549cf14bac527dbfa97123edcca4"}, + {file = "rpds_py-0.19.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3384d278df99ec2c6acf701d067147320b864ef6727405d6470838476e44d9e8"}, + {file = "rpds_py-0.19.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e54548e0be3ac117595408fd4ca0ac9278fde89829b0b518be92863b17ff67a2"}, + {file = "rpds_py-0.19.0-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:8eb488ef928cdbc05a27245e52de73c0d7c72a34240ef4d9893fdf65a8c1a955"}, + {file = "rpds_py-0.19.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a5da93debdfe27b2bfc69eefb592e1831d957b9535e0943a0ee8b97996de21b5"}, + {file = "rpds_py-0.19.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:79e205c70afddd41f6ee79a8656aec738492a550247a7af697d5bd1aee14f766"}, + {file = "rpds_py-0.19.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:959179efb3e4a27610e8d54d667c02a9feaa86bbabaf63efa7faa4dfa780d4f1"}, + {file = "rpds_py-0.19.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a6e605bb9edcf010f54f8b6a590dd23a4b40a8cb141255eec2a03db249bc915b"}, + {file = "rpds_py-0.19.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:9133d75dc119a61d1a0ded38fb9ba40a00ef41697cc07adb6ae098c875195a3f"}, + {file = "rpds_py-0.19.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:dd36b712d35e757e28bf2f40a71e8f8a2d43c8b026d881aa0c617b450d6865c9"}, + {file = "rpds_py-0.19.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:354f3a91718489912f2e0fc331c24eaaf6a4565c080e00fbedb6015857c00582"}, + {file = "rpds_py-0.19.0-cp310-none-win32.whl", hash = "sha256:ebcbf356bf5c51afc3290e491d3722b26aaf5b6af3c1c7f6a1b757828a46e336"}, + {file = "rpds_py-0.19.0-cp310-none-win_amd64.whl", hash = "sha256:75a6076289b2df6c8ecb9d13ff79ae0cad1d5fb40af377a5021016d58cd691ec"}, + {file = "rpds_py-0.19.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:6d45080095e585f8c5097897313def60caa2046da202cdb17a01f147fb263b81"}, + {file = "rpds_py-0.19.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c5c9581019c96f865483d031691a5ff1cc455feb4d84fc6920a5ffc48a794d8a"}, + {file = "rpds_py-0.19.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1540d807364c84516417115c38f0119dfec5ea5c0dd9a25332dea60b1d26fc4d"}, + {file = "rpds_py-0.19.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:9e65489222b410f79711dc3d2d5003d2757e30874096b2008d50329ea4d0f88c"}, + {file = "rpds_py-0.19.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9da6f400eeb8c36f72ef6646ea530d6d175a4f77ff2ed8dfd6352842274c1d8b"}, + {file = "rpds_py-0.19.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:37f46bb11858717e0efa7893c0f7055c43b44c103e40e69442db5061cb26ed34"}, + {file = "rpds_py-0.19.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:071d4adc734de562bd11d43bd134330fb6249769b2f66b9310dab7460f4bf714"}, + {file = "rpds_py-0.19.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:9625367c8955e4319049113ea4f8fee0c6c1145192d57946c6ffcd8fe8bf48dd"}, + {file = "rpds_py-0.19.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:e19509145275d46bc4d1e16af0b57a12d227c8253655a46bbd5ec317e941279d"}, + {file = "rpds_py-0.19.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:4d438e4c020d8c39961deaf58f6913b1bf8832d9b6f62ec35bd93e97807e9cbc"}, + {file = "rpds_py-0.19.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:90bf55d9d139e5d127193170f38c584ed3c79e16638890d2e36f23aa1630b952"}, + {file = "rpds_py-0.19.0-cp311-none-win32.whl", hash = "sha256:8d6ad132b1bc13d05ffe5b85e7a01a3998bf3a6302ba594b28d61b8c2cf13aaf"}, + {file = "rpds_py-0.19.0-cp311-none-win_amd64.whl", hash = "sha256:7ec72df7354e6b7f6eb2a17fa6901350018c3a9ad78e48d7b2b54d0412539a67"}, + {file = "rpds_py-0.19.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:5095a7c838a8647c32aa37c3a460d2c48debff7fc26e1136aee60100a8cd8f68"}, + {file = "rpds_py-0.19.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6f2f78ef14077e08856e788fa482107aa602636c16c25bdf59c22ea525a785e9"}, + {file = "rpds_py-0.19.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b7cc6cb44f8636fbf4a934ca72f3e786ba3c9f9ba4f4d74611e7da80684e48d2"}, + {file = "rpds_py-0.19.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:cf902878b4af334a09de7a45badbff0389e7cf8dc2e4dcf5f07125d0b7c2656d"}, + {file = "rpds_py-0.19.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:688aa6b8aa724db1596514751ffb767766e02e5c4a87486ab36b8e1ebc1aedac"}, + {file = "rpds_py-0.19.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:57dbc9167d48e355e2569346b5aa4077f29bf86389c924df25c0a8b9124461fb"}, + {file = "rpds_py-0.19.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3b4cf5a9497874822341c2ebe0d5850fed392034caadc0bad134ab6822c0925b"}, + {file = "rpds_py-0.19.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:8a790d235b9d39c70a466200d506bb33a98e2ee374a9b4eec7a8ac64c2c261fa"}, + {file = "rpds_py-0.19.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:1d16089dfa58719c98a1c06f2daceba6d8e3fb9b5d7931af4a990a3c486241cb"}, + {file = "rpds_py-0.19.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:bc9128e74fe94650367fe23f37074f121b9f796cabbd2f928f13e9661837296d"}, + {file = "rpds_py-0.19.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:c8f77e661ffd96ff104bebf7d0f3255b02aa5d5b28326f5408d6284c4a8b3248"}, + {file = "rpds_py-0.19.0-cp312-none-win32.whl", hash = "sha256:5f83689a38e76969327e9b682be5521d87a0c9e5a2e187d2bc6be4765f0d4600"}, + {file = "rpds_py-0.19.0-cp312-none-win_amd64.whl", hash = "sha256:06925c50f86da0596b9c3c64c3837b2481337b83ef3519e5db2701df695453a4"}, + {file = "rpds_py-0.19.0-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:52e466bea6f8f3a44b1234570244b1cff45150f59a4acae3fcc5fd700c2993ca"}, + {file = "rpds_py-0.19.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:e21cc693045fda7f745c790cb687958161ce172ffe3c5719ca1764e752237d16"}, + {file = "rpds_py-0.19.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6b31f059878eb1f5da8b2fd82480cc18bed8dcd7fb8fe68370e2e6285fa86da6"}, + {file = "rpds_py-0.19.0-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:1dd46f309e953927dd018567d6a9e2fb84783963650171f6c5fe7e5c41fd5666"}, + {file = "rpds_py-0.19.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:34a01a4490e170376cd79258b7f755fa13b1a6c3667e872c8e35051ae857a92b"}, + {file = "rpds_py-0.19.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:bcf426a8c38eb57f7bf28932e68425ba86def6e756a5b8cb4731d8e62e4e0223"}, + {file = "rpds_py-0.19.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f68eea5df6347d3f1378ce992d86b2af16ad7ff4dcb4a19ccdc23dea901b87fb"}, + {file = "rpds_py-0.19.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:dab8d921b55a28287733263c0e4c7db11b3ee22aee158a4de09f13c93283c62d"}, + {file = "rpds_py-0.19.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:6fe87efd7f47266dfc42fe76dae89060038f1d9cb911f89ae7e5084148d1cc08"}, + {file = "rpds_py-0.19.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:535d4b52524a961d220875688159277f0e9eeeda0ac45e766092bfb54437543f"}, + {file = "rpds_py-0.19.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:8b1a94b8afc154fbe36978a511a1f155f9bd97664e4f1f7a374d72e180ceb0ae"}, + {file = "rpds_py-0.19.0-cp38-none-win32.whl", hash = "sha256:7c98298a15d6b90c8f6e3caa6457f4f022423caa5fa1a1ca7a5e9e512bdb77a4"}, + {file = "rpds_py-0.19.0-cp38-none-win_amd64.whl", hash = "sha256:b0da31853ab6e58a11db3205729133ce0df26e6804e93079dee095be3d681dc1"}, + {file = "rpds_py-0.19.0-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:5039e3cef7b3e7a060de468a4a60a60a1f31786da94c6cb054e7a3c75906111c"}, + {file = "rpds_py-0.19.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ab1932ca6cb8c7499a4d87cb21ccc0d3326f172cfb6a64021a889b591bb3045c"}, + {file = "rpds_py-0.19.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f2afd2164a1e85226fcb6a1da77a5c8896c18bfe08e82e8ceced5181c42d2179"}, + {file = "rpds_py-0.19.0-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:b1c30841f5040de47a0046c243fc1b44ddc87d1b12435a43b8edff7e7cb1e0d0"}, + {file = "rpds_py-0.19.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f757f359f30ec7dcebca662a6bd46d1098f8b9fb1fcd661a9e13f2e8ce343ba1"}, + {file = "rpds_py-0.19.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:15e65395a59d2e0e96caf8ee5389ffb4604e980479c32742936ddd7ade914b22"}, + {file = "rpds_py-0.19.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cb0f6eb3a320f24b94d177e62f4074ff438f2ad9d27e75a46221904ef21a7b05"}, + {file = "rpds_py-0.19.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b228e693a2559888790936e20f5f88b6e9f8162c681830eda303bad7517b4d5a"}, + {file = "rpds_py-0.19.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:2575efaa5d949c9f4e2cdbe7d805d02122c16065bfb8d95c129372d65a291a0b"}, + {file = "rpds_py-0.19.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:5c872814b77a4e84afa293a1bee08c14daed1068b2bb1cc312edbf020bbbca2b"}, + {file = "rpds_py-0.19.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:850720e1b383df199b8433a20e02b25b72f0fded28bc03c5bd79e2ce7ef050be"}, + {file = "rpds_py-0.19.0-cp39-none-win32.whl", hash = "sha256:ce84a7efa5af9f54c0aa7692c45861c1667080814286cacb9958c07fc50294fb"}, + {file = "rpds_py-0.19.0-cp39-none-win_amd64.whl", hash = "sha256:1c26da90b8d06227d7769f34915913911222d24ce08c0ab2d60b354e2d9c7aff"}, + {file = "rpds_py-0.19.0-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:75969cf900d7be665ccb1622a9aba225cf386bbc9c3bcfeeab9f62b5048f4a07"}, + {file = "rpds_py-0.19.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:8445f23f13339da640d1be8e44e5baf4af97e396882ebbf1692aecd67f67c479"}, + {file = "rpds_py-0.19.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5a7c1062ef8aea3eda149f08120f10795835fc1c8bc6ad948fb9652a113ca55"}, + {file = "rpds_py-0.19.0-pp310-pypy310_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:462b0c18fbb48fdbf980914a02ee38c423a25fcc4cf40f66bacc95a2d2d73bc8"}, + {file = "rpds_py-0.19.0-pp310-pypy310_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3208f9aea18991ac7f2b39721e947bbd752a1abbe79ad90d9b6a84a74d44409b"}, + {file = "rpds_py-0.19.0-pp310-pypy310_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c3444fe52b82f122d8a99bf66777aed6b858d392b12f4c317da19f8234db4533"}, + {file = "rpds_py-0.19.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:88cb4bac7185a9f0168d38c01d7a00addece9822a52870eee26b8d5b61409213"}, + {file = "rpds_py-0.19.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6b130bd4163c93798a6b9bb96be64a7c43e1cec81126ffa7ffaa106e1fc5cef5"}, + {file = "rpds_py-0.19.0-pp310-pypy310_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:a707b158b4410aefb6b054715545bbb21aaa5d5d0080217290131c49c2124a6e"}, + {file = "rpds_py-0.19.0-pp310-pypy310_pp73-musllinux_1_2_i686.whl", hash = "sha256:dc9ac4659456bde7c567107556ab065801622396b435a3ff213daef27b495388"}, + {file = "rpds_py-0.19.0-pp310-pypy310_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:81ea573aa46d3b6b3d890cd3c0ad82105985e6058a4baed03cf92518081eec8c"}, + {file = "rpds_py-0.19.0-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:3f148c3f47f7f29a79c38cc5d020edcb5ca780020fab94dbc21f9af95c463581"}, + {file = "rpds_py-0.19.0-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:b0906357f90784a66e89ae3eadc2654f36c580a7d65cf63e6a616e4aec3a81be"}, + {file = "rpds_py-0.19.0-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f629ecc2db6a4736b5ba95a8347b0089240d69ad14ac364f557d52ad68cf94b0"}, + {file = "rpds_py-0.19.0-pp38-pypy38_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c6feacd1d178c30e5bc37184526e56740342fd2aa6371a28367bad7908d454fc"}, + {file = "rpds_py-0.19.0-pp38-pypy38_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ae8b6068ee374fdfab63689be0963333aa83b0815ead5d8648389a8ded593378"}, + {file = "rpds_py-0.19.0-pp38-pypy38_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:78d57546bad81e0da13263e4c9ce30e96dcbe720dbff5ada08d2600a3502e526"}, + {file = "rpds_py-0.19.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a8b6683a37338818646af718c9ca2a07f89787551057fae57c4ec0446dc6224b"}, + {file = "rpds_py-0.19.0-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:e8481b946792415adc07410420d6fc65a352b45d347b78fec45d8f8f0d7496f0"}, + {file = "rpds_py-0.19.0-pp38-pypy38_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:bec35eb20792ea64c3c57891bc3ca0bedb2884fbac2c8249d9b731447ecde4fa"}, + {file = "rpds_py-0.19.0-pp38-pypy38_pp73-musllinux_1_2_i686.whl", hash = "sha256:aa5476c3e3a402c37779e95f7b4048db2cb5b0ed0b9d006983965e93f40fe05a"}, + {file = "rpds_py-0.19.0-pp38-pypy38_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:19d02c45f2507b489fd4df7b827940f1420480b3e2e471e952af4d44a1ea8e34"}, + {file = "rpds_py-0.19.0-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:a3e2fd14c5d49ee1da322672375963f19f32b3d5953f0615b175ff7b9d38daed"}, + {file = "rpds_py-0.19.0-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:93a91c2640645303e874eada51f4f33351b84b351a689d470f8108d0e0694210"}, + {file = "rpds_py-0.19.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e5b9fc03bf76a94065299d4a2ecd8dfbae4ae8e2e8098bbfa6ab6413ca267709"}, + {file = "rpds_py-0.19.0-pp39-pypy39_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5a4b07cdf3f84310c08c1de2c12ddadbb7a77568bcb16e95489f9c81074322ed"}, + {file = "rpds_py-0.19.0-pp39-pypy39_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ba0ed0dc6763d8bd6e5de5cf0d746d28e706a10b615ea382ac0ab17bb7388633"}, + {file = "rpds_py-0.19.0-pp39-pypy39_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:474bc83233abdcf2124ed3f66230a1c8435896046caa4b0b5ab6013c640803cc"}, + {file = "rpds_py-0.19.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:329c719d31362355a96b435f4653e3b4b061fcc9eba9f91dd40804ca637d914e"}, + {file = "rpds_py-0.19.0-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:ef9101f3f7b59043a34f1dccbb385ca760467590951952d6701df0da9893ca0c"}, + {file = "rpds_py-0.19.0-pp39-pypy39_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:0121803b0f424ee2109d6e1f27db45b166ebaa4b32ff47d6aa225642636cd834"}, + {file = "rpds_py-0.19.0-pp39-pypy39_pp73-musllinux_1_2_i686.whl", hash = "sha256:8344127403dea42f5970adccf6c5957a71a47f522171fafaf4c6ddb41b61703a"}, + {file = "rpds_py-0.19.0-pp39-pypy39_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:443cec402ddd650bb2b885113e1dcedb22b1175c6be223b14246a714b61cd521"}, + {file = "rpds_py-0.19.0.tar.gz", hash = "sha256:4fdc9afadbeb393b4bbbad75481e0ea78e4469f2e1d713a90811700830b553a9"}, ] [[package]] @@ -2893,18 +2905,18 @@ files = [ [[package]] name = "setuptools" -version = "70.1.0" +version = "70.3.0" description = "Easily download, build, install, upgrade, and uninstall Python packages" optional = false python-versions = ">=3.8" files = [ - {file = "setuptools-70.1.0-py3-none-any.whl", hash = "sha256:d9b8b771455a97c8a9f3ab3448ebe0b29b5e105f1228bba41028be116985a267"}, - {file = "setuptools-70.1.0.tar.gz", hash = "sha256:01a1e793faa5bd89abc851fa15d0a0db26f160890c7102cd8dce643e886b47f5"}, + {file = "setuptools-70.3.0-py3-none-any.whl", hash = "sha256:fe384da74336c398e0d956d1cae0669bc02eed936cdb1d49b57de1990dc11ffc"}, + {file = "setuptools-70.3.0.tar.gz", hash = "sha256:f171bab1dfbc86b132997f26a119f6056a57950d058587841a0082e8830f9dc5"}, ] [package.extras] -docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"] -testing = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "importlib-metadata", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "jaraco.test", "mypy (==1.10.0)", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.1)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy", "pytest-perf", "pytest-ruff (>=0.3.2)", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] +doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"] +test = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "importlib-metadata", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "jaraco.test", "mypy (==1.10.0)", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy", "pytest-perf", "pytest-ruff (>=0.3.2)", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] [[package]] name = "six" @@ -2919,17 +2931,20 @@ files = [ [[package]] name = "sympy" -version = "1.12.1" +version = "1.13.0" description = "Computer algebra system (CAS) in Python" optional = false python-versions = ">=3.8" files = [ - {file = "sympy-1.12.1-py3-none-any.whl", hash = "sha256:9b2cbc7f1a640289430e13d2a56f02f867a1da0190f2f99d8968c2f74da0e515"}, - {file = "sympy-1.12.1.tar.gz", hash = "sha256:2877b03f998cd8c08f07cd0de5b767119cd3ef40d09f41c30d722f6686b0fb88"}, + {file = "sympy-1.13.0-py3-none-any.whl", hash = "sha256:6b0b32a4673fb91bd3cac3b55406c8e01d53ae22780be467301cc452f6680c92"}, + {file = "sympy-1.13.0.tar.gz", hash = "sha256:3b6af8f4d008b9a1a6a4268b335b984b23835f26d1d60b0526ebc71d48a25f57"}, ] [package.dependencies] -mpmath = ">=1.1.0,<1.4.0" +mpmath = ">=1.1.0,<1.4" + +[package.extras] +dev = ["hypothesis (>=6.70.0)", "pytest (>=7.1.0)"] [[package]] name = "tbb" diff --git a/server/pyproject.toml b/server/pyproject.toml index c91e8849..5b6f720a 100644 --- a/server/pyproject.toml +++ b/server/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "text-generation-server" -version = "2.0.2" +version = "2.0.4" description = "Text Generation Inference Python gRPC Server" authors = ["Olivier Dehaene "] diff --git a/server/requirements_cuda.txt b/server/requirements_cuda.txt index c2714764..88fcc4f3 100644 --- a/server/requirements_cuda.txt +++ b/server/requirements_cuda.txt @@ -6,14 +6,14 @@ colorama==0.4.6 ; python_version >= "3.9" and python_version < "3.13" and (sys_p deprecated==1.2.14 ; python_version >= "3.9" and python_version < "3.13" einops==0.6.1 ; python_version >= "3.9" and python_version < "3.13" filelock==3.14.0 ; python_version >= "3.9" and python_version < "3.13" -fsspec==2024.3.1 ; python_version >= "3.9" and python_version < "3.13" +fsspec==2024.5.0 ; python_version >= "3.9" and python_version < "3.13" googleapis-common-protos==1.63.0 ; python_version >= "3.9" and python_version < "3.13" grpc-interceptor==0.15.4 ; python_version >= "3.9" and python_version < "3.13" grpcio-reflection==1.62.2 ; python_version >= "3.9" and python_version < "3.13" grpcio-status==1.62.2 ; python_version >= "3.9" and python_version < "3.13" -grpcio==1.62.2 ; python_version >= "3.9" and python_version < "3.13" +grpcio==1.64.0 ; python_version >= "3.9" and python_version < "3.13" hf-transfer==0.1.6 ; python_version >= "3.9" and python_version < "3.13" -huggingface-hub==0.19.4 ; python_version >= "3.9" and python_version < "3.13" +huggingface-hub==0.23.1 ; python_version >= "3.9" and python_version < "3.13" idna==3.7 ; python_version >= "3.9" and python_version < "3.13" loguru==0.6.0 ; python_version >= "3.9" and python_version < "3.13" numpy==1.26.4 ; python_version >= "3.9" and python_version < "3.13" @@ -32,17 +32,17 @@ prometheus-client==0.20.0 ; python_version >= "3.9" and python_version < "3.13" protobuf==4.25.3 ; python_version >= "3.9" and python_version < "3.13" py-cpuinfo==9.0.0 ; python_version >= "3.9" and python_version < "3.13" pyyaml==6.0.1 ; python_version >= "3.9" and python_version < "3.13" -regex==2024.4.28 ; python_version >= "3.9" and python_version < "3.13" -requests==2.31.0 ; python_version >= "3.9" and python_version < "3.13" +regex==2024.5.15 ; python_version >= "3.9" and python_version < "3.13" +requests==2.32.2 ; python_version >= "3.9" and python_version < "3.13" safetensors==0.4.3 ; python_version >= "3.9" and python_version < "3.13" -scipy==1.13.0 ; python_version >= "3.9" and python_version < "3.13" +scipy==1.13.1 ; python_version >= "3.9" and python_version < "3.13" sentencepiece==0.1.99 ; python_version >= "3.9" and python_version < "3.13" -setuptools==69.5.1 ; python_version >= "3.9" and python_version < "3.13" +setuptools==70.0.0 ; python_version >= "3.9" and python_version < "3.13" tokenizers==0.19.1 ; python_version >= "3.9" and python_version < "3.13" -tqdm==4.66.2 ; python_version >= "3.9" and python_version < "3.13" -transformers==4.40.1 ; python_version >= "3.9" and python_version < "3.13" +tqdm==4.66.4 ; python_version >= "3.9" and python_version < "3.13" +transformers==4.41.1 ; python_version >= "3.9" and python_version < "3.13" typer==0.6.1 ; python_version >= "3.9" and python_version < "3.13" -typing-extensions==4.11.0 ; python_version >= "3.9" and python_version < "3.13" +typing-extensions==4.12.0 ; python_version >= "3.9" and python_version < "3.13" urllib3==2.2.1 ; python_version >= "3.9" and python_version < "3.13" win32-setctime==1.1.0 ; python_version >= "3.9" and python_version < "3.13" and sys_platform == "win32" wrapt==1.16.0 ; python_version >= "3.9" and python_version < "3.13" diff --git a/server/requirements_rocm.txt b/server/requirements_rocm.txt index c2714764..88fcc4f3 100644 --- a/server/requirements_rocm.txt +++ b/server/requirements_rocm.txt @@ -6,14 +6,14 @@ colorama==0.4.6 ; python_version >= "3.9" and python_version < "3.13" and (sys_p deprecated==1.2.14 ; python_version >= "3.9" and python_version < "3.13" einops==0.6.1 ; python_version >= "3.9" and python_version < "3.13" filelock==3.14.0 ; python_version >= "3.9" and python_version < "3.13" -fsspec==2024.3.1 ; python_version >= "3.9" and python_version < "3.13" +fsspec==2024.5.0 ; python_version >= "3.9" and python_version < "3.13" googleapis-common-protos==1.63.0 ; python_version >= "3.9" and python_version < "3.13" grpc-interceptor==0.15.4 ; python_version >= "3.9" and python_version < "3.13" grpcio-reflection==1.62.2 ; python_version >= "3.9" and python_version < "3.13" grpcio-status==1.62.2 ; python_version >= "3.9" and python_version < "3.13" -grpcio==1.62.2 ; python_version >= "3.9" and python_version < "3.13" +grpcio==1.64.0 ; python_version >= "3.9" and python_version < "3.13" hf-transfer==0.1.6 ; python_version >= "3.9" and python_version < "3.13" -huggingface-hub==0.19.4 ; python_version >= "3.9" and python_version < "3.13" +huggingface-hub==0.23.1 ; python_version >= "3.9" and python_version < "3.13" idna==3.7 ; python_version >= "3.9" and python_version < "3.13" loguru==0.6.0 ; python_version >= "3.9" and python_version < "3.13" numpy==1.26.4 ; python_version >= "3.9" and python_version < "3.13" @@ -32,17 +32,17 @@ prometheus-client==0.20.0 ; python_version >= "3.9" and python_version < "3.13" protobuf==4.25.3 ; python_version >= "3.9" and python_version < "3.13" py-cpuinfo==9.0.0 ; python_version >= "3.9" and python_version < "3.13" pyyaml==6.0.1 ; python_version >= "3.9" and python_version < "3.13" -regex==2024.4.28 ; python_version >= "3.9" and python_version < "3.13" -requests==2.31.0 ; python_version >= "3.9" and python_version < "3.13" +regex==2024.5.15 ; python_version >= "3.9" and python_version < "3.13" +requests==2.32.2 ; python_version >= "3.9" and python_version < "3.13" safetensors==0.4.3 ; python_version >= "3.9" and python_version < "3.13" -scipy==1.13.0 ; python_version >= "3.9" and python_version < "3.13" +scipy==1.13.1 ; python_version >= "3.9" and python_version < "3.13" sentencepiece==0.1.99 ; python_version >= "3.9" and python_version < "3.13" -setuptools==69.5.1 ; python_version >= "3.9" and python_version < "3.13" +setuptools==70.0.0 ; python_version >= "3.9" and python_version < "3.13" tokenizers==0.19.1 ; python_version >= "3.9" and python_version < "3.13" -tqdm==4.66.2 ; python_version >= "3.9" and python_version < "3.13" -transformers==4.40.1 ; python_version >= "3.9" and python_version < "3.13" +tqdm==4.66.4 ; python_version >= "3.9" and python_version < "3.13" +transformers==4.41.1 ; python_version >= "3.9" and python_version < "3.13" typer==0.6.1 ; python_version >= "3.9" and python_version < "3.13" -typing-extensions==4.11.0 ; python_version >= "3.9" and python_version < "3.13" +typing-extensions==4.12.0 ; python_version >= "3.9" and python_version < "3.13" urllib3==2.2.1 ; python_version >= "3.9" and python_version < "3.13" win32-setctime==1.1.0 ; python_version >= "3.9" and python_version < "3.13" and sys_platform == "win32" wrapt==1.16.0 ; python_version >= "3.9" and python_version < "3.13" diff --git a/server/tests/utils/test_layers.py b/server/tests/utils/test_layers.py index 93a0e982..9a8da0d6 100644 --- a/server/tests/utils/test_layers.py +++ b/server/tests/utils/test_layers.py @@ -1,5 +1,5 @@ import torch -from text_generation_server.utils.layers import ( +from text_generation_server.layers import ( TensorParallelEmbedding, ) diff --git a/server/text_generation_server/cli.py b/server/text_generation_server/cli.py index 990c31be..74b87024 100644 --- a/server/text_generation_server/cli.py +++ b/server/text_generation_server/cli.py @@ -200,31 +200,27 @@ def download_weights( try: import json - medusa_head = hf_hub_download( - model_id, revision=revision, filename="medusa_lm_head.safetensors" - ) - medusa_config = hf_hub_download( + config = hf_hub_download( model_id, revision=revision, filename="config.json" ) - with open(medusa_config, "r") as f: + with open(config, "r") as f: config = json.load(f) - model_id = config["base_model_name_or_path"] - revision = "main" - try: - utils.weight_files(model_id, revision, extension) - logger.info( - f"Files for parent {model_id} are already present on the host. " - "Skipping download." - ) - return - # Local files not found - except ( - utils.LocalEntryNotFoundError, - FileNotFoundError, - utils.EntryNotFoundError, - ): - pass + base_model_id = config.get("base_model_name_or_path", None) + if base_model_id and base_model_id != model_id: + try: + logger.info(f"Downloading parent model {base_model_id}") + download_weights( + model_id=base_model_id, + revision="main", + extension=extension, + auto_convert=auto_convert, + logger_level=logger_level, + json_output=json_output, + trust_remote_code=trust_remote_code, + ) + except Exception: + pass except (utils.LocalEntryNotFoundError, utils.EntryNotFoundError): pass @@ -241,31 +237,6 @@ def download_weights( if not extension == ".safetensors" or not auto_convert: raise e - elif (Path(model_id) / "medusa_lm_head.safetensors").exists(): - # Try to load as a local Medusa model - try: - import json - - medusa_head = Path(model_id) / "medusa_lm_head.safetensors" - medusa_config = Path(model_id) / "config.json" - with open(medusa_config, "r") as f: - config = json.load(f) - - model_id = config["base_model_name_or_path"] - revision = "main" - try: - utils.weight_files(model_id, revision, extension) - logger.info( - f"Files for parent {model_id} are already present on the host. " - "Skipping download." - ) - return - # Local files not found - except (utils.LocalEntryNotFoundError, utils.EntryNotFoundError): - pass - except (utils.LocalEntryNotFoundError, utils.EntryNotFoundError): - pass - elif (Path(model_id) / "adapter_config.json").exists(): # Try to load as a local PEFT model try: @@ -276,14 +247,43 @@ def download_weights( return except (utils.LocalEntryNotFoundError, utils.EntryNotFoundError): pass + elif (Path(model_id) / "config.json").exists(): + # Try to load as a local Medusa model + try: + import json + + config = Path(model_id) / "config.json" + with open(config, "r") as f: + config = json.load(f) + + base_model_id = config.get("base_model_name_or_path", None) + if base_model_id: + try: + logger.info(f"Downloading parent model {base_model_id}") + download_weights( + model_id=base_model_id, + revision="main", + extension=extension, + auto_convert=auto_convert, + logger_level=logger_level, + json_output=json_output, + trust_remote_code=trust_remote_code, + ) + except Exception: + pass + except (utils.LocalEntryNotFoundError, utils.EntryNotFoundError): + pass # Try to see if there are local pytorch weights try: # Get weights for a local model, a hub cached model and inside the WEIGHTS_CACHE_OVERRIDE - local_pt_files = utils.weight_files(model_id, revision, ".bin") + try: + local_pt_files = utils.weight_files(model_id, revision, ".bin") + except Exception: + local_pt_files = utils.weight_files(model_id, revision, ".pt") # No local pytorch weights - except utils.LocalEntryNotFoundError: + except (utils.LocalEntryNotFoundError, utils.EntryNotFoundError): if extension == ".safetensors": logger.warning( f"No safetensors weights found for model {model_id} at revision {revision}. " diff --git a/server/text_generation_server/layers/__init__.py b/server/text_generation_server/layers/__init__.py new file mode 100644 index 00000000..c29dd092 --- /dev/null +++ b/server/text_generation_server/layers/__init__.py @@ -0,0 +1,14 @@ +from text_generation_server.layers.tensor_parallel import ( + TensorParallelColumnLinear, + TensorParallelRowLinear, + TensorParallelEmbedding, +) +from text_generation_server.layers.linear import ( + get_linear, + FastLinear, +) +from text_generation_server.layers.speculative import SpeculativeHead + +# Just to add the `load` methods. +from text_generation_server.layers.layernorm import load_layer_norm +from text_generation_server.layers.conv import load_conv2d diff --git a/server/text_generation_server/utils/awq/conversion_utils.py b/server/text_generation_server/layers/awq/conversion_utils.py similarity index 100% rename from server/text_generation_server/utils/awq/conversion_utils.py rename to server/text_generation_server/layers/awq/conversion_utils.py diff --git a/server/text_generation_server/utils/awq/quantize/qmodule.py b/server/text_generation_server/layers/awq/quantize/qmodule.py similarity index 100% rename from server/text_generation_server/utils/awq/quantize/qmodule.py rename to server/text_generation_server/layers/awq/quantize/qmodule.py diff --git a/server/text_generation_server/layers/bnb.py b/server/text_generation_server/layers/bnb.py new file mode 100644 index 00000000..ca39919c --- /dev/null +++ b/server/text_generation_server/layers/bnb.py @@ -0,0 +1,106 @@ +import torch +from loguru import logger +from functools import lru_cache +import bitsandbytes as bnb +from bitsandbytes.nn import Int8Params, Params4bit + + +@lru_cache(1) +def warn_deprecate_bnb(): + logger.warning( + "Bitsandbytes 8bit is deprecated, using `eetq` is a drop-in replacement, and has much better performnce" + ) + + +class Linear8bitLt(torch.nn.Module): + def __init__( + self, + weight, + bias, + has_fp16_weights=True, + memory_efficient_backward=False, + threshold=0.0, + index=None, + ): + super().__init__() + assert ( + not memory_efficient_backward + ), "memory_efficient_backward is no longer required and the argument is deprecated in 0.37.0 and will be removed in 0.39.0" + self.state = bnb.MatmulLtState() + self.index = index + + # Necessary for stacked layers + self.state.threshold = threshold + self.state.has_fp16_weights = has_fp16_weights + self.state.memory_efficient_backward = memory_efficient_backward + if threshold > 0.0 and not has_fp16_weights: + self.state.use_pool = True + + self.weight = Int8Params( + weight.data, + has_fp16_weights=has_fp16_weights, + requires_grad=has_fp16_weights, + ) + self.weight.cuda(weight.device) + self.bias = bias + + def init_8bit_state(self): + self.state.CB = self.weight.CB + self.state.SCB = self.weight.SCB + self.weight.CB = None + self.weight.SCB = None + + def forward(self, x: torch.Tensor): + self.state.is_training = self.training + if self.weight.CB is not None: + self.init_8bit_state() + + # weights are cast automatically as Int8Params, but the bias has to be cast manually + if self.bias is not None and self.bias.dtype != x.dtype: + self.bias.data = self.bias.data.to(x.dtype) + + out = bnb.matmul(x, self.weight, bias=self.bias, state=self.state) + + if not self.state.has_fp16_weights: + if self.state.CB is not None and self.state.CxB is not None: + # we converted 8-bit row major to turing/ampere format in the first inference pass + # we no longer need the row-major weight + del self.state.CB + self.weight.data = self.state.CxB + return out + + +class Linear4bit(torch.nn.Module): + def __init__(self, weight, bias, quant_type): + super().__init__() + self.weight = Params4bit( + weight.data, + requires_grad=False, + compress_statistics=True, + quant_type=quant_type, + ) + self.compute_dtype = None + self.weight.cuda(weight.device) + self.bias = bias + + def forward(self, x: torch.Tensor): + # weights are cast automatically as Int8Params, but the bias has to be cast manually + if self.bias is not None and self.bias.dtype != x.dtype: + self.bias.data = self.bias.data.to(x.dtype) + + if getattr(self.weight, "quant_state", None) is None: + print( + "FP4 quantization state not initialized. Please call .cuda() or .to(device) on the LinearFP4 layer first." + ) + inp_dtype = x.dtype + if self.compute_dtype is not None: + x = x.to(self.compute_dtype) + + bias = None if self.bias is None else self.bias.to(self.compute_dtype) + out = bnb.matmul_4bit( + x, self.weight.t(), bias=bias, quant_state=self.weight.quant_state + ) + + out = out.to(inp_dtype) + + return out diff --git a/server/text_generation_server/layers/conv.py b/server/text_generation_server/layers/conv.py new file mode 100644 index 00000000..7fb18ab3 --- /dev/null +++ b/server/text_generation_server/layers/conv.py @@ -0,0 +1,41 @@ +from accelerate import init_empty_weights +import torch + + +@classmethod +def load_conv2d(cls, prefix, weights, in_channels, out_channels, kernel_size, stride): + weight = weights.get_tensor(f"{prefix}.weight") + bias = weights.get_tensor(f"{prefix}.bias") + with init_empty_weights(): + conv2d = cls( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + ) + + conv2d.weight = torch.nn.Parameter(weight) + conv2d.bias = torch.nn.Parameter(bias) + return conv2d + + +@classmethod +def load_conv2d_no_bias( + cls, prefix, weights, in_channels, out_channels, kernel_size, stride +): + weight = weights.get_tensor(f"{prefix}.weight") + with init_empty_weights(): + conv2d = cls( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + ) + + conv2d.weight = torch.nn.Parameter(weight) + conv2d.bias = None + return conv2d + + +torch.nn.Conv2d.load = load_conv2d +torch.nn.Conv2d.load_no_bias = load_conv2d_no_bias diff --git a/server/text_generation_server/layers/eetq.py b/server/text_generation_server/layers/eetq.py new file mode 100644 index 00000000..fd22b5c6 --- /dev/null +++ b/server/text_generation_server/layers/eetq.py @@ -0,0 +1,25 @@ +import torch +from EETQ import quant_weights, w8_a16_gemm + + +class EETQLinear(torch.nn.Module): + def __init__( + self, + weight, + bias, + ) -> None: + super().__init__() + device = weight.device + if weight.dtype != torch.float16: + weight = weight.to(dtype=torch.float16) + weight = torch.t(weight).contiguous().cpu() + weight, scale = quant_weights(weight, torch.int8, False) + + self.weight = weight.cuda(device) + self.scale = scale.cuda(device) + self.bias = bias.cuda(device) if bias is not None else None + + def forward(self, input: torch.Tensor) -> torch.Tensor: + output = w8_a16_gemm(input, self.weight, self.scale) + output = output + self.bias if self.bias is not None else output + return output diff --git a/server/text_generation_server/layers/fp8.py b/server/text_generation_server/layers/fp8.py new file mode 100644 index 00000000..dd61d081 --- /dev/null +++ b/server/text_generation_server/layers/fp8.py @@ -0,0 +1,43 @@ +import torch + + +def fp8_quantize(weight, qdtype=torch.float8_e4m3fn): + device = weight.device + # weight, scale = quant_weights(weight, torch.int8, False) + finfo = torch.finfo(qdtype) + # Calculate the scale as dtype max divided by absmax + scale = finfo.max / weight.abs().max().clamp(min=1e-12) + # scale and clamp the tensor to bring it to + # the representative range of float8 data type + # (as default cast is unsaturated) + qweight = (weight * scale).clamp(min=finfo.min, max=finfo.max) + # Return both float8 data and the inverse scale (as float), + # as both required as inputs to torch._scaled_mm + qweight = qweight.to(qdtype) + scale = scale.float().reciprocal() + return qweight, scale + + +class Fp8Linear(torch.nn.Module): + def __init__( + self, + weight, + bias, + ) -> None: + super().__init__() + self.dtype = weight.dtype + self.qweight, self.scale = fp8_quantize(weight) + + self.bias = bias if bias is not None else None + + def forward(self, input: torch.Tensor) -> torch.Tensor: + qinput, scale = fp8_quantize(input) + output, _ = torch._scaled_mm( + qinput, + self.qweight.t(), + out_dtype=self.dtype, + scale_a=scale, + scale_b=self.scale, + bias=self.bias, + ) + return output diff --git a/server/text_generation_server/layers/gptq/__init__.py b/server/text_generation_server/layers/gptq/__init__.py new file mode 100644 index 00000000..1c46f493 --- /dev/null +++ b/server/text_generation_server/layers/gptq/__init__.py @@ -0,0 +1,39 @@ +import os +import torch +from text_generation_server.utils.import_utils import ( + SYSTEM, +) + +try: + major, _minor = torch.cuda.get_device_capability() +except Exception: + major = 1 + +HAS_EXLLAMA = False +CAN_EXLLAMA = major >= 8 or SYSTEM == "rocm" +V2 = os.getenv("EXLLAMA_VERSION", "2") == "2" +if os.getenv("DISABLE_EXLLAMA") == "True": + HAS_EXLLAMA = False +elif CAN_EXLLAMA: + try: + if V2: + from text_generation_server.layers.gptq.exllamav2 import ( + QuantLinear as ExllamaQuantLinear, + create_exllama_buffers, + set_device, + ) + + HAS_EXLLAMA = "2" + else: + from text_generation_server.layers.gptq.exllama import ( + Ex4bitLinear as ExllamaQuantLinear, + create_exllama_buffers, + set_device, + ) + + HAS_EXLLAMA = "1" + + except ImportError: + pass + +from text_generation_server.layers.gptq.quant_linear import QuantLinear diff --git a/server/text_generation_server/utils/gptq/custom_autotune.py b/server/text_generation_server/layers/gptq/custom_autotune.py similarity index 100% rename from server/text_generation_server/utils/gptq/custom_autotune.py rename to server/text_generation_server/layers/gptq/custom_autotune.py diff --git a/server/text_generation_server/utils/gptq/exllama.py b/server/text_generation_server/layers/gptq/exllama.py similarity index 100% rename from server/text_generation_server/utils/gptq/exllama.py rename to server/text_generation_server/layers/gptq/exllama.py diff --git a/server/text_generation_server/utils/gptq/exllamav2.py b/server/text_generation_server/layers/gptq/exllamav2.py similarity index 100% rename from server/text_generation_server/utils/gptq/exllamav2.py rename to server/text_generation_server/layers/gptq/exllamav2.py diff --git a/server/text_generation_server/layers/gptq/exllamav2.py.rej b/server/text_generation_server/layers/gptq/exllamav2.py.rej new file mode 100644 index 00000000..cde7b73a --- /dev/null +++ b/server/text_generation_server/layers/gptq/exllamav2.py.rej @@ -0,0 +1,10 @@ +diff a/server/text_generation_server/layers/gptq/exllamav2.py b/server/text_generation_server/layers/gptq/exllamav2.py (rejected hunks) +@@ -119,6 +119,8 @@ def ext_make_q_matrix(w: dict, temp_dq, key: str = None): + none_tensor, + temp_dq, + ) ++ else: ++ RuntimeError("Cannot create handle") + + + DEVICE = None diff --git a/server/text_generation_server/layers/gptq/quant_linear.py b/server/text_generation_server/layers/gptq/quant_linear.py new file mode 100644 index 00000000..b52ceb0f --- /dev/null +++ b/server/text_generation_server/layers/gptq/quant_linear.py @@ -0,0 +1,356 @@ +import math +import numpy as np +import torch +import torch.nn as nn +from torch.cuda.amp import custom_fwd + +import triton +import triton.language as tl +from . import custom_autotune + + +# code based https://github.com/fpgaminer/GPTQ-triton +@custom_autotune.autotune( + configs=[ + triton.Config( + { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 8, + }, + num_stages=4, + num_warps=4, + ), + triton.Config( + { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 8, + }, + num_stages=4, + num_warps=4, + ), + triton.Config( + { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 8, + }, + num_stages=4, + num_warps=4, + ), + triton.Config( + { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 8, + }, + num_stages=4, + num_warps=4, + ), + triton.Config( + { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 8, + }, + num_stages=4, + num_warps=4, + ), + triton.Config( + { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 8, + }, + num_stages=2, + num_warps=8, + ), + triton.Config( + { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + }, + num_stages=3, + num_warps=8, + ), + triton.Config( + { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + }, + num_stages=2, + num_warps=4, + ), + ], + key=["M", "N", "K"], + nearest_power_of_two=True, + prune_configs_by={ + "early_config_prune": custom_autotune.matmul248_kernel_config_pruner, + "perf_model": None, + "top_k": None, + }, +) +@triton.jit +def matmul_248_kernel( + a_ptr, + b_ptr, + c_ptr, + scales_ptr, + zeros_ptr, + g_ptr, + M, + N, + K, + bits, + maxq, + stride_am, + stride_ak, + stride_bk, + stride_bn, + stride_cm, + stride_cn, + stride_scales, + stride_zeros, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, +): + """ + Compute the matrix multiplication C = A x B. + A is of shape (M, K) float16 + B is of shape (K//8, N) int32 + C is of shape (M, N) float16 + scales is of shape (G, N) float16 + zeros is of shape (G, N) float16 + g_ptr is of shape (K) int32 + """ + infearure_per_bits = 32 // bits + + pid = tl.program_id(axis=0) + num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) + num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) + num_pid_k = tl.cdiv(K, BLOCK_SIZE_K) + num_pid_in_group = GROUP_SIZE_M * num_pid_n + group_id = pid // num_pid_in_group + first_pid_m = group_id * GROUP_SIZE_M + group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M) + pid_m = first_pid_m + (pid % group_size_m) + pid_n = (pid % num_pid_in_group) // group_size_m + + offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + offs_k = tl.arange(0, BLOCK_SIZE_K) + a_ptrs = a_ptr + ( + offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak + ) # (BLOCK_SIZE_M, BLOCK_SIZE_K) + a_mask = offs_am[:, None] < M + # b_ptrs is set up such that it repeats elements along the K axis 8 times + b_ptrs = b_ptr + ( + (offs_k[:, None] // infearure_per_bits) * stride_bk + + offs_bn[None, :] * stride_bn + ) # (BLOCK_SIZE_K, BLOCK_SIZE_N) + g_ptrs = g_ptr + offs_k + # shifter is used to extract the N bits of each element in the 32-bit word from B + scales_ptrs = scales_ptr + offs_bn[None, :] + zeros_ptrs = zeros_ptr + (offs_bn[None, :] // infearure_per_bits) + + shifter = (offs_k % infearure_per_bits) * bits + zeros_shifter = (offs_bn % infearure_per_bits) * bits + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + + for k in range(0, num_pid_k): + g_idx = tl.load(g_ptrs) + + # Fetch scales and zeros; these are per-outfeature and thus reused in the inner loop + scales = tl.load( + scales_ptrs + g_idx[:, None] * stride_scales + ) # (BLOCK_SIZE_K, BLOCK_SIZE_N,) + zeros = tl.load( + zeros_ptrs + g_idx[:, None] * stride_zeros + ) # (BLOCK_SIZE_K, BLOCK_SIZE_N,) + + zeros = (zeros >> zeros_shifter[None, :]) & maxq + zeros = (zeros + 1) & maxq # eventually avoid overflow + + a = tl.load(a_ptrs, mask=a_mask, other=0.0) # (BLOCK_SIZE_M, BLOCK_SIZE_K) + b = tl.load(b_ptrs) # (BLOCK_SIZE_K, BLOCK_SIZE_N), but repeated + + # Now we need to unpack b (which is N-bit values) into 32-bit values + b = (b >> shifter[:, None]) & maxq # Extract the N-bit values + b = (b - zeros) * scales # Scale and shift + + accumulator += tl.dot(a, b) + a_ptrs += BLOCK_SIZE_K + b_ptrs += (BLOCK_SIZE_K // infearure_per_bits) * stride_bk + g_ptrs += BLOCK_SIZE_K + + c_ptrs = c_ptr + stride_cm * offs_am[:, None] + stride_cn * offs_bn[None, :] + c_mask = (offs_am[:, None] < M) & (offs_bn[None, :] < N) + tl.store(c_ptrs, accumulator, mask=c_mask) + + +def matmul248(input, qweight, scales, qzeros, g_idx, bits, maxq): + with torch.cuda.device(input.device): + output = torch.empty( + (input.shape[0], qweight.shape[1]), device=input.device, dtype=torch.float16 + ) + grid = lambda META: ( + triton.cdiv(input.shape[0], META["BLOCK_SIZE_M"]) + * triton.cdiv(qweight.shape[1], META["BLOCK_SIZE_N"]), + ) + matmul_248_kernel[grid]( + input, + qweight, + output, + scales, + qzeros, + g_idx, + input.shape[0], + qweight.shape[1], + input.shape[1], + bits, + maxq, + input.stride(0), + input.stride(1), + qweight.stride(0), + qweight.stride(1), + output.stride(0), + output.stride(1), + scales.stride(0), + qzeros.stride(0), + ) + return output + + +class QuantLinearFunction(torch.autograd.Function): + @staticmethod + @custom_fwd(cast_inputs=torch.float16) + def forward(ctx, input, qweight, scales, qzeros, g_idx, bits, maxq): + output = matmul248(input, qweight, scales, qzeros, g_idx, bits, maxq) + return output + + +class QuantLinear(nn.Module): + def __init__(self, qweight, qzeros, scales, g_idx, bias, bits, groupsize): + super().__init__() + self.register_buffer("qweight", qweight) + self.register_buffer("qzeros", qzeros) + self.register_buffer("scales", scales) + self.register_buffer("g_idx", g_idx) + if bias is not None: + self.register_buffer("bias", bias) + else: + self.bias = None + if bits not in [2, 4, 8]: + raise NotImplementedError("Only 2,4,8 bits are supported.") + self.bits = bits + self.maxq = 2**self.bits - 1 + self.groupsize = groupsize + + self.outfeatures = qweight.shape[1] + self.infeatures = qweight.shape[0] * 32 // bits + + @classmethod + def new(cls, bits, groupsize, infeatures, outfeatures, bias): + if bits not in [2, 4, 8]: + raise NotImplementedError("Only 2,4,8 bits are supported.") + + qweight = torch.zeros((infeatures // 32 * bits, outfeatures), dtype=torch.int32) + qzeros = torch.zeros( + (math.ceil(infeatures / groupsize), outfeatures // 32 * bits), + dtype=torch.int32, + ) + scales = torch.zeros( + (math.ceil(infeatures / groupsize), outfeatures), dtype=torch.float16 + ) + g_idx = torch.tensor( + [i // groupsize for i in range(infeatures)], dtype=torch.int32 + ) + if bias: + bias = torch.zeros((outfeatures), dtype=torch.float16) + else: + bias = None + return cls(qweight, qzeros, scales, g_idx, bias, bits, groupsize) + + def pack(self, linear, scales, zeros, g_idx=None): + self.g_idx = g_idx.clone() if g_idx is not None else self.g_idx + + scales = scales.t().contiguous() + zeros = zeros.t().contiguous() + scale_zeros = zeros * scales + self.scales = scales.clone().half() + if linear.bias is not None: + self.bias = linear.bias.clone().half() + + intweight = [] + for idx in range(self.infeatures): + intweight.append( + torch.round( + (linear.weight.data[:, idx] + scale_zeros[self.g_idx[idx]]) + / self.scales[self.g_idx[idx]] + ).to(torch.int)[:, None] + ) + intweight = torch.cat(intweight, dim=1) + intweight = intweight.t().contiguous() + intweight = intweight.numpy().astype(np.uint32) + qweight = np.zeros( + (intweight.shape[0] // 32 * self.bits, intweight.shape[1]), dtype=np.uint32 + ) + i = 0 + row = 0 + while row < qweight.shape[0]: + if self.bits in [2, 4, 8]: + for j in range(i, i + (32 // self.bits)): + qweight[row] |= intweight[j] << (self.bits * (j - i)) + i += 32 // self.bits + row += 1 + else: + raise NotImplementedError("Only 2,4,8 bits are supported.") + + qweight = qweight.astype(np.int32) + self.qweight = torch.from_numpy(qweight) + + zeros -= 1 + zeros = zeros.numpy().astype(np.uint32) + qzeros = np.zeros( + (zeros.shape[0], zeros.shape[1] // 32 * self.bits), dtype=np.uint32 + ) + i = 0 + col = 0 + while col < qzeros.shape[1]: + if self.bits in [2, 4, 8]: + for j in range(i, i + (32 // self.bits)): + qzeros[:, col] |= zeros[:, j] << (self.bits * (j - i)) + i += 32 // self.bits + col += 1 + else: + raise NotImplementedError("Only 2,4,8 bits are supported.") + + qzeros = qzeros.astype(np.int32) + self.qzeros = torch.from_numpy(qzeros) + + def forward(self, x): + out_shape = x.shape[:-1] + (self.outfeatures,) + out = QuantLinearFunction.apply( + x.reshape(-1, x.shape[-1]), + self.qweight, + self.scales, + self.qzeros, + self.g_idx, + self.bits, + self.maxq, + ) + out = out + self.bias if self.bias is not None else out + return out.reshape(out_shape) diff --git a/server/text_generation_server/utils/gptq/quantize.py b/server/text_generation_server/layers/gptq/quantize.py similarity index 100% rename from server/text_generation_server/utils/gptq/quantize.py rename to server/text_generation_server/layers/gptq/quantize.py diff --git a/server/text_generation_server/layers/layernorm.py b/server/text_generation_server/layers/layernorm.py new file mode 100644 index 00000000..c4aa6c7d --- /dev/null +++ b/server/text_generation_server/layers/layernorm.py @@ -0,0 +1,185 @@ +import torch +from torch import nn +from accelerate import init_empty_weights +from text_generation_server.utils.import_utils import ( + SYSTEM, +) + + +# Monkey patching +@classmethod +def load_layer_norm(cls, prefix, weights, eps): + weight = weights.get_tensor(f"{prefix}.weight") + bias = weights.get_tensor(f"{prefix}.bias") + with init_empty_weights(): + ln = cls(weight.shape, eps=eps) + + ln.weight = torch.nn.Parameter(weight) + ln.bias = torch.nn.Parameter(bias) + return ln + + +@classmethod +def load_layer_norm_no_bias(cls, prefix, weights, eps): + weight = weights.get_tensor(f"{prefix}.weight") + with init_empty_weights(): + ln = cls(weight.shape, eps=eps) + + ln.weight = torch.nn.Parameter(weight) + ln.bias = None + return ln + + +torch.nn.LayerNorm.load = load_layer_norm +torch.nn.LayerNorm.load_no_bias = load_layer_norm_no_bias + +if SYSTEM == "cuda": + import dropout_layer_norm + + class FastLayerNorm(nn.LayerNorm): + def forward(self, hidden_states, residual=None): + if hidden_states.shape[-1] > 8192: + if residual is not None: + hidden_states += residual + residual = hidden_states + + return super(FastLayerNorm, self).forward(hidden_states), residual + else: + ( + normed_hidden_states, + residual, + *rest, + ) = dropout_layer_norm.dropout_add_ln_fwd( + hidden_states, + residual, + self.weight, + self.bias, + None, + None, + None, + None, + 0.0, + self.eps, + 1.0, + 0, + None, + False, + False, + ) + if residual is None: + residual = hidden_states + + return normed_hidden_states, residual + +elif SYSTEM == "rocm": + from vllm._C import ops + + class FastLayerNorm(nn.LayerNorm): + def forward(self, hidden_states, residual=None): + if residual is not None: + hidden_states += residual + residual = hidden_states + + return super().forward(hidden_states), residual + +elif SYSTEM == "xpu": + import intel_extension_for_pytorch as ipex + + class FastLayerNorm(nn.LayerNorm): + def forward(self, hidden_states, residual=None): + res_out = hidden_states + out = ipex.llm.functional.add_layer_norm( + residual, hidden_states, self.weight, self.bias, self.eps, True + ) + if residual is not None: + res_out = residual + return out, res_out + + +class FastRMSNorm(nn.Module): + def __init__(self, weight: torch.Tensor, eps: float): + super().__init__() + + self.weight = nn.Parameter(weight) + self.variance_epsilon = eps + + @classmethod + def load(cls, prefix, weights, eps=1e-6): + weight = weights.get_tensor(f"{prefix}.weight") + return cls(weight, eps) + + def forward(self, hidden_states, residual=None): + if SYSTEM == "xpu": + residual_out = hidden_states + out = ipex.llm.functional.add_rms_norm( + residual, + hidden_states, + self.weight, + None, + self.variance_epsilon, + True, + ) + if residual is not None: + residual_out = residual + return out, residual_out + elif hidden_states.shape[-1] > 8192: + if residual is not None: + hidden_states += residual + residual = hidden_states + + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt( + variance + self.variance_epsilon + ) + + # convert into half-precision if necessary + if self.weight.dtype in [torch.float16, torch.bfloat16]: + hidden_states = hidden_states.to(self.weight.dtype) + + return self.weight * hidden_states, residual + elif SYSTEM == "cuda": + # faster post attention rms norm + ( + normed_hidden_states, + res, + *rest, + ) = dropout_layer_norm.dropout_add_ln_fwd( + hidden_states, + residual, + self.weight, + None, + None, + None, + None, + None, + 0.0, + self.variance_epsilon, + 1.0, + 0, + None, + False, + True, # Activate RMSNorm + ) + if res is None: + res = hidden_states + + return normed_hidden_states, res + elif SYSTEM == "rocm": + # We use VLLM RMSNorm kernel that can be compiled for RoCm, instead of Flash Attention ones that can not. + if residual is not None: + hidden_states += residual + residual = hidden_states + + out = torch.empty_like(hidden_states) + ops.rms_norm( + out, + hidden_states, + self.weight.data, + self.variance_epsilon, + ) + return out, residual + else: + raise ValueError( + "Your system seem to be not supported. Please check your install or open an issue at https://github.com/huggingface/text-generation-inference/issues with a clear reproduction." + ) diff --git a/server/text_generation_server/layers/linear.py b/server/text_generation_server/layers/linear.py new file mode 100644 index 00000000..5bd6aa95 --- /dev/null +++ b/server/text_generation_server/layers/linear.py @@ -0,0 +1,216 @@ +import torch +from torch.nn import functional as F +from text_generation_server.utils.import_utils import SYSTEM + +if SYSTEM == "rocm": + try: + from vllm import _custom_C + except Exception as e: + raise ImportError(f"Could not load `vllm._custom_C`. Full error: {e}") + + +class FastLinear(torch.nn.Module): + def __init__( + self, + weight, + bias, + ) -> None: + super().__init__() + self.weight = torch.nn.Parameter(weight, requires_grad=False) + if bias is not None: + self.bias = torch.nn.Parameter(bias, requires_grad=False) + else: + self.bias = None + + @classmethod + def load(cls, config, prefix: str, weights, bias: bool): + weight = weights.get_tensor(f"{prefix}.weight") + if bias: + bias = weights.get_tensor(f"{prefix}.bias") + else: + bias = None + return cls(weight, bias) + + def forward(self, input: torch.Tensor) -> torch.Tensor: + return F.linear(input, self.weight, self.bias) + + +class FastLinearROCm(torch.nn.Module): + def __init__( + self, + weight, + bias, + ) -> None: + super().__init__() + self.weight = torch.nn.Parameter(weight) + if bias is not None: + self.bias = torch.nn.Parameter(bias) + else: + self.bias = None + + @classmethod + def load(cls, config, prefix: str, weights, bias: bool): + weight = weights.get_tensor(f"{prefix}.weight") + if bias: + bias = weights.get_tensor(f"{prefix}.bias") + else: + bias = None + return cls(weight, bias) + + def forward(self, inp: torch.Tensor) -> torch.Tensor: + weight = self.weight + bias = self.bias + + if SYSTEM == "rocm" and inp.numel() // inp.shape[-1] == 1: + batched = False + inp_shape = inp.shape + + if inp.dim() == 3: + inp = inp.view(-1, inp_shape[-1]) + batched = True + + m, k = weight.shape[0], inp_shape[1] + out = torch.empty( + inp_shape[0], weight.shape[0], dtype=inp.dtype, device="cuda" + ) + if (k == 8192 and (m == 1280 or m == 7168)) or (k == 3584 and m == 8192): + _custom_C.LLMM1(weight, inp, out, 8) + elif k <= 8192 and k % 8 == 0 and m % 4 == 0: + _custom_C.LLMM1(weight, inp, out, 4) + else: + out = F.linear(inp, weight) + + if batched: + out.view(*inp_shape[:-1], out.shape[-1]) + + if bias is not None: + out = out + bias + return out + return F.linear(inp, self.weight, self.bias) + + +def get_linear(weight, bias, quantize): + if quantize is None: + if SYSTEM == "rocm": + linear = FastLinearROCm(weight, bias) + else: + linear = FastLinear(weight, bias) + elif quantize == "eetq": + try: + from text_generation_server.layers.eetq import EETQLinear + + linear = EETQLinear(weight, bias) + except ImportError: + raise ImportError( + "Please install EETQ from https://github.com/NetEase-FuXi/EETQ" + ) + elif quantize == "fp8": + from text_generation_server.layers.fp8 import Fp8Linear + + linear = Fp8Linear(weight, bias) + elif quantize == "bitsandbytes": + try: + from text_generation_server.layers.bnb import ( + warn_deprecate_bnb, + Linear8bitLt, + ) + except ImportError: + raise NotImplementedError( + f"Bitsandbytes is missing install it with `pip install bitsandbytes`." + ) + warn_deprecate_bnb() + linear = Linear8bitLt( + weight, + bias, + has_fp16_weights=False, + threshold=6.0, + ) + if bias is not None: + linear.bias = nn.Parameter(bias) + elif quantize == "bitsandbytes-fp4": + try: + from text_generation_server.layers.bnb import Linear4bit + except ImportError: + raise NotImplementedError( + f"Bitsandbytes is missing install it with `pip install bitsandbytes`." + ) + linear = Linear4bit( + weight, + bias, + quant_type="fp4", + ) + elif quantize == "bitsandbytes-nf4": + try: + from text_generation_server.layers.bnb import Linear4bit + except ImportError: + raise NotImplementedError( + f"Bitsandbytes is missing install it with `pip install bitsandbytes`." + ) + linear = Linear4bit( + weight, + bias, + quant_type="nf4", + ) + elif quantize == "gptq": + try: + qweight, qzeros, scales, g_idx, bits, groupsize, use_exllama = weight + except Exception: + raise NotImplementedError( + f"The passed weight is not `gptq` compatible, loader needs to be updated." + ) + + if use_exllama: + try: + from text_generation_server.layers.gptq import ( + ExllamaQuantLinear, + ) + except ImportError: + raise NotImplementedError( + f"Exllama gptq kernels are not installed. Install them `cd server/exllama_kernels && python setup.py install && cd ../exllamav2_kernels && python setup.py install`" + ) + + linear = ExllamaQuantLinear( + qweight, qzeros, scales, g_idx, bias, bits, groupsize + ) + else: + from text_generation_server.layers.gptq.quant_linear import QuantLinear + + linear = QuantLinear( + qweight, + qzeros, + scales, + g_idx, + bias, + bits, + groupsize, + ) + elif quantize == "awq": + try: + qweight, qzeros, scales, _, bits, groupsize, _ = weight + except Exception: + raise NotImplementedError( + f"The passed weight is not `awq` compatible, loader needs to be updated." + ) + if SYSTEM == "rocm": + raise NotImplementedError( + "AWQ GEMM kernel can't be used on ROCm systems, please use `--quantize gptq` instead " + "to use Exllama/GPTQ kernels for AWQ inference." + ) + try: + from text_generation_server.layers.awq.quantize.qmodule import WQLinear + + linear = WQLinear( + w_bit=bits, + group_size=groupsize, + qweight=qweight, + qzeros=qzeros, + scales=scales, + bias=bias is not None, + ) + except ImportError: + raise NotImplementedError( + "You do not seem to have awq installed, either install it (cd server && make install-awq), or try using GPTQ `---quantize gptq` a conversion AWQ->GPTQ will happen on the fly" + ) + else: + raise NotImplementedError(f"Quantization `{quantize}` is not implemented yet.") + return linear diff --git a/server/text_generation_server/layers/medusa.py b/server/text_generation_server/layers/medusa.py new file mode 100644 index 00000000..7579ccdb --- /dev/null +++ b/server/text_generation_server/layers/medusa.py @@ -0,0 +1,189 @@ +import torch +from torch import nn +from typing import Tuple, Optional +from text_generation_server.utils.speculate import get_speculate +from text_generation_server.layers.linear import FastLinear +from text_generation_server.layers.tensor_parallel import ( + TensorParallelHead, + TensorParallelColumnLinear, +) + + +class ResBlock(torch.nn.Module): + def __init__(self, config, prefix, weights): + super().__init__() + self.linear = FastLinear.load( + config, prefix=f"{prefix}.linear", weights=weights, bias=True + ) + self.act = torch.nn.SiLU() + + def forward(self, x): + return x + self.act(self.linear(x)) + + +class MedusaModel(torch.nn.Module): + def __init__(self, config, medusa_config, weights): + super().__init__() + self.heads = torch.nn.ModuleList( + [ + MedusaHead(config, medusa_config, prefix=f"{i}", weights=weights) + for i in range(get_speculate()) + ] + ) + + def forward(self, x): + speculative_logits = torch.stack([head(x) for head in self.heads], dim=1) + return speculative_logits + + +class MedusaHead(torch.nn.Module): + def __init__(self, config, medusa_config, prefix, weights): + super().__init__() + self.blocks = torch.nn.ModuleList( + [ + ResBlock(config, prefix=f"{prefix}.{i}", weights=weights) + for i in range(medusa_config["medusa_num_layers"]) + ] + ) + n = len(self.blocks) + self.out = FastLinear.load( + config, prefix=f"{prefix}.{n}", weights=weights, bias=False + ) + + def forward(self, x): + for block in self.blocks: + x = block(x) + x = self.out(x) + return x + + +class MedusaHeadV1(nn.Module): + def __init__(self, lm_head, medusa): + super().__init__() + self.lm_head = lm_head + self.medusa = medusa + + @staticmethod + def load(config, prefix: str, weights): + from pathlib import Path + from safetensors import safe_open + import json + + speculator = config.speculator + + path = speculator["path"] + medusa_config = str(Path(path) / "config.json") + + for fname in speculator["model_paths"]: + filename = str(Path(path) / fname) + + with open(medusa_config, "r") as f: + medusa_config = json.load(f) + routing = weights.routing + with safe_open(filename, framework="pytorch") as f: + for k in f.keys(): + if k in routing and routing[k] != filename: + raise RuntimeError( + f"Key {k} was found in multiple files: {filename} and {routing[k]}" + ) + routing[k] = filename + + medusa = MedusaModel(config, medusa_config, weights) + lm_head = TensorParallelHead.load(config, prefix, weights) + return MedusaHeadV1(lm_head, medusa) + + def forward( + self, input: torch.Tensor + ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: + logits = self.lm_head(input) + # If we have too many tokens, we skip speculative logits + if input.shape[0] > 128: + return logits, None + + speculative_logits = self.medusa(input) + return logits, speculative_logits + + +class MedusaHeadV2(nn.Module): + def __init__(self, config, prefix, weights): + super().__init__() + from pathlib import Path + from safetensors import safe_open + import json + + speculator_path = config.speculator["path"] + + medusa_config = str(Path(speculator_path) / "config.json") + filename = str(Path(speculator_path) / "medusa_lm_head.safetensors") + + with open(medusa_config, "r") as f: + medusa_config = json.load(f) + routing = weights.routing + with safe_open(filename, framework="pytorch") as f: + for k in f.keys(): + if k in routing and routing[k] != filename: + raise RuntimeError( + f"Key {k} was found in multiple files: {filename} and {routing[k]}" + ) + routing[k] = filename + + self.n_medusa_heads = get_speculate() + + assert medusa_config["medusa_num_layers"] == 1 + self.linear = TensorParallelColumnLinear.load_multi( + config, + prefixes=[f"{i}.0.linear" for i in range(self.n_medusa_heads)], + dim=0, + weights=weights, + bias=True, + ) + self.process_group = weights.process_group + self.world_size = self.process_group.size() + self.rank = self.process_group.rank() + + self.act = torch.nn.SiLU() + + self.lm_head = TensorParallelHead.load(config, prefix, weights) + + def forward(self, x): + # If we have too many tokens, we skip speculative logits + if x.shape[0] > 128: + logits = self.lm_head(x) + return logits, None + + size = x.shape[-1] + block_size = (size + self.world_size - 1) // self.world_size + start = self.rank * block_size + stop = (self.rank + 1) * block_size + + x_block = x[:, start:stop] + + # Compute all medusa heads at the same time, then reshape and move the n_medusa_heads dim to dim 1 + medusa_res = self.act(self.linear(x)).reshape( + *x_block.shape[:-1], self.n_medusa_heads, x_block.shape[-1] + ) + + # Apply all residual medusa heads + output = x[:, start:stop].unsqueeze(-2) + medusa_res + + # Gather medusa heads + world_output = [ + torch.empty_like(output) for _ in range(self.process_group.size()) + ] + torch.distributed.all_gather(world_output, output, group=self.process_group) + world_output = torch.cat(world_output, dim=-1) + + # Stack x and medusa residual x + stacked_x = torch.cat([x.unsqueeze(-2), world_output], dim=-2) + + # Compute lm head on x + medusa residual x + logits = self.lm_head(stacked_x) + + # Finally, split logits from speculative logits + logits, speculative_logits = torch.split( + logits, [1, self.n_medusa_heads], dim=-2 + ) + # Squeeze added dimension + logits = logits.squeeze(-2) + + return logits, speculative_logits diff --git a/server/text_generation_server/layers/mlp.py b/server/text_generation_server/layers/mlp.py new file mode 100644 index 00000000..f08cb673 --- /dev/null +++ b/server/text_generation_server/layers/mlp.py @@ -0,0 +1,176 @@ +import torch +import math +from torch import nn +from torch.nn import functional as F +from typing import Optional, Tuple +from text_generation_server.layers import TensorParallelEmbedding, FastLinear +from text_generation_server.layers.tensor_parallel import TensorParallelHead +from text_generation_server.utils.speculate import get_speculate + + +class MLPSpeculatorLayerNorm(nn.Module): + """ + A L2 normalization implementation + ... + Args + ---- + normalized_shape : int + Dimensionality of input data (size of final tensor axis) + elementwise_scale_weight : torch.Tensor + learned scaling term after normalization? + elementwise_shift_bias : torch.Tensor + learned bias term after normalization? + eps : float + Safety term to prevent division by zero. Make sure the chosen value fits in the range of your encoding scheme (i.e. fp16 requires eps >= 6e-8). + """ + + def __init__( + self, + prefix, + config, + weights, + eps=1e-06, + ): + super(MLPSpeculatorLayerNorm, self).__init__() + self.weight = weights.get_tensor(f"{prefix}.weight") + self.bias = weights.get_tensor(f"{prefix}.bias") + self.eps = eps + + def forward(self, x): + xf = x + xf = xf * torch.rsqrt(xf.pow(2).mean(-1, keepdim=True) + self.eps) + x = xf.type_as(x) + x = self.weight * x + x = x + self.bias + return x + + +class MLPSpeculatorModel(torch.nn.Module): + def __init__(self, config, prefix, weights): + super().__init__() + self.config = config + self.n_predict = get_speculate() + self.hidden_size = config.hidden_size + self.emb = nn.ModuleList( + [ + TensorParallelEmbedding(f"{prefix}.emb.{i}", weights) + for i in range(self.n_predict) + ] + ) + self.proj = [ + FastLinear.load( + config, + prefix=f"{prefix}.proj.{i}", + weights=weights, + bias=False, + ) + for i in range(self.n_predict) + ] + self.head = nn.ModuleList( + [ + FastLinear.load(config, f"{prefix}.head.{i}", weights, bias=False) + for i in range(self.n_predict) + ] + ) + self.ln = nn.ModuleList( + [ + MLPSpeculatorLayerNorm( + prefix=f"{prefix}.ln.{i}", + config=config, + weights=weights, + ) + for i in range(self.n_predict) + ] + ) + + # Weights ensure that state_0 accounts for 50% of state magnitude by final head in expectation + self.state_weight = 0.5 ** (0.5 / self.n_predict) + self.emb_weight = math.sqrt(1 - self.state_weight**2) + self.activation = nn.GELU() + # TODO + self.vsize = config.vocab_size + self.inner_dim = config.speculator_config["inner_dim"] + self.top_k_tokens_per_head = [1] * self.n_predict + + def forward( + self, + hidden_states: torch.Tensor, + input_ids: torch.Tensor, + ): + top_k_tokens_per_head = self.top_k_tokens_per_head + + # k indicates # of candidates + # h indicates # of generated tokens + state = hidden_states + b = state.size(0) + ind = input_ids.unsqueeze(0) + all_probs = torch.empty( + b, self.n_predict, self.vsize, device=state.device + ) # b k h v + assert ( + len(top_k_tokens_per_head) == self.n_predict + ), f"You must provide a topk number for each head ({self.n_predict} heads, {len(top_k_tokens_per_head)} provided)" + for i in range(self.n_predict): + # Project and predict + z = self.emb[i](ind) + z = z.mul(self.emb_weight * math.sqrt(self.inner_dim / 2)) # b k d + state = self.proj[i](state) * self.state_weight + z + state = self.activation(self.ln[i](state)) # b k d + probs = F.log_softmax(self.head[i](state), dim=-1) # b k v + _probs, preds = probs.topk(top_k_tokens_per_head[i], dim=-1) # b k k' + + # Update candidate set with new predictions + + # Update distribution set with new logits + all_probs[:, i] = probs.exp() + + # Update state, log_probs and ind for new predictions + state = state.unsqueeze(2).expand( + -1, -1, top_k_tokens_per_head[i], -1 + ) # b k k' d + state = state.reshape(-1, b, state.size(3)) # b kk' d + ind = preds.view(-1, b) # b kk' + + speculative_logits = all_probs + return speculative_logits + + +class MLPSpeculatorHead(nn.Module): + def __init__(self, lm_head, mlp_speculator): + super().__init__() + self.lm_head = lm_head + self.mlp_speculator = mlp_speculator + + def forward( + self, input: torch.Tensor + ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: + logits = self.lm_head(input) + # If we have too many tokens, we skip speculative logits + if input.shape[0] > 128: + return logits, None + + input_ids = logits.argmax(dim=-1) + speculative_logits = self.mlp_speculator(input, input_ids) + return logits, speculative_logits + + @staticmethod + def load(config, prefix: str, weights): + from pathlib import Path + from safetensors import safe_open + + speculator_path = config.speculator["path"] + + for fname in config.speculator["model_paths"]: + filename = str(Path(speculator_path) / fname) + routing = weights.routing + with safe_open(filename, framework="pytorch") as f: + for k in f.keys(): + if k in routing and routing[k] != filename: + raise RuntimeError( + f"Key {k} was found in multiple files: {filename} and {routing[k]}" + ) + routing[k] = filename + + mlp_speculator = MLPSpeculatorModel(config, "speculator", weights) + lm_head = TensorParallelHead.load(config, prefix, weights) + return MLPSpeculatorHead(lm_head, mlp_speculator) diff --git a/server/text_generation_server/layers/rotary.py b/server/text_generation_server/layers/rotary.py new file mode 100644 index 00000000..648d28ab --- /dev/null +++ b/server/text_generation_server/layers/rotary.py @@ -0,0 +1,421 @@ +import os +import torch +from torch import nn + +from text_generation_server.utils.import_utils import SYSTEM + +if SYSTEM == "cuda": + from flash_attn.layers.rotary import RotaryEmbedding + import rotary_emb +elif SYSTEM == "rocm": + from vllm._C import ops +elif SYSTEM == "xpu": + import intel_extension_for_pytorch as ipex + + +def _create_inv_freq(dim, base, device): + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, device=device, dtype=torch.float32) / dim) + ) + return inv_freq + + +def _get_rope_config(config): + if os.getenv("ROPE_SCALING", None) is not None: + rope_scaling = { + "type": os.environ["ROPE_SCALING"], + "factor": float(os.environ["ROPE_FACTOR"]), + } + return rope_scaling + return getattr(config, "rope_scaling", None) + + +class PositionRotaryEmbedding(nn.Module): + def __init__(self, inv_freq, scaling_factor): + super().__init__() + self.inv_freq = inv_freq + self._seq_len_cached = 0 + self._cos_cached = None + self._sin_cached = None + self._cos_k_cached = None + self._sin_k_cached = None + self.scaling_factor = scaling_factor + self.dynamic_args = None + + def forward( + self, + query: torch.Tensor, + key: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, + ): + # Such controlflows may add some overhead. + if SYSTEM == "cuda": + rotary_dim = cos.shape[-1] + q1 = query[..., :rotary_dim] + q2 = query[..., rotary_dim : 2 * rotary_dim] + + rotary_emb.apply_rotary(q1, q2, cos, sin, q1, q2, False) + + k1 = key[..., :rotary_dim] + k2 = key[..., rotary_dim : 2 * rotary_dim] + + rotary_emb.apply_rotary(k1, k2, cos, sin, k1, k2, False) + elif SYSTEM == "rocm": + # NOTE: On RoCm systems, we use a ROPE implementatation adapted from VLLM which launches a single kernel for both query/key, contrary to flash-attn implementation used on NVIDIA systems. + # Compiling flash-attn rotary on RoCm, it appears hipcc is unable to unroll loops, resulting in an even slower inference compared to eager: https://github.com/pytorch/pytorch/issues/113773 + + head_size = query.shape[-1] + + # Inplace operation, updating query and key. + ops.rotary_embedding(query, key, head_size, cos, sin, True) + elif SYSTEM == "xpu": + ipex.llm.functional.rotary_embedding( + query, key, sin, cos, query.size(-1), True + ) + else: + raise ValueError( + "Your system seem to be not supported. Please check your install or open an issue at https://github.com/huggingface/text-generation-inference/issues with a clear reproduction." + ) + + @classmethod + def static(cls, config, dim, base, device): + inv_freq = _create_inv_freq(dim, base, device) + scaling_factor = None + rope_scaling = _get_rope_config(config) + if rope_scaling is not None: + if rope_scaling["type"] == "linear": + pass + elif rope_scaling["type"] == "dynamic": + scaling_factor = rope_scaling["factor"] + return DynamicPositionRotaryEmbedding( + dim=dim, + max_position_embeddings=config.max_position_embeddings, + base=base, + device=inv_freq.device, + scaling_factor=scaling_factor, + ) + elif rope_scaling["type"] == "yarn": + scaling_factor = rope_scaling["factor"] + return YarnPositionRotaryEmbedding( + dim=2 * inv_freq.shape[0], + max_position_embeddings=rope_scaling[ + "original_max_position_embeddings" + ], + base=10000.0, + device=inv_freq.device, + scaling_factor=scaling_factor, + extrapolation_factor=1, + attn_factor=1, + beta_fast=32, + beta_slow=1, + ) + elif rope_scaling["type"] == "su": + short_factor = torch.tensor( + rope_scaling["short_factor"], dtype=torch.float32, device=device + ) + short_inv_freq = 1.0 / ( + short_factor + * base + ** ( + torch.arange(0, dim, 2, device=device, dtype=torch.float32) + / dim + ) + ) + long_factor = torch.tensor( + rope_scaling["long_factor"], dtype=torch.float32, device=device + ) + long_inv_freq = 1.0 / ( + long_factor + * base + ** ( + torch.arange(0, dim, 2, device=device, dtype=torch.float32) + / dim + ) + ) + + original_max_position_embeddings = ( + config.original_max_position_embeddings + ) + max_position_embeddings = config.max_position_embeddings + if max_position_embeddings <= original_max_position_embeddings: + scaling_factor = 1.0 + else: + scale = max_position_embeddings / original_max_position_embeddings + scaling_factor = math.sqrt( + 1 + math.log(scale) / math.log(original_max_position_embeddings) + ) + + return SuRotaryEmbedding( + short_inv_freq=short_inv_freq, + long_inv_freq=long_inv_freq, + scaling_factor=scaling_factor, + original_max_position_embeddings=original_max_position_embeddings, + ) + else: + raise NotImplementedError( + f"rope scaling type {rope_scaling['type']} is not implemented or invalid" + ) + return cls(inv_freq, scaling_factor) + + @classmethod + def load(cls, config, prefix, weights): + # XXX: Always load this in float32 ! + dtype = weights.dtype + weights.dtype = torch.float32 + inv_freq = weights.get_tensor(f"{prefix}.inv_freq") + weights.dtype = dtype + + scaling_factor = None + rope_scaling = _get_rope_config(config) + if rope_scaling is not None: + scaling_factor = rope_scaling["factor"] + if rope_scaling["type"] == "linear": + pass + elif rope_scaling["type"] == "dynamic": + return DynamicPositionRotaryEmbedding( + dim=2 * inv_freq.shape[0], + max_position_embeddings=config.max_position_embeddings, + base=10000.0, + device=inv_freq.device, + scaling_factor=scaling_factor, + ) + elif rope_scaling["type"] == "yarn": + return YarnPositionRotaryEmbedding( + dim=2 * inv_freq.shape[0], + max_position_embeddings=rope_scaling[ + "original_max_position_embeddings" + ], + base=10000.0, + device=inv_freq.device, + scaling_factor=scaling_factor, + extrapolation_factor=1, + attn_factor=1, + beta_fast=32, + beta_slow=1, + ) + else: + raise NotImplementedError( + f"rope scaling type {rope_scaling['type']} is not implemented or invalid" + ) + return cls(inv_freq, scaling_factor) + + def _update_cos_sin_cache(self, dtype, device, seqlen): + # Reset the tables if the sequence length has changed, + # or if we're on a new device (possibly due to tracing for instance) + if ( + seqlen > self._seq_len_cached + or self._cos_cached.device != device + or self._cos_cached.dtype != dtype + ): + self._seq_len_cached = seqlen + t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype) + if self.scaling_factor is not None: + t /= self.scaling_factor + # Don't do einsum, it converts fp32 to fp16 + # freqs = torch.einsum("i,j->ij", t, self.inv_freq) + + freqs = torch.outer(t, self.inv_freq.to(device=t.device)) + self._cos_cached = torch.cos(freqs).to(dtype) + self._sin_cached = torch.sin(freqs).to(dtype) + + def get_cos_sin(self, position_ids: torch.Tensor, max_s: int, dtype: torch.dtype): + """ + Return cos and sin for the asked position ids + """ + if SYSTEM == "rocm": + # For RoCm, we always use float cos/sin to avoid a cast. + # For NVIDIA, for some reason, the flash-attn rotary kernel requires cos/sin and query/key to be of same dtype: https://github.com/Dao-AILab/flash-attention/blob/017716451d446e464dde9aca3a3c1ed2209caaa9/csrc/rotary/rotary.cpp#L26 + # But later on goes and cast cos/sin to float anyway: https://github.com/Dao-AILab/flash-attention/blob/017716451d446e464dde9aca3a3c1ed2209caaa9/csrc/rotary/rotary_cuda.cu#L29, which looks suboptimal. + dtype = torch.float32 + + self._update_cos_sin_cache(dtype, position_ids.device, max_s) + + cos = torch.index_select(self._cos_cached, 0, position_ids) + sin = torch.index_select(self._sin_cached, 0, position_ids) + + # Note: this unsqueeze is not necessary on RoCm + VLLM ROPE implementation, but we leave it as is to avoid yet an other controlflow. + return cos.unsqueeze(1), sin.unsqueeze(1) + + +class SuRotaryEmbedding(PositionRotaryEmbedding): + def __init__( + self, + short_inv_freq, + long_inv_freq, + scaling_factor, + original_max_position_embeddings, + ): + super(PositionRotaryEmbedding, self).__init__() + self.short_inv_freq = short_inv_freq + self.long_inv_freq = long_inv_freq + self.scaling_factor = scaling_factor + self.original_max_position_embeddings = original_max_position_embeddings + self._seq_len_cached = 0 + self._cos_cached = None + self._sin_cached = None + self._cos_k_cached = None + self._sin_k_cached = None + self.dynamic_args = None + + def _update_cos_sin_cache(self, dtype, device, seqlen): + # Reset the tables if the sequence length has changed, + # or if we're on a new device (possibly due to tracing for instance) + if ( + seqlen > self._seq_len_cached + or self._cos_cached.device != device + or self._cos_cached.dtype != dtype + ): + self._seq_len_cached = seqlen + if seqlen > self.original_max_position_embeddings: + inv_freq = self.long_inv_freq + else: + inv_freq = self.short_inv_freq + t = torch.arange(seqlen, device=device, dtype=inv_freq.dtype) + if self.scaling_factor is not None: + t /= self.scaling_factor + # Don't do einsum, it converts fp32 to fp16 + # freqs = torch.einsum("i,j->ij", t, self.inv_freq) + + freqs = torch.outer(t, inv_freq.to(device=t.device)) + self._cos_cached = torch.cos(freqs).to(dtype) + self._sin_cached = torch.sin(freqs).to(dtype) + + +class DynamicPositionRotaryEmbedding(PositionRotaryEmbedding): + def __init__(self, dim, max_position_embeddings, base, device, scaling_factor): + inv_freq = _create_inv_freq(dim, base, device) + super().__init__(inv_freq, scaling_factor) + self.dim = dim + self.max_position_embeddings = max_position_embeddings + self.base = base + + def _update_cos_sin_cache(self, dtype, device, seqlen): + # Reset the tables if the sequence length has changed, + # or if we're on a new device (possibly due to tracing for instance) + if ( + seqlen > self._seq_len_cached + or self._cos_cached.device != device + or self._cos_cached.dtype != dtype + ): + if seqlen > self.max_position_embeddings: + newbase = self.base * ( + (self.scaling_factor * seqlen / self.max_position_embeddings) + - (self.scaling_factor - 1) + ) ** (self.dim / (self.dim - 2)) + self.inv_freq = _create_inv_freq( + self.dim, newbase, self.inv_freq.device + ) + self._seq_len_cached = seqlen + t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype) + # Don't do einsum, it converts fp32 to fp16 + # freqs = torch.einsum("i,j->ij", t, self.inv_freq) + + freqs = torch.outer(t, self.inv_freq.to(device=t.device)) + self._cos_cached = torch.cos(freqs).to(dtype) + self._sin_cached = torch.sin(freqs).to(dtype) + + +# Inverse dim formula to find dim based on number of rotations +import math + + +def find_correction_dim(num_rotations, dim, base=10000, max_position_embeddings=2048): + return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / ( + 2 * math.log(base) + ) + + +# Find dim range bounds based on rotations +def find_correction_range( + low_rot, high_rot, dim, base=10000, max_position_embeddings=2048 +): + low = math.floor(find_correction_dim(low_rot, dim, base, max_position_embeddings)) + high = math.ceil(find_correction_dim(high_rot, dim, base, max_position_embeddings)) + return max(low, 0), min(high, dim - 1) # Clamp values just in case + + +def linear_ramp_mask(min, max, dim): + if min == max: + max += 0.001 # Prevent singularity + + linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min) + ramp_func = torch.clamp(linear_func, 0, 1) + return ramp_func + + +def get_mscale(scale=1): + if scale <= 1: + return 1.0 + return 0.1 * math.log(scale) + 1.0 + + +class YarnPositionRotaryEmbedding(PositionRotaryEmbedding): + def __init__( + self, + dim, + max_position_embeddings, + base, + device, + scaling_factor, + *, + extrapolation_factor, + attn_factor, + beta_fast, + beta_slow, + ): + inv_freq = _create_inv_freq(dim, base, device) + super().__init__(inv_freq, scaling_factor) + self.dim = dim + self.max_position_embeddings = max_position_embeddings + self.base = base + self.extrapolation_factor = extrapolation_factor + self.attn_factor = attn_factor + self.beta_fast = beta_fast + self.beta_slow = beta_slow + self.mscale = float( + get_mscale(self.scaling_factor) * self.attn_factor + ) # Get n-d magnitude scaling corrected for interpolation + + def _update_cos_sin_cache(self, dtype, device, seqlen): + # Reset the tables if the sequence length has changed, + # or if we're on a new device (possibly due to tracing for instance) + if ( + seqlen > self._seq_len_cached + or self._cos_cached.device != device + or self._cos_cached.dtype != dtype + ): + if seqlen > self.max_position_embeddings: + inv_freq_extrapolation = _create_inv_freq( + self.dim, self.base, self.inv_freq.device + ) + freqs = 1.0 / inv_freq_extrapolation + inv_freq_interpolation = 1.0 / (self.scaling_factor * freqs) + low, high = find_correction_range( + self.beta_fast, + self.beta_slow, + self.dim, + self.base, + self.max_position_embeddings, + ) + inv_freq_mask = ( + 1 - linear_ramp_mask(low, high, self.dim // 2).float().to(device) + ) * self.extrapolation_factor # Get n-d rotational scaling corrected for extrapolation + inv_freq = ( + inv_freq_interpolation * (1 - inv_freq_mask) + + inv_freq_extrapolation * inv_freq_mask + ) + + self.inv_freq = inv_freq + self.mscale = float( + get_mscale(self.scaling_factor) * self.attn_factor + ) # Get n-d magnitude scaling corrected for interpolation + + self._seq_len_cached = seqlen + t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype) + # Don't do einsum, it converts fp32 to fp16 + # freqs = torch.einsum("i,j->ij", t, self.inv_freq) + + freqs = torch.outer(t, self.inv_freq.to(device=t.device)) + self._cos_cached = (torch.cos(freqs) * self.mscale).to(dtype) + self._sin_cached = (torch.sin(freqs) * self.mscale).to(dtype) diff --git a/server/text_generation_server/layers/speculative.py b/server/text_generation_server/layers/speculative.py new file mode 100644 index 00000000..4b977a56 --- /dev/null +++ b/server/text_generation_server/layers/speculative.py @@ -0,0 +1,52 @@ +import torch +import json +from typing import Tuple, Optional +from text_generation_server.layers.tensor_parallel import TensorParallelHead +from text_generation_server.layers.medusa import MedusaHeadV1, MedusaHeadV2 +from text_generation_server.layers.mlp import MLPSpeculatorHead + + +class SpeculativeHead(torch.nn.Module): + def __init__(self, lm_head, speculator): + super().__init__() + self.head = lm_head + self.speculator = speculator + + @staticmethod + def load(config, prefix: str, weights): + speculator = config.speculator + if speculator: + speculator_path = config.speculator["path"] + speculator_config = str(speculator_path / "config.json") + + with open(speculator_config, "r") as f: + speculator_config = json.load(f) + + config.speculator_config = speculator_config + try: + architecture = speculator_config["architectures"][0] + + if architecture == "MLPSpeculatorPreTrainedModel": + speculator = MLPSpeculatorHead.load(config, prefix, weights) + else: + speculator = None + except KeyError: + try: + speculator = MedusaHeadV1.load(config, prefix, weights) + except: + speculator = MedusaHeadV2(config, prefix, weights) + lm_head = None + else: + lm_head = TensorParallelHead.load(config, prefix, weights) + speculator = None + return SpeculativeHead(lm_head, speculator) + + def forward( + self, input: torch.Tensor + ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: + if self.speculator is not None: + return self.speculator(input) + + assert self.head is not None + logits = self.head(input) + return logits, None diff --git a/server/text_generation_server/layers/tensor_parallel.py b/server/text_generation_server/layers/tensor_parallel.py new file mode 100644 index 00000000..34b9c51e --- /dev/null +++ b/server/text_generation_server/layers/tensor_parallel.py @@ -0,0 +1,188 @@ +import torch +from torch.nn import functional as F +from typing import List +from text_generation_server.layers.linear import get_linear, FastLinear + + +class SuperLayer(torch.nn.Module): + def __init__(self, linear): + super().__init__() + self.linear = linear + + def forward(self, x): + return self.linear.forward(x) + + +class TensorParallelHead(SuperLayer): + def __init__(self, linear, process_group, should_gather: bool): + super().__init__(linear) + self.process_group = process_group + self.should_gather = should_gather + + @staticmethod + def load(config, prefix: str, weights): + if weights.process_group.size() > 1: + try: + weight = weights.get_sharded(f"{prefix}.weight", dim=0) + should_gather = True + except AssertionError: + # If the vocab size is not divisible by number of shards + # just load the entire thing. + weight = weights.get_tensor(f"{prefix}.weight") + should_gather = False + else: + weight = weights.get_tensor(f"{prefix}.weight") + should_gather = False + + # GPTQ,AWQ,EETQ don't quantize heads (nor embeddings) + if config.quantize in ["gptq", "awq", "eetq"]: + quantize = None + else: + quantize = config.quantize + return TensorParallelHead( + get_linear(weight, bias=None, quantize=quantize), + process_group=weights.process_group, + should_gather=should_gather, + ) + + def forward(self, input: torch.Tensor) -> torch.Tensor: + if not self.should_gather: + return super().forward(input) + + world_size = self.process_group.size() + if len(input.shape) == 2 and isinstance(self.linear, FastLinear): + out_dim = self.linear.weight.shape[0] + + if input.shape[0] == 1: + world_out = input.new_empty(1, out_dim * world_size) + local_out = input.new_empty(1, out_dim) + gather_input = local_out + else: + world_out = input.new_empty(out_dim * world_size, input.shape[0]) + gather_input = input.new_empty(out_dim, input.shape[0]) + local_out = gather_input.T + + torch.mm(input, self.linear.weight.T, out=local_out) + + torch.distributed.all_gather_into_tensor( + world_out, gather_input, group=self.process_group + ) + + if input.shape[0] == 1: + return world_out + return world_out.T + + output = super().forward(input) + world_output = [ + torch.empty_like(output) for _ in range(self.process_group.size()) + ] + torch.distributed.all_gather(world_output, output, group=self.process_group) + world_output = torch.cat(world_output, dim=-1) + return world_output + + +class TensorParallelColumnLinear(SuperLayer): + @classmethod + def load_gate_up(cls, config, prefix: str, weights, bias: bool): + """Specific method when the QKV was joined after the fact""" + weight = weights.get_weights_col_packed_gate_up( + prefix, quantize=config.quantize + ) + if bias: + raise NotImplementedError("packed_gate_up only implemented without bias") + else: + bias = None + linear = get_linear(weight, bias, config.quantize) + return cls(linear) + + @classmethod + def load_qkv(cls, config, prefix: str, weights, bias: bool): + """Specific method when the QKV was joined after the fact""" + weight = weights.get_weights_col_packed_qkv(prefix, quantize=config.quantize) + if bias: + raise NotImplementedError("packed_qkv only implemented for baichuan") + else: + bias = None + linear = get_linear(weight, bias, config.quantize) + return cls(linear) + + @classmethod + def load(cls, config, prefix: str, weights, bias: bool): + return cls.load_multi(config, [prefix], weights, bias, dim=0) + + @classmethod + def load_multi(cls, config, prefixes: List[str], weights, bias: bool, dim: int): + weight = weights.get_multi_weights_col( + prefixes, quantize=config.quantize, dim=dim + ) + + if bias: + b = [weights.get_sharded(f"{p}.bias", dim=0) for p in prefixes] + bias = torch.cat(b, dim=dim) + else: + bias = None + linear = get_linear(weight, bias, config.quantize) + return cls(linear) + + +class TensorParallelRowLinear(SuperLayer): + def __init__(self, linear, process_group): + super().__init__(linear) + self.process_group = process_group + + @classmethod + def load(cls, config, prefix: str, weights, bias: bool): + weight = weights.get_multi_weights_row(prefix, quantize=config.quantize) + + if bias and weights.process_group.rank() == 0: + # Rank is only on the first rank process + bias = weights.get_tensor(f"{prefix}.bias") + else: + bias = None + return cls( + get_linear(weight, bias, config.quantize), + process_group=weights.process_group, + ) + + def forward(self, input: torch.Tensor, reduce: bool = True) -> torch.Tensor: + out = super().forward(input) + if self.process_group.size() > 1 and reduce: + torch.distributed.all_reduce(out, group=self.process_group) + return out + + +class TensorParallelEmbedding(torch.nn.Module): + def __init__(self, prefix: str, weights, reduce=True): + super().__init__() + weight = weights.get_partial_sharded(f"{prefix}.weight", dim=0) + num_embeddings = weights.get_shape(f"{prefix}.weight")[0] + + process_group = weights.process_group + + world_size = process_group.size() + rank = process_group.rank() + + block_size = (num_embeddings + world_size - 1) // world_size + self.min_id = rank * block_size + self.max_id = min(num_embeddings, (rank + 1) * block_size) + self.null_idx = weight.shape[ + 0 + ] # Usually block_size, might be less in non even vocab_size. + self.process_group = weights.process_group + self.reduce = reduce + + """Additional 0 entry used for masking""" + self.weight = torch.nn.Parameter(F.pad(weight, (0, 0, 0, 1))) + + def forward(self, input: torch.Tensor) -> torch.Tensor: + # default all out of bounds values to `self.null_idx` that will then be mapped to 0 + # translate for [0, self.max_id - self.min_id[ + input = torch.where( + (self.min_id > input) | (input >= self.max_id), + self.null_idx, + input - self.min_id, + ) + out = torch.nn.functional.embedding(input, self.weight) + if self.reduce and self.process_group.size() > 1: + torch.distributed.all_reduce(out, group=self.process_group) + return out diff --git a/server/text_generation_server/models/__init__.py b/server/text_generation_server/models/__init__.py index 1cd1563f..30bd8f6c 100644 --- a/server/text_generation_server/models/__init__.py +++ b/server/text_generation_server/models/__init__.py @@ -1,9 +1,10 @@ import torch +import os from loguru import logger from transformers.configuration_utils import PretrainedConfig from transformers.models.auto import modeling_auto -from huggingface_hub import hf_hub_download +from huggingface_hub import hf_hub_download, HfApi from typing import Optional from pathlib import Path @@ -15,6 +16,12 @@ from text_generation_server.models.model import Model from text_generation_server.models.causal_lm import CausalLM from text_generation_server.models.bloom import BLOOM from text_generation_server.models.starcoder import StarCoder +from text_generation_server.models.vlm_causal_lm import VlmCausalLM +from text_generation_server.models.custom_modeling.llava_next import ( + LlavaNextForConditionalGeneration, +) + + from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi @@ -40,8 +47,9 @@ def get_model( config_dict, _ = PretrainedConfig.get_config_dict( model_id, revision=revision, trust_remote_code=trust_remote_code ) + model_type = config_dict.get("model_type", None) - use_medusa = None + speculator = None if "medusa_num_heads" in config_dict: medusa_model_id = model_id medusa_revision = revision @@ -61,6 +69,8 @@ def get_model( config_dict, _ = PretrainedConfig.get_config_dict( model_id, revision=revision, trust_remote_code=trust_remote_code ) + # Reload model type from parent. + model_type = config_dict.get("model_type", None) is_local = Path(medusa_model_id).exists() if not is_local: medusa_config = hf_hub_download( @@ -71,11 +81,70 @@ def get_model( revision=medusa_revision, filename="medusa_lm_head.safetensors", ) - use_medusa = Path(medusa_config).parent + speculator = { + "path": Path(medusa_config).parent, + "model_paths": ["medusa_lm_head.safetensors"], + } else: - use_medusa = Path(medusa_model_id) + speculator = { + "path": Path(medusa_model_id), + "model_paths": ["medusa_lm_head.safetensors"], + } method = "medusa" + elif model_type == "mlp_speculator": + mlp_model_id = model_id + mlp_revision = revision + model_id = config_dict["base_model_name_or_path"] + revision = "main" + speculate_mlp = config_dict["n_predict"] + if speculate is not None: + if speculate > speculate_mlp: + raise RuntimeError( + f"Speculate is set to `{speculate}` but this mlp_speculator models only has `{speculate_mlp}` heads, please make them match" + ) + else: + set_speculate(speculate) + else: + set_speculate(speculate_mlp) + + config_dict, _ = PretrainedConfig.get_config_dict( + model_id, revision=revision, trust_remote_code=trust_remote_code + ) + # Reload model type from parent. + model_type = config_dict.get("model_type", None) + is_local = Path(mlp_model_id).exists() + extension = ".safetensors" + if not is_local: + mlp_speculator_config = hf_hub_download( + mlp_model_id, revision=mlp_revision, filename="config.json" + ) + api = HfApi() + info = api.model_info(mlp_model_id, revision=mlp_revision) + filenames = [ + s.rfilename + for s in info.siblings + if s.rfilename.endswith(extension) + and len(s.rfilename.split("/")) == 1 + and "arguments" not in s.rfilename + and "args" not in s.rfilename + and "training" not in s.rfilename + ] + for filename in filenames: + hf_hub_download( + mlp_model_id, + revision=mlp_revision, + filename=filename, + ) + speculator = { + "path": Path(mlp_speculator_config).parent, + "model_paths": filenames, + } + else: + speculator = Path(mlp_model_id) + filenames = [p for p in os.listdir(speculator) if p.endswith(extension)] + speculator = {"path": speculator, "model_paths": filenames} + method = "mlp_speculator" else: method = "n-gram" @@ -92,7 +161,18 @@ def get_model( return BLOOM( model_id, revision, - use_medusa=use_medusa, + speculator=speculator, + dtype=dtype, + trust_remote_code=trust_remote_code, + ) + + if model_type == "llava_next": + return VlmCausalLM( + model_class=LlavaNextForConditionalGeneration, + model_id=model_id, + revision=revision, + quantize=None, + speculator=speculator, dtype=dtype, trust_remote_code=trust_remote_code, ) @@ -101,7 +181,7 @@ def get_model( return CausalLM( model_id, revision, - use_medusa=use_medusa, + speculator=speculator, dtype=dtype, trust_remote_code=trust_remote_code, ) diff --git a/server/text_generation_server/models/bloom.py b/server/text_generation_server/models/bloom.py index 86cafda2..6fe64374 100644 --- a/server/text_generation_server/models/bloom.py +++ b/server/text_generation_server/models/bloom.py @@ -35,14 +35,14 @@ class BLOOM(CausalLM): self, model_id: str, revision: Optional[str] = None, - use_medusa: Optional[str] = None, + speculator: Optional[str] = None, dtype: Optional[torch.dtype] = None, trust_remote_code: bool = False, ): super(BLOOM, self).__init__( model_id=model_id, revision=revision, - use_medusa=use_medusa, + speculator=speculator, dtype=dtype, trust_remote_code=trust_remote_code, ) diff --git a/server/text_generation_server/models/cache_manager.py b/server/text_generation_server/models/cache_manager.py index 4c65e2dd..c7705fe8 100644 --- a/server/text_generation_server/models/cache_manager.py +++ b/server/text_generation_server/models/cache_manager.py @@ -2,7 +2,7 @@ import math import torch from typing import Optional, List, Tuple -from text_generation_server.utils.import_utils import IS_XPU_SYSTEM +from text_generation_server.utils.import_utils import SYSTEM BLOCK_SIZE: int = 16 # Will be set in warmup @@ -25,7 +25,7 @@ class CacheManager: self.repeat_slots = repeat_slots element_size = torch.tensor([], dtype=dtype).element_size() - if IS_XPU_SYSTEM: + if SYSTEM == "xpu": x = 1 else: x = self.block_size // element_size diff --git a/server/text_generation_server/models/causal_lm.py b/server/text_generation_server/models/causal_lm.py index bf713582..5eddd5b7 100644 --- a/server/text_generation_server/models/causal_lm.py +++ b/server/text_generation_server/models/causal_lm.py @@ -367,6 +367,7 @@ class CausalLMBatch(Batch): input_lengths = [b.input_length for b in batches] max_input_length = max(input_lengths) offsets = [max_input_length - b.input_length for b in batches] + cur_padding = [b.right_padding for b in batches] # For prefill there is a space allocated only for first token # Need to add padding to the max total tokens before first decode @@ -596,13 +597,15 @@ class CausalLM(Model): self, model_id: str, revision: Optional[str] = None, - use_medusa: Optional[str] = None, + speculator: Optional[str] = None, dtype: Optional[torch.dtype] = None, trust_remote_code: bool = False, ): + + if speculator: + raise RuntimeError("Speculator decoding is not enabled for AutoModel") + self.prev_bs = 0 - if use_medusa: - raise RuntimeError("Medusa decoding is not enabled for AutoModel") # Create tokenizer tokenizer = AutoTokenizer.from_pretrained( diff --git a/server/text_generation_server/models/custom_modeling/bloom_modeling.py b/server/text_generation_server/models/custom_modeling/bloom_modeling.py index c8f02bca..0d8a1b59 100644 --- a/server/text_generation_server/models/custom_modeling/bloom_modeling.py +++ b/server/text_generation_server/models/custom_modeling/bloom_modeling.py @@ -32,7 +32,7 @@ from transformers.modeling_outputs import ( ) from transformers import BloomConfig, PreTrainedModel -from text_generation_server.utils.layers import ( +from text_generation_server.layers import ( TensorParallelColumnLinear, TensorParallelEmbedding, TensorParallelRowLinear, diff --git a/server/text_generation_server/models/custom_modeling/clip.py b/server/text_generation_server/models/custom_modeling/clip.py index c4917733..56618bf1 100644 --- a/server/text_generation_server/models/custom_modeling/clip.py +++ b/server/text_generation_server/models/custom_modeling/clip.py @@ -15,7 +15,7 @@ from transformers.modeling_outputs import ( ) from transformers import CLIPConfig, CLIPTextConfig, CLIPVisionConfig -from text_generation_server.utils.layers import ( +from text_generation_server.layers import ( TensorParallelEmbedding, TensorParallelColumnLinear, TensorParallelRowLinear, diff --git a/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py b/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py index 56d9a966..bd8b8016 100644 --- a/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py @@ -26,18 +26,22 @@ from transformers.activations import ACT2FN from typing import Optional, List, Tuple from text_generation_server.utils import paged_attention, flash_attn -from text_generation_server.utils.import_utils import IS_ROCM_SYSTEM, IS_CUDA_SYSTEM -from text_generation_server.utils.layers import ( +from text_generation_server.utils.import_utils import SYSTEM +from text_generation_server.layers import ( TensorParallelRowLinear, TensorParallelColumnLinear, TensorParallelEmbedding, - PositionRotaryEmbedding, SpeculativeHead, get_linear, +) +from text_generation_server.layers.layernorm import ( FastLayerNorm, ) +from text_generation_server.layers.rotary import ( + PositionRotaryEmbedding, +) -if IS_CUDA_SYSTEM: +if SYSTEM == "cuda": import dropout_layer_norm else: dropout_layer_norm = None @@ -52,7 +56,7 @@ class CohereRotary(PositionRotaryEmbedding): sin: torch.Tensor, ): # Such controlflows may add some overhead. - if IS_CUDA_SYSTEM: + if SYSTEM == "cuda": import rotary_emb q1 = query[..., ::2] @@ -64,8 +68,8 @@ class CohereRotary(PositionRotaryEmbedding): k2 = key[..., 1::2] rotary_emb.apply_rotary(k1, k2, cos, sin, k1, k2, False) - elif IS_ROCM_SYSTEM: - from vllm import pos_encoding_ops + elif SYSTEM == "rocm": + from vllm._C import ops # NOTE: On RoCm systems, we use a ROPE implementatation adapted from VLLM which launches a single kernel for both query/key, contrary to flash-attn implementation used on NVIDIA systems. # Compiling flash-attn rotary on RoCm, it appears hipcc is unable to unroll loops, resulting in an even slower inference compared to eager: https://github.com/pytorch/pytorch/issues/113773 @@ -73,7 +77,7 @@ class CohereRotary(PositionRotaryEmbedding): head_size = query.shape[-1] # Inplace operation, updating query and key. - pos_encoding_ops.rotary_embedding(query, key, head_size, cos, sin, False) + ops.rotary_embedding(query, key, head_size, cos, sin, False) else: raise ValueError( "Your system seem to be not supported. Please check your install or open an issue at https://github.com/huggingface/text-generation-inference/issues with a clear reproduction." @@ -90,7 +94,7 @@ class CohereLayerNorm(nn.Module): self.eps = eps def forward(self, hidden_states): - if hidden_states.shape[-1] > 8192 or IS_ROCM_SYSTEM: + if hidden_states.shape[-1] > 8192 or SYSTEM == "rocm": hidden_states = hidden_states.reshape( -1, self.weight.shape[0], self.weight.shape[1] ) diff --git a/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py b/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py index d0978bef..9d652b67 100644 --- a/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py @@ -21,21 +21,26 @@ from transformers.activations import ACT2FN from transformers.configuration_utils import PretrainedConfig from typing import Optional, List, Tuple, Any from loguru import logger -from text_generation_server.utils.import_utils import IS_XPU_SYSTEM +from text_generation_server.utils.import_utils import SYSTEM -if not IS_XPU_SYSTEM: +if SYSTEM != "xpu": from vllm.model_executor.layers.fused_moe import fused_moe + from text_generation_server.utils import paged_attention, flash_attn -from text_generation_server.utils.layers import ( +from text_generation_server.layers import ( FastLinear, - FastLayerNorm, TensorParallelRowLinear, TensorParallelColumnLinear, TensorParallelEmbedding, - PositionRotaryEmbedding, SpeculativeHead, get_linear, ) +from text_generation_server.layers.rotary import ( + PositionRotaryEmbedding, +) +from text_generation_server.layers.layernorm import ( + FastLayerNorm, +) from text_generation_server.utils.log import log_once @@ -216,7 +221,7 @@ def _load_gqa(config, prefix: str, weights): bits, groupsize, desc_act, quant_method = weights._get_gptq_params() - from text_generation_server.utils.layers import HAS_EXLLAMA + from text_generation_server.layers import HAS_EXLLAMA use_exllama = ( bits == 4 and HAS_EXLLAMA and config.quantize == "gptq" and not desc_act @@ -236,7 +241,7 @@ def _load_gqa(config, prefix: str, weights): log_once( logger.info, "Converting AWQ model to Exllama/GPTQ packing format." ) - from text_generation_server.utils.awq.conversion_utils import ( + from text_generation_server.layers.awq.conveersion_utils import ( fast_awq_to_gptq, ) diff --git a/server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py b/server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py index bd7596db..ac6fd0e6 100644 --- a/server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py @@ -27,13 +27,15 @@ from transformers.configuration_utils import PretrainedConfig from typing import Optional, List, Tuple from text_generation_server.utils import paged_attention, flash_attn -from text_generation_server.utils.layers import ( +from text_generation_server.layers import ( TensorParallelRowLinear, TensorParallelColumnLinear, TensorParallelEmbedding, - PositionRotaryEmbedding, SpeculativeHead, get_linear, +) +from text_generation_server.layers.rotary import PositionRotaryEmbedding +from text_generation_server.layers.layernorm import ( FastRMSNorm, ) @@ -97,8 +99,13 @@ class GemmaConfig(PretrainedConfig): class GemmaFastRMSNorm(FastRMSNorm): @classmethod def load(cls, prefix, weights, eps=1e-6): + dtype = weights.dtype + weights.dtype = torch.float32 weight = weights.get_tensor(f"{prefix}.weight") + 1 - return cls(weight, eps) + weights.dtype = dtype + new = cls(weight, eps) + new.dtype = dtype + return new # perform the multiplication in full precision and downcast after def forward(self, hidden_states, residual=None): @@ -109,7 +116,7 @@ class GemmaFastRMSNorm(FastRMSNorm): variance = hidden_states.pow(2).mean(-1, keepdim=True) hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) hidden_states = hidden_states * self.weight - return hidden_states.to(self.weight.dtype), residual + return hidden_states.to(self.dtype), residual def load_attention(config, prefix, weights): @@ -151,15 +158,11 @@ def _load_gqa(config, prefix: str, weights): class FlashGemmaAttention(torch.nn.Module): - def __init__( - self, - prefix: str, - config, - weights, - ): + def __init__(self, prefix: str, config, weights, causal: bool): super().__init__() self.num_heads = config.num_attention_heads self.head_size = config.head_dim + self.causal = causal self.rotary_emb = PositionRotaryEmbedding.static( config=config, @@ -236,6 +239,7 @@ class FlashGemmaAttention(torch.nn.Module): cu_seqlen_prefill, max_s, self.softmax_scale, + causal=self.causal, ) # Decode else: @@ -293,11 +297,10 @@ class GemmaMLP(nn.Module): class FlashGemmaLayer(nn.Module): - def __init__(self, layer_id, config, weights): + def __init__(self, prefix, config, weights, causal: bool): super().__init__() - prefix = f"model.layers.{layer_id}" self.self_attn = FlashGemmaAttention( - prefix=f"{prefix}.self_attn", config=config, weights=weights + prefix=f"{prefix}.self_attn", config=config, weights=weights, causal=causal ) self.mlp = GemmaMLP(prefix=f"{prefix}.mlp", config=config, weights=weights) @@ -349,30 +352,25 @@ class FlashGemmaLayer(nn.Module): class FlashGemmaModel(torch.nn.Module): - def __init__(self, config, weights): + def __init__(self, prefix, config, weights, causal: bool): super().__init__() process_group = weights.process_group self.tp_rank = process_group.rank() self.tp_world_size = process_group.size() - embed_norm = config.hidden_size**0.5 - self.embed_tokens = TensorParallelEmbedding( - prefix="model.embed_tokens", weights=weights - ) - self.embed_tokens.weight *= embed_norm - self.layers = nn.ModuleList( [ FlashGemmaLayer( - layer_id, - config, - weights, + prefix=f"{prefix}.layers.{layer_id}", + config=config, + weights=weights, + causal=causal, ) for layer_id in range(config.num_hidden_layers) ] ) self.norm = GemmaFastRMSNorm.load( - prefix="model.norm", weights=weights, eps=config.rms_norm_eps + prefix=f"{prefix}.norm", weights=weights, eps=config.rms_norm_eps ) self.gradient_checkpointing = False @@ -383,7 +381,7 @@ class FlashGemmaModel(torch.nn.Module): def forward( self, - input_ids: torch.Tensor, + inputs_embeds: torch.Tensor, position_ids: torch.Tensor, cu_seqlen_prefill: Optional[torch.Tensor], kv_cache: List[Tuple[torch.Tensor, torch.Tensor]], @@ -392,7 +390,7 @@ class FlashGemmaModel(torch.nn.Module): input_lengths: torch.Tensor, max_s: int, ) -> torch.Tensor: - hidden_states = self.embed_tokens(input_ids) + hidden_states = inputs_embeds # Get rotary cos and sin for this forward # Avoid to index in each layer @@ -421,13 +419,30 @@ class FlashGemmaModel(torch.nn.Module): class FlashGemmaForCausalLM(torch.nn.Module): - def __init__(self, config, weights): + def __init__(self, prefix, config, weights, causal: bool): super().__init__() - self.model = FlashGemmaModel(config, weights) + embed_norm = config.hidden_size**0.5 + if prefix is None: + prefix = "model" + else: + prefix = f"{prefix}.model" + + self.embed_tokens = TensorParallelEmbedding( + prefix=f"{prefix}.embed_tokens", weights=weights + ) + self.embed_tokens.weight *= embed_norm + + self.model = FlashGemmaModel( + prefix=prefix, config=config, weights=weights, causal=causal + ) self.lm_head = SpeculativeHead.load( - config, - prefix="model.embed_tokens" if config.tie_word_embeddings else "lm_head", + prefix=( + f"{prefix}.embed_tokens" + if config.tie_word_embeddings + else f"{prefix}.lm_head" + ), + config=config, weights=weights, ) @@ -443,8 +458,9 @@ class FlashGemmaForCausalLM(torch.nn.Module): max_s: int, lm_head_indices: Optional[torch.Tensor] = None, ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: + input_embeds = self.embed_tokens(input_ids) hidden_states = self.model( - input_ids, + input_embeds, position_ids, cu_seqlen_prefill, kv_cache, diff --git a/server/text_generation_server/models/custom_modeling/flash_gpt2_modeling.py b/server/text_generation_server/models/custom_modeling/flash_gpt2_modeling.py new file mode 100644 index 00000000..d2599f7a --- /dev/null +++ b/server/text_generation_server/models/custom_modeling/flash_gpt2_modeling.py @@ -0,0 +1,454 @@ +# coding=utf-8 +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +import torch.distributed + +from torch import nn +from transformers.activations import ACT2FN +from typing import Optional, List, Tuple + +from text_generation_server.utils import paged_attention, flash_attn +from text_generation_server.layers import ( + TensorParallelRowLinear, + TensorParallelColumnLinear, + TensorParallelEmbedding, + SpeculativeHead, + get_linear, +) + + +def load_qkv(config, prefix: str, weights, head_size, num_heads): + if config.quantize == "gptq": + return _load_qkv_gptq( + config, + prefix, + weights, + ) + else: + return _load_qkv(config, prefix, weights, head_size, num_heads) + + +def _load_qkv_gptq(config, prefix: str, weights): + world_size = weights.process_group.size() + rank = weights.process_group.rank() + + # Weights + weight = weights.get_weights_col_packed_qkv(f"{prefix}.c_attn", config.quantize) + + # Bias + slice_ = weights._get_slice(f"{prefix}.c_attn.bias") + shape = slice_.get_shape() + total_size = shape[0] + assert total_size % 3 == 0, f"Prepacked is not divisible by {3}" + single_size = total_size // 3 + assert single_size % world_size == 0 + block_size = single_size // world_size + start = rank * block_size + stop = (rank + 1) * block_size + tensors = [] + for i in range(3): + tensor = slice_[start + i * single_size : stop + i * single_size] + tensors.append(tensor) + bias = torch.cat(tensors, dim=0) + bias = bias.to(device=weights.device) + + return TensorParallelColumnLinear(get_linear(weight, bias, config.quantize)) + + +def _load_qkv(config, prefix: str, weights, head_size, num_heads): + """Load QKV from a single, transposed matrix.""" + + slice_ = weights._get_slice(f"{prefix}.c_attn.weight") + shape = slice_.get_shape() + total_size = shape[1] + assert total_size % 3 == 0, f"Prepacked is not divisible by {3}" + world_size = weights.process_group.size() + single_size = total_size // 3 + assert single_size % world_size == 0 + rank = weights.process_group.rank() + + # Weights + block_size = single_size // world_size + start = rank * block_size + stop = (rank + 1) * block_size + tensors = [] + for i in range(3): + tensor = slice_[:, start + i * single_size : stop + i * single_size] + tensors.append(tensor) + weight = torch.cat(tensors, dim=1).T + weight = weight.to(dtype=weights.dtype) + weight = weight.to(device=weights.device) + + # Bias + slice_ = weights._get_slice(f"{prefix}.c_attn.bias") + shape = slice_.get_shape() + total_size = shape[0] + single_size = total_size // 3 + block_size = single_size // world_size + assert single_size % world_size == 0 + start = rank * block_size + stop = (rank + 1) * block_size + b = [] + for i in range(3): + tensor = slice_[start + i * single_size : stop + i * single_size] + b.append(tensor) + bias = torch.cat(b, dim=0) + bias = bias.to(dtype=weights.dtype) + bias = bias.to(device=weights.device) + assert list(bias.shape) == [ + 3 * num_heads * head_size + ], f"{weight.shape} != {[3 * num_heads * head_size]}" + + return TensorParallelColumnLinear(get_linear(weight, bias, config.quantize)) + + +def load_row(config, prefix: str, weights, bias: bool): + """load_row, but with transposed weight matrices.""" + + if config.quantize == "gptq": + weight = weights.get_multi_weights_row(prefix, quantize=config.quantize) + else: + weight = weights.get_sharded(f"{prefix}.weight", dim=0).T + + if bias and weights.process_group.rank() == 0: + # Rank is only on the first rank process + bias = weights.get_tensor(f"{prefix}.bias") + else: + bias = None + + return TensorParallelRowLinear( + get_linear(weight, bias, config.quantize), process_group=weights.process_group + ) + + +def load_col(config, prefix: str, weights, bias: bool): + """load_col, but with transposed weight matrices.""" + if config.quantize == "gptq": + weight = weights.get_multi_weights_col( + [prefix], quantize=config.quantize, dim=1 + ) + else: + weight = weights.get_sharded(f"{prefix}.weight", dim=1).T + + if bias: + bias = weights.get_sharded(f"{prefix}.bias", dim=0) + else: + bias = None + + return TensorParallelColumnLinear(get_linear(weight, bias, config.quantize)) + + +class FlashGPT2Attention(torch.nn.Module): + def __init__( + self, + prefix: str, + config, + weights, + ): + super().__init__() + self.num_heads = config.num_attention_heads + self.hidden_size = config.hidden_size + + self.head_size = self.hidden_size // self.num_heads + self.softmax_scale = self.head_size**-0.5 + + if self.num_heads % weights.process_group.size() != 0: + raise ValueError( + f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} " + f"and `num_shards`: {weights.process_group.size()}" + ) + self.num_heads = self.num_heads // weights.process_group.size() + + self.query_key_value = load_qkv( + config, + prefix=prefix, + weights=weights, + head_size=self.head_size, + num_heads=self.num_heads, + ) + + self.o_proj = load_row( + config, + prefix=f"{prefix}.c_proj", + weights=weights, + bias=True, + ) + + self.kv_head_mapping = torch.arange( + 0, self.num_heads, dtype=torch.int32, device=weights.device + ) + + def forward( + self, + hidden_states, + cu_seqlen_prefill, + kv_cache, + block_tables, + slots, + input_lengths, + max_s, + ): + query, key, value = self.query_key_value(hidden_states).split( + self.head_size * self.num_heads, dim=1 + ) + query = query.view(-1, self.num_heads, self.head_size) + key = key.view(-1, self.num_heads, self.head_size) + value = value.view(-1, self.num_heads, self.head_size) + + paged_attention.reshape_and_cache(key, value, kv_cache[0], kv_cache[1], slots) + + # output tensor + attn_output = torch.empty_like(query) + + # Prefill + if cu_seqlen_prefill is not None: + # flash attention + flash_attn.attention( + query, + key, + value, + attn_output, + cu_seqlen_prefill, + max_s, + self.softmax_scale, + ) + # Decode + else: + paged_attention.attention( + attn_output, + query, + kv_cache[0], + kv_cache[1], + self.kv_head_mapping, + self.softmax_scale, + block_tables, + input_lengths, + max_s, + ) + + return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size)) + + +class GPT2MLP(nn.Module): + def __init__(self, prefix, config, weights): + super().__init__() + act = config.activation_function + self.act = ( + ACT2FN[act] + if "gelu" not in act + else lambda x: torch.nn.functional.gelu( + x, + approximate=( + "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none" + ), + ) + ) + + self.c_fc = load_col( + config, prefix=f"{prefix}.c_fc", weights=weights, bias=True + ) + self.c_proj = load_row( + config, + prefix=f"{prefix}.c_proj", + weights=weights, + bias=True, + ) + + intermediate_size = ( + config.n_inner if config.n_inner is not None else 4 * config.hidden_size + ) + + self.intermediate_size = intermediate_size // weights.process_group.size() + + def forward(self, hidden_states): + hidden_states = self.c_fc(hidden_states) + hidden_states = self.act(hidden_states) + return self.c_proj(hidden_states) + + +class FlashGPT2Layer(nn.Module): + def __init__(self, prefix, config, weights): + super().__init__() + self.self_attn = FlashGPT2Attention( + prefix=f"{prefix}.attn", config=config, weights=weights + ) + self.mlp = GPT2MLP(prefix=f"{prefix}.mlp", config=config, weights=weights) + + self.input_layernorm = nn.LayerNorm.load( + prefix=f"{prefix}.ln_1", weights=weights, eps=config.layer_norm_epsilon + ) + self.post_attention_layernorm = nn.LayerNorm.load( + prefix=f"{prefix}.ln_2", + weights=weights, + eps=config.layer_norm_epsilon, + ) + + def forward( + self, + hidden_states, + residual, + cu_seqlen_prefill, + kv_cache, + block_tables, + slots, + input_lengths, + max_s, + ): + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + attn_output = self.self_attn( + hidden_states, + cu_seqlen_prefill, + kv_cache, + block_tables, + slots, + input_lengths, + max_s, + ) + + hidden_states = attn_output + residual + residual = hidden_states + + hidden_states = self.post_attention_layernorm(hidden_states) + + mlp_output = self.mlp(hidden_states) + + return residual + mlp_output, residual + + +class FlashGPT2Model(torch.nn.Module): + def __init__(self, prefix, config, weights): + super().__init__() + + process_group = weights.process_group + self.tp_rank = process_group.rank() + self.tp_world_size = process_group.size() + self.layers = nn.ModuleList( + [ + FlashGPT2Layer( + prefix=( + f"h.{layer_id}" if not prefix else f"{prefix}.h.{layer_id}" + ), + config=config, + weights=weights, + ) + for layer_id in range(config.num_hidden_layers) + ] + ) + + self.norm = nn.LayerNorm.load( + prefix="ln_f" if not prefix else f"{prefix}.ln_f", + weights=weights, + eps=config.layer_norm_epsilon, + ) + + self.gradient_checkpointing = False + + self.head_size = self.layers[0].self_attn.head_size + self.num_heads = self.layers[0].self_attn.num_heads + + def forward( + self, + inputs_embeds: torch.Tensor, + position_ids: torch.Tensor, + cu_seqlen_prefill: Optional[torch.Tensor], + kv_cache: List[Tuple[torch.Tensor, torch.Tensor]], + block_tables: torch.Tensor, + slots: torch.Tensor, + input_lengths: torch.Tensor, + max_s: int, + true_max_s: int, + prefill_cache_indices: Optional[torch.Tensor], + ) -> torch.Tensor: + hidden_states = inputs_embeds + + residual = None + for i, layer in enumerate(self.layers): + hidden_states, residual = layer( + hidden_states, + residual, + cu_seqlen_prefill, + kv_cache[i], + block_tables, + slots, + input_lengths, + max_s, + ) + + hidden_states = self.norm(hidden_states) + + return hidden_states + + +class FlashGPT2ForCausalLM(torch.nn.Module): + def __init__(self, prefix, config, weights): + super().__init__() + + self.embed_tokens = TensorParallelEmbedding( + prefix=("wte" if not prefix else f"{prefix}.wte"), + weights=weights, + ) + self.embed_positions = TensorParallelEmbedding( + prefix=("wpe" if not prefix else f"{prefix}.wpe"), + weights=weights, + ) + + self.model = FlashGPT2Model(prefix, config, weights) + self.lm_head = SpeculativeHead.load( + config, + prefix="wte" if not prefix else f"{prefix}.wte", + weights=weights, + ) + + def forward( + self, + input_ids: torch.Tensor, + position_ids: torch.Tensor, + cu_seqlen_prefill: Optional[torch.Tensor], + kv_cache: List[Tuple[torch.Tensor, torch.Tensor]], + block_tables: torch.Tensor, + slots: torch.Tensor, + input_lengths: torch.Tensor, + max_s: int, + prefill_cache_indices: Optional[torch.Tensor] = None, + lm_head_indices: Optional[torch.Tensor] = None, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: + token_embeds = self.embed_tokens(input_ids) + position_embeds = self.embed_positions(position_ids) + inputs_embeds = token_embeds + position_embeds + hidden_states = self.model( + inputs_embeds, + position_ids, + cu_seqlen_prefill, + kv_cache, + block_tables, + slots, + input_lengths, + max_s, + true_max_s=max_s, + prefill_cache_indices=prefill_cache_indices, + ) + if lm_head_indices is not None: + hidden_states = hidden_states[lm_head_indices] + logits, speculative_logits = self.lm_head(hidden_states) + return logits, speculative_logits diff --git a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py index 6fa85d4e..6e23aa2b 100644 --- a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py @@ -18,43 +18,59 @@ # See the License for the specific language governing permissions and # limitations under the License. +from typing import List, Optional, Tuple + import torch import torch.distributed from torch import nn from transformers.activations import ACT2FN -from transformers.configuration_utils import PretrainedConfig from typing import Optional, List, Tuple +from text_generation_server.utils.import_utils import SYSTEM from text_generation_server.utils import paged_attention, flash_attn -from text_generation_server.utils.layers import ( +from text_generation_server.layers import ( TensorParallelRowLinear, TensorParallelColumnLinear, TensorParallelEmbedding, - PositionRotaryEmbedding, SpeculativeHead, - get_linear, +) +from text_generation_server.layers.rotary import PositionRotaryEmbedding +from text_generation_server.layers.layernorm import ( FastRMSNorm, ) +if SYSTEM == "rocm": + try: + from vllm import _custom_C + except Exception as e: + raise ImportError(f"Could not load `vllm._custom_C`. Full error: {e}") + def load_attention(config, prefix, weights): + bias = config.attention_bias if config.num_attention_heads != config.num_key_value_heads: - return _load_gqa(config, prefix, weights) + return TensorParallelColumnLinear.load_multi( + config, + prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"], + dim=0, + weights=weights, + bias=bias, + ) else: if config.model_type == "baichuan": return TensorParallelColumnLinear.load_qkv( config, prefix=f"{prefix}.W_pack", weights=weights, - bias=False, + bias=bias, ) elif config.model_type == "phi3": return TensorParallelColumnLinear.load_qkv( config, prefix=f"{prefix}.qkv_proj", weights=weights, - bias=False, + bias=bias, ) else: return TensorParallelColumnLinear.load_multi( @@ -62,36 +78,10 @@ def load_attention(config, prefix, weights): prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"], dim=0, weights=weights, - bias=False, + bias=bias, ) -def _load_gqa(config, prefix: str, weights): - assert config.hidden_size % config.num_attention_heads == 0 - assert config.num_attention_heads % weights.process_group.size() == 0 - - weight = weights.get_multi_weights_col( - prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"], - quantize=config.quantize, - dim=0, - ) - - if config.quantize not in ["gptq", "awq"]: - weight = weight.to(dtype=weights.dtype).to(device=weights.device) - - head_size = config.hidden_size // config.num_attention_heads - num_heads = config.num_attention_heads // weights.process_group.size() - num_key_value_heads = config.num_key_value_heads // weights.process_group.size() - assert list(weight.shape) == [ - (num_heads + 2 * num_key_value_heads) * head_size, - config.hidden_size, - ], f"{list(weight.shape)} != {[(num_heads + 2 * config.num_key_value_heads) * head_size, config.hidden_size]}" - - return TensorParallelColumnLinear( - get_linear(weight, bias=None, quantize=config.quantize) - ) - - class FlashLlamaAttention(torch.nn.Module): def __init__( self, @@ -200,24 +190,27 @@ class FlashLlamaAttention(torch.nn.Module): class LlamaMLP(nn.Module): def __init__(self, prefix, config, weights): super().__init__() - act = config.hidden_act + self.hidden_act = config.hidden_act self.act = ( - ACT2FN[act] - if "gelu" not in act + ACT2FN[self.hidden_act] + if "gelu" not in self.hidden_act else lambda x: torch.nn.functional.gelu( x, approximate=( - "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none" + "tanh" + if self.hidden_act in ["gelu_fast", "gelu_pytorch_tanh"] + else "none" ), ) ) # Fuse gate and up proj + bias = getattr(config, "mlp_bias", False) if config.model_type == "phi3": self.gate_up_proj = TensorParallelColumnLinear.load_gate_up( config, prefix=f"{prefix}.gate_up_proj", weights=weights, - bias=False, + bias=bias, ) else: self.gate_up_proj = TensorParallelColumnLinear.load_multi( @@ -225,22 +218,40 @@ class LlamaMLP(nn.Module): prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"], weights=weights, dim=0, - bias=False, + bias=bias, ) self.down_proj = TensorParallelRowLinear.load( config, prefix=f"{prefix}.down_proj", weights=weights, - bias=False, + bias=bias, ) self.intermediate_size = ( config.intermediate_size // weights.process_group.size() ) + # TODO: This is a hotfix to be removed & properly refactored. + self.quantize = config.quantize + def forward(self, hidden_states): - gate_up_states = self.gate_up_proj(hidden_states) - gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size) - return self.down_proj(self.act(gate_up_states[:, 0]) * gate_up_states[:, 1]) + if ( + SYSTEM == "rocm" + and self.hidden_act == "silu" + and hidden_states.shape[0] == 1 + and not self.quantize + ): + out = torch.empty( + hidden_states.shape[0], + self.intermediate_size, + dtype=hidden_states.dtype, + device="cuda", + ) + _custom_C.LLMM_Silu(self.gate_up_proj.linear.weight, hidden_states, out, 8) + return self.down_proj(out) + else: + gate_up_states = self.gate_up_proj(hidden_states) + gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size) + return self.down_proj(self.act(gate_up_states[:, 0]) * gate_up_states[:, 1]) class FlashLlamaLayer(nn.Module): @@ -383,9 +394,14 @@ class FlashLlamaForCausalLM(torch.nn.Module): weights=weights, ) self.model = FlashLlamaModel(prefix, config, weights) + if config.tie_word_embeddings: + suffix = "model.embed_tokens" + else: + suffix = "lm_head" + self.lm_head = SpeculativeHead.load( config, - prefix="lm_head" if not prefix else f"{prefix}.lm_head", + prefix=suffix if not prefix else f"{prefix}.suffix", weights=weights, ) diff --git a/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py b/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py index c2445cda..ef3777da 100644 --- a/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py @@ -26,18 +26,28 @@ from transformers.activations import ACT2FN from transformers.configuration_utils import PretrainedConfig from typing import Optional, List, Tuple +from text_generation_server.utils.import_utils import SYSTEM from text_generation_server.utils import paged_attention, flash_attn -from text_generation_server.utils.layers import ( +from text_generation_server.layers import ( TensorParallelRowLinear, TensorParallelColumnLinear, TensorParallelEmbedding, - PositionRotaryEmbedding, SpeculativeHead, get_linear, +) +from text_generation_server.layers.rotary import PositionRotaryEmbedding +from text_generation_server.layers.layernorm import ( FastRMSNorm, ) +if SYSTEM == "rocm": + try: + from vllm import _custom_C + except Exception as e: + raise ImportError(f"Could not load `vllm._custom_C`. Full error: {e}") + + class MistralConfig(PretrainedConfig): model_type = "mistral" @@ -249,14 +259,16 @@ class MistralAttention(torch.nn.Module): class MistralMLP(nn.Module): def __init__(self, prefix, config, weights): super().__init__() - act = config.hidden_act + self.hidden_act = config.hidden_act self.act = ( - ACT2FN[act] - if "gelu" not in act + ACT2FN[self.hidden_act] + if "gelu" not in self.hidden_act else lambda x: torch.nn.functional.gelu( x, approximate=( - "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none" + "tanh" + if self.hidden_act in ["gelu_fast", "gelu_pytorch_tanh"] + else "none" ), ) ) @@ -278,10 +290,28 @@ class MistralMLP(nn.Module): config.intermediate_size // weights.process_group.size() ) + # TODO: This is a hotfix to be removed & properly refactored. + self.quantize = config.quantize + def forward(self, hidden_states): - gate_up_states = self.gate_up_proj(hidden_states) - gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size) - return self.down_proj(self.act(gate_up_states[:, 0]) * gate_up_states[:, 1]) + if ( + SYSTEM == "rocm" + and self.hidden_act == "silu" + and hidden_states.shape[0] == 1 + and not self.quantize + ): + out = torch.empty( + hidden_states.shape[0], + self.intermediate_size, + dtype=hidden_states.dtype, + device="cuda", + ) + _custom_C.LLMM_Silu(self.gate_up_proj.linear.weight, hidden_states, out, 8) + return self.down_proj(out) + else: + gate_up_states = self.gate_up_proj(hidden_states) + gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size) + return self.down_proj(self.act(gate_up_states[:, 0]) * gate_up_states[:, 1]) class MistralLayer(nn.Module): diff --git a/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py b/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py index 3f6c8e03..be2d6c45 100644 --- a/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py @@ -24,9 +24,9 @@ import torch.distributed import numpy as np from torch import nn -from text_generation_server.utils.import_utils import IS_XPU_SYSTEM +from text_generation_server.utils.import_utils import SYSTEM -if not IS_XPU_SYSTEM: +if SYSTEM != "xpu": from vllm.model_executor.layers.fused_moe import fused_moe from transformers.activations import ACT2FN from transformers.configuration_utils import PretrainedConfig @@ -34,16 +34,20 @@ from typing import Optional, List, Tuple from loguru import logger from text_generation_server.utils import paged_attention, flash_attn -from text_generation_server.utils.layers import ( +from text_generation_server.layers import ( FastLinear, - FastRMSNorm, TensorParallelRowLinear, TensorParallelColumnLinear, TensorParallelEmbedding, - PositionRotaryEmbedding, SpeculativeHead, get_linear, ) +from text_generation_server.layers.layernorm import ( + FastRMSNorm, +) +from text_generation_server.layers.rotary import ( + PositionRotaryEmbedding, +) class MixtralConfig(PretrainedConfig): diff --git a/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py b/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py index ee062d3d..d45cab2e 100644 --- a/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py @@ -29,15 +29,19 @@ from typing import Optional, List, Tuple from text_generation_server.utils import paged_attention, flash_attn from text_generation_server.utils.flash_attn import attention -from text_generation_server.utils.layers import ( +from text_generation_server.layers import ( TensorParallelRowLinear, TensorParallelColumnLinear, TensorParallelEmbedding, SpeculativeHead, - FastLayerNorm, - PositionRotaryEmbedding, get_linear, ) +from text_generation_server.layers.layernorm import ( + FastLayerNorm, +) +from text_generation_server.layers.rotary import ( + PositionRotaryEmbedding, +) def load_row(config, prefix: str, weights, bias: bool): diff --git a/server/text_generation_server/models/custom_modeling/flash_pali_gemma_modeling.py b/server/text_generation_server/models/custom_modeling/flash_pali_gemma_modeling.py new file mode 100644 index 00000000..91c709e4 --- /dev/null +++ b/server/text_generation_server/models/custom_modeling/flash_pali_gemma_modeling.py @@ -0,0 +1,110 @@ +# coding=utf-8 +# Copyright 2024 HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +import torch.distributed +from torch import nn +from transformers.configuration_utils import PretrainedConfig +from typing import Optional, List, Tuple + +from text_generation_server.layers.tensor_parallel import TensorParallelColumnLinear +from text_generation_server.models.custom_modeling.vlm import ( + load_text_model, + load_vision_model, +) + + +class PaliGemmaForConditionalGeneration(nn.Module): + def __init__(self, prefix, config, weights): + super().__init__() + config.vision_config.quantize = config.quantize + self.vision_tower = load_vision_model( + prefix="vision_tower" if not prefix else f"{prefix}.vision_tower", + config=config.vision_config, + weights=weights, + ) + + self.multi_modal_projector = TensorParallelColumnLinear.load( + config, + prefix="multi_modal_projector.linear", + weights=weights, + bias=True, + ) + + self.vocab_size = config.vocab_size + self.config = config + + text_config = config.text_config + text_config.speculator = config.speculator + text_config.quantize = config.quantize + self.text_model = load_text_model( + prefix="language_model" if not prefix else f"{prefix}.language_model", + config=config.text_config, + weights=weights, + ) + self.pad_token_id = ( + config.pad_token_id if config.pad_token_id is not None else -1 + ) + + def forward( + self, + input_ids: torch.Tensor, + position_ids: torch.Tensor, + cu_seqlen_prefill: Optional[torch.Tensor], + kv_cache: List[Tuple[torch.Tensor, torch.Tensor]], + block_tables: torch.Tensor, + slots: torch.Tensor, + input_lengths: torch.Tensor, + max_s: int, + prefill_cache_indices: Optional[torch.Tensor] = None, + lm_head_indices: Optional[torch.Tensor] = None, + pixel_values: torch.FloatTensor = None, + # Unused here + pixel_attention_mask: Optional[torch.BoolTensor] = None, + image_sizes: Optional[torch.Tensor] = None, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: + inputs_embeds = self.text_model.embed_tokens(input_ids) + # TODO This is odd but apparently pali gemma position ids start at 1. + if cu_seqlen_prefill is not None: + max_s += 1 + position_ids += 1 + + if pixel_values is not None: + pixel_values = pixel_values.to(dtype=inputs_embeds.dtype) + image_outputs = self.vision_tower(pixel_values) + image_features = self.multi_modal_projector(image_outputs.last_hidden_state) + + # mask where image or padding tokens + mask = input_ids == self.config.image_token_index + + # insert image features into input embeddings + inputs_embeds[mask] = image_features.view(-1, image_features.shape[-1]) + + hidden_states = self.text_model.model( + inputs_embeds=inputs_embeds, + position_ids=position_ids, + cu_seqlen_prefill=cu_seqlen_prefill, + kv_cache=kv_cache, + block_tables=block_tables, + slots=slots, + input_lengths=input_lengths, + max_s=max_s, + ) + + if lm_head_indices is not None: + hidden_states = hidden_states[lm_head_indices] + logits, speculative_logits = self.text_model.lm_head(hidden_states) + + return logits, speculative_logits diff --git a/server/text_generation_server/models/custom_modeling/flash_phi_modeling.py b/server/text_generation_server/models/custom_modeling/flash_phi_modeling.py index cfe447a7..f2efb538 100644 --- a/server/text_generation_server/models/custom_modeling/flash_phi_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_phi_modeling.py @@ -7,15 +7,19 @@ from transformers.configuration_utils import PretrainedConfig from typing import Optional, List, Tuple from text_generation_server.utils import paged_attention, flash_attn -from text_generation_server.utils.layers import ( +from text_generation_server.layers import ( TensorParallelRowLinear, TensorParallelColumnLinear, TensorParallelEmbedding, - PositionRotaryEmbedding, SpeculativeHead, get_linear, +) +from text_generation_server.layers.layernorm import ( FastLayerNorm, ) +from text_generation_server.layers.rotary import ( + PositionRotaryEmbedding, +) class PhiConfig(PretrainedConfig): diff --git a/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py b/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py index 94023b33..3a6d2db5 100644 --- a/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py @@ -6,13 +6,15 @@ from transformers.activations import ACT2FN from typing import Optional, List, Tuple from text_generation_server.utils import paged_attention, flash_attn -from text_generation_server.utils.layers import ( +from text_generation_server.layers import ( TensorParallelRowLinear, TensorParallelColumnLinear, TensorParallelEmbedding, - PositionRotaryEmbedding, SpeculativeHead, get_linear, +) +from text_generation_server.layers.rotary import PositionRotaryEmbedding +from text_generation_server.layers.layernorm import ( FastRMSNorm, ) diff --git a/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py b/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py index a9127d1f..fa463a19 100644 --- a/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py @@ -1,22 +1,21 @@ +from typing import List, Optional, Tuple + import torch import torch.distributed - from torch import nn -from transformers.modeling_utils import PreTrainedModel from transformers.configuration_utils import PretrainedConfig -from typing import Optional, List, Tuple +from transformers.modeling_utils import PreTrainedModel -from text_generation_server.utils import paged_attention, flash_attn -from text_generation_server.utils.flash_attn import attention -from text_generation_server.utils.layers import ( - TensorParallelRowLinear, +from text_generation_server.layers import ( + SpeculativeHead, TensorParallelColumnLinear, TensorParallelEmbedding, - SpeculativeHead, - FastLayerNorm, - PositionRotaryEmbedding, + TensorParallelRowLinear, get_linear, ) +from text_generation_server.layers.layernorm import FastLayerNorm +from text_generation_server.layers.rotary import PositionRotaryEmbedding +from text_generation_server.utils import flash_attn, paged_attention def load_row(config, prefix: str, weights, bias: bool): @@ -48,6 +47,7 @@ class RWConfig(PretrainedConfig): hidden_size=64, num_hidden_layers=None, num_attention_heads=None, + num_ln_in_prallel_attention=None, layer_norm_epsilon=1e-5, initializer_range=0.02, use_cache=True, @@ -61,6 +61,7 @@ class RWConfig(PretrainedConfig): new_decoder_architecture=None, bias=False, parallel_attn=False, + rope_theta=10_000.0, **kwargs, ): if alibi: @@ -71,6 +72,7 @@ class RWConfig(PretrainedConfig): self.model_type = model_type self.alibi = False self.rotary = True + self.rope_theta = rope_theta self.vocab_size = vocab_size # Backward compatibility with n_embed kwarg @@ -87,6 +89,7 @@ class RWConfig(PretrainedConfig): else kwargs.pop("n_head", 8) ) self.layer_norm_epsilon = layer_norm_epsilon + self.num_ln_in_parallel_attention = num_ln_in_prallel_attention self.initializer_range = initializer_range self.use_cache = use_cache self.hidden_dropout = hidden_dropout @@ -128,9 +131,13 @@ class FlashRWAttention(torch.nn.Module): self.num_heads_kv = config.n_head_kv self.hidden_size = config.hidden_size self.head_size = self.hidden_size // self.num_heads + self.rope_theta = config.rope_theta self.rotary_emb = PositionRotaryEmbedding.static( - config=config, dim=self.head_size, base=10000.0, device=weights.device + config=config, + dim=self.head_size, + base=self.rope_theta, + device=weights.device, ) self.softmax_scale = self.head_size ** (-0.5) @@ -240,9 +247,13 @@ class FlashRWLargeAttention(torch.nn.Module): self.hidden_size = hidden_size self.head_size = hidden_size // num_heads self.num_groups = num_groups + self.rope_theta = config.rope_theta self.rotary_emb = PositionRotaryEmbedding.static( - config=config, dim=self.head_size, base=10000.0, device=weights.device + config=config, + dim=self.head_size, + base=self.rope_theta, + device=weights.device, ) self.softmax_scale = self.head_size ** (-0.5) @@ -253,7 +264,7 @@ class FlashRWLargeAttention(torch.nn.Module): if process_group.size() > self.num_groups: raise NotImplementedError( - f"Tensor Parallelism is not implemented for world_size > n groups" + "Tensor Parallelism is not implemented for world_size > n groups" ) if self.num_groups % process_group.size() != 0: raise NotImplementedError( @@ -455,29 +466,61 @@ class FlashRWLayer(nn.Module): max_s, ) - hidden_states, residual = self.post_attention_layernorm( - hidden_states, residual - ) + if self.post_attention_layernorm is not None: + hidden_states, residual = self.post_attention_layernorm( + hidden_states, residual + ) mlp_output = self.mlp(hidden_states) return mlp_output, residual +class FlashRWLayerNorm(nn.Module): + def __init__(self, config, prefix, weights): + super().__init__() + self.num_ln = config.num_ln_in_parallel_attn + + if self.num_ln == 1: + self.input_ln = FastLayerNorm.load( + prefix=f"{prefix}.input_layernorm", + weights=weights, + eps=config.layer_norm_epsilon, + ) + elif self.num_ln == 2: + self.ln_attn = FastLayerNorm.load( + prefix=f"{prefix}.ln_attn", + weights=weights, + eps=config.layer_norm_epsilon, + ) + self.ln_mlp = FastLayerNorm.load( + prefix=f"{prefix}.ln_mlp", + weights=weights, + eps=config.layer_norm_epsilon, + ) + else: + raise ValueError("Number of layer norms can either be 1 or 2.") + + def forward( + self, + hidden_states, + residual, + ): + if self.num_ln == 1: + ln_hidden_states, residual = self.input_ln(hidden_states, residual) + return ln_hidden_states, ln_hidden_states, residual + elif self.num_ln == 2: + ln_attn, residual = self.ln_attn(hidden_states, residual) + ln_mlp, _ = self.ln_mlp(residual) + return ln_attn, ln_mlp, residual + + class FlashRWLargeLayer(nn.Module): def __init__(self, layer_id, config, weights): super().__init__() prefix = f"transformer.h.{layer_id}" - self.ln_attn = FastLayerNorm.load( - prefix=f"{prefix}.ln_attn", - weights=weights, - eps=config.layer_norm_epsilon, - ) - self.ln_mlp = FastLayerNorm.load( - prefix=f"{prefix}.ln_mlp", - weights=weights, - eps=config.layer_norm_epsilon, - ) + + self.ln_layer = FlashRWLayerNorm(config, prefix, weights) self.self_attention = FlashRWLargeAttention( config, @@ -503,8 +546,8 @@ class FlashRWLargeLayer(nn.Module): input_lengths, max_s, ): - ln_attn, residual = self.ln_attn(hidden_states, residual) - ln_mlp, _ = self.ln_mlp(residual) + # Layer norm. + ln_attn, ln_mlp, residual = self.ln_layer(hidden_states, residual) # Self attention. attn_output = self.self_attention( diff --git a/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py b/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py index bbb603a7..d2f6d9af 100644 --- a/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py @@ -6,14 +6,16 @@ from transformers.activations import ACT2FN from typing import Optional, List, Tuple from text_generation_server.utils import paged_attention, flash_attn -from text_generation_server.utils.layers import ( +from text_generation_server.layers import ( TensorParallelRowLinear, TensorParallelColumnLinear, SpeculativeHead, TensorParallelEmbedding, - FastLayerNorm, get_linear, ) +from text_generation_server.layers.layernorm import ( + FastLayerNorm, +) def load_multi_mqa( @@ -80,13 +82,13 @@ def _load_multi_mqa_gptq( g_idx = g_idx.to(device=weights.device) elif quant_method == "awq": g_idx = None - from text_generation_server.utils.awq.conversion_utils import ( + from text_generation_server.layers.awq.conversion_utils import ( fast_awq_to_gptq, ) qweight, qzeros = fast_awq_to_gptq(qweight, qzeros) - from text_generation_server.utils.layers import HAS_EXLLAMA + from text_generation_server.layers.gptq import HAS_EXLLAMA use_exllama = HAS_EXLLAMA weight = (qweight, qzeros, scales, g_idx, bits, groupsize, use_exllama) diff --git a/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py b/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py index ed77af78..3e2ce4f9 100644 --- a/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py @@ -27,15 +27,19 @@ from transformers.configuration_utils import PretrainedConfig from typing import Optional, List, Tuple from text_generation_server.utils import paged_attention, flash_attn -from text_generation_server.utils.layers import ( +from text_generation_server.layers import ( TensorParallelRowLinear, TensorParallelColumnLinear, TensorParallelEmbedding, - PositionRotaryEmbedding, SpeculativeHead, get_linear, - FastRMSNorm, +) +from text_generation_server.layers.layernorm import ( FastLayerNorm, + FastRMSNorm, +) +from text_generation_server.layers.rotary import ( + PositionRotaryEmbedding, ) diff --git a/server/text_generation_server/models/custom_modeling/idefics2.py b/server/text_generation_server/models/custom_modeling/idefics2.py index cb2ee7db..51fd7c02 100644 --- a/server/text_generation_server/models/custom_modeling/idefics2.py +++ b/server/text_generation_server/models/custom_modeling/idefics2.py @@ -29,7 +29,7 @@ from text_generation_server.models.custom_modeling.vlm import ( ) from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask -from text_generation_server.utils.layers import ( +from text_generation_server.layers import ( TensorParallelColumnLinear, TensorParallelEmbedding, TensorParallelRowLinear, @@ -683,9 +683,9 @@ class Idefics2ForConditionalGeneration(nn.Module): def __init__(self, prefix, config, weights): super().__init__() config.vision_config.quantize = config.quantize - config.vision_config.use_medusa = config.use_medusa + config.vision_config.speculator = config.speculator config.text_config.quantize = config.quantize - config.text_config.use_medusa = config.use_medusa + config.text_config.speculator = config.speculator vision_config = config.vision_config self.text_model = load_text_model( diff --git a/server/text_generation_server/models/custom_modeling/idefics_modeling.py b/server/text_generation_server/models/custom_modeling/idefics_modeling.py index ee4cdb08..786ef559 100644 --- a/server/text_generation_server/models/custom_modeling/idefics_modeling.py +++ b/server/text_generation_server/models/custom_modeling/idefics_modeling.py @@ -47,20 +47,22 @@ from text_generation_server.models.custom_modeling.idefics_vision import ( from text_generation_server.models.custom_modeling.idefics_perceiver import ( IdeficsPerceiverResampler, ) -from text_generation_server.utils.layers import ( +from text_generation_server.layers import ( TensorParallelColumnLinear, TensorParallelEmbedding, TensorParallelRowLinear, SpeculativeHead, - PositionRotaryEmbedding, FastLinear, ) -from text_generation_server.utils.import_utils import IS_CUDA_SYSTEM, IS_ROCM_SYSTEM +from text_generation_server.layers.rotary import PositionRotaryEmbedding +from text_generation_server.utils.import_utils import SYSTEM -if IS_CUDA_SYSTEM: +if SYSTEM == "cuda": import dropout_layer_norm -elif IS_ROCM_SYSTEM: - from vllm import layernorm_ops +elif SYSTEM == "rocm": + from vllm._C import ops +else: + dropout_layer_norm = None @dataclass @@ -373,7 +375,7 @@ class IdeficsRMSNorm(nn.Module): hidden_states = hidden_states.to(self.weight.dtype) return self.weight * hidden_states - elif IS_CUDA_SYSTEM: + elif SYSTEM == "cuda": # faster post attention rms norm unwrap = False if len(hidden_states.shape) > 2: @@ -405,7 +407,7 @@ class IdeficsRMSNorm(nn.Module): normed_hidden_states = normed_hidden_states.view(*shape) return normed_hidden_states - elif IS_ROCM_SYSTEM: + elif SYSTEM == "rocm": # We use VLLM RMSNorm kernel that can be compiled for RoCm, instead of Flash Attention ones that can not. if residual is not None: hidden_states += residual @@ -418,7 +420,7 @@ class IdeficsRMSNorm(nn.Module): hidden_states = hidden_states.reshape(-1, shape[-1]) out = torch.empty_like(hidden_states) - layernorm_ops.rms_norm( + ops.rms_norm( out, hidden_states, self.weight.data, diff --git a/server/text_generation_server/models/custom_modeling/idefics_perceiver.py b/server/text_generation_server/models/custom_modeling/idefics_perceiver.py index 477d4d70..af44490b 100644 --- a/server/text_generation_server/models/custom_modeling/idefics_perceiver.py +++ b/server/text_generation_server/models/custom_modeling/idefics_perceiver.py @@ -41,7 +41,7 @@ from typing import Optional, Tuple import torch import torch.nn as nn -from text_generation_server.utils.layers import ( +from text_generation_server.layers import ( TensorParallelColumnLinear, TensorParallelRowLinear, ) diff --git a/server/text_generation_server/models/custom_modeling/idefics_vision.py b/server/text_generation_server/models/custom_modeling/idefics_vision.py index c521dd0a..30c5997f 100644 --- a/server/text_generation_server/models/custom_modeling/idefics_vision.py +++ b/server/text_generation_server/models/custom_modeling/idefics_vision.py @@ -28,7 +28,7 @@ from transformers.utils import ( ModelOutput, logging, ) -from text_generation_server.utils.layers import ( +from text_generation_server.layers import ( TensorParallelColumnLinear, TensorParallelRowLinear, TensorParallelEmbedding, diff --git a/server/text_generation_server/models/custom_modeling/llava_next.py b/server/text_generation_server/models/custom_modeling/llava_next.py index 0d93791f..319a6d28 100644 --- a/server/text_generation_server/models/custom_modeling/llava_next.py +++ b/server/text_generation_server/models/custom_modeling/llava_next.py @@ -21,18 +21,12 @@ import torch.utils.checkpoint from torch import nn from transformers.activations import ACT2FN +from transformers.models.llava_next.modeling_llava_next import ( + unpad_image, +) +from optimum.habana.transformers.models import GaudiLlavaNextForConditionalGeneration from transformers.image_processing_utils import select_best_resolution -from text_generation_server.models.custom_modeling.vlm import ( - load_text_model, - load_vision_model, -) -from text_generation_server.utils.layers import ( - TensorParallelColumnLinear, - TensorParallelRowLinear, -) - - def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size): """ Calculate the shape of the image patch grid after the preprocessing for images of any resolution. @@ -56,100 +50,13 @@ def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size): return height // patch_size, width // patch_size -def unpad_image(tensor, original_size): - """ - Unpads a PyTorch tensor of a padded and resized image. - - Args: - tensor (`torch.Tensor`): - The image tensor, assumed to be of shape (num_channels, height, width). - original_size (`tuple`): - The original size of the image (height, width). - - Returns: - `torch.Tensor`: The unpadded image tensor. - """ - original_height, original_width = original_size - current_height, current_width = tensor.shape[1:] - - original_aspect_ratio = original_width / original_height - current_aspect_ratio = current_width / current_height - - if original_aspect_ratio > current_aspect_ratio: - scale_factor = current_width / original_width - new_height = int(original_height * scale_factor) - padding = (current_height - new_height) // 2 - unpadded_tensor = tensor[:, padding : current_height - padding, :] - else: - scale_factor = current_height / original_height - new_width = int(original_width * scale_factor) - padding = (current_width - new_width) // 2 - unpadded_tensor = tensor[:, :, padding : current_width - padding] - - return unpadded_tensor - - -# Copied from transformers.models.llava.modeling_llava.LlavaMultiModalProjector with Llava->LlavaNext -class LlavaNextMultiModalProjector(nn.Module): - def __init__(self, prefix, config, weights): - super().__init__() - - self.linear_1 = TensorParallelColumnLinear.load( - prefix=f"{prefix}.linear_1", config=config, weights=weights, bias=True - ) - self.act = ACT2FN[config.projector_hidden_act] - self.linear_2 = TensorParallelRowLinear.load( - prefix=f"{prefix}.linear_2", config=config, weights=weights, bias=True - ) - - def forward(self, image_features): - hidden_states = self.linear_1(image_features) - hidden_states = self.act(hidden_states) - hidden_states = self.linear_2(hidden_states) - return hidden_states - - -class LlavaNextForConditionalGeneration(nn.Module): - def __init__(self, prefix, config, weights): - super().__init__() - config.vision_config.quantize = config.quantize - vision_config = config.vision_config - # Instead of selecting in hidden_states[-2]. - # Instead compute only the n -2 + 1 layers and don't pool - if config.vision_feature_layer < 0: - vision_config.num_hidden_layers += config.vision_feature_layer + 1 - else: - vision_config.num_hidden_layers = config.vision_feature_layer + 1 - self.vision_tower = load_vision_model( - prefix="vision_tower" if not prefix else f"{prefix}.vision_tower", - config=config.vision_config, - weights=weights, - ) - - self.multi_modal_projector = LlavaNextMultiModalProjector( - prefix="multi_modal_projector", config=config, weights=weights - ) - - self.image_newline = weights.get_tensor("image_newline") - - self.vocab_size = config.text_config.vocab_size - self.config = config - config.text_config.quantize = config.quantize - config.text_config.use_medusa = config.use_medusa - self.language_model = load_text_model( - prefix="language_model" if not prefix else f"{prefix}.language_model", - config=config.text_config, - weights=weights, - ) - self.pad_token_id = ( - config.pad_token_id if config.pad_token_id is not None else -1 - ) - +class LlavaNextForConditionalGeneration(GaudiLlavaNextForConditionalGeneration): + def _merge_input_ids_with_image_features( self, - input_ids: torch.Tensor, inputs_embeds: torch.Tensor, image_features: torch.Tensor, + input_ids: torch.Tensor, ): """In place merges in vision_embeddings with inputs_embeds.""" mask = input_ids == self.config.image_token_index @@ -164,120 +71,215 @@ class LlavaNextForConditionalGeneration(nn.Module): def forward( self, - input_ids: torch.Tensor, - position_ids: torch.Tensor, - cu_seqlen_prefill: Optional[torch.Tensor], - kv_cache: List[Tuple[torch.Tensor, torch.Tensor]], - block_tables: torch.Tensor, - slots: torch.Tensor, - input_lengths: torch.Tensor, - max_s: int, - prefill_cache_indices: Optional[torch.Tensor], - lm_head_indices: Optional[torch.Tensor] = None, + input_ids: torch.LongTensor = None, pixel_values: torch.FloatTensor = None, - # Unused for this model - pixel_attention_mask=None, image_sizes: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + vision_feature_layer: Optional[int] = None, + vision_feature_select_strategy: Optional[str] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + token_idx: Optional[torch.Tensor] = None, ): - inputs_embeds = self.language_model.embed_tokens(input_ids) - if pixel_values is not None and len(pixel_values) > 0: - # num_special_image_tokens = (input_ids == self.config.image_token_index).sum() - # assert num_special_image_tokens == len(pixel_values), f"Received {num_special_image_tokens} for {len(pixel_values)} images, this is invalid" - # 1. Extract the input embeddings - # 2. Merge text and images - num_images, num_patches, channels, height, width = pixel_values.shape - pixel_values = pixel_values.view( - num_images * num_patches, channels, height, width + if token_idx is not None: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - image_features = self.vision_tower(pixel_values) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + if inputs_embeds is None: + inputs_embeds = self.get_input_embeddings()(input_ids) - # selected_image_feature = image_features.hidden_states[self.config.vision_feature_layer] - # Already done within the clip model - selected_image_feature = image_features.last_hidden_state + outputs = self.language_model( + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + token_idx=token_idx, + ) - if self.config.vision_feature_select_strategy == "default": - selected_image_feature = selected_image_feature[:, 1:] - elif self.config.vision_feature_select_strategy == "full": - selected_image_feature = selected_image_feature + logits = outputs[0] + + if not return_dict: + output = (logits,) + outputs[1:] + return output + + return outputs + + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + inputs_embeds=None, + pixel_values=None, + image_sizes=None, + attention_mask=None, + **kwargs, + ): + """ + Inherits from LlavaForConditionalGeneration: https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/models/llava_next/modeling_llava_next.py#L635 + The only differences are: + - add new args token_idx + - add the process of merging images into inputs_embeds + """ + token_idx = kwargs.get("token_idx", None) + if token_idx is None: + return super().prepare_inputs_for_generation( + input_ids=input_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + pixel_values=pixel_values, + image_sizes=image_sizes, + attention_mask=attention_mask, + **kwargs, + ) else: - raise RuntimeError( - f"Strategy `{self.config.vision_feature_select_strategy}` is not supported/valid." + + position_ids = kwargs.get("position_ids", None) + labels = kwargs.get("labels", None) + if past_key_values is None and pixel_values is not None and input_ids.shape[1] != 1: + vision_feature_select_strategy = kwargs.get("vision_feature_select_strategy", None) + vision_feature_layer = kwargs.get("vision_feature_layer", None) + vision_feature_select_strategy = ( + vision_feature_select_strategy + if vision_feature_select_strategy is not None + else self.config.vision_feature_select_strategy + ) + vision_feature_layer = ( + vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer + ) + + # 1. Extract the input embeddings + inputs_embeds = self.get_input_embeddings()(input_ids) + # 2. Merge text and images + batch_size, num_patches, num_channels, height, width = pixel_values.shape + reshaped_pixel_values = pixel_values.view(batch_size * num_patches, num_channels, height, width) + image_features = self.vision_tower( + reshaped_pixel_values, output_hidden_states=True + ) + + selected_image_feature = image_features.hidden_states[vision_feature_layer] + + if vision_feature_select_strategy == "default": + selected_image_feature = selected_image_feature[:, 1:] + elif vision_feature_select_strategy == "full": + selected_image_feature = selected_image_feature + + image_features = self.multi_modal_projector(selected_image_feature) + + # split up image_features for each of the individual images + # hence we get a list of image_features, each of shape (5, num_patches, hidden_size) + # if we assume each image has 5 image features (base image + 4 patches) + split_sizes = [image.shape[0] for image in pixel_values] + image_features = torch.split(image_features, split_sizes, dim=0) + + # NOTE we only support multimodal_patch_merge_type == "spatial_unpad" + height = width = self.config.vision_config.image_size // self.config.vision_config.patch_size + + new_image_features = [] + for image_idx, image_feature in enumerate(image_features): + if image_feature.shape[0] > 1: + base_image_feature = image_feature[0] + image_feature = image_feature[1:] + + if height * width != base_image_feature.shape[0]: + raise ValueError("The number of patches is not consistent with the image size.") + + num_patch_height, num_patch_width = get_anyres_image_grid_shape( + image_sizes[image_idx].tolist(), + self.config.image_grid_pinpoints, + self.config.vision_config.image_size, + ) + + image_feature = image_feature.view(num_patch_height, num_patch_width, height, width, -1) + image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous() + image_feature = image_feature.flatten(1, 2).flatten(2, 3) + image_feature = unpad_image(image_feature, image_sizes[image_idx]) + image_feature = torch.cat( + ( + image_feature, + self.image_newline[:, None, None].expand(*image_feature.shape[:-1], 1), + ), + dim=-1, + ) + image_feature = image_feature.flatten(1, 2).transpose(0, 1) + image_feature = torch.cat((base_image_feature, image_feature), dim=0) + else: + image_feature = image_feature[0] + image_feature = torch.cat((image_feature, self.image_newline[None]), dim=0) + new_image_features.append(image_feature) + image_features = torch.stack(new_image_features, dim=0) + inputs_embeds = self._merge_input_ids_with_image_features(inputs_embeds, image_features, input_ids) + self.image_offset = image_features.shape[1] - 1 # image_token has occupied 1 token position. + # In case input_ids.shape[1] == 1 & pixel_values==None & past_key_values != None, we are in the case of + # generation with cache + elif past_key_values is not None: + seq_len = input_ids.shape[1] + pad_len = seq_len - token_idx.item() + input_ids = torch.index_select(input_ids, 1, token_idx - 1) + # Retrieve the first layer to inspect the logits and mask out the hidden states + # that are set to 0 + first_layer_past_key_value = past_key_values[0][0][:, :, :, 0] + + # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941 + batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0) + + # Get the target length + past_length = first_layer_past_key_value.shape[-1] + extended_attention_mask = torch.ones( + (attention_mask.shape[0], past_length), + dtype=attention_mask.dtype, + device=attention_mask.device, + ) + # Filter out only the tokens that can be un-attended, this can happen + # if one uses Llava + Fused modules where the cache on the + # first iteration is already big enough, or if one passes custom cache + valid_indices = non_attended_tokens < extended_attention_mask.size(-1) + new_batch_index = batch_index[valid_indices] + new_non_attended_tokens = non_attended_tokens[valid_indices] + + # Zero-out the places where we don't need to attend + extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0 + + attention_mask = extended_attention_mask + attention_mask[:, -pad_len:] = 0 + + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past_key_values: + if token_idx is not None: + position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1 + else: + position_ids = position_ids[:, -input_ids.shape[1] :] + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and past_key_values is None: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids} + + model_inputs.update( + { + "position_ids": position_ids, + "past_key_values": past_key_values, + "use_cache": kwargs.get("use_cache"), + "attention_mask": attention_mask, + "token_idx": token_idx, + "labels": labels, + } ) - image_features = self.multi_modal_projector(selected_image_feature) - - # split up image_features for each of the individual images - # hence we get a list of image_features, each of shape (5, num_patches, hidden_size) - # if we assume each image has 5 image features (base image + 4 patches) - split_sizes = [num_patches] * num_images - image_features = torch.split(image_features, split_sizes, dim=0) - - # NOTE we only support multimodal_patch_merge_type == "spatial_unpad" - height = width = ( - self.config.vision_config.image_size - // self.config.vision_config.patch_size - ) - - new_image_features = [] - for image_idx, image_feature in enumerate(image_features): - if image_feature.shape[0] > 1: - base_image_feature = image_feature[0] - image_feature = image_feature[1:] - - if height * width != base_image_feature.shape[0]: - raise ValueError( - "The number of patches is not consistent with the image size." - ) - num_patch_height, num_patch_width = get_anyres_image_grid_shape( - image_sizes[image_idx], - self.config.image_grid_pinpoints, - self.config.vision_config.image_size, - ) - image_feature = image_feature.view( - num_patch_height, num_patch_width, height, width, -1 - ) - image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous() - image_feature = image_feature.flatten(1, 2).flatten(2, 3) - image_feature = unpad_image(image_feature, image_sizes[image_idx]) - image_feature = torch.cat( - ( - image_feature, - self.image_newline[:, None, None].expand( - *image_feature.shape[:-1], 1 - ), - ), - dim=-1, - ) - image_feature = image_feature.flatten(1, 2).transpose(0, 1) - image_feature = torch.cat( - (base_image_feature, image_feature), dim=0 - ) - else: - image_feature = image_feature[0] - image_feature = torch.cat( - (image_feature, self.image_newline[None]), dim=0 - ) - new_image_features.append(image_feature) - image_features = torch.stack(new_image_features, dim=0) - - inputs_embeds = self._merge_input_ids_with_image_features( - input_ids, inputs_embeds, image_features - ) - - hidden_states = self.language_model.model( - inputs_embeds=inputs_embeds, - position_ids=position_ids, - cu_seqlen_prefill=cu_seqlen_prefill, - kv_cache=kv_cache, - block_tables=block_tables, - slots=slots, - input_lengths=input_lengths, - max_s=max_s, - true_max_s=max_s, - prefill_cache_indices=None, - ) - if lm_head_indices is not None: - hidden_states = hidden_states[lm_head_indices] - logits, speculative_logits = self.language_model.lm_head(hidden_states) - return logits, speculative_logits + return model_inputs \ No newline at end of file diff --git a/server/text_generation_server/models/custom_modeling/mamba_modeling.py b/server/text_generation_server/models/custom_modeling/mamba_modeling.py index c58a617f..293051c2 100644 --- a/server/text_generation_server/models/custom_modeling/mamba_modeling.py +++ b/server/text_generation_server/models/custom_modeling/mamba_modeling.py @@ -8,12 +8,12 @@ from typing import Optional, Tuple, Any from transformers.configuration_utils import PretrainedConfig import torch.nn.functional as F -from text_generation_server.utils.layers import ( +from text_generation_server.layers import ( SpeculativeHead, TensorParallelEmbedding, - FastRMSNorm, FastLinear, ) +from text_generation_server.layers.layernorm import FastRMSNorm from einops import rearrange from causal_conv1d import causal_conv1d_fn, causal_conv1d_update diff --git a/server/text_generation_server/models/custom_modeling/mpt_modeling.py b/server/text_generation_server/models/custom_modeling/mpt_modeling.py index 9b0f8b92..f7981bf5 100644 --- a/server/text_generation_server/models/custom_modeling/mpt_modeling.py +++ b/server/text_generation_server/models/custom_modeling/mpt_modeling.py @@ -17,7 +17,7 @@ from transformers.modeling_outputs import ( ) from einops import rearrange from packaging import version -from text_generation_server.utils.layers import ( +from text_generation_server.layers import ( TensorParallelEmbedding, TensorParallelColumnLinear, TensorParallelRowLinear, diff --git a/server/text_generation_server/models/custom_modeling/neox_modeling.py b/server/text_generation_server/models/custom_modeling/neox_modeling.py index 1b060060..fcad32fa 100644 --- a/server/text_generation_server/models/custom_modeling/neox_modeling.py +++ b/server/text_generation_server/models/custom_modeling/neox_modeling.py @@ -40,7 +40,7 @@ from transformers.modeling_outputs import ( from transformers.modeling_utils import PreTrainedModel from transformers import GPTNeoXConfig from loguru import logger -from text_generation_server.utils.layers import ( +from text_generation_server.layers import ( TensorParallelColumnLinear, TensorParallelEmbedding, TensorParallelRowLinear, @@ -60,9 +60,6 @@ if ( except ImportError: pass -if not CUSTOM_KERNELS_ENABLED: - logger.warning("We're not using custom kernels.") - def make_causal_mask( input_ids_shape: torch.Size, device: torch.device, past_key_values_length: int diff --git a/server/text_generation_server/models/custom_modeling/opt_modeling.py b/server/text_generation_server/models/custom_modeling/opt_modeling.py index 7a5cf917..83d62dea 100644 --- a/server/text_generation_server/models/custom_modeling/opt_modeling.py +++ b/server/text_generation_server/models/custom_modeling/opt_modeling.py @@ -27,7 +27,7 @@ from transformers.modeling_outputs import ( ) from transformers.modeling_utils import PreTrainedModel from transformers import OPTConfig -from text_generation_server.utils.layers import ( +from text_generation_server.layers import ( FastLinear, TensorParallelColumnLinear, TensorParallelEmbedding, diff --git a/server/text_generation_server/models/custom_modeling/phi_modeling.py b/server/text_generation_server/models/custom_modeling/phi_modeling.py index 1571f9fd..04b470eb 100644 --- a/server/text_generation_server/models/custom_modeling/phi_modeling.py +++ b/server/text_generation_server/models/custom_modeling/phi_modeling.py @@ -9,7 +9,7 @@ from typing import Optional, List, Tuple, Any from transformers.configuration_utils import PretrainedConfig from transformers.modeling_outputs import CausalLMOutputWithPast -from text_generation_server.utils.layers import ( +from text_generation_server.layers import ( TensorParallelRowLinear, TensorParallelColumnLinear, TensorParallelEmbedding, diff --git a/server/text_generation_server/models/custom_modeling/siglip.py b/server/text_generation_server/models/custom_modeling/siglip.py new file mode 100644 index 00000000..5fbc6d29 --- /dev/null +++ b/server/text_generation_server/models/custom_modeling/siglip.py @@ -0,0 +1,436 @@ +from typing import Optional, Tuple, Union + +import math +import torch +from torch import nn + +from transformers.activations import ACT2FN +from transformers.modeling_attn_mask_utils import ( + _create_4d_causal_attention_mask, + _prepare_4d_attention_mask, +) +from transformers.modeling_outputs import ( + BaseModelOutput, + BaseModelOutputWithPooling, + ImageClassifierOutput, +) +from transformers import SiglipConfig, SiglipTextConfig, SiglipVisionConfig + +from text_generation_server.layers.tensor_parallel import ( + TensorParallelEmbedding, + TensorParallelColumnLinear, + TensorParallelRowLinear, +) + + +class SiglipVisionEmbeddings(nn.Module): + def __init__(self, prefix, config: SiglipVisionConfig, weights): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.image_size = config.image_size + self.patch_size = config.patch_size + self.patch_embedding = nn.Conv2d( + in_channels=config.num_channels, + out_channels=self.embed_dim, + kernel_size=self.patch_size, + stride=self.patch_size, + padding="valid", + ) + self.patch_embedding.weight = nn.Parameter( + weights.get_tensor(f"{prefix}.patch_embedding.weight"), requires_grad=False + ) + self.patch_embedding.bias = nn.Parameter( + weights.get_tensor(f"{prefix}.patch_embedding.bias"), requires_grad=False + ) + self.num_patches = (self.image_size // self.patch_size) ** 2 + self.num_positions = self.num_patches + self.position_embedding = TensorParallelEmbedding( + prefix=f"{prefix}.position_embedding", weights=weights + ) + self.register_buffer( + "position_ids", + torch.arange(self.num_positions, device=weights.device).expand((1, -1)), + persistent=False, + ) + + def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor: + patch_embeds = self.patch_embedding( + pixel_values + ) # shape = [*, width, grid, grid] + embeddings = patch_embeds.flatten(2).transpose(1, 2) + + embeddings = embeddings + self.position_embedding(self.position_ids) + return embeddings + + +class SiglipAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, prefix, config, weights): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.embed_dim // self.num_heads + self.head_size = self.head_dim + if self.head_dim * self.num_heads != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" + f" {self.num_heads})." + ) + self.num_heads = self.num_heads // weights.process_group.size() + self.embed_dim = self.embed_dim // weights.process_group.size() + self.scale = self.head_dim**-0.5 + self.dropout = config.attention_dropout + + self.k_proj = TensorParallelColumnLinear.load( + config, prefix=f"{prefix}.k_proj", weights=weights, bias=True + ) + self.v_proj = TensorParallelColumnLinear.load( + config, prefix=f"{prefix}.v_proj", weights=weights, bias=True + ) + self.q_proj = TensorParallelColumnLinear.load( + config, prefix=f"{prefix}.q_proj", weights=weights, bias=True + ) + self.out_proj = TensorParallelRowLinear.load( + config, prefix=f"{prefix}.out_proj", weights=weights, bias=True + ) + + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return ( + tensor.view(bsz, seq_len, self.num_heads, self.head_dim) + .transpose(1, 2) + .contiguous() + ) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + """Input shape: Batch x Time x Channel""" + + bsz, tgt_len, _ = hidden_states.size() + query_states = self.q_proj(hidden_states) + key_states = self._shape(self.k_proj(hidden_states), -1, bsz) + value_states = self._shape(self.v_proj(hidden_states), -1, bsz) + proj_shape = (bsz * self.num_heads, -1, self.head_dim) + query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape) + key_states = key_states.view(*proj_shape) + value_states = value_states.view(*proj_shape) + + src_len = key_states.size(1) + # scale post matmul + attn_weights = torch.bmm(query_states, key_states.transpose(1, 2)) * self.scale + + if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len): + raise ValueError( + f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is" + f" {attn_weights.size()}" + ) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, tgt_len, src_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}" + ) + attn_weights = ( + attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + + attention_mask + ) + attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) + + # upcast attention to fp32 + attn_weights = nn.functional.softmax( + attn_weights, dim=-1, dtype=torch.float32 + ).to(attn_weights.dtype) + attn_weights = nn.functional.dropout( + attn_weights, p=self.dropout, training=self.training + ) + attn_output = torch.matmul(attn_weights, value_states) + + if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_size): + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_size)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_size) + attn_output = attn_output.transpose(1, 2) + attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) + + attn_output = self.out_proj(attn_output) + + return attn_output, attn_weights + + +class SiglipMLP(nn.Module): + def __init__(self, prefix, config, weights): + super().__init__() + self.config = config + self.activation_fn = ACT2FN[config.hidden_act] + self.fc1 = TensorParallelColumnLinear.load( # config.hidden_size, config.intermediate_size + prefix=f"{prefix}.fc1", config=config, weights=weights, bias=True + ) + self.fc2 = TensorParallelRowLinear.load( # config.intermediate_size, config.hidden_size + prefix=f"{prefix}.fc2", config=config, weights=weights, bias=True + ) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.fc1(hidden_states) + hidden_states = self.activation_fn(hidden_states) + hidden_states = self.fc2(hidden_states) + return hidden_states + + +class SiglipEncoderLayer(nn.Module): + def __init__(self, prefix, config: SiglipConfig, weights): + super().__init__() + self.embed_dim = config.hidden_size + self.self_attn = SiglipAttention( + prefix=f"{prefix}.self_attn", config=config, weights=weights + ) + self.layer_norm1 = nn.LayerNorm.load( + prefix=f"{prefix}.layer_norm1", weights=weights, eps=config.layer_norm_eps + ) + self.mlp = SiglipMLP(prefix=f"{prefix}.mlp", config=config, weights=weights) + self.layer_norm2 = nn.LayerNorm.load( + prefix=f"{prefix}.layer_norm2", weights=weights, eps=config.layer_norm_eps + ) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor, + ) -> Tuple[torch.FloatTensor]: + residual = hidden_states + hidden_states = self.layer_norm1(hidden_states) + hidden_states, attn_weights = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + ) + hidden_states = residual + hidden_states + residual = hidden_states + hidden_states = self.layer_norm2(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + return hidden_states, None + + +class SiglipMultiheadAttentionPoolingHead(nn.Module): + """Multihead Attention Pooling.""" + + def __init__(self, prefix, config: SiglipVisionConfig, weights): + super().__init__() + + self.probe = nn.Parameter(torch.randn(1, 1, config.hidden_size)) + self.attention = torch.nn.MultiheadAttention( + config.hidden_size, config.num_attention_heads, batch_first=True + ) + self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.mlp = SiglipMLP(prefix, config, weights) + + def forward(self, hidden_state): + batch_size = hidden_state.shape[0] + probe = self.probe.repeat(batch_size, 1, 1) + + hidden_state = self.attention(probe, hidden_state, hidden_state)[0] + + residual = hidden_state + hidden_state = self.layernorm(hidden_state) + hidden_state = residual + self.mlp(hidden_state) + + return hidden_state[:, 0] + + +import warnings + + +def _trunc_normal_(tensor, mean, std, a, b): + # Cut & paste from PyTorch official master until it's in a few official releases - RW + # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf + def norm_cdf(x): + # Computes standard normal cumulative distribution function + return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0 + + if (mean < a - 2 * std) or (mean > b + 2 * std): + warnings.warn( + "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. " + "The distribution of values may be incorrect.", + stacklevel=2, + ) + + # Values are generated by using a truncated uniform distribution and + # then using the inverse CDF for the normal distribution. + # Get upper and lower cdf values + l = norm_cdf((a - mean) / std) + u = norm_cdf((b - mean) / std) + + # Uniformly fill tensor with values from [l, u], then translate to + # [2l-1, 2u-1]. + tensor.uniform_(2 * l - 1, 2 * u - 1) + + # Use inverse cdf transform for normal distribution to get truncated + # standard normal + tensor.erfinv_() + + # Transform to proper mean, std + tensor.mul_(std * math.sqrt(2.0)) + tensor.add_(mean) + + # Clamp to ensure it's in the proper range + tensor.clamp_(min=a, max=b) + + +def trunc_normal_tf_( + tensor: torch.Tensor, + mean: float = 0.0, + std: float = 1.0, + a: float = -2.0, + b: float = 2.0, +) -> torch.Tensor: + """Fills the input Tensor with values drawn from a truncated + normal distribution. The values are effectively drawn from the + normal distribution :math:`\\mathcal{N}(\text{mean}, \text{std}^2)` + with values outside :math:`[a, b]` redrawn until they are within + the bounds. The method used for generating the random values works + best when :math:`a \\leq \text{mean} \\leq b`. + + NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the + bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0 + and the result is subsquently scaled and shifted by the mean and std args. + + Args: + tensor: an n-dimensional `torch.Tensor` + mean: the mean of the normal distribution + std: the standard deviation of the normal distribution + a: the minimum cutoff value + b: the maximum cutoff value + """ + with torch.no_grad(): + _trunc_normal_(tensor, 0, 1.0, a, b) + tensor.mul_(std).add_(mean) + + +from torch.nn.init import _calculate_fan_in_and_fan_out + + +def variance_scaling_(tensor, scale=1.0, mode="fan_in", distribution="normal"): + fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor) + if mode == "fan_in": + denom = fan_in + elif mode == "fan_out": + denom = fan_out + elif mode == "fan_avg": + denom = (fan_in + fan_out) / 2 + + variance = scale / denom + + if distribution == "truncated_normal": + # constant is stddev of standard normal truncated to (-2, 2) + trunc_normal_tf_(tensor, std=math.sqrt(variance) / 0.87962566103423978) + elif distribution == "normal": + with torch.no_grad(): + tensor.normal_(std=math.sqrt(variance)) + elif distribution == "uniform": + bound = math.sqrt(3 * variance) + with torch.no_grad(): + tensor.uniform_(-bound, bound) + else: + raise ValueError(f"invalid distribution {distribution}") + + +def lecun_normal_(tensor): + variance_scaling_(tensor, mode="fan_in", distribution="truncated_normal") + + +def default_flax_embed_init(tensor): + variance_scaling_(tensor, mode="fan_in", distribution="normal") + + +from transformers import PreTrainedModel + + +class SiglipEncoder(nn.Module): + """ + Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a + [`SiglipEncoderLayer`]. + + Args: + config: SiglipConfig + """ + + def __init__(self, prefix, config: SiglipConfig, weights): + super().__init__() + self.config = config + self.layers = nn.ModuleList( + [ + SiglipEncoderLayer( + prefix=f"{prefix}.layers.{i}", config=config, weights=weights + ) + for i in range(config.num_hidden_layers) + ] + ) + + def forward( + self, + inputs_embeds, + attention_mask: Optional[torch.Tensor] = None, + ): + + hidden_states = inputs_embeds + for idx, encoder_layer in enumerate(self.layers): + hidden_states, _ = encoder_layer( + hidden_states, + attention_mask, + ) + + return hidden_states + + +class SiglipVisionTransformer(nn.Module): + def __init__(self, prefix, config: SiglipVisionConfig, weights): + super().__init__() + self.config = config + embed_dim = config.hidden_size + + self.embeddings = SiglipVisionEmbeddings( + prefix=f"{prefix}.embeddings", config=config, weights=weights + ) + self.encoder = SiglipEncoder( + prefix=f"{prefix}.encoder", config=config, weights=weights + ) + self.post_layernorm = nn.LayerNorm.load( + prefix=f"{prefix}.post_layernorm", + weights=weights, + eps=config.layer_norm_eps, + ) + + def forward( + self, + pixel_values: Optional[torch.FloatTensor] = None, + ): + r""" + Returns: + + """ + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + hidden_states = self.embeddings(pixel_values) + + # NOTE: up until this point, the code logits are exactly + # the same as the transformers code. The values evaulate + # slightly differently in our encoder layer. + encoder_outputs = self.encoder( + inputs_embeds=hidden_states, + ) + last_hidden_state = encoder_outputs + post_last_hidden_state = self.post_layernorm(last_hidden_state) + + return BaseModelOutputWithPooling( + last_hidden_state=post_last_hidden_state, + # pooler_output=pooled_output, + # hidden_states=encoder_outputs, + ) diff --git a/server/text_generation_server/models/custom_modeling/t5_modeling.py b/server/text_generation_server/models/custom_modeling/t5_modeling.py index 2773fb15..0b899fba 100644 --- a/server/text_generation_server/models/custom_modeling/t5_modeling.py +++ b/server/text_generation_server/models/custom_modeling/t5_modeling.py @@ -38,7 +38,7 @@ from transformers.utils import ( is_torch_fx_proxy, ) from transformers import T5Config -from text_generation_server.utils.layers import ( +from text_generation_server.layers import ( TensorParallelColumnLinear, TensorParallelEmbedding, TensorParallelRowLinear, diff --git a/server/text_generation_server/models/custom_modeling/vlm.py b/server/text_generation_server/models/custom_modeling/vlm.py index 690957d0..b74b43ff 100644 --- a/server/text_generation_server/models/custom_modeling/vlm.py +++ b/server/text_generation_server/models/custom_modeling/vlm.py @@ -11,6 +11,18 @@ def load_text_model(prefix, config, weights, name=None): ) return FlashMistralForCausalLM(prefix, config, weights, name=name) + elif config.model_type == "gemma": + from text_generation_server.models.custom_modeling.flash_gemma_modeling import ( + FlashGemmaForCausalLM, + ) + + return FlashGemmaForCausalLM(prefix, config, weights, causal=False) + elif config.model_type == "paligemma": + from text_generation_server.models.custom_modeling.flash_gemma_modeling import ( + FlashGemmaForCausalLM, + ) + + return FlashGemmaForCausalLM(prefix, config, weights) else: raise RuntimeError(f"Unsupported model type {config.model_type}") @@ -24,5 +36,13 @@ def load_vision_model(prefix, config, weights): return CLIPVisionTransformer( prefix=f"{prefix}.vision_model", config=config, weights=weights ) + if config.model_type == "siglip_vision_model": + from text_generation_server.models.custom_modeling.siglip import ( + SiglipVisionTransformer, + ) + + return SiglipVisionTransformer( + prefix=f"vision_tower.vision_model", config=config, weights=weights + ) else: raise RuntimeError(f"Unsupported model type {config.model_type}") diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py index a6d0204f..86d9b4c8 100644 --- a/server/text_generation_server/models/flash_causal_lm.py +++ b/server/text_generation_server/models/flash_causal_lm.py @@ -13,8 +13,11 @@ from opentelemetry import trace from transformers import PreTrainedTokenizerBase from typing import Optional, Tuple, List, Type, Dict +from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE +from text_generation_server.utils.import_utils import SYSTEM from text_generation_server.models import Model from text_generation_server.utils.tokens import batch_top_tokens +from text_generation_server.utils.dist import RANK from text_generation_server.utils.speculate import get_speculate from text_generation_server.models.types import ( Batch, @@ -29,16 +32,18 @@ from text_generation_server.models.cache_manager import ( ) from text_generation_server.pb import generate_pb2 from text_generation_server.models.globals import MEM_POOL, CUDA_GRAPHS +import text_generation_server.models.globals as tgi_globals from text_generation_server.utils import StoppingCriteria, HeterogeneousNextTokenChooser from text_generation_server.utils.dist import MEMORY_FRACTION -tracer = trace.get_tracer(__name__) from text_generation_server.utils.import_utils import ( - IS_CUDA_SYSTEM, - IS_ROCM_SYSTEM, - IS_XPU_SYSTEM, + empty_cache, + synchronize, + get_free_memory, ) +tracer = trace.get_tracer(__name__) + @dataclass class FlashCausalLMBatch(Batch): @@ -133,6 +138,17 @@ class FlashCausalLMBatch(Batch): device: torch.device, ) -> "FlashCausalLMBatch": batch_tokenized_inputs = cls.batch_tokenized_inputs(pb.requests, tokenizer) + return cls.from_tokenized(pb, tokenizer, batch_tokenized_inputs, dtype, device) + + @classmethod + def from_tokenized( + cls, + pb: generate_pb2.Batch, + tokenizer: PreTrainedTokenizerBase, + batch_tokenized_inputs, + dtype: torch.dtype, + device: torch.device, + ) -> "FlashCausalLMBatch": position_ids = [] speculative_ids = [] cu_seqlen_prefill = [0] @@ -207,6 +223,7 @@ class FlashCausalLMBatch(Batch): # Paged attention # Remove one as the first token des not have a past speculative_length = get_speculate() + speculative_length = 0 if speculative_length is None else speculative_length total_tokens = input_length + max_new_tokens - 1 + speculative_length needed_blocks = math.ceil(total_tokens / BLOCK_SIZE) blocks += needed_blocks @@ -757,10 +774,8 @@ class FlashCausalLM(Model): def warmup(self, batch: FlashCausalLMBatch): # The warmup batch is the biggest batch we could ever receive - if IS_CUDA_SYSTEM or IS_ROCM_SYSTEM: - torch.cuda.empty_cache() - elif IS_XPU_SYSTEM: - torch.xpu.empty_cache() + empty_cache() + try: cache_manager = set_cache_manager( batch.blocks, @@ -773,6 +788,9 @@ class FlashCausalLM(Model): ) max_bt = batch.max_blocks max_s = max_bt * get_cache_manager().block_size + + if SYSTEM == "rocm" and os.environ.get("PYTORCH_TUNABLEOP_ENABLED", False): + torch.cuda.tunable.tuning_enable(False) _, batch, _ = self.generate_token(batch) except torch.cuda.OutOfMemoryError as e: raise RuntimeError( @@ -780,10 +798,7 @@ class FlashCausalLM(Model): f"You need to decrease `--max-batch-prefill-tokens`" ) from e - if IS_CUDA_SYSTEM or IS_ROCM_SYSTEM: - torch.cuda.synchronize(self.device) - elif IS_XPU_SYSTEM: - torch.xpu.synchronize(self.device) + synchronize(self.device) # Inspired by the original implementation in [vllm](https://github.com/vllm-project/vllm) # Calculate the number of blocks that can be allocated with the free memory @@ -791,20 +806,7 @@ class FlashCausalLM(Model): cache_block_size = BLOCK_SIZE * self.num_kv_heads * self.head_size total_cache_size = self.num_layers * cache_block_size * 2 * dtype_size - if IS_CUDA_SYSTEM or IS_ROCM_SYSTEM: - total_free_memory, _ = torch.cuda.mem_get_info(self.device) - total_gpu_memory = torch.cuda.get_device_properties( - self.device - ).total_memory - - free_memory = max( - 0, total_free_memory - (1 - MEMORY_FRACTION) * total_gpu_memory - ) - elif IS_XPU_SYSTEM: - total_gpu_memory = torch.xpu.get_device_properties(self.device).total_memory - free_memory = int(total_gpu_memory * 0.5) - else: - raise NotImplementedError("FlashModel is only available on GPU") + free_memory = get_free_memory(self.device, MEMORY_FRACTION) num_blocks = ( # Leave 5% for some wiggle room @@ -826,6 +828,49 @@ class FlashCausalLM(Model): self.device, ) + if SYSTEM == "rocm": + if ( + os.environ.get("PYTORCH_TUNABLEOP_ENABLED") is None + or os.environ.get("PYTORCH_TUNABLEOP_ENABLED") == "1" + ): + if os.environ.get("PYTORCH_TUNABLEOP_TUNING") != "0": + torch.cuda.tunable.tuning_enable(True) + + if os.environ.get("PYTORCH_TUNABLEOP_SEQLENS") is not None: + tuning_sequences = [ + int(val) + for val in os.environ["PYTORCH_TUNABLEOP_SEQLENS"].split(",") + ] + else: + tuning_sequences = CUDA_GRAPHS + + tunableop_filepath = os.path.join( + HUGGINGFACE_HUB_CACHE, + f"tunableop_{tgi_globals.MODEL_ID.replace('/', '-')}_tp{self.world_size}_rank{self.rank}.csv", + ) + + logger.info( + f"PyTorch TunableOp (https://github.com/fxmarty/pytorch/tree/2.3-patched/aten/src/ATen/cuda/tunable) is enabled. The warmup may take several minutes, picking the ROCm optimal matrix multiplication kernel for the target lengths {', '.join([str(seqlen) for seqlen in tuning_sequences])}, with typical 5-8% latency improvement for small sequence lengths. The picked GEMMs are saved in the file {tunableop_filepath}. To disable TunableOp, please launch TGI with `PYTORCH_TUNABLEOP_ENABLED=0`." + ) + + if os.path.isfile(tunableop_filepath): + logger.info( + f"The file {tunableop_filepath} already exists and will be reused." + ) + torch.cuda.tunable.read_file(tunableop_filepath) + + os.makedirs(HUGGINGFACE_HUB_CACHE, exist_ok=True) + + for seqlen in tuning_sequences: + logger.info(f"Warming up TunableOp for seqlen={seqlen}") + self.tunableop_warmup(seqlen) + torch.cuda.tunable.write_file(tunableop_filepath) + torch.cuda.tunable.tuning_enable(False) + else: + logger.info( + "PyTorch ROCm TunableOp (https://github.com/pytorch/pytorch/tree/main/aten/src/ATen/cuda/tunable) is disabled. TunableOp brings an additional 5-8% latency improvement for small sequence lengths but requires a warmup. If necessary, please use the environment variable PYTORCH_TUNABLEOP_ENABLED=1 to enable TunableOp." + ) + if CUDA_GRAPHS: try: logger.info(f"Cuda Graphs are enabled for sizes {CUDA_GRAPHS}") @@ -840,6 +885,30 @@ class FlashCausalLM(Model): return int(num_blocks * BLOCK_SIZE) + def tunableop_warmup(self, seqlen: int): + input_ids = torch.zeros(seqlen, dtype=torch.int64, device=self.device) + position_ids = torch.zeros(seqlen, dtype=torch.int32, device=self.device) + slots = torch.arange(seqlen, dtype=torch.int64, device=self.device) + kv_cache = get_cache_manager().kv_cache + + # Dummy value, some models (starcoder2) don't accept `None`. + input_lengths = torch.ones(seqlen, dtype=torch.int32, device=self.device) + + # We pass a `cu_seqlen_prefill` in order not to have to deal with paged attention cache allocation/deallocation. + self.model.forward( + input_ids=input_ids, + position_ids=position_ids, + cu_seqlen_prefill=torch.tensor( + [0, seqlen], device=self.device, dtype=torch.int32 + ), + kv_cache=get_cache_manager().kv_cache, + block_tables=None, + input_lengths=input_lengths, + slots=slots, + max_s=seqlen, + lm_head_indices=None, + ) + def forward( self, batch: FlashCausalLMBatch ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: @@ -1119,6 +1188,10 @@ class FlashCausalLM(Model): next_token_texts = [] left = 0 + if n_accepted_ids > 1: + if RANK == 0: + logger.debug(f"Speculated ids {n_accepted_ids - 1}") + current_stopped = False for j in range(index, index + n_accepted_ids): # Generated token diff --git a/server/text_generation_server/models/flash_cohere.py b/server/text_generation_server/models/flash_cohere.py index f85c7722..b907ee08 100644 --- a/server/text_generation_server/models/flash_cohere.py +++ b/server/text_generation_server/models/flash_cohere.py @@ -24,7 +24,7 @@ class FlashCohere(FlashCausalLM): model_id: str, revision: Optional[str] = None, quantize: Optional[str] = None, - use_medusa: Optional[str] = None, + speculator: Optional[str] = None, dtype: Optional[torch.dtype] = None, trust_remote_code: bool = False, ): @@ -49,7 +49,7 @@ class FlashCohere(FlashCausalLM): model_id, revision=revision, trust_remote_code=trust_remote_code ) config.quantize = quantize - config.use_medusa = use_medusa + config.speculator = speculator torch.distributed.barrier(group=self.process_group) diff --git a/server/text_generation_server/models/flash_dbrx.py b/server/text_generation_server/models/flash_dbrx.py index 367d3db0..d5eb1a6e 100644 --- a/server/text_generation_server/models/flash_dbrx.py +++ b/server/text_generation_server/models/flash_dbrx.py @@ -26,7 +26,7 @@ class FlashDbrx(FlashCausalLM): model_id: str, revision: Optional[str] = None, quantize: Optional[str] = None, - use_medusa: Optional[str] = None, + speculator: Optional[str] = None, dtype: Optional[torch.dtype] = None, trust_remote_code: bool = False, ): @@ -74,7 +74,7 @@ class FlashDbrx(FlashCausalLM): model_id, revision=revision, trust_remote_code=trust_remote_code ) config.quantize = quantize - config.use_medusa = use_medusa + config.speculator = speculator torch.distributed.barrier(group=self.process_group) diff --git a/server/text_generation_server/models/flash_gemma.py b/server/text_generation_server/models/flash_gemma.py index 7259b820..53bfd064 100644 --- a/server/text_generation_server/models/flash_gemma.py +++ b/server/text_generation_server/models/flash_gemma.py @@ -3,12 +3,11 @@ import torch.distributed from opentelemetry import trace from typing import Optional -from transformers.models.gemma import GemmaTokenizerFast +from transformers import AutoConfig, AutoTokenizer from text_generation_server.models import FlashCausalLM from text_generation_server.models.custom_modeling.flash_gemma_modeling import ( FlashGemmaForCausalLM, - GemmaConfig, ) from text_generation_server.utils import ( initialize_torch_distributed, @@ -25,7 +24,7 @@ class FlashGemma(FlashCausalLM): model_id: str, revision: Optional[str] = None, quantize: Optional[str] = None, - use_medusa: Optional[str] = None, + speculator: Optional[str] = None, dtype: Optional[torch.dtype] = None, trust_remote_code: bool = False, ): @@ -36,21 +35,19 @@ class FlashGemma(FlashCausalLM): else: raise NotImplementedError("FlashGemma is only available on GPU") - tokenizer = GemmaTokenizerFast.from_pretrained( + tokenizer = AutoTokenizer.from_pretrained( model_id, revision=revision, padding_side="left", truncation_side="left", trust_remote_code=trust_remote_code, - use_fast=True, - from_slow=False, ) - config = GemmaConfig.from_pretrained( + config = AutoConfig.from_pretrained( model_id, revision=revision, trust_remote_code=trust_remote_code ) config.quantize = quantize - config.use_medusa = use_medusa + config.speculator = speculator torch.distributed.barrier(group=self.process_group) @@ -59,7 +56,9 @@ class FlashGemma(FlashCausalLM): if config.quantize in ["gptq", "awq"]: weights._set_gptq_params(model_id, revision) - model = FlashGemmaForCausalLM(config, weights) + # TODO hardcoded + prefix = "language_model" + model = FlashGemmaForCausalLM(prefix, config, weights, causal=True) torch.distributed.barrier(group=self.process_group) super(FlashGemma, self).__init__( diff --git a/server/text_generation_server/models/flash_gpt2.py b/server/text_generation_server/models/flash_gpt2.py new file mode 100644 index 00000000..0067a806 --- /dev/null +++ b/server/text_generation_server/models/flash_gpt2.py @@ -0,0 +1,77 @@ +import torch +import torch.distributed + +from opentelemetry import trace +from transformers import AutoConfig, AutoTokenizer, GenerationConfig +from transformers.models.gpt2 import GPT2Tokenizer +from typing import Optional + +from text_generation_server.models import FlashCausalLM +from text_generation_server.models.custom_modeling.flash_gpt2_modeling import ( + FlashGPT2ForCausalLM, +) +from text_generation_server.utils import ( + initialize_torch_distributed, + weight_files, + Weights, +) +from text_generation_server.utils.import_utils import SYSTEM + +tracer = trace.get_tracer(__name__) + + +class FlashGPT2(FlashCausalLM): + def __init__( + self, + model_id: str, + revision: Optional[str] = None, + quantize: Optional[str] = None, + speculator: Optional[str] = None, + dtype: Optional[torch.dtype] = None, + trust_remote_code: bool = False, + ): + self.process_group, rank, world_size = initialize_torch_distributed() + if torch.cuda.is_available(): + device = torch.device(f"cuda:{rank}") + dtype = torch.float16 if dtype is None else dtype + elif SYSTEM == "xpu": + device = torch.device(f"xpu:{rank}") + dtype = torch.float16 if dtype is None else dtype + else: + raise NotImplementedError("FlashGPT2 is only available on GPU") + + tokenizer = AutoTokenizer.from_pretrained( + model_id, + revision=revision, + padding_side="left", + truncation_side="left", + trust_remote_code=trust_remote_code, + ) + + config = AutoConfig.from_pretrained( + model_id, revision=revision, trust_remote_code=trust_remote_code + ) + config.quantize = quantize + config.speculator = speculator + + torch.distributed.barrier(group=self.process_group) + + filenames = weight_files(model_id, revision=revision, extension=".safetensors") + weights = Weights(filenames, device, dtype, process_group=self.process_group) + if config.quantize in ["gptq", "awq"]: + weights._set_gptq_params(model_id, revision) + + prefix = "" + model = FlashGPT2ForCausalLM(prefix, config, weights) + torch.distributed.barrier(group=self.process_group) + super(FlashGPT2, self).__init__( + model=model, + tokenizer=tokenizer, + num_layers=len(model.model.layers), + num_kv_heads=model.model.num_heads, + head_size=model.model.head_size, + dtype=dtype, + device=device, + rank=rank, + world_size=world_size, + ) diff --git a/server/text_generation_server/models/flash_llama.py b/server/text_generation_server/models/flash_llama.py index 609a188d..9a7dfaee 100644 --- a/server/text_generation_server/models/flash_llama.py +++ b/server/text_generation_server/models/flash_llama.py @@ -3,7 +3,6 @@ import torch.distributed from opentelemetry import trace from transformers import AutoConfig, AutoTokenizer, GenerationConfig -from transformers.models.llama import LlamaTokenizer from typing import Optional from text_generation_server.models import FlashCausalLM @@ -18,7 +17,7 @@ from text_generation_server.utils import ( tracer = trace.get_tracer(__name__) -from text_generation_server.utils.import_utils import IS_XPU_SYSTEM +from text_generation_server.utils.import_utils import SYSTEM class FlashLlama(FlashCausalLM): @@ -27,7 +26,7 @@ class FlashLlama(FlashCausalLM): model_id: str, revision: Optional[str] = None, quantize: Optional[str] = None, - use_medusa: Optional[str] = None, + speculator: Optional[str] = None, dtype: Optional[torch.dtype] = None, trust_remote_code: bool = False, ): @@ -35,28 +34,19 @@ class FlashLlama(FlashCausalLM): if torch.cuda.is_available(): device = torch.device(f"cuda:{rank}") dtype = torch.float16 if dtype is None else dtype - elif IS_XPU_SYSTEM: + elif SYSTEM == "xpu": device = torch.device(f"xpu:{rank}") dtype = torch.float16 if dtype is None else dtype else: raise NotImplementedError("FlashLlama is only available on GPU") - try: - tokenizer = LlamaTokenizer.from_pretrained( - model_id, - revision=revision, - padding_side="left", - truncation_side="left", - trust_remote_code=trust_remote_code, - ) - except Exception: - tokenizer = AutoTokenizer.from_pretrained( - model_id, - revision=revision, - padding_side="left", - truncation_side="left", - trust_remote_code=trust_remote_code, - ) + tokenizer = AutoTokenizer.from_pretrained( + model_id, + revision=revision, + padding_side="left", + truncation_side="left", + trust_remote_code=trust_remote_code, + ) try: generation_config = GenerationConfig.from_pretrained( model_id, revision=revision, trust_remote_code=trust_remote_code @@ -71,7 +61,7 @@ class FlashLlama(FlashCausalLM): model_id, revision=revision, trust_remote_code=trust_remote_code ) config.quantize = quantize - config.use_medusa = use_medusa + config.speculator = speculator torch.distributed.barrier(group=self.process_group) diff --git a/server/text_generation_server/models/flash_mistral.py b/server/text_generation_server/models/flash_mistral.py index 88b9fe63..87e192e0 100644 --- a/server/text_generation_server/models/flash_mistral.py +++ b/server/text_generation_server/models/flash_mistral.py @@ -33,7 +33,7 @@ tracer = trace.get_tracer(__name__) # Will be set in init SLIDING_WINDOW: Optional[int] = None SLIDING_WINDOW_BLOCKS: Optional[int] = None -from text_generation_server.utils.import_utils import IS_XPU_SYSTEM +from text_generation_server.utils.import_utils import SYSTEM MEM_POOL = torch.cuda.graph_pool_handle() if torch.cuda.is_available() else None @@ -121,6 +121,11 @@ class FlashMistralBatch(FlashCausalLMBatch): requests_idx_mapping[r.id] = i tokenized_input = tokenized_input[-r.truncate :] + if ( + tokenized_input[0] == tokenizer.bos_token_id + and tokenized_input[1] == tokenizer.bos_token_id + ): + tokenized_input = tokenized_input[1:] input_length = len(tokenized_input) input_lengths.append(input_length) @@ -308,7 +313,7 @@ class BaseFlashMistral(FlashCausalLM): config_cls=AutoConfig, revision: Optional[str] = None, quantize: Optional[str] = None, - use_medusa: Optional[str] = None, + speculator: Optional[str] = None, dtype: Optional[torch.dtype] = None, trust_remote_code: bool = False, tokenizer_class=AutoTokenizer, @@ -317,7 +322,7 @@ class BaseFlashMistral(FlashCausalLM): if torch.cuda.is_available(): device = torch.device(f"cuda:{rank}") dtype = torch.float16 if dtype is None else dtype - elif IS_XPU_SYSTEM: + elif SYSTEM == "xpu": device = torch.device(f"xpu:{rank}") dtype = torch.float16 if dtype is None else dtype else: @@ -335,7 +340,7 @@ class BaseFlashMistral(FlashCausalLM): model_id, revision=revision, trust_remote_code=trust_remote_code ) config.quantize = quantize - config.use_medusa = use_medusa + config.speculator = speculator # Set context windows if getattr(config, "sliding_window", None) is not None: @@ -386,6 +391,31 @@ class BaseFlashMistral(FlashCausalLM): def batch_type(self) -> Type[FlashMistralBatch]: return FlashMistralBatch + def tunableop_warmup(self, seqlen: int): + input_ids = torch.zeros(seqlen, dtype=torch.int64, device=self.device) + position_ids = torch.zeros(seqlen, dtype=torch.int32, device=self.device) + slots = torch.arange(seqlen, dtype=torch.int64, device=self.device) + kv_cache = get_cache_manager().kv_cache + + # Dummy value, some models (starcoder2) don't accept `None`. + input_lengths = torch.ones(seqlen, dtype=torch.int32, device=self.device) + + # We pass a `cu_seqlen_prefill` in order not to have to deal with paged attention cache allocation/deallocation. + self.model.forward( + input_ids=input_ids, + position_ids=position_ids, + cu_seqlen_prefill=torch.tensor( + [0, seqlen], device=self.device, dtype=torch.int32 + ), + kv_cache=get_cache_manager().kv_cache, + block_tables=None, + input_lengths=input_lengths, + slots=slots, + max_s=seqlen, + lm_head_indices=None, + prefill_cache_indices=None, + ) + def cuda_graph_warmup(self, bs: int, max_s: int, max_bt: int): input_ids = torch.zeros(bs, dtype=torch.int64, device=self.device) position_ids = torch.zeros(bs, dtype=torch.int32, device=self.device) @@ -562,7 +592,7 @@ class FlashMistral(BaseFlashMistral): model_id: str, revision: Optional[str] = None, quantize: Optional[str] = None, - use_medusa: Optional[str] = None, + speculator: Optional[str] = None, dtype: Optional[torch.dtype] = None, trust_remote_code: bool = False, ): @@ -572,7 +602,7 @@ class FlashMistral(BaseFlashMistral): model_id=model_id, revision=revision, quantize=quantize, - use_medusa=use_medusa, + speculator=speculator, dtype=dtype, trust_remote_code=trust_remote_code, ) diff --git a/server/text_generation_server/models/flash_mixtral.py b/server/text_generation_server/models/flash_mixtral.py index 2ee35e82..587d423f 100644 --- a/server/text_generation_server/models/flash_mixtral.py +++ b/server/text_generation_server/models/flash_mixtral.py @@ -15,7 +15,7 @@ class FlashMixtral(BaseFlashMistral): model_id: str, revision: Optional[str] = None, quantize: Optional[str] = None, - use_medusa: Optional[str] = None, + speculator: Optional[str] = None, dtype: Optional[torch.dtype] = None, trust_remote_code: bool = False, ): @@ -25,7 +25,7 @@ class FlashMixtral(BaseFlashMistral): model_id=model_id, revision=revision, quantize=quantize, - use_medusa=use_medusa, + speculator=speculator, dtype=dtype, trust_remote_code=trust_remote_code, ) diff --git a/server/text_generation_server/models/flash_neox.py b/server/text_generation_server/models/flash_neox.py index f82e27db..adefaeb2 100644 --- a/server/text_generation_server/models/flash_neox.py +++ b/server/text_generation_server/models/flash_neox.py @@ -14,7 +14,7 @@ from text_generation_server.utils import ( weight_files, Weights, ) -from text_generation_server.utils.import_utils import IS_XPU_SYSTEM +from text_generation_server.utils.import_utils import SYSTEM tracer = trace.get_tracer(__name__) @@ -25,7 +25,7 @@ class FlashNeoXSharded(FlashCausalLM): model_id: str, revision: Optional[str] = None, quantize: Optional[str] = None, - use_medusa: Optional[str] = None, + speculator: Optional[str] = None, dtype: Optional[torch.dtype] = None, trust_remote_code: bool = False, ): @@ -33,7 +33,7 @@ class FlashNeoXSharded(FlashCausalLM): if torch.cuda.is_available(): device = torch.device(f"cuda:{rank}") dtype = torch.float16 if dtype is None else dtype - elif IS_XPU_SYSTEM: + elif SYSTEM == "xpu": device = torch.device(f"xpu:{rank}") dtype = torch.float16 if dtype is None else dtype else: @@ -51,7 +51,7 @@ class FlashNeoXSharded(FlashCausalLM): model_id, revision=revision, trust_remote_code=trust_remote_code ) config.quantize = quantize - config.use_medusa = use_medusa + config.speculator = speculator torch.distributed.barrier(group=self.process_group) filenames = weight_files(model_id, revision=revision, extension=".safetensors") diff --git a/server/text_generation_server/models/flash_phi.py b/server/text_generation_server/models/flash_phi.py index cb55f9e6..32b573a9 100644 --- a/server/text_generation_server/models/flash_phi.py +++ b/server/text_generation_server/models/flash_phi.py @@ -25,7 +25,7 @@ class FlashPhi(FlashCausalLM): model_id: str, revision: Optional[str] = None, quantize: Optional[str] = None, - use_medusa: Optional[str] = None, + speculator: Optional[str] = None, dtype: Optional[torch.dtype] = None, trust_remote_code: bool = False, ): @@ -48,7 +48,7 @@ class FlashPhi(FlashCausalLM): model_id, revision=revision, trust_remote_code=trust_remote_code ) config.quantize = quantize - config.use_medusa = use_medusa + config.speculator = speculator torch.distributed.barrier(group=self.process_group) @@ -58,7 +58,7 @@ class FlashPhi(FlashCausalLM): weights._set_gptq_params(model_id, revision) model = FlashPhiForCausalLM(config, weights) - if use_medusa: + if speculator: from text_generation_server.utils.medusa import MedusaModel from huggingface_hub import hf_hub_download import json @@ -66,19 +66,19 @@ class FlashPhi(FlashCausalLM): from pathlib import Path is_local_model = ( - Path(use_medusa).exists() and Path(use_medusa).is_dir() + Path(speculator).exists() and Path(speculator).is_dir() ) or os.getenv("WEIGHTS_CACHE_OVERRIDE", None) is not None if not is_local_model: medusa_config = hf_hub_download( - use_medusa, revision=revision, filename="config.json" + speculator, revision=revision, filename="config.json" ) medusa_head = hf_hub_download( - use_medusa, revision=revision, filename="medusa_lm_head.pt" + speculator, revision=revision, filename="medusa_lm_head.pt" ) else: - medusa_config = str(Path(use_medusa) / "config.json") - medusa_head = str(Path(use_medusa) / "medusa_lm_head.pt") + medusa_config = str(Path(speculator) / "config.json") + medusa_head = str(Path(speculator) / "medusa_lm_head.pt") with open(medusa_config, "r") as f: config = json.load(f) diff --git a/server/text_generation_server/models/flash_qwen2.py b/server/text_generation_server/models/flash_qwen2.py index cb3cf6b0..59064b30 100644 --- a/server/text_generation_server/models/flash_qwen2.py +++ b/server/text_generation_server/models/flash_qwen2.py @@ -30,7 +30,7 @@ class FlashQwen2(BaseFlashMistral): model_id: str, revision: Optional[str] = None, quantize: Optional[str] = None, - use_medusa: Optional[str] = None, + speculator: Optional[str] = None, dtype: Optional[torch.dtype] = None, trust_remote_code: bool = False, ): @@ -53,7 +53,7 @@ class FlashQwen2(BaseFlashMistral): model_id, revision=revision, trust_remote_code=trust_remote_code ) config.quantize = quantize - config.use_medusa = use_medusa + config.speculator = speculator # Set context windows if config.sliding_window is not None: diff --git a/server/text_generation_server/models/flash_rw.py b/server/text_generation_server/models/flash_rw.py index ccf38a0c..e6350611 100644 --- a/server/text_generation_server/models/flash_rw.py +++ b/server/text_generation_server/models/flash_rw.py @@ -15,7 +15,7 @@ from text_generation_server.utils import ( weight_files, Weights, ) -from text_generation_server.utils.import_utils import IS_XPU_SYSTEM +from text_generation_server.utils.import_utils import SYSTEM tracer = trace.get_tracer(__name__) @@ -26,7 +26,7 @@ class FlashRWSharded(FlashCausalLM): model_id: str, revision: Optional[str] = None, quantize: Optional[str] = None, - use_medusa: Optional[str] = None, + speculator: Optional[str] = None, dtype: Optional[torch.dtype] = None, trust_remote_code: bool = False, ): @@ -34,7 +34,7 @@ class FlashRWSharded(FlashCausalLM): if torch.cuda.is_available(): device = torch.device(f"cuda:{rank}") dtype = torch.float16 if dtype is None else dtype - elif IS_XPU_SYSTEM: + elif SYSTEM == "xpu": device = torch.device(f"xpu:{rank}") dtype = torch.float16 if dtype is None else dtype else: @@ -66,7 +66,7 @@ class FlashRWSharded(FlashCausalLM): ) config.quantize = quantize - config.use_medusa = use_medusa + config.speculator = speculator if config.quantize == "gptq": weights._set_gptq_params(model_id, revision) diff --git a/server/text_generation_server/models/flash_santacoder.py b/server/text_generation_server/models/flash_santacoder.py index e66f1bf8..2ad36b93 100644 --- a/server/text_generation_server/models/flash_santacoder.py +++ b/server/text_generation_server/models/flash_santacoder.py @@ -18,7 +18,7 @@ from text_generation_server.utils import ( Weights, ) -from text_generation_server.utils.import_utils import IS_XPU_SYSTEM +from text_generation_server.utils.import_utils import SYSTEM tracer = trace.get_tracer(__name__) @@ -29,7 +29,7 @@ class FlashSantacoderSharded(FlashCausalLM): model_id: str, revision: Optional[str] = None, quantize: Optional[str] = None, - use_medusa: Optional[str] = None, + speculator: Optional[str] = None, dtype: Optional[torch.dtype] = None, trust_remote_code: bool = False, ): @@ -37,7 +37,7 @@ class FlashSantacoderSharded(FlashCausalLM): if torch.cuda.is_available(): device = torch.device(f"cuda:{rank}") dtype = torch.float16 if dtype is None else dtype - elif IS_XPU_SYSTEM: + elif SYSTEM == "xpu": device = torch.device(f"xpu:{rank}") dtype = torch.float16 if dtype is None else dtype else: @@ -57,7 +57,7 @@ class FlashSantacoderSharded(FlashCausalLM): trust_remote_code=True, ) config.quantize = quantize - config.use_medusa = use_medusa + config.speculator = speculator config.transpose = config.architectures[0].startswith("GPT2") torch.distributed.barrier(group=self.process_group) diff --git a/server/text_generation_server/models/flash_starcoder2.py b/server/text_generation_server/models/flash_starcoder2.py index 68e726d8..dc5d49be 100644 --- a/server/text_generation_server/models/flash_starcoder2.py +++ b/server/text_generation_server/models/flash_starcoder2.py @@ -29,7 +29,7 @@ class FlashStarcoder2(BaseFlashMistral): model_id: str, revision: Optional[str] = None, quantize: Optional[str] = None, - use_medusa: Optional[str] = None, + speculator: Optional[str] = None, dtype: Optional[torch.dtype] = None, trust_remote_code: bool = False, ): @@ -52,7 +52,7 @@ class FlashStarcoder2(BaseFlashMistral): model_id, revision=revision, trust_remote_code=trust_remote_code ) config.quantize = quantize - config.use_medusa = use_medusa + config.speculator = speculator # Set context windows if config.sliding_window is not None: diff --git a/server/text_generation_server/models/galactica.py b/server/text_generation_server/models/galactica.py index a46f86be..4656fd45 100644 --- a/server/text_generation_server/models/galactica.py +++ b/server/text_generation_server/models/galactica.py @@ -167,7 +167,7 @@ class GalacticaSharded(CausalLM): model_id: str, revision: Optional[str] = None, quantize: Optional[str] = None, - use_medusa: Optional[str] = None, + speculator: Optional[str] = None, dtype: Optional[torch.dtype] = None, trust_remote_code: bool = False, ): @@ -195,7 +195,7 @@ class GalacticaSharded(CausalLM): ) config.quantize = quantize tokenizer.pad_token_id = config.pad_token_id - config.use_medusa = use_medusa + config.speculator = speculator torch.distributed.barrier(group=self.process_group) filenames = weight_files(model_id, revision=revision, extension=".safetensors") diff --git a/server/text_generation_server/models/globals.py b/server/text_generation_server/models/globals.py index 6f8d1017..e8a11958 100644 --- a/server/text_generation_server/models/globals.py +++ b/server/text_generation_server/models/globals.py @@ -15,3 +15,12 @@ else: cuda_graphs = None CUDA_GRAPHS = cuda_graphs + +# This is overridden at model loading. +global MODEL_ID +MODEL_ID = None + + +def set_model_id(model_id: str): + global MODEL_ID + MODEL_ID = model_id diff --git a/server/text_generation_server/models/gpt_neox.py b/server/text_generation_server/models/gpt_neox.py index 1c4cfe7d..c0e1adf2 100644 --- a/server/text_generation_server/models/gpt_neox.py +++ b/server/text_generation_server/models/gpt_neox.py @@ -24,7 +24,7 @@ class GPTNeoxSharded(CausalLM): model_id: str, revision: Optional[str] = None, quantize: Optional[str] = None, - use_medusa: Optional[str] = None, + speculator: Optional[str] = None, dtype: Optional[torch.dtype] = None, trust_remote_code: bool = False, ): @@ -51,7 +51,7 @@ class GPTNeoxSharded(CausalLM): trust_remote_code=trust_remote_code, ) config.quantize = quantize - config.use_medusa = use_medusa + config.speculator = speculator torch.distributed.barrier(group=self.process_group) filenames = weight_files(model_id, revision=revision, extension=".safetensors") diff --git a/server/text_generation_server/models/idefics.py b/server/text_generation_server/models/idefics.py index 30bf4aa6..c1fe03e4 100644 --- a/server/text_generation_server/models/idefics.py +++ b/server/text_generation_server/models/idefics.py @@ -31,7 +31,7 @@ class IDEFICSSharded(IdeficsCausalLM): model_id: str, revision: Optional[str] = None, quantize: Optional[str] = None, - use_medusa: Optional[str] = None, + speculator: Optional[str] = None, dtype: Optional[torch.dtype] = None, trust_remote_code: bool = False, ): @@ -52,7 +52,7 @@ class IDEFICSSharded(IdeficsCausalLM): trust_remote_code=trust_remote_code, ) config.quantize = quantize - config.use_medusa = use_medusa + config.speculator = speculator config.vision_config.quantize = quantize tokenizer = LlamaTokenizerFast.from_pretrained( diff --git a/server/text_generation_server/models/idefics2.py b/server/text_generation_server/models/idefics2.py index e831af89..314c0500 100644 --- a/server/text_generation_server/models/idefics2.py +++ b/server/text_generation_server/models/idefics2.py @@ -18,7 +18,7 @@ class Idefics2(VlmCausalLM): model_id: str, revision: Optional[str] = None, quantize: Optional[str] = None, - use_medusa: Optional[str] = None, + speculator: Optional[str] = None, dtype: Optional[torch.dtype] = None, trust_remote_code: bool = False, ): @@ -35,7 +35,7 @@ class Idefics2(VlmCausalLM): model_id=model_id, revision=revision, quantize=quantize, - use_medusa=use_medusa, + speculator=speculator, dtype=dtype, trust_remote_code=trust_remote_code, ) diff --git a/server/text_generation_server/models/llava_next.py b/server/text_generation_server/models/llava_next.py index 3983bc85..effe8b91 100644 --- a/server/text_generation_server/models/llava_next.py +++ b/server/text_generation_server/models/llava_next.py @@ -18,7 +18,7 @@ class LlavaNext(VlmCausalLM): model_id: str, revision: Optional[str] = None, quantize: Optional[str] = None, - use_medusa: Optional[str] = None, + speculator: Optional[str] = None, dtype: Optional[torch.dtype] = None, trust_remote_code: bool = False, ): @@ -30,7 +30,7 @@ class LlavaNext(VlmCausalLM): model_id=model_id, revision=revision, quantize=quantize, - use_medusa=use_medusa, + speculator=speculator, dtype=dtype, trust_remote_code=trust_remote_code, ) diff --git a/server/text_generation_server/models/mamba.py b/server/text_generation_server/models/mamba.py index 0884317e..d9f90590 100644 --- a/server/text_generation_server/models/mamba.py +++ b/server/text_generation_server/models/mamba.py @@ -408,7 +408,7 @@ class Mamba(Model): model_id: str, revision: Optional[str] = None, quantize: Optional[str] = None, - use_medusa: Optional[str] = None, + speculator: Optional[str] = None, dtype: Optional[torch.dtype] = None, trust_remote_code: bool = False, ): @@ -445,7 +445,7 @@ class Mamba(Model): tokenizer.pad_token = tokenizer.eos_token config.quantize = quantize - config.use_medusa = use_medusa + config.speculator = speculator torch.distributed.barrier(group=self.process_group) filenames = weight_files(model_id, revision=revision, extension=".safetensors") weights = Weights(filenames, device, dtype, process_group=self.process_group) @@ -522,6 +522,30 @@ class Mamba(Model): } self.cuda_graphs[batch_size] = graph_dict + def tunableop_warmup(self, seqlen: int): + input_ids = torch.zeros((batch_size, 1), dtype=torch.int64, device=self.device) + n_blocks = len(self.model.blocks) + + d_state = self.model.config.d_state + d_conv = self.model.config.d_conv + # Inner takes the expand multiplication + d_inner = self.model.config.d_inner + + # Important seqlen_offset to go through the update mecanism with the state + seqlen_offset = 1 + inference_params = new_inference_params( + n_blocks=n_blocks, + batch_size=seqlen, + d_state=d_state, + d_conv=d_conv, + d_inner=d_inner, + seqlen_offset=seqlen_offset, + device=self.device, + dtype=self.dtype, + ) + + self.model.forward(input_ids=input_ids, inference_params=inference_params) + def forward( self, input_ids: torch.Tensor, inference_params: Any ) -> Tuple[torch.Tensor, torch.Tensor]: diff --git a/server/text_generation_server/models/mpt.py b/server/text_generation_server/models/mpt.py index 6b3f29a6..8d8b4909 100644 --- a/server/text_generation_server/models/mpt.py +++ b/server/text_generation_server/models/mpt.py @@ -43,7 +43,7 @@ class MPTSharded(CausalLM): model_id: str, revision: Optional[str] = None, quantize: Optional[str] = None, - use_medusa: Optional[str] = None, + speculator: Optional[str] = None, dtype: Optional[torch.dtype] = None, trust_remote_code: bool = False, ): @@ -76,7 +76,7 @@ class MPTSharded(CausalLM): config = json.load(f) config = PretrainedConfig(**config) config.quantize = quantize - config.use_medusa = use_medusa + config.speculator = speculator torch.distributed.barrier(group=self.process_group) diff --git a/server/text_generation_server/models/opt.py b/server/text_generation_server/models/opt.py index 703e5b58..5b84f4ff 100644 --- a/server/text_generation_server/models/opt.py +++ b/server/text_generation_server/models/opt.py @@ -22,7 +22,7 @@ class OPTSharded(CausalLM): model_id: str, revision: Optional[str] = None, quantize: Optional[str] = None, - use_medusa: Optional[str] = None, + speculator: Optional[str] = None, dtype: Optional[torch.dtype] = None, trust_remote_code: bool = False, ): @@ -48,7 +48,7 @@ class OPTSharded(CausalLM): trust_remote_code=trust_remote_code, ) config.quantize = quantize - config.use_medusa = use_medusa + config.speculator = speculator tokenizer.pad_token_id = config.pad_token_id torch.distributed.barrier(group=self.process_group) diff --git a/server/text_generation_server/models/pali_gemma.py b/server/text_generation_server/models/pali_gemma.py new file mode 100644 index 00000000..d94b9526 --- /dev/null +++ b/server/text_generation_server/models/pali_gemma.py @@ -0,0 +1,123 @@ +import torch +import torch.distributed +from opentelemetry import trace +from typing import Optional, Tuple +from text_generation_server.models.vlm_causal_lm import ( + VlmCausalLM, + VlmCausalLMBatch, + image_text_replacement, + load_data_uri, + split, +) +from text_generation_server.models.custom_modeling.flash_pali_gemma_modeling import ( + PaliGemmaForConditionalGeneration, +) +from transformers import AutoProcessor, AutoConfig, AutoImageProcessor + +tracer = trace.get_tracer(__name__) + + +class PaliGemmaBatch(VlmCausalLMBatch): + @classmethod + def batch_tokenized_inputs(cls, requests, tokenizer, processor, config): + batch_inputs = [] + image_inputs = [] + max_truncation = 0 + for r in requests: + chunks = split(r.inputs) + full_text = "" + image_id = 0 + for chunk in chunks: + if chunk["type"] == "text": + full_text += "" + chunk["content"] + "\n" + elif chunk["type"] == "image": + image = chunk["content"] + # Should never receive URLs anymore, processing should be done + # On the rust layer. + # This avoid making n queries per TP + # if image.startswith("https://") or image.startswith("http://"): + # image = processor.image_processor.fetch_images(image) + if image.startswith("data:"): + image = load_data_uri(image) + else: + raise RuntimeError( + "Cannot process input image not starting with data:" + ) + # TODO do_convert_RGB should be on by default ? + image = image.convert("RGB") + image_input = processor.image_processor(image, return_tensors="pt") + full_text += image_text_replacement(image_input, config, image_id) + image_inputs.append(image_input) + else: + raise RuntimeError(f"Invalid chunk type {chunk['type']}") + + batch_inputs.append(full_text) + max_truncation = max(max_truncation, r.truncate) + + batch_tokenized_inputs = tokenizer( + batch_inputs, + truncation=True, + max_length=max_truncation, + add_special_tokens=False, + )["input_ids"] + if image_inputs: + image_input = image_inputs[0] + new_image_inputs = { + "pixel_values": torch.cat( + [img["pixel_values"] for img in image_inputs], dim=0 + ), + } + if "pixel_attention_mask" in image_input: + new_image_inputs["pixel_attention_mask"] = torch.cat( + [img["pixel_attention_mask"] for img in image_inputs], dim=0 + ) + if "image_sizes" in image_input: + new_image_inputs["image_sizes"] = torch.cat( + [img["image_sizes"] for img in image_inputs], dim=0 + ) + image_inputs = new_image_inputs + else: + image_inputs = None + return batch_tokenized_inputs, image_inputs + + +class PaliGemma(VlmCausalLM): + def __init__( + self, + model_id: str, + revision: Optional[str] = None, + quantize: Optional[str] = None, + speculator: Optional[str] = None, + dtype: Optional[torch.dtype] = None, + trust_remote_code: bool = False, + ): + self.processor = AutoProcessor.from_pretrained( + model_id, + revision=revision, + trust_remote_code=trust_remote_code, + ) + + super().__init__( + config_cls=AutoConfig, + model_cls=PaliGemmaForConditionalGeneration, + model_id=model_id, + revision=revision, + quantize=quantize, + speculator=speculator, + dtype=dtype, + trust_remote_code=trust_remote_code, + ) + + @property + def batch_type(self): + return PaliGemmaBatch + + def get_layer_config(self, model) -> Tuple[int, int, int]: + return ( + len(model.text_model.model.layers), + model.text_model.model.num_key_value_heads, + model.text_model.model.head_size, + ) + + def max_past(self) -> Optional[int]: + return getattr(self.model.text_model, "max_past", None) diff --git a/server/text_generation_server/models/phi.py b/server/text_generation_server/models/phi.py index cc4e2505..d68866c1 100644 --- a/server/text_generation_server/models/phi.py +++ b/server/text_generation_server/models/phi.py @@ -22,7 +22,7 @@ class Phi(CausalLM): model_id: str, revision: Optional[str] = None, quantize: Optional[str] = None, - use_medusa: Optional[str] = None, + speculator: Optional[str] = None, dtype: Optional[torch.dtype] = None, trust_remote_code: bool = False, ): @@ -53,7 +53,7 @@ class Phi(CausalLM): tokenizer.pad_token = tokenizer.eos_token config.quantize = quantize - config.use_medusa = use_medusa + config.speculator = speculator torch.distributed.barrier(group=self.process_group) filenames = weight_files(model_id, revision=revision, extension=".safetensors") weights = Weights(filenames, device, dtype, process_group=self.process_group) diff --git a/server/text_generation_server/models/rw.py b/server/text_generation_server/models/rw.py index 92c93542..d4764ded 100644 --- a/server/text_generation_server/models/rw.py +++ b/server/text_generation_server/models/rw.py @@ -12,11 +12,11 @@ class RW(CausalLM): model_id: str, revision: Optional[str] = None, quantize: Optional[str] = None, - use_medusa: Optional[str] = None, + speculator: Optional[str] = None, dtype: Optional[torch.dtype] = None, trust_remote_code: bool = False, ): - if use_medusa: + if speculator: raise RuntimeError("Medusa decoding is not enabled for AutoModel") if torch.cuda.is_available(): diff --git a/server/text_generation_server/models/santacoder.py b/server/text_generation_server/models/santacoder.py index a887555a..f07734c2 100644 --- a/server/text_generation_server/models/santacoder.py +++ b/server/text_generation_server/models/santacoder.py @@ -15,14 +15,14 @@ class SantaCoder(CausalLM): self, model_id: str, revision: Optional[str] = None, - use_medusa: Optional[str] = None, + speculator: Optional[str] = None, dtype: Optional[torch.dtype] = None, trust_remote_code: bool = False, ): super().__init__( model_id=model_id, revision=revision, - use_medusa=use_medusa, + speculator=speculator, dtype=dtype, trust_remote_code=trust_remote_code, ) diff --git a/server/text_generation_server/models/seq2seq_lm.py b/server/text_generation_server/models/seq2seq_lm.py index e55a661c..6a0c812f 100644 --- a/server/text_generation_server/models/seq2seq_lm.py +++ b/server/text_generation_server/models/seq2seq_lm.py @@ -532,12 +532,12 @@ class Seq2SeqLM(Model): model_id: str, revision: Optional[str] = None, quantize: Optional[str] = None, - use_medusa: Optional[str] = None, + speculator: Optional[str] = None, dtype: Optional[torch.dtype] = None, trust_remote_code: bool = False, ): - if use_medusa: - raise RuntimeError("Medusa decoding is not enabled for AutoModel") + if speculator: + raise RuntimeError("Speculator decoding is not enabled for AutoModel") if torch.cuda.is_available(): device = torch.device("cuda") diff --git a/server/text_generation_server/models/t5.py b/server/text_generation_server/models/t5.py index 3f3cb965..8e0735e5 100644 --- a/server/text_generation_server/models/t5.py +++ b/server/text_generation_server/models/t5.py @@ -25,7 +25,7 @@ class T5Sharded(Seq2SeqLM): model_id: str, revision: Optional[str] = None, quantize: Optional[str] = None, - use_medusa: Optional[str] = None, + speculator: Optional[str] = None, dtype: Optional[torch.dtype] = None, trust_remote_code: bool = False, ): @@ -43,7 +43,7 @@ class T5Sharded(Seq2SeqLM): trust_remote_code=trust_remote_code, ) config.quantize = quantize - config.use_medusa = use_medusa + config.speculator = speculator tokenizer = AutoTokenizer.from_pretrained( model_id, diff --git a/server/text_generation_server/models/vlm_causal_lm.py b/server/text_generation_server/models/vlm_causal_lm.py index 5394feb5..5c6c90c6 100644 --- a/server/text_generation_server/models/vlm_causal_lm.py +++ b/server/text_generation_server/models/vlm_causal_lm.py @@ -1,28 +1,92 @@ import re import torch +import os +import time import math from PIL import Image from io import BytesIO import base64 - +import numpy from opentelemetry import trace +from loguru import logger from typing import Optional, Tuple, List, Type, Dict - +import itertools +import tempfile +import copy +from text_generation_server.models import Model from transformers import PreTrainedTokenizerBase from transformers.image_processing_utils import select_best_resolution +from text_generation_server.utils.tokens import batch_top_tokens from text_generation_server.pb import generate_pb2 -from text_generation_server.models.flash_mistral import ( - BaseFlashMistral, - FlashMistralBatch, +from text_generation_server.models.causal_lm import ( + CausalLMBatch, + CausalLMRequest, + remove_kv_cache_from_output, + biggest_single_chunk, ) + +from transformers.models.llava_next.modeling_llava_next import ( + get_anyres_image_grid_shape, +) + +from transformers import AutoProcessor +import text_generation_server.habana_quantization_env as hq_env +from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi from text_generation_server.models.cache_manager import ( get_cache_manager, ) +from text_generation_server.utils import ( + HeterogeneousNextTokenChooser, + StoppingCriteria, + make_tokenizer_optional, + is_tokenizer_transparent, + pad_next_token_chooser_parameters, +) +import habana_frameworks.torch as htorch +from optimum.habana.utils import HabanaProfile +from optimum.habana.transformers.generation import MODELS_OPTIMIZED_WITH_STATIC_SHAPES +from optimum.habana.utils import get_hpu_memory_stats + +from transformers import ( + AutoTokenizer, + AutoModel, + PreTrainedTokenizerBase, + AutoConfig, +) +from optimum.habana.checkpoint_utils import ( + get_repo_root, + model_on_meta, + write_checkpoints_json, +) + +from text_generation_server.utils.speculate import get_speculate +from text_generation_server.models.types import ( + Batch, + Tokens, + Generation, + GeneratedText, +) +from text_generation_server.utils.debug import dbg_trace tracer = trace.get_tracer(__name__) IMAGES = re.compile(r"!\[[^\]]*\]\((.*?)\s*(\"(?:.*[^\"])\")?\s*\)") +BASE_IMAGE_TOKENS = int(os.environ.get('BASE_IMAGE_TOKENS', 2048)) +MAX_TOTAL_TOKENS = int(os.environ.get('MAX_TOTAL_TOKENS', 8192)) +MAX_BATCH_TOTAL_TOKENS = int(os.environ.get('MAX_BATCH_TOTAL_TOKENS', 131072)) +PAD_SEQUENCE_TO_MULTIPLE_OF = int(os.environ.get('PAD_SEQUENCE_TO_MULTIPLE_OF', 256)) +CHUNK_SIZES = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048] +LAZY_MODE = int(os.environ.get('PT_HPU_LAZY_MODE', 1)) +PREFILL_WARMUP_BATCH_SIZE_LIST = [] +PREFILL_WARMUP_SEQLEN_LIST = [] +DECODE_WARMUP_BATCH_SIZE_LIST = [] +def round_up(warmup_list:list, num) : + i = 0 + for i in warmup_list: + if num <= i : + break + return i def split(string) -> List[Dict[str, str]]: parts = [] @@ -40,30 +104,6 @@ def split(string) -> List[Dict[str, str]]: return parts - -def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size): - """ - Calculate the shape of the image patch grid after the preprocessing for images of any resolution. - - Args: - image_size (`tuple`): - The size of the input image in the format (width, height). - grid_pinpoints (`List`): - A list containing possible resolutions. Each item in the list should be a tuple or list - of the form `(height, width)`. - patch_size (`int`): - The size of each image patch. - - Returns: - tuple: The shape of the image patch grid in the format (width, height). - """ - if not isinstance(grid_pinpoints, list): - raise ValueError("grid_pinpoints should be a list of tuples or lists") - - height, width = select_best_resolution(image_size, grid_pinpoints) - return height // patch_size, width // patch_size - - def image_text_replacement(image_input, config, image_id) -> str: if config.model_type == "idefics2": # TODO technically depends on image splitting which is not implemented. @@ -76,10 +116,11 @@ def image_text_replacement(image_input, config, image_id) -> str: elif config.model_type == "llava_next": height, width = image_input["image_sizes"][image_id] num_features = get_number_of_features(height, width, config) - from loguru import logger - logger.info(f"Found {num_features} in image of resolution {height}x{width}") return "" * num_features + + elif config.model_type == "paligemma": + return "" * config.text_config.num_image_tokens else: raise RuntimeError(f"Unknown config {config.model_type} for multimodal") @@ -121,6 +162,7 @@ def get_number_of_features(height: int, width: int, config) -> int: image_grid_pinpoints, image_size, ) + unpadded_features, newline_features = get_unpadded_features( height, width, npatches, num_patch_height, num_patch_width ) @@ -136,30 +178,104 @@ def load_data_uri(image_uri: str) -> Image.Image: return image -class VlmCausalLMBatch(FlashMistralBatch): +class VlmCausalLMBatch(CausalLMBatch): pixel_values: Optional[List[torch.Tensor]] pixel_attention_mask: Optional[List[torch.Tensor]] image_sizes: Optional[List[Tuple[int, int]]] @classmethod - @tracer.start_as_current_span("concatenate") - def concatenate(cls, batches): - batch = super(VlmCausalLMBatch, cls).concatenate(batches) - batch.pixel_values = None - batch.pixel_attention_mask = None - batch.image_sizes = None - return batch + def from_tokenized( + cls, + pb: generate_pb2.Batch, + tokenizer: PreTrainedTokenizerBase, + batch_tokenized_inputs, + dtype: torch.dtype, + device: torch.device, + is_warmup: bool = False, + ) -> "VlmCausalLMBatch": + + dbg_trace('FROM_PB', f'num_reqs:{len(pb.requests)}') + requests = [CausalLMRequest.from_pb(idx, req, tokenizer) for idx, req in enumerate(pb.requests)] - @tracer.start_as_current_span("filter") - def filter(self, request_ids: List[int]): - batch = super().filter(request_ids) - batch.pixel_values = None - batch.pixel_attention_mask = None - batch.image_sizes = None - return batch + max_input_length = max(r.data.truncate for r in requests) + max_new_tokens = max(r.stopping_criteria.max_new_tokens for r in requests) + # TODO: Add support for sparse batches + top_n_tokens = [r.top_n_tokens for r in pb.requests] + top_n_tokens_tensor = torch.tensor(top_n_tokens, device=device, dtype=torch.int64) + + # TODO: by tokenizing all inputs at once we loose information on actual input lengths + # this means that we cannot shift inputs to the left after a long input sequence + # was filtered out + new_bs = round_up(PREFILL_WARMUP_BATCH_SIZE_LIST, len(requests)) + parameters = [r.parameters for r in pb.requests] + # append the dummy parameters for dummy request + parameters = pad_next_token_chooser_parameters(parameters, new_bs) + + next_token_chooser = HeterogeneousNextTokenChooser.from_pb( + pb=parameters, + dtype=dtype, + device=device, + tokenizer=tokenizer, + quantization_enabled=hq_env.is_quantization_enabled, + ) + tokenized_inputs = batch_tokenized_inputs + input_len = tokenized_inputs["input_ids"].shape[1] + + bucket_size = max_input_length + left_padding = max_input_length - input_len + if is_warmup is False: + if input_len < max_input_length : + rounded_seq_len = round_up(PREFILL_WARMUP_SEQLEN_LIST, input_len + 1) + if rounded_seq_len <= max_input_length: + bucket_size = rounded_seq_len - 1 + else: + bucket_size = max_input_length - 1 + left_padding = bucket_size - input_len + + input_ids = tokenized_inputs["input_ids"] + attention_mask = tokenized_inputs["attention_mask"] + # Allocate space for first token + if left_padding > 0: + input_ids = torch.nn.functional.pad( + input_ids, (left_padding, 1), value=tokenizer.pad_token_id + ) + attention_mask = torch.nn.functional.pad( + attention_mask, (left_padding, 1), value=0 + ) + all_input_ids = torch.nn.functional.pad( + input_ids, (0, max_new_tokens), value=tokenizer.pad_token_id + ).T.split(1, dim=1) + + # New input length after left padding + input_len = bucket_size + for r in requests: + r.input_length = input_len + r.prefix_offset = input_len - 5 + r.read_offset = input_len + r.all_input_ids = all_input_ids[r.idx] + input_ids = input_ids.to(device) + attention_mask = attention_mask.to(device) + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + + htorch.core.mark_step() + + return cls( + batch_id=pb.id, + requests=requests, + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=None, + merged_kv_cache=False, + next_token_chooser=next_token_chooser, + top_n_tokens=top_n_tokens, + top_n_tokens_tensor=top_n_tokens_tensor, + input_length=input_len, + ) @classmethod - def batch_tokenized_inputs(cls, requests, tokenizer, processor, config): + def batch_tokenized_inputs(cls, requests, tokenizer, processor, config, is_warmup): batch_inputs = [] image_inputs = [] max_truncation = 0 @@ -188,13 +304,28 @@ class VlmCausalLMBatch(FlashMistralBatch): image_inputs.append(image_input) else: raise RuntimeError(f"Invalid chunk type {chunk['type']}") - batch_inputs.append(full_text) max_truncation = max(max_truncation, r.truncate) + if is_warmup is False: + new_bs = round_up(PREFILL_WARMUP_BATCH_SIZE_LIST, len(requests)) + missing_inputs = new_bs - len(requests) + dummy_images = [] + dummy_inputs = [] + if len(batch_inputs) > 0 and len(image_inputs) > 0: + dummy_inputs = [batch_inputs[0]] * missing_inputs + dummy_images = [image_inputs[0]] * missing_inputs + image_inputs += dummy_images + batch_inputs += dummy_inputs + batch_tokenized_inputs = tokenizer( - batch_inputs, truncation=True, max_length=max_truncation - )["input_ids"] + batch_inputs, + truncation=True, + max_length=max_truncation, + return_tensors="pt", + padding="longest", + return_token_type_ids=False, + ) if image_inputs: image_input = image_inputs[0] new_image_inputs = { @@ -224,9 +355,10 @@ class VlmCausalLMBatch(FlashMistralBatch): config, dtype: torch.dtype, device: torch.device, + is_warmup: bool = False, ) -> "VlmCausalLMBatch": batch_tokenized_inputs, image_inputs = cls.batch_tokenized_inputs( - pb.requests, tokenizer, processor, config + pb.requests, tokenizer, processor, config, is_warmup ) batch = cls.from_tokenized(pb, tokenizer, batch_tokenized_inputs, dtype, device) if image_inputs is not None: @@ -247,127 +379,838 @@ class VlmCausalLMBatch(FlashMistralBatch): batch.image_sizes = None return batch + @classmethod + @tracer.start_as_current_span("concatenate") + def concatenate(cls, batches: List["CausalLMBatch"], pad_token_id: int = 0, is_warmup:bool = False) -> "CausalLMBatch": + return cls.recombine(batches, pad_token_id, is_warmup) + + + + @classmethod + def recombine(cls, batches: List["VlmCausalLMBatch"], pad_token_id: int, is_warmup: bool =False) -> "VlmCausalLMBatch": + if not all(b.past_key_values is not None for b in batches): + raise ValueError("KV cache not allocated! Cannot recombine before prefill!") + + total_requests = sum(len(b) for b in batches) + new_bs = total_requests + if is_warmup is False : + new_bs = round_up(DECODE_WARMUP_BATCH_SIZE_LIST, total_requests) + batch_id = batches[0].batch_id + device = batches[0].input_ids.device + + input_lengths = [b.input_length for b in batches] + max_input_length = max(input_lengths) + offsets = [max_input_length - b.input_length for b in batches] + + cur_padding = [b.right_padding for b in batches] + # For prefill there is a space allocated only for first token + # Need to add padding to the max total tokens before first decode + + moves_needed = [total_requests - len(b) if b.batch_size == new_bs else total_requests for b in batches] + dst_batch_idx = min(enumerate(moves_needed), key=lambda idx_val: idx_val[1])[0] + reshape = (batches[dst_batch_idx].batch_size < new_bs) + + # TODO: Add support for changing max seq len, i.e. due to output length bucketing + # FIXME: max_seq_len for non optimized code + if len(batches) > 1: + scenario = 'CONCAT' + elif reshape: + scenario = 'RESHAPE' + elif cur_padding[dst_batch_idx] <= 0: + scenario = 'SHIFT' + offsets = [biggest_single_chunk(b.max_input_length - max_input_length) for b in batches] + max_input_length = max_input_length + offsets[dst_batch_idx] + else: + # Nothing to do + return batches[0] + + dbg_trace( + scenario, f'bs:{[b.batch_size for b in batches]}->{new_bs}' + f' reqs:{[len(b) for b in batches]}' + f' offsets:{offsets}' + f' input_lengths:{input_lengths}' + f' cur_padding:{cur_padding}' + f' dst_batch:{dst_batch_idx}') + + grouped_requests = [[req for req in batch.requests] for batch in batches] + flat_requests = list(itertools.chain(*grouped_requests)) + + for i in range(len(batches)): + target_bs = new_bs if i == dst_batch_idx else batches[i].batch_size + batches[i].merge_kv_cache_if_needed(target_bs, offsets[i]) + batches[i].realign(target_bs, offsets[i], pad_token_id) + batches[i].split_kv_cache_if_needed(i == dst_batch_idx) + batches[dst_batch_idx].expand_bs(new_bs) + batches[dst_batch_idx].move_data([batches[i] for i in range(len(batches)) if i != dst_batch_idx]) + + top_n_tokens = [r.data.top_n_tokens for r in flat_requests] + top_n_tokens_tensor = torch.tensor(top_n_tokens, device=device, dtype=torch.int64) + + parameters = [r.data.parameters for r in flat_requests] + # append the dummy parameters for dummy requests + batch_size = batches[dst_batch_idx].batch_size + parameters = pad_next_token_chooser_parameters(parameters, batch_size) + + # update past grammar states + fsm_grammar_states = [0] * batch_size + for batch in batches: + for i, req in enumerate(batch.requests): + fsm_grammar_states[req.idx] = batch.next_token_chooser.fsm_grammar_states[i] + + next_token_chooser = HeterogeneousNextTokenChooser.from_pb( + parameters, + batches[dst_batch_idx].next_token_chooser.dtype, + batches[dst_batch_idx].next_token_chooser.device, + batches[dst_batch_idx].next_token_chooser.tokenizer, + fsm_grammar_states, + quantization_enabled=hq_env.is_quantization_enabled, + ) + + input_ids = batches[dst_batch_idx].input_ids + attention_mask = batches[dst_batch_idx].attention_mask + position_ids = batches[dst_batch_idx].position_ids + past_key_values = batches[dst_batch_idx].past_key_values + input_length = max_input_length + + htorch.core.mark_step() + + return cls( + batch_id=batch_id, + requests=flat_requests, + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + merged_kv_cache=False, + next_token_chooser=next_token_chooser, + top_n_tokens=top_n_tokens, + top_n_tokens_tensor=top_n_tokens_tensor, + input_length=input_length, + ) + +class VlmCausalLM(Model): + def __init__( + self, + model_class, + model_id: str, + *, + processor_class=AutoProcessor, + processor_kwargs=None, + batch_class=VlmCausalLMBatch, + revision, + dtype, + trust_remote_code: bool, + **kwargs, + ): + adapt_transformers_to_gaudi() + if processor_kwargs is None: + processor_kwargs = {} + self.processor = processor_class.from_pretrained( + model_id, + revision=revision, + trust_remote_code=trust_remote_code, + **processor_kwargs, + ) + self.batch_class = batch_class + self.prev_bs = 0 + + # Create tokenizer + tokenizer = AutoTokenizer.from_pretrained( + model_id, + revision=revision, + padding_side="left", + truncation_side="left", + trust_remote_code=trust_remote_code, + ) + + # Create model + world_size = int(os.getenv("WORLD_SIZE", "1")) + rank = int(os.getenv("RANK", "0")) + dtype = torch.bfloat16 if dtype is None else dtype + device = torch.device("hpu") + + if hq_env.is_quantization_enabled: + htorch.core.hpu_set_env() + + if world_size > 1: + model = self.get_deepspeed_model( + model_class, model_id, dtype, revision + ) + model = self.prepare_model_for_quantization(model) + else: + get_repo_root(model_id) + + # Check support for rope scaling + model_kwargs = {} + config = AutoConfig.from_pretrained( + model_id + ) + if hasattr(config, "rope_scaling"): + model_kwargs["rope_scaling"] = self.get_rope_scaling() + + model = model_class.from_pretrained( + model_id, + revision=revision, + torch_dtype=dtype, + trust_remote_code=trust_remote_code, + **model_kwargs + ) + model = self.prepare_model_for_quantization(model) + model = model.eval().to(device) + + self.enable_hpu_graph = os.getenv("ENABLE_HPU_GRAPH", "true").lower() == "true" and LAZY_MODE == 1 + self.limit_hpu_graph = os.getenv("LIMIT_HPU_GRAPH", "false").lower() == "true" + model = remove_kv_cache_from_output(model) + if self.enable_hpu_graph: + from habana_frameworks.torch.hpu import wrap_in_hpu_graph + model = wrap_in_hpu_graph(model, disable_tensor_cache=True) + else: + if LAZY_MODE == 0: + # It is said that "keep_input_mutations" is safe for inference to be done + dbg_trace( + "TORCH COMPILE", f'Torch compiling of model') + model.model = torch.compile(model.model, backend="hpu_backend", options={"keep_input_mutations": True}) + + model = self.setup_quantization(model) + + if model.config.model_type not in MODELS_OPTIMIZED_WITH_STATIC_SHAPES: + raise ValueError(f"Model type {model.config.model_type} is not supported!") + + if tokenizer.pad_token_id is None: + if model.config.pad_token_id is not None: + tokenizer.pad_token_id = model.config.pad_token_id + elif model.config.eos_token_id is not None: + tokenizer.pad_token_id = model.config.eos_token_id + elif tokenizer.eos_token_id is not None: + tokenizer.pad_token_id = tokenizer.eos_token_id + else: + tokenizer.add_special_tokens({"pad_token": "[PAD]"}) + + kwargs = { + "use_cache": True, + "return_dict": True, + } + + if model.config.model_type in ["llama", "mistral"]: + kwargs["attn_softmax_bf16"] = True + kwargs["trim_logits"] = True + + if os.getenv("USE_FLASH_ATTENTION", "false").lower() == "true": + kwargs["use_flash_attention"] = True + if os.getenv("FLASH_ATTENTION_RECOMPUTE", "false").lower() == "true": + kwargs["flash_attention_recompute"] = True + + self.speculate = get_speculate() + super(VlmCausalLM, self).__init__( + model=model, + tokenizer=tokenizer, + requires_padding=True, + dtype=dtype, + device=device, + rank=rank, + kwargs=kwargs, + ) + + # Create profiler + ranks_to_profile = [int(val) for val in os.getenv("PROF_RANKS", "0").split(',')] + record_shapes = os.getenv("PROF_RECORD_SHAPES", "false").lower() == "true" + output_dir = os.getenv("PROF_PATH", "/tmp/hpu_profile") + self.profiling_warmup_steps = int(os.getenv("PROF_WARMUPSTEP", "0")) if rank in ranks_to_profile else 0 + self.profiling_steps = int(os.getenv("PROF_STEP", "0")) if rank in ranks_to_profile else 0 + self.profiling_wait_steps = int(os.getenv("PROF_WAITSTEP", "0")) + if self.profiling_steps > 0: + self.hb_profiler = HabanaProfile( + wait=self.profiling_wait_steps, + warmup=self.profiling_warmup_steps, + active=self.profiling_steps, + output_dir=output_dir, + record_shapes=record_shapes + ) + self.hb_profiler.start() + else: + self.hb_profiler = None + self.step = 0 + -class VlmCausalLM(BaseFlashMistral): @property def batch_type(self) -> Type[VlmCausalLMBatch]: - return VlmCausalLMBatch + return self.batch_class + + def max_past(self) -> Optional[int]: + return getattr(self.model.text_model, "max_past", None) + + def get_deepspeed_model( + self, + model_class, + model_id: str, + dtype: torch.dtype, + revision: Optional[str] = None + ) -> torch.nn.Module: + import deepspeed + from habana_frameworks.torch.distributed.hccl import initialize_distributed_hpu + + world_size, rank, local_rank = initialize_distributed_hpu() + model_kwargs = { + "revision": revision + } + + # Initialize process(es) for DeepSpeed + deepspeed.init_distributed(dist_backend="hccl") + logger.info( + "DeepSpeed is enabled. world_size {} rank {} local_rank {}".format(world_size, rank, local_rank) + ) + config = AutoConfig.from_pretrained(model_id, **model_kwargs) + load_to_meta = model_on_meta(config) + + # Check support for rope scaling + if hasattr(config, "rope_scaling"): + config.rope_scaling = self.get_rope_scaling() + model_kwargs["rope_scaling"] = self.get_rope_scaling() + + if load_to_meta: + # Construct model with fake meta tensors, later will be replaced on devices during ds-inference ckpt load + with deepspeed.OnDevice(dtype=dtype, device="meta"): + model = model_class.from_config(config, torch_dtype=dtype) + else: + get_repo_root(model_id, local_rank=os.getenv("LOCAL_RANK")) + # TODO: revisit placement on CPU when auto-injection is possible + with deepspeed.OnDevice(dtype=dtype, device="cpu"): + model = model_class.from_pretrained(model_id, torch_dtype=dtype, **model_kwargs) + model = model.eval() + + # Initialize the model + ds_inference_kwargs = {"dtype": dtype} + ds_inference_kwargs["tensor_parallel"] = {"tp_size": world_size} + ds_inference_kwargs["enable_cuda_graph"] = False + + if load_to_meta: + # model loaded to meta is managed differently + checkpoints_json = tempfile.NamedTemporaryFile(suffix=".json", mode="+w") + write_checkpoints_json(model_id, local_rank, checkpoints_json) + ds_inference_kwargs["checkpoint"] = checkpoints_json.name + model = deepspeed.init_inference(model, **ds_inference_kwargs) + + return model.module + + def get_rope_scaling(self) -> Optional[Dict]: + rope_scaling = os.getenv("ROPE_SCALING", None) + if rope_scaling is None: + return None + + rope_factor = float(os.getenv("ROPE_FACTOR", 1.0)) + return { + 'type': rope_scaling, 'factor': float(rope_factor) + } + + def setup_quantization(self, model): + if hq_env.is_quantization_enabled: + htorch.core.quantization._mark_params_as_const(model) + htorch.core.quantization._check_params_as_const(model) + htorch.core.hpu_initialize(model) + return model + + def prepare_model_for_quantization(self, model): + if hq_env.is_quantization_enabled: + if model.config.model_type == "llama": + self.patch_scoped_linear_all_reduce(model) + import habana_quantization_toolkit + habana_quantization_toolkit.prep_model(model) + return model + + def finish_quantization_measurements(self, model): + if hq_env.is_quantization_enabled: + import habana_quantization_toolkit + habana_quantization_toolkit.finish_measurements(self.model) + return model + + def patch_scoped_linear_all_reduce(self, model): + from deepspeed.module_inject.layers import LinearAllreduce + from optimum.habana.transformers.models.modeling_all_models import ScopedLinearAllReduce + for name, module in model.named_children(): + if type(module) is LinearAllreduce: + SL = ScopedLinearAllReduce(mod=module) + setattr(model, name, SL) + self.patch_scoped_linear_all_reduce(module) + + def decode(self, generated_ids: List[int]) -> str: + return self.tokenizer.decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False) + + def decode_token( + self, + all_input_ids: List[int], + prefix_offset: int = 0, + read_offset: int = 0, + ) -> Tuple[str, int, int]: + if is_tokenizer_transparent(self.tokenizer): + new_text = self.tokenizer.decode(all_input_ids[read_offset:], skip_special_tokens=False) + return new_text, read_offset, len(all_input_ids) + else: + return super().decode_token(all_input_ids, prefix_offset, read_offset) def forward( - self, batch: VlmCausalLMBatch - ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: + self, + input_ids, + attention_mask, + position_ids, + token_idx, + past_key_values: Optional[List[Tuple]] = None, + pixel_values: Optional[List[torch.Tensor]] = None, + image_sizes: Optional[List[Tuple[int, int]]] = None, + bypass_hpu_graph: Optional[bool] = None, + ) -> Tuple[torch.Tensor, List[Tuple[torch.Tensor, torch.Tensor]]]: # Model Forward - if batch.speculative_ids is not None: - input_ids = batch.input_ids - position_ids = batch.position_ids - cu_seqlen_prefill = batch.cu_seqlen_prefill - kv_cache = get_cache_manager().kv_cache - block_tables = batch.block_tables_tensor - slots = batch.slots[batch.slot_indices] - input_lengths = batch.input_lengths_tensor - max_s = batch.max_seqlen - lm_head_indices = batch.prefill_head_indices + kwargs = { + "input_ids": input_ids, + "attention_mask": attention_mask, + "past_key_values": past_key_values, + "token_idx": token_idx, + "pixel_values": pixel_values, + "image_sizes": image_sizes + } - speculative_ids = batch.speculative_ids + hpu_kwargs = {} + # Optimum Habana got "lazy_mode" key-val only supported for llama type of models + if self.model.config.model_type == "llama" : + hpu_kwargs["lazy_mode"] = LAZY_MODE == 1 - B, speculative_length = speculative_ids.shape - new_length = speculative_length + 1 - new_input_ids = torch.cat( - [input_ids.unsqueeze(-1), speculative_ids], dim=1 - ).reshape(-1) - arange = torch.arange(new_length, device=position_ids.device).unsqueeze(0) - arange_int = arange.to(dtype=torch.int32) - new_position_ids = ( - position_ids.unsqueeze(-1).expand(B, new_length) + arange - ).view(-1) - slots = (slots.unsqueeze(-1).expand(B, new_length) + arange_int).view(-1) - input_lengths = ( - input_lengths.unsqueeze(-1).expand(B, new_length) + arange_int - ).view(-1) + if self.has_position_ids: + kwargs["position_ids"] = position_ids - # Add Copy the block tables for all members - block_tables = ( - block_tables.unsqueeze(1) - .expand(B, new_length, -1) - .reshape(B * new_length, -1) - .contiguous() - ) - max_s = max_s + speculative_length + if bypass_hpu_graph != None: + hpu_kwargs["bypass_hpu_graphs"] = bypass_hpu_graph - input_ids = new_input_ids - position_ids = new_position_ids + kwargs.update(self.kwargs) + model_inputs = self.model.prepare_inputs_for_generation(**kwargs) + if past_key_values is not None: + return self.model.forward(**model_inputs, **hpu_kwargs) else: - input_ids = batch.input_ids - position_ids = batch.position_ids - cu_seqlen_prefill = batch.cu_seqlen_prefill - kv_cache = get_cache_manager().kv_cache - block_tables = batch.block_tables_tensor - slots = batch.slots[batch.slot_indices] - input_lengths = batch.input_lengths_tensor - max_s = batch.max_seqlen - lm_head_indices = batch.prefill_head_indices + outputs = self.model.forward(**model_inputs, **hpu_kwargs) + return outputs.logits, outputs.past_key_values - if cu_seqlen_prefill is None and self.max_past() is not None: - # In decode, not prefill, we're actually overwriting the KV-cache - # in a circular buffer mode. - # This makes sure the max_s for the decode pass is correct. - max_s = min(self.max_past(), max_s) + @tracer.start_as_current_span("generate_token") + def generate_token( + self, batches: List[VlmCausalLMBatch], is_warmup: bool = False + ) -> Tuple[List[Generation], Optional[CausalLMBatch], Tuple[int, int]]: + start = time.time_ns() + # Results + generations: List[Generation] = [] + prev_batches = [] + requests_to_generate = [] + # In order to pipeline any actions on CPU we perform the operation in 3 main stages: + # Stage 1. Collect next token ids of any previously started generations + for batch_id, batch in enumerate(batches): + if batch.logits is not None: + logits = batch.logits + past = batch.past + prefill = batch.past_key_values is None + if prefill: + # no right padding for prefill + token_idx_scalar = batch.attention_mask.shape[-1] - 1 + token_idx = torch.tensor(token_idx_scalar).to(self.device) + else: + token_idx_scalar = batch.attention_mask.shape[-1] - batch.right_padding + token_idx = torch.tensor(token_idx_scalar).to(self.device) - bs = input_ids.shape[0] - # Try to find an associated cuda graph - bs = input_ids.shape[0] - sorted_padded_bs = sorted([k for k in self.cuda_graphs.keys() if k >= bs]) - if sorted_padded_bs: - # Get associated cuda graph - cuda_graph = self.cuda_graphs[sorted_padded_bs[0]] + # Select next token + input_length = batch.input_length + if logits.shape[-2] > 1: + next_token_ids, next_token_logprobs, logprobs, _, _ = batch.next_token_chooser( + batch.input_ids, logits[:, input_length - 1: input_length, :].squeeze(-2), self.speculate + ) + else: + next_token_ids, next_token_logprobs, logprobs, _, _ = batch.next_token_chooser( + batch.input_ids, logits.squeeze(-2), self.speculate + ) + # Speculation is not active for causal + accepted_ids = torch.ones_like(batch.input_ids)[:, 0] + batch_top_token_ids, batch_top_token_logprobs = batch_top_tokens( + batch.top_n_tokens, + batch.top_n_tokens_tensor, + logprobs, + accepted_ids, + ) + + prev_batches.append({ + 'next_token_ids': next_token_ids, + 'next_token_logprobs': next_token_logprobs, + }) + + for req_idx, req in enumerate(batch.requests): + requests_to_generate.append({ + 'req': req, + 'prev_req_idx': req.idx, + 'batch_id': batch_id, + 'seed': batch.next_token_chooser.seeds[req_idx], + 'do_sample': batch.next_token_chooser.do_sample[req_idx], + 'top_n_tokens': batch.top_n_tokens[req_idx], + 'top_token_ids': batch_top_token_ids[req_idx], + 'top_token_logprobs': batch_top_token_logprobs[req_idx], + 'grammar_state': batch.next_token_chooser.fsm_grammar_states[req.idx], + }) + + htorch.core.mark_step() + + # Add new token into input_ids + batch.input_ids.index_copy_(1, token_idx, next_token_ids.unsqueeze(1)) + + # Update attention_mask as we added a new token to input_ids + batch.attention_mask.index_fill_(1, token_idx, 1) + + # Adjust lengths + batch.input_length += 1 + + # Update position_ids + if prefill: + batch.position_ids = torch.index_select(batch.position_ids, 1, token_idx - 1) + 1 + else: + batch.position_ids += 1 + # Update past key values + if prefill: + batch.past_key_values = past + + htorch.core.mark_step() + + # Stage 2. Prepare new batch for speculative scheduling + if len(batches) > 1: + batch = self.batch_type.concatenate(batches, self.tokenizer.pad_token_id, is_warmup) else: - cuda_graph = None - if cu_seqlen_prefill is not None or cuda_graph is None: - logits, speculative_logits = self.model.forward( - input_ids=input_ids, - position_ids=position_ids, - cu_seqlen_prefill=cu_seqlen_prefill, - kv_cache=kv_cache, - block_tables=block_tables, - slots=slots, - input_lengths=input_lengths, - max_s=max_s, - prefill_cache_indices=batch.prefill_cache_indices, - lm_head_indices=lm_head_indices, - pixel_values=batch.pixel_values, - pixel_attention_mask=batch.pixel_attention_mask, - image_sizes=batch.image_sizes, + batch = batches[0] + + prefill = batch.past_key_values is None + + # Check if we need to do any bookkeeping first + if not prefill: + batch = batch.__class__.recombine([batch], self.tokenizer.pad_token_id, is_warmup) + + scenario = 'PREFILL' if prefill else 'GENERATE' + if self.enable_hpu_graph and self.limit_hpu_graph and round_up(DECODE_WARMUP_BATCH_SIZE_LIST, batch.batch_size) != self.prev_bs: + self.model.clear_cache() + self.prev_bs = round_up(DECODE_WARMUP_BATCH_SIZE_LIST, batch.batch_size) + dbg_trace( + scenario, f'bs:{batch.batch_size} num_reqs:{len(batch.requests)} seq_len:{batch.seq_length} padding:{batch.right_padding}') + #assert batch.right_padding > 0, 'No more room for next token!' + + # Execute batch + if prefill: + # no right padding for prefill + token_idx = torch.tensor(batch.attention_mask.shape[-1] - 1).to(self.device) + batch.logits, batch.past = self.forward( + batch.input_ids, + batch.attention_mask, + batch.position_ids, + token_idx, + batch.past_key_values, + batch.pixel_values, + batch.image_sizes, + bypass_hpu_graph=prefill and self.limit_hpu_graph if self.enable_hpu_graph else None, + ) + elif all([req.stopping_criteria.max_new_tokens == 1 for req in batch.requests]): + # Don't schedule next forward if max_new_tokens for all requests equals 1 + # - we've already generated the first and only needed token in the prefill phase + pass + else: + token_idx = torch.tensor(batch.attention_mask.shape[-1] - batch.right_padding).to(self.device) + batch.logits = self.forward( + batch.input_ids, + batch.attention_mask, + batch.position_ids, + token_idx, + batch.past_key_values, + bypass_hpu_graph=prefill and self.limit_hpu_graph if self.enable_hpu_graph else None, ) - if batch.prefill_cache_indices is not None: - batch.prefill_cache_indices = None - if batch.pixel_values is not None: - batch.pixel_values = None - if batch.pixel_attention_mask is not None: - batch.pixel_attention_mask = None - if batch.image_sizes is not None: - batch.image_sizes = None - return logits, speculative_logits - # Copy inputs to the static inputs of the cuda graph - # Static inputs are potentially padded - cuda_graph["input_ids"][: input_ids.shape[0]] = input_ids - cuda_graph["position_ids"][: position_ids.shape[0]] = position_ids - cuda_graph["block_tables"][ - : block_tables.shape[0], : block_tables.shape[1] - ] = block_tables - cuda_graph["slots"].fill_(-1) - cuda_graph["slots"][: slots.shape[0]] = slots - cuda_graph["input_lengths"].zero_() - cuda_graph["input_lengths"][: input_lengths.shape[0]] = input_lengths + htorch.core.mark_step() - # Replay the graph - cuda_graph["graph"].replay() + start_decode = time.time_ns() - # Slice output to the correct shape - speculative_logits = ( - cuda_graph["speculative_logits"][:bs] - if cuda_graph["speculative_logits"] is not None - else None + # Stage 3. Finish and return previous generations + stopped = len(requests_to_generate) > 0 + for prev_batch in prev_batches: + prev_batch['next_token_logprobs'] = prev_batch['next_token_logprobs'].tolist() + prev_batch['next_token_ids_cpu'] = prev_batch['next_token_ids'].cpu() + htorch.core.mark_step() + + for req_data in requests_to_generate: + req = req_data['req'] + i = req_data['prev_req_idx'] + prev_batch_id = req_data['batch_id'] + assert len(prev_batches) > prev_batch_id + next_token_ids_cpu = prev_batches[prev_batch_id]['next_token_ids_cpu'] + next_token_logprobs = prev_batches[prev_batch_id]['next_token_logprobs'] + + request = req.data + input_length = req.input_length + prefix_offset = req.prefix_offset + read_offset = req.read_offset + do_sample = req_data['do_sample'] + seed = req_data['seed'] + stopping_criteria = req.stopping_criteria + all_input_ids = req.all_input_ids + next_token_id = next_token_ids_cpu[i] + next_token_logprob = next_token_logprobs[i] + top_n_tokens = req_data['top_n_tokens'] + top_token_ids = req_data['top_token_ids'] + top_token_logprobs = req_data['top_token_logprobs'] + grammar_state = req_data['grammar_state'] + + # Append next token to all tokens + all_input_ids[input_length] = next_token_id + new_input_length = input_length + 1 + + # Generated token + if is_tokenizer_transparent(self.tokenizer) and len(stopping_criteria.stop_sequence_criterias) == 0: + next_token_text = '' + else: + next_token_text, prefix_offset, read_offset = self.decode_token( + all_input_ids[0:new_input_length, 0], prefix_offset, read_offset + ) + + # Evaluate stopping criteria + stop, reason = stopping_criteria( + next_token_id, + next_token_text, + ) + + if not stop: + stopped = False + + # Shard generations + # All generations will be appended in the rust sharded client + if i % self.world_size == self.rank: + if stop: + # Decode generated tokens + if is_tokenizer_transparent(self.tokenizer): + output_text = None + else: + output_text = self.decode( + all_input_ids[new_input_length - stopping_criteria.current_tokens: new_input_length, 0] + ) + generated_text = GeneratedText( + output_text, + stopping_criteria.current_tokens, + reason, + seed if do_sample else None, + ) + else: + generated_text = None + + # Prefill + if stopping_criteria.current_tokens == 1 and request.prefill_logprobs: + # Remove generated token to only have prefill and add nan for first prompt token + prefill_logprobs = [float("nan")] + next_token_logprobs + prefill_token_ids = all_input_ids[0: new_input_length - 1] + prefill_texts = self.tokenizer.batch_decode( + prefill_token_ids, + clean_up_tokenization_spaces=False, + skip_special_tokens=False, + ) + prefill_tokens = Tokens( + prefill_token_ids, + prefill_logprobs, + prefill_texts, + is_special=[], + ) + else: + prefill_tokens = None + + if top_n_tokens > 0: + all_top_tokens = [] + for top_token_ids, top_token_logprobs in zip( + top_token_ids, top_token_logprobs + ): + toptoken_texts = self.tokenizer.batch_decode( + top_token_ids, + clean_up_tokenization_spaces=False, + skip_special_tokens=False, + ) + special_toptokens = [ + token_id in self.all_special_ids + for token_id in top_token_ids + ] + top_tokens = Tokens( + top_token_ids, + top_token_logprobs, + toptoken_texts, + special_toptokens, + ) + all_top_tokens.append(top_tokens) + top_tokens = all_top_tokens + else: + top_tokens = None + + generation = Generation( + request.id, + prefill_tokens, + Tokens( + [next_token_id], + [next_token_logprob], + [next_token_text], + [next_token_id in self.all_special_ids], + ), + generated_text, + top_tokens, + ) + + generations.append(generation) + + batch.next_token_chooser = ( + batch.next_token_chooser.advance_grammar_single_with_past_state( + req.idx, next_token_id, grammar_state + ) + ) + + req.all_input_ids = all_input_ids + req.input_length = new_input_length + req.prefix_offset = prefix_offset + req.read_offset = read_offset + + htorch.core.mark_step() + self.step = self.step + 1 + if self.hb_profiler is not None: + if self.step > self.profiling_wait_steps + self.profiling_warmup_steps + self.profiling_steps: + self.hb_profiler.stop() + else: + self.hb_profiler.step() + + forward_ns = start_decode - start + decode_ns = time.time_ns() - start_decode + return generations, batch if not stopped else None, (forward_ns, decode_ns) + + def batch_from_pb(self, batch, is_warmup): + return VlmCausalLMBatch.from_pb_processor( + batch, + self.tokenizer, + self.processor, + self.model.config, + self.dtype, + self.device, + is_warmup ) - logits = cuda_graph["logits"][:bs] - return logits, speculative_logits + + def generate_warmup_batch(self, request, seq_len, batch_size, is_warmup): + batch = copy.deepcopy(request.batches[0]) + for req in batch.requests: + req.truncate = seq_len + + for i in range(len(batch.requests) - batch_size): + batch.requests.pop() + + return self.batch_from_pb(batch, is_warmup) + + def warmup(self, request) -> None: + is_warmup = True + batches = [self.batch_from_pb(batch, is_warmup) for batch in request.batches] + + try: + # max prefill batch size warmup + _, prefill_batch, _ = self.generate_token([batches[0]], is_warmup) + except: + raise RuntimeError( + f"Not enough memory to handle {len(batches[0].input_ids)} prefill tokens. " + f"You need to decrease `--max-batch-prefill-tokens`" + ) + + self.model.clear_inputs() + global BASE_IMAGE_TOKENS, MAX_TOTAL_TOKENS, MAX_BATCH_TOTAL_TOKENS, PREFILL_WARMUP_BATCH_SIZE_LIST, PREFILL_WARMUP_SEQLEN_LIST, DECODE_WARMUP_BATCH_SIZE_LIST + max_input_length = batches[0].input_ids.shape[1] + max_prefill_batch_size = batches[0].input_ids.shape[0] + PREFILL_WARMUP_BATCH_SIZE_LIST = [] + batch_size = 1 + while batch_size <= max_prefill_batch_size: + PREFILL_WARMUP_BATCH_SIZE_LIST.append(batch_size) + batch_size = batch_size * 2 + if PREFILL_WARMUP_BATCH_SIZE_LIST[-1] < max_prefill_batch_size : + PREFILL_WARMUP_BATCH_SIZE_LIST.append(max_prefill_batch_size) + + seq_len = BASE_IMAGE_TOKENS + PREFILL_WARMUP_SEQLEN_LIST = [] + i = 0 + while seq_len <= max_input_length: + PREFILL_WARMUP_SEQLEN_LIST.append(seq_len) + seq_len += PAD_SEQUENCE_TO_MULTIPLE_OF*(2**i) + i += 1 + if PREFILL_WARMUP_SEQLEN_LIST[-1] < max_input_length: + PREFILL_WARMUP_SEQLEN_LIST.append(max_input_length) + + #Prefill and decode warmup + DECODE_WARMUP_BATCH_SIZE_LIST = [] + prefill_batch = None + decode_batch = None + try: + for batch_size in PREFILL_WARMUP_BATCH_SIZE_LIST : + for seq_len in PREFILL_WARMUP_SEQLEN_LIST : + batch = self.generate_warmup_batch(request, seq_len, batch_size, is_warmup) + _, prefill_batch, _ = self.generate_token([batch], is_warmup) + _, decode_batch, _ = self.generate_token([prefill_batch], is_warmup) + + DECODE_WARMUP_BATCH_SIZE_LIST.append(batch_size) + + except: + raise RuntimeError( + f"Not enough memory to handle following prefill and decode warmup." + f"Prefill batch size list:{PREFILL_WARMUP_BATCH_SIZE_LIST}" + f"Prefill sequence length list:{PREFILL_WARMUP_SEQLEN_LIST}" + f"Decode batch size list:{DECODE_WARMUP_BATCH_SIZE_LIST}" + f"You need to decrease `--max-batch-prefill-tokens`" + ) + + mem_stats = get_hpu_memory_stats(self.device) + logger.info( + f"\nFollowing prefill and decode warmup successfully.\n" + f"Prefill batch size list:{PREFILL_WARMUP_BATCH_SIZE_LIST}\n" + f"Prefill sequence length list:{PREFILL_WARMUP_SEQLEN_LIST}\n" + f"Decode batch size list:{DECODE_WARMUP_BATCH_SIZE_LIST}\n" + f"Memory stats: {mem_stats} " + ) + + self.model.clear_inputs() + max_decode_batch_size = math.floor(MAX_BATCH_TOTAL_TOKENS / MAX_TOTAL_TOKENS) + batch_size = max_prefill_batch_size * 2 + # Decode warmup with bigger batch_size + try: + if DECODE_WARMUP_BATCH_SIZE_LIST[-1] < max_decode_batch_size and batch_size <= max_decode_batch_size: + batches = [] + for i in range(int(batch_size/max_prefill_batch_size)) : + batch = self.generate_warmup_batch(request, PREFILL_WARMUP_SEQLEN_LIST[0], DECODE_WARMUP_BATCH_SIZE_LIST[-1], is_warmup) + _, prefill_batch, _ = self.generate_token([batch], is_warmup) + batches.append(prefill_batch) + while batch_size <= max_decode_batch_size: + _, decode_batch, _ = self.generate_token(batches, is_warmup) + DECODE_WARMUP_BATCH_SIZE_LIST.append(batch_size) + batch_size = batch_size * 2 + batches.clear() + + for i in range(int(batch_size/max_prefill_batch_size)) : + batch = self.generate_warmup_batch(request, PREFILL_WARMUP_SEQLEN_LIST[0], DECODE_WARMUP_BATCH_SIZE_LIST[-1], is_warmup) + _, prefill_batch, _ = self.generate_token([batch], is_warmup) + batches.append(prefill_batch) + + batches.clear() + if DECODE_WARMUP_BATCH_SIZE_LIST[-1] < max_decode_batch_size: + max_decode_batch_size = math.floor( max_decode_batch_size / 2) * 2 + batch_size = max_decode_batch_size + for i in range(int(max_decode_batch_size / 2)) : + batch = self.generate_warmup_batch(request, PREFILL_WARMUP_SEQLEN_LIST[0], 2, is_warmup) + _, prefill_batch, _ = self.generate_token([batch], is_warmup) + batches.append(prefill_batch) + _, decode_batch, _ = self.generate_token(batches, is_warmup) + DECODE_WARMUP_BATCH_SIZE_LIST.append(max_decode_batch_size) + max_batch_total_tokens = max_decode_batch_size * MAX_TOTAL_TOKENS + MAX_BATCH_TOTAL_TOKENS = max_batch_total_tokens + except : + raise RuntimeError( + f"Not enough memory to handle batch_size({batch_size}) decode warmup." + f"Decode batch size list:{DECODE_WARMUP_BATCH_SIZE_LIST}" + f"max_decode_batch_size is {max_decode_batch_size}" + f"You need to decrease env `MAX_BATCH_TOTAL_TOKENS` or '--max_batch_total_tokens'" + ) + + mem_stats = get_hpu_memory_stats(self.device) + logger.info( + f"\nFollowing decode warmup successfully.\n" + f"Decode batch size list:{DECODE_WARMUP_BATCH_SIZE_LIST}\n" + f"Memory stats: {mem_stats}" + ) + + self.model.clear_inputs() + return MAX_BATCH_TOTAL_TOKENS \ No newline at end of file diff --git a/server/text_generation_server/server.py b/server/text_generation_server/server.py index b8905d71..a4dc4a4c 100644 --- a/server/text_generation_server/server.py +++ b/server/text_generation_server/server.py @@ -19,6 +19,19 @@ from text_generation_server.interceptor import ExceptionInterceptor from text_generation_server.models import Model, get_model from text_generation_server.pb import generate_pb2_grpc, generate_pb2 from text_generation_server.tracing import UDSOpenTelemetryAioServerInterceptor +from text_generation_server.models.globals import set_model_id + +try: + from text_generation_server.models.pali_gemma import PaliGemmaBatch + from text_generation_server.models.vlm_causal_lm import ( + VlmCausalLMBatch, + ) + from text_generation_server.models.idefics_causal_lm import IdeficsCausalLMBatch + + VLM_BATCH_TYPES = {PaliGemmaBatch, VlmCausalLMBatch, IdeficsCausalLMBatch} +except (ImportError, NotImplementedError): + # These imports can fail on CPU/Non flash. + VLM_BATCH_TYPES = set() from text_generation_server.utils.version import is_driver_compatible, MIN_TGI_GAUDI_SYNAPSE_VERSION @@ -34,9 +47,6 @@ class SignalHandler: self.KEEP_PROCESSING = False -signal_handler = SignalHandler() - - class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServicer): def __init__( self, @@ -88,16 +98,33 @@ class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServicer): batch, self.model.tokenizer, self.model.dtype, self.model.device ) - batches = [batch_from_pb(batch) for batch in request.batches] - self.model.warmup(batches) + if self.model.batch_type in VLM_BATCH_TYPES : + max_supported_total_tokens = self.model.warmup(request) + return generate_pb2.WarmupResponse(max_supported_total_tokens=max_supported_total_tokens) + else: + batches = [batch_from_pb(batch) for batch in request.batches] + self.model.warmup(batches) + return generate_pb2.WarmupResponse() - return generate_pb2.WarmupResponse() async def Prefill(self, request, context): start = time.time_ns() - batch = self.model.batch_type.from_pb( - request.batch, self.model.tokenizer, self.model.dtype, self.model.device - ) + if ( + self.model.batch_type in VLM_BATCH_TYPES + ): # Hack, i would rather use kwargs in the `from_pb` call + batch = self.model.batch_type.from_pb_processor( + request.batch, + self.model.tokenizer, + self.model.processor, + self.model.model.config, + self.model.dtype, + self.model.device, + ) + else: + batch = self.model.batch_type.from_pb( + request.batch, self.model.tokenizer, self.model.dtype, self.model.device + ) + generations, next_batch, timings = self.model.generate_token([batch]) self.cache.set(next_batch) @@ -147,7 +174,7 @@ def serve( uds_path: Path, ): # Remove default handler - logger.remove() + #logger.remove() logger.add( sys.stdout, format="{message}", @@ -221,10 +248,11 @@ def serve( await server.start() logger.info("Server started at {}".format(local_url)) - + signal_handler = SignalHandler() while signal_handler.KEEP_PROCESSING: await asyncio.sleep(0.5) + set_model_id(model_id) asyncio.run( serve_inner( model_id, revision, sharded, speculate, dtype, trust_remote_code diff --git a/server/text_generation_server/utils/flash_attn.py b/server/text_generation_server/utils/flash_attn.py index 583a8f91..4f5cf10b 100644 --- a/server/text_generation_server/utils/flash_attn.py +++ b/server/text_generation_server/utils/flash_attn.py @@ -4,22 +4,21 @@ import torch from loguru import logger import math -from text_generation_server.utils.import_utils import ( - IS_CUDA_SYSTEM, - IS_ROCM_SYSTEM, - IS_XPU_SYSTEM, -) +from text_generation_server.utils.import_utils import SYSTEM + +if SYSTEM != "xpu": + from text_generation_server.utils.flash_attn_triton import triton_attention if os.getenv("USE_FLASH_ATTENTION", "").lower() == "false": raise ImportError("`USE_FLASH_ATTENTION` is false.") -HAS_FLASH_ATTN = True +HAS_FLASH_ATTN = False HAS_FLASH_ATTN_V2_CUDA = False HAS_FLASH_ATTN_V2_ROCM = False +ROCM_USE_FLASH_ATTN_V2_CK = False +ROCM_USE_FLASH_ATTN_V2_TRITON = False -if IS_XPU_SYSTEM: - import intel_extension_for_pytorch as ipex -if IS_CUDA_SYSTEM or IS_ROCM_SYSTEM: +if SYSTEM in {"cuda", "rocm"}: if not torch.cuda.is_available(): raise ImportError("CUDA is not available") @@ -27,31 +26,43 @@ if IS_CUDA_SYSTEM or IS_ROCM_SYSTEM: is_sm75 = major == 7 and minor == 5 is_sm8x = major == 8 and minor >= 0 is_sm90 = major == 9 and minor == 0 + is_sm94 = major == 9 and minor == 4 + + if SYSTEM == "rocm": + if ( + os.getenv("ROCM_USE_FLASH_ATTN_V2_TRITON", "").lower() == "true" + or os.getenv("ROCM_USE_FLASH_ATTN_V2_TRITON", "0") == "1" + ): + ROCM_USE_FLASH_ATTN_V2_TRITON = True + logger.info("ROCm: using Flash Attention 2 Triton implementation.") + else: + ROCM_USE_FLASH_ATTN_V2_CK = True + logger.info( + "ROCm: using Flash Attention 2 Composable Kernel implementation." + ) - HAS_FLASH_ATTN = False - HAS_FLASH_ATTN_V2_CUDA = False - HAS_FLASH_ATTN_V2_ROCM = False try: try: import flash_attn_2_cuda except ImportError: - architecture_suffix = "" - if IS_CUDA_SYSTEM: - architecture_suffix = "-cuda" - elif IS_ROCM_SYSTEM: - architecture_suffix = "-rocm" + architecture_suffix = f"-{SYSTEM}" raise ImportError( "Flash Attention V2 is not installed.\n" "Use the official Docker image (ghcr.io/huggingface/text-generation-inference:latest) " f"or install flash attention v2 with `cd server && make install install-flash-attention-v2{architecture_suffix}`" ) - if not (is_sm8x or is_sm90): + if SYSTEM == "cuda" and not (is_sm8x or is_sm90): raise ImportError( f"GPU with CUDA capability {major} {minor} is not supported for " "Flash Attention V2" ) - HAS_FLASH_ATTN_V2_CUDA = IS_CUDA_SYSTEM - HAS_FLASH_ATTN_V2_ROCM = IS_ROCM_SYSTEM + elif SYSTEM == "rocm" and not (is_sm8x or is_sm90 or is_sm94): + raise ImportError( + f"AMD GPU with compute capability {major} {minor} is not supported for " + "Flash Attention V2" + ) + HAS_FLASH_ATTN_V2_CUDA = SYSTEM == "cuda" + HAS_FLASH_ATTN_V2_ROCM = SYSTEM == "rocm" except ImportError as e: try: import flash_attn_cuda @@ -62,11 +73,11 @@ if IS_CUDA_SYSTEM or IS_ROCM_SYSTEM: "or install flash attention with `cd server && make install install-flash-attention`" ) from e - if IS_CUDA_SYSTEM and not (is_sm75 or is_sm8x or is_sm90): + if SYSTEM == "cuda" and not (is_sm75 or is_sm8x or is_sm90): raise ImportError( f"GPU with CUDA capability {major} {minor} is not supported" ) from e - elif IS_ROCM_SYSTEM: + elif SYSTEM == "rocm": for idx in range(torch.cuda.device_count()): if "MI210" not in torch.cuda.get_device_name( idx @@ -78,21 +89,22 @@ if IS_CUDA_SYSTEM or IS_ROCM_SYSTEM: logger.warning(f"Unable to use Flash Attention V2: {e}") HAS_FLASH_ATTN = True +if SYSTEM == "xpu": + import intel_extension_for_pytorch as ipex -def attention( - q, - k, - v, - out, - cu_seqlens, - max_s, - softmax_scale, - window_size_left=-1, -): - if window_size_left <= 0 and window_size_left != -1: - raise ValueError("`window_size_left` must be > 0 or -1") + def attention( + q, + k, + v, + out, + cu_seqlens, + max_s, + softmax_scale, + window_size_left=-1, + ): + if window_size_left <= 0 and window_size_left != -1: + raise ValueError("`window_size_left` must be > 0 or -1") - if IS_XPU_SYSTEM: if window_size_left != -1: raise ValueError( f"XPU version of Flash Attention does not support window attention (window_size_left != -1, got window_size_left={window_size_left})." @@ -114,7 +126,21 @@ def attention( None, ) - if HAS_FLASH_ATTN_V2_CUDA: +elif HAS_FLASH_ATTN_V2_CUDA: + + def attention( + q, + k, + v, + out, + cu_seqlens, + max_s, + softmax_scale, + window_size_left=-1, + causal=True, + ): + if window_size_left <= 0 and window_size_left != -1: + raise ValueError("`window_size_left` must be > 0 or -1") return flash_attn_2_cuda.varlen_fwd( q, k, @@ -130,13 +156,28 @@ def attention( 0.0, softmax_scale, False, - True, + causal, window_size_left, 0, False, None, ) - elif HAS_FLASH_ATTN_V2_ROCM: + +elif HAS_FLASH_ATTN_V2_ROCM and ROCM_USE_FLASH_ATTN_V2_CK: + + def attention( + q, + k, + v, + out, + cu_seqlens, + max_s, + softmax_scale, + window_size_left=-1, + causal=True, + ): + if window_size_left <= 0 and window_size_left != -1: + raise ValueError("`window_size_left` must be > 0 or -1") if window_size_left != -1: raise ValueError( f"RoCm version of Flash Attention v2 does not support window attention (window_size_left != -1, got window_size_left={window_size_left})." @@ -155,11 +196,50 @@ def attention( 0.0, softmax_scale, False, - True, + causal, False, None, ) - elif HAS_FLASH_ATTN: + +elif HAS_FLASH_ATTN_V2_ROCM and ROCM_USE_FLASH_ATTN_V2_TRITON: + + def attention( + q, + k, + v, + out, + cu_seqlens, + max_s, + softmax_scale, + window_size_left=-1, + causal=True, + ): + output, _ = triton_attention( + q, + k, + v, + out, + cu_seqlens, + cu_seqlens, + max_s, + max_s, + causal, + softmax_scale, + ) + return output + +elif HAS_FLASH_ATTN: + + def attention( + q, + k, + v, + out, + cu_seqlens, + max_s, + softmax_scale, + window_size_left=-1, + ): if window_size_left != -1: raise NotImplementedError( "window_size_left is only available with flash attn v2" @@ -209,4 +289,5 @@ def attention( None, ) +else: raise NotImplementedError("flash attention is not installed") diff --git a/server/text_generation_server/utils/flash_attn_triton.py b/server/text_generation_server/utils/flash_attn_triton.py new file mode 100644 index 00000000..3fe32231 --- /dev/null +++ b/server/text_generation_server/utils/flash_attn_triton.py @@ -0,0 +1,816 @@ +#!/usr/bin/env python +""" +Fused Attention +=============== + +This is a Triton implementation of the Flash Attention v2 algorithm from Tri Dao +(https://tridao.me/publications/flash2/flash2.pdf) +Credits: OpenAI kernel team, AMD ML Frameworks Triton team + +Features supported: + +1) Fwd with causal masking +2) Any sequence lengths without padding (currently fwd kernel only) +3) Support for different sequence lengths for q and k +4) Nested tensor API currently does not support dropout or bias. + +Not currently supported: + +1) Non power of two head dims + +""" + +import torch +import triton +import triton.language as tl + +torch_dtype: tl.constexpr = torch.float16 + + +@triton.jit +def cdiv_fn(x, y): + return (x + y - 1) // y + + +@triton.jit +def max_fn(x, y): + return tl.math.max(x, y) + + +@triton.jit +def dropout_offsets(philox_seed, philox_offset, dropout_p, m, n, stride): + ms = tl.arange(0, m) + ns = tl.arange(0, n) + return philox_offset + ms[:, None] * stride + ns[None, :] + + +@triton.jit +def dropout_rng(philox_seed, philox_offset, dropout_p, m, n, stride): + rng_offsets = dropout_offsets( + philox_seed, philox_offset, dropout_p, m, n, stride + ).to(tl.uint32) + # TODO: use tl.randint for better performance + return tl.rand(philox_seed, rng_offsets) + + +@triton.jit +def dropout_mask(philox_seed, philox_offset, dropout_p, m, n, stride): + rng_output = dropout_rng(philox_seed, philox_offset, dropout_p, m, n, stride) + rng_keep = rng_output > dropout_p + return rng_keep + + +@triton.jit +def load_fn(block_ptr, first, second, pad): + if first and second: + tensor = tl.load(block_ptr, boundary_check=(0, 1), padding_option=pad) + elif first: + tensor = tl.load(block_ptr, boundary_check=(0,), padding_option=pad) + elif second: + tensor = tl.load(block_ptr, boundary_check=(1,), padding_option=pad) + else: + tensor = tl.load(block_ptr) + return tensor + + +@triton.jit +def _attn_fwd_inner( + acc, + l_i, + m_i, + q, + K_block_ptr, + V_block_ptr, + start_m, + actual_seqlen_k, + dropout_p, + philox_seed, + batch_philox_offset, + encoded_softmax_block_ptr, + block_min, + block_max, + offs_n_causal, + masked_blocks, + n_extra_tokens, + bias_ptr, + IS_CAUSAL: tl.constexpr, + BLOCK_M: tl.constexpr, + BLOCK_DMODEL: tl.constexpr, + BLOCK_N: tl.constexpr, + OFFS_M: tl.constexpr, + OFFS_N: tl.constexpr, + PRE_LOAD_V: tl.constexpr, + MASK_STEPS: tl.constexpr, + ENABLE_DROPOUT: tl.constexpr, + RETURN_ENCODED_SOFTMAX: tl.constexpr, + PADDED_HEAD: tl.constexpr, +): + # loop over k, v, and update accumulator + for start_n in range(block_min, block_max, BLOCK_N): + # For padded blocks, we will overrun the tensor size if + # we load all BLOCK_N. For others, the blocks are all within range. + k = load_fn( + K_block_ptr, + PADDED_HEAD, + MASK_STEPS and (n_extra_tokens != 0), + "zero", + ) + if PRE_LOAD_V: + v = load_fn( + V_block_ptr, + MASK_STEPS and (n_extra_tokens != 0), + PADDED_HEAD, + "zero", + ) + qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) + # We start from end of seqlen_k so only the first iteration would need + # to be checked for padding if it is not a multiple of block_n + # TODO: This can be optimized to only be true for the padded block. + if MASK_STEPS: # noqa: SIM102 + # If this is the last block / iteration, we want to + # mask if the sequence length is not a multiple of block size + # a solution is to always do BLOCK_M // BLOCK_N + 1 steps + # if not is_modulo_mn. last step might get wasted but that is okay. + # check if this masking works for that case. + if (start_n + BLOCK_N == block_max) and (n_extra_tokens != 0): + boundary_m = tl.full([BLOCK_M], actual_seqlen_k, dtype=tl.int32) + size_n = start_n + OFFS_N[None, :] + mask = size_n < boundary_m[:, None] + qk = tl.where(mask, qk, float("-inf")) + if IS_CAUSAL: + causal_boundary = start_n + offs_n_causal + causal_mask = OFFS_M[:, None] >= causal_boundary[None, :] + qk = tl.where(causal_mask, qk, float("-inf")) + # -- compute qk ---- + qk += tl.dot(q, k) + if bias_ptr is not None: + bias = load_fn( + bias_ptr, False, MASK_STEPS and (n_extra_tokens != 0), "zero" + ) + # While bias is added after multiplying qk with sm_scale, our + # optimization to use 2^x instead of e^x results in an additional + # scale factor of log2(e) which we must also multiply the bias with. + qk += bias * 1.44269504089 + m_ij = tl.maximum(m_i, tl.max(qk, 1)) + qk = qk - m_ij[:, None] + p = tl.math.exp2(qk) + + # CAVEAT: Must update l_ij before applying dropout + l_ij = tl.sum(p, 1) + if ENABLE_DROPOUT: + philox_offset = ( + batch_philox_offset + + start_m * BLOCK_M * actual_seqlen_k + + start_n + - BLOCK_N + ) + keep = dropout_mask( + philox_seed, + philox_offset, + dropout_p, + BLOCK_M, + BLOCK_N, + actual_seqlen_k, + ) + if RETURN_ENCODED_SOFTMAX: + tl.store( + encoded_softmax_block_ptr, + tl.where(keep, p, -p).to(encoded_softmax_block_ptr.type.element_ty), + ) + p = tl.where(keep, p, 0.0) + elif RETURN_ENCODED_SOFTMAX: + tl.store( + encoded_softmax_block_ptr, + p.to(encoded_softmax_block_ptr.type.element_ty), + ) + # -- update output accumulator -- + alpha = tl.math.exp2(m_i - m_ij) + acc = acc * alpha[:, None] + if not PRE_LOAD_V: + v = load_fn( + V_block_ptr, + MASK_STEPS and (n_extra_tokens != 0), + PADDED_HEAD, + "zero", + ) + # -- update m_i and l_i + l_i = l_i * alpha + l_ij + # update m_i and l_i + m_i = m_ij + acc += tl.dot(p.to(V_block_ptr.type.element_ty), v) + V_block_ptr = tl.advance(V_block_ptr, (BLOCK_N, 0)) + K_block_ptr = tl.advance(K_block_ptr, (0, BLOCK_N)) + if bias_ptr is not None: + bias_ptr = tl.advance(bias_ptr, (0, BLOCK_N)) + if RETURN_ENCODED_SOFTMAX: + encoded_softmax_block_ptr = tl.advance( + encoded_softmax_block_ptr, (0, BLOCK_N) + ) + return acc, l_i, m_i + + +@triton.autotune( + configs=[ + triton.Config( + { + "BLOCK_M": 256, + "BLOCK_N": 64, + "waves_per_eu": 2, + "PRE_LOAD_V": False, + }, + num_stages=1, + num_warps=8, + ), + triton.Config( + { + "BLOCK_M": 128, + "BLOCK_N": 128, + "waves_per_eu": 2, + "PRE_LOAD_V": False, + }, + num_stages=1, + num_warps=4, + ), + triton.Config( + { + "BLOCK_M": 256, + "BLOCK_N": 128, + "waves_per_eu": 2, + "PRE_LOAD_V": False, + }, + num_stages=1, + num_warps=8, + ), + triton.Config( + { + "BLOCK_M": 128, + "BLOCK_N": 64, + "waves_per_eu": 3, + "PRE_LOAD_V": True, + }, + num_stages=1, + num_warps=4, + ), + triton.Config( + { + "BLOCK_M": 128, + "BLOCK_N": 64, + "waves_per_eu": 3, + "PRE_LOAD_V": False, + }, + num_stages=1, + num_warps=4, + ), + triton.Config( + { + "BLOCK_M": 64, + "BLOCK_N": 64, + "waves_per_eu": 4, + "PRE_LOAD_V": False, + }, + num_stages=1, + num_warps=8, + ), + triton.Config( + { + "BLOCK_M": 32, + "BLOCK_N": 32, + "waves_per_eu": 4, + "PRE_LOAD_V": False, + }, + num_stages=1, + num_warps=8, + ), + # TODO: This config fails with head_size not pow2 with data mismatches. + # triton.Config({'BLOCK_M': 32, 'BLOCK_N': 16, 'waves_per_eu': 1, + # 'PRE_LOAD_V': False}, num_stages=1, num_warps=4), + triton.Config( + { + "BLOCK_M": 16, + "BLOCK_N": 16, + "waves_per_eu": 1, + "PRE_LOAD_V": False, + }, + num_stages=1, + num_warps=4, + ), + triton.Config( + { + "BLOCK_M": 128, + "BLOCK_N": 64, + "waves_per_eu": 1, + "PRE_LOAD_V": False, + }, + num_stages=1, + num_warps=4, + ), + ], + key=["IS_CAUSAL", "dropout_p", "BLOCK_DMODEL"], +) +@triton.jit +def attn_fwd( + Q, + K, + V, + bias, + sm_scale, + L, + Out, + stride_qz, + stride_qh, + stride_qm, + stride_qk, + stride_kz, + stride_kh, + stride_kn, + stride_kk, + stride_vz, + stride_vh, + stride_vk, + stride_vn, + stride_oz, + stride_oh, + stride_om, + stride_on, + stride_bz, + stride_bh, + stride_bm, + stride_bn, + cu_seqlens_q, + cu_seqlens_k, + dropout_p, + philox_seed, + philox_offset_base, + encoded_softmax, + HQ: tl.constexpr, + HK: tl.constexpr, + ACTUAL_BLOCK_DMODEL: tl.constexpr, + MAX_SEQLENS_Q: tl.constexpr, + MAX_SEQLENS_K: tl.constexpr, + VARLEN: tl.constexpr, + IS_CAUSAL: tl.constexpr, + BLOCK_M: tl.constexpr, + BLOCK_DMODEL: tl.constexpr, + BLOCK_N: tl.constexpr, + PRE_LOAD_V: tl.constexpr, + BIAS_TYPE: tl.constexpr, + ENABLE_DROPOUT: tl.constexpr, + RETURN_ENCODED_SOFTMAX: tl.constexpr, +): + start_m = tl.program_id(0) + off_h_q = tl.program_id(1) + off_z = tl.program_id(2) + offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M) + offs_n = tl.arange(0, BLOCK_N) + if VARLEN: + cu_seqlens_q_start = tl.load(cu_seqlens_q + off_z) + cu_seqlens_q_end = tl.load(cu_seqlens_q + off_z + 1) + seqlen_q = cu_seqlens_q_end - cu_seqlens_q_start + # We have a one-size-fits-all grid in id(0). Some seqlens might be too + # small for all start_m so for those we return early. + if start_m * BLOCK_M > seqlen_q: + return + cu_seqlens_k_start = tl.load(cu_seqlens_k + off_z) + cu_seqlens_k_end = tl.load(cu_seqlens_k + off_z + 1) + seqlen_k = cu_seqlens_k_end - cu_seqlens_k_start + else: + cu_seqlens_q_start = 0 + cu_seqlens_k_start = 0 + seqlen_q = MAX_SEQLENS_Q + seqlen_k = MAX_SEQLENS_K + + # Now we compute whether we need to exit early due to causal masking. + # This is because for seqlen_q > seqlen_k, M rows of the attn scores + # are completely masked, resulting in 0s written to the output, and + # inf written to LSE. We don't need to do any GEMMs in this case. + # This block of code determines what N is, and if this WG is operating + # on those M rows. + n_blocks = cdiv_fn(seqlen_k, BLOCK_N) + if IS_CAUSAL: + # If seqlen_q == seqlen_k, the attn scores are a square matrix. + # If seqlen_q != seqlen_k, attn scores are rectangular which means + # the causal mask boundary is bottom right aligned, and ends at either + # the top edge (seqlen_q < seqlen_k) or left edge. + # This captures the decrease in n_blocks if we have a rectangular attn + # matrix + n_blocks_seqlen = cdiv_fn( + (start_m + 1) * BLOCK_M + seqlen_k - seqlen_q, BLOCK_N + ) + # This is what adjusts the block_max for the current WG, only + # if IS_CAUSAL. Otherwise we want to always iterate through all n_blocks + n_blocks = min(n_blocks, n_blocks_seqlen) + # If we have no blocks after adjusting for seqlen deltas, this WG is + # part of the blocks that are all 0. We exit early. + if n_blocks <= 0: + o_offset = ( + off_z * stride_oz + cu_seqlens_q_start * stride_om + off_h_q * stride_oh + ) + O_block_ptr = tl.make_block_ptr( + base=Out + o_offset, + shape=(seqlen_q, BLOCK_DMODEL), + strides=(stride_om, stride_on), + offsets=(start_m * BLOCK_M, 0), + block_shape=(BLOCK_M, BLOCK_DMODEL), + order=(1, 0), + ) + acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=Out.type.element_ty) + # We still need to write 0s to the result + # tl.store(O_block_ptr, + # acc.to(Out.type.element_ty), boundary_check=(0,1)) + # l_ptrs = L + off_z * hq * MAX_SEQLENS_Q + off_h_q * MAX_SEQLENS_Q + # + offs_m + # We store inf to LSE, not -inf because in the bwd pass, + # we subtract this + # from qk which makes it -inf, such that exp(qk - inf) = 0 + # for these masked blocks. + # l = tl.full([BLOCK_M], value=float("inf"), dtype=tl.float32) + # tl.store(l_ptrs, l) + # TODO: Should dropout and return encoded softmax be handled here? + return + + # If MQA / GQA, set the K and V head offsets appropriately. + GROUP_SIZE: tl.constexpr = HQ // HK + if GROUP_SIZE != 1: + off_h_k = off_h_q // GROUP_SIZE + else: + off_h_k = off_h_q + + n_extra_tokens = 0 + if seqlen_k < BLOCK_N: + n_extra_tokens = BLOCK_N - seqlen_k + elif seqlen_k % BLOCK_N: + n_extra_tokens = seqlen_k % BLOCK_N + PADDED_HEAD: tl.constexpr = ACTUAL_BLOCK_DMODEL != BLOCK_DMODEL + + # Compute pointers for all the tensors used in this kernel. + q_offset = off_z * stride_qz + off_h_q * stride_qh + cu_seqlens_q_start * stride_qm + Q_block_ptr = tl.make_block_ptr( + base=Q + q_offset, + shape=(seqlen_q, ACTUAL_BLOCK_DMODEL), + strides=(stride_qm, stride_qk), + offsets=(start_m * BLOCK_M, 0), + block_shape=(BLOCK_M, BLOCK_DMODEL), + order=(1, 0), + ) + k_offset = off_z * stride_kz + off_h_k * stride_kh + cu_seqlens_k_start * stride_kn + K_block_ptr = tl.make_block_ptr( + base=K + k_offset, + shape=(ACTUAL_BLOCK_DMODEL, seqlen_k), + strides=(stride_kk, stride_kn), + offsets=(0, 0), + block_shape=(BLOCK_DMODEL, BLOCK_N), + order=(0, 1), + ) + v_offset = off_z * stride_vz + off_h_k * stride_vh + cu_seqlens_k_start * stride_vk + V_block_ptr = tl.make_block_ptr( + base=V + v_offset, + shape=(seqlen_k, ACTUAL_BLOCK_DMODEL), + strides=(stride_vk, stride_vn), + offsets=(0, 0), + block_shape=(BLOCK_N, BLOCK_DMODEL), + order=(1, 0), + ) + if BIAS_TYPE != 0: + bias_ptr = tl.make_block_ptr( + base=bias + off_h_q * stride_bh, + shape=(seqlen_q, seqlen_k), + strides=(stride_bm, stride_bn), + offsets=(start_m * BLOCK_M, 0), + block_shape=(BLOCK_M, BLOCK_N), + order=(1, 0), + ) + else: + bias_ptr = None + if ENABLE_DROPOUT: + batch_philox_offset = ( + philox_offset_base + (off_z * HQ + off_h_q) * seqlen_q * seqlen_k + ) + else: + batch_philox_offset = 0 + # We can ask to return the dropout mask without actually doing any dropout. + # In this case, we return an invalid pointer so indicate the mask is not i + # valid. + # TODO: Fix encoded softmax. It currently uses just h_q in the base offset. + if RETURN_ENCODED_SOFTMAX: + encoded_softmax_block_ptr = tl.make_block_ptr( + base=encoded_softmax + off_h_q * seqlen_q * seqlen_k, + shape=(seqlen_q, seqlen_k), + strides=(seqlen_k, 1), + offsets=(start_m * BLOCK_M, 0), + block_shape=(BLOCK_M, BLOCK_N), + order=(1, 0), + ) + else: + encoded_softmax_block_ptr = 0 + # initialize pointer to m and l + m_i = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32) + l_i = tl.full([BLOCK_M], 1.0, dtype=tl.float32) + acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32) + # scale sm_scale by log_2(e) and use 2^x in the loop as we do not + # have native e^x support in HW. + qk_scale = sm_scale * 1.44269504089 + # Q is loaded once at the beginning and shared by all N blocks. + q = load_fn(Q_block_ptr, True, PADDED_HEAD, "zero") + q = (q * qk_scale).to(Q_block_ptr.type.element_ty) + + # Here we compute how many full and masked blocks we have. + padded_block_k = n_extra_tokens != 0 + is_modulo_mn = not padded_block_k and (seqlen_q % BLOCK_M == 0) + if IS_CAUSAL: + # There are always at least BLOCK_M // BLOCK_N masked blocks. + # Additionally there might be one more due to dissimilar seqlens. + masked_blocks = BLOCK_M // BLOCK_N + (not is_modulo_mn) + else: + # Padding on Q does not need to be masked in the FA loop. + masked_blocks = padded_block_k + # if IS_CAUSAL, not is_modulo_mn does not always result in an additional + # block. In this case we might exceed n_blocks so pick the min. + masked_blocks = min(masked_blocks, n_blocks) + n_full_blocks = n_blocks - masked_blocks + block_min = 0 + block_max = n_blocks * BLOCK_N + # Compute for full blocks. Here we set causal to false regardless of its + # value because there is no masking. Similarly we do not need padding. + if n_full_blocks > 0: + block_max = (n_blocks - masked_blocks) * BLOCK_N + acc, l_i, m_i = _attn_fwd_inner( + acc, + l_i, + m_i, + q, + K_block_ptr, + V_block_ptr, + start_m, + seqlen_k, + dropout_p, + philox_seed, + batch_philox_offset, + encoded_softmax_block_ptr, + # _, _, offs_n_causal, masked_blocks, n_extra_tokens, _ + block_min, + block_max, + 0, + 0, + 0, + bias_ptr, + # IS_CAUSAL, .... + False, + BLOCK_M, + BLOCK_DMODEL, + BLOCK_N, + offs_m, + offs_n, + # _, MASK_STEPS, ... + PRE_LOAD_V, + False, + ENABLE_DROPOUT, + RETURN_ENCODED_SOFTMAX, + PADDED_HEAD, + ) + block_min = block_max + block_max = n_blocks * BLOCK_N + + tl.debug_barrier() + # Remaining blocks, if any, are full / not masked. + if masked_blocks > 0: + offs_n_causal = offs_n + (seqlen_q - seqlen_k) if IS_CAUSAL else 0 + K_block_ptr = tl.advance(K_block_ptr, (0, n_full_blocks * BLOCK_N)) + V_block_ptr = tl.advance(V_block_ptr, (n_full_blocks * BLOCK_N, 0)) + if bias_ptr is not None: + bias_ptr = tl.advance(bias_ptr, (0, n_full_blocks * BLOCK_N)) + if RETURN_ENCODED_SOFTMAX: + encoded_softmax_block_ptr = tl.advance( + encoded_softmax_block_ptr, (0, n_full_blocks) + ) + acc, l_i, m_i = _attn_fwd_inner( + acc, + l_i, + m_i, + q, + K_block_ptr, + V_block_ptr, + start_m, + seqlen_k, + dropout_p, + philox_seed, + batch_philox_offset, + encoded_softmax_block_ptr, + block_min, + block_max, + offs_n_causal, + masked_blocks, + n_extra_tokens, + bias_ptr, + IS_CAUSAL, + BLOCK_M, + BLOCK_DMODEL, + BLOCK_N, + offs_m, + offs_n, + # _, MASK_STEPS, ... + PRE_LOAD_V, + True, + ENABLE_DROPOUT, + RETURN_ENCODED_SOFTMAX, + PADDED_HEAD, + ) + # epilogue + acc = acc / l_i[:, None] + if ENABLE_DROPOUT: + acc = acc / (1 - dropout_p) + # If seqlen_q > seqlen_k but the delta is not a multiple of BLOCK_M, + # then we have one block with a row of all NaNs which come from computing + # softmax over a row of all -infs (-inf - inf = NaN). We check for that here + # and store 0s where there are NaNs as these rows should've been zeroed out. + end_m_idx = (start_m + 1) * BLOCK_M + start_m_idx = start_m * BLOCK_M + causal_start_idx = seqlen_q - seqlen_k + acc = acc.to(Out.type.element_ty) + if IS_CAUSAL: # noqa: SIM102 + if causal_start_idx > start_m_idx and causal_start_idx < end_m_idx: + out_mask_boundary = tl.full( + (BLOCK_DMODEL,), causal_start_idx, dtype=tl.int32 + ) + mask_m_offsets = start_m_idx + tl.arange(0, BLOCK_M) + out_ptrs_mask = mask_m_offsets[:, None] >= out_mask_boundary[None, :] + z = 0.0 + acc = tl.where(out_ptrs_mask, acc, z.to(acc.type.element_ty)) + # write back LSE + # l_ptrs = L + off_z * hq * MAX_SEQLENS_Q + off_h_q * MAX_SEQLENS_Q + offs_m + # If seqlen_q not multiple of BLOCK_M, we need to mask out the last + # few rows. This is only true for the last M block. For others, + # overflow_size will be -ve + # overflow_size = end_m_idx - seqlen_q + # if overflow_size > 0: + # boundary = tl.full((BLOCK_M,), BLOCK_M - overflow_size, dtype=tl.int32) + # # This is a > check because mask being 0 blocks the store. + # l_ptrs_mask = boundary > tl.arange(0, BLOCK_M) + # tl.store(l_ptrs, m_i + tl.math.log2(l_i), mask=l_ptrs_mask) + # else: + # tl.store(l_ptrs, m_i + tl.math.log2(l_i)) + + # write back O + o_offset = off_z * stride_oz + cu_seqlens_q_start * stride_om + off_h_q * stride_oh + O_block_ptr = tl.make_block_ptr( + base=Out + o_offset, + shape=(seqlen_q, ACTUAL_BLOCK_DMODEL), + strides=(stride_om, stride_on), + offsets=(start_m * BLOCK_M, 0), + block_shape=(BLOCK_M, BLOCK_DMODEL), + order=(1, 0), + ) + # Need boundary check on this to make sure the padding from the + # Q and KV tensors in both dims are not part of what we store back. + # TODO: Do the boundary check optionally. + tl.store(O_block_ptr, acc, boundary_check=(0, 1)) + + +def check_args( + q, + k, + v, + o, + varlen=True, + max_seqlens=None, + cu_seqlens_q=None, + cu_seqlens_k=None, +): + assert q.dim() == k.dim() and q.dim() == v.dim() + if varlen: + assert q.dim() == 3 + total_q, nheads_q, head_size = q.shape + total_k, nheads_k, _ = k.shape + assert cu_seqlens_q is not None + assert cu_seqlens_k is not None + assert len(cu_seqlens_q) == len(cu_seqlens_k) + else: + assert q.dim() == 4 + batch, nheads_q, seqlen_q, head_size = q.shape + _, nheads_k, seqlen_k, _ = k.shape + assert max_seqlens > 0 + assert k.shape == v.shape + assert q.shape[-1] == k.shape[-1] and q.shape[-1] == v.shape[-1] + # TODO: Change assert if we support qkl f8 and v f16 + assert q.dtype == k.dtype and q.dtype == v.dtype + # TODO: Fix assert to check head size <=256 once supported + assert head_size <= 128 + assert o.shape == q.shape + assert (nheads_q % nheads_k) == 0 + + +class _attention(torch.autograd.Function): + + @staticmethod + def forward( + ctx, + q, + k, + v, + o, + cu_seqlens_q, + cu_seqlens_k, + max_seqlens_q, + max_seqlens_k, + causal=False, + sm_scale=1.0, + bias=None, + ): + if o is None: + o = torch.empty_like(q, dtype=v.dtype) + + check_args( + q, + k, + v, + o, + varlen=True, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + ) + if True: # varlen + total_q, nheads_q, head_size = q.shape + total_k, nheads_k, _ = k.shape + batch = len(cu_seqlens_q) - 1 + q_strides = (0, q.stride(1), q.stride(0), q.stride(2)) + k_strides = (0, k.stride(1), k.stride(0), k.stride(2)) + v_strides = (0, v.stride(1), v.stride(0), v.stride(2)) + o_strides = (0, o.stride(1), o.stride(0), o.stride(2)) + else: + batch, seqlen_q, nheads_q, head_size = q.shape + _, seqlen_k, nheads_k, _ = k.shape + q_strides = (q.stride(0), q.stride(2), q.stride(1), q.stride(3)) + k_strides = (k.stride(0), k.stride(2), k.stride(1), k.stride(3)) + v_strides = (v.stride(0), v.stride(2), v.stride(1), v.stride(3)) + o_strides = (o.stride(0), o.stride(2), o.stride(1), o.stride(3)) + + # Get closest power of 2 over or equal to 32. + padded_d_model = 1 << (head_size - 1).bit_length() + padded_d_model = max(padded_d_model, 16) + + grid = lambda META: ( + triton.cdiv(max_seqlens_q, META["BLOCK_M"]), + nheads_q, + batch, + ) + + encoded_softmax = None + + # Seed the RNG so we get reproducible results for testing. + philox_seed = 0x1BF52 + philox_offset = 0x1D4B42 + + if bias is not None: + bias_strides = ( + bias.stride(0), + bias.stride(1), + bias.stride(2), + bias.stride(3), + ) + else: + bias_strides = (0, 0, 0, 0) + + attn_fwd[grid]( + q, + k, + v, + bias, + sm_scale, + None, + o, + *q_strides, + *k_strides, + *v_strides, + *o_strides, + *bias_strides, + cu_seqlens_q, + cu_seqlens_k, + dropout_p=0.0, + philox_seed=philox_seed, + philox_offset_base=philox_offset, + encoded_softmax=encoded_softmax, + HQ=nheads_q, + HK=nheads_k, + ACTUAL_BLOCK_DMODEL=head_size, + MAX_SEQLENS_Q=max_seqlens_q, + MAX_SEQLENS_K=max_seqlens_k, + IS_CAUSAL=causal, + VARLEN=True, + BLOCK_DMODEL=padded_d_model, + BIAS_TYPE=0 if bias is None else 1, + ENABLE_DROPOUT=False, + RETURN_ENCODED_SOFTMAX=False, + ) + + ctx.grid = grid + ctx.sm_scale = sm_scale + ctx.BLOCK_DMODEL = head_size + ctx.causal = causal + ctx.dropout_p = 0.0 + ctx.philox_seed = philox_seed + ctx.philox_offset = philox_offset + ctx.encoded_softmax = encoded_softmax + ctx.return_encoded_softmax = False + return o, encoded_softmax + + +triton_attention = _attention.apply diff --git a/server/text_generation_server/utils/hub.py b/server/text_generation_server/utils/hub.py index a81e659d..b56484f6 100644 --- a/server/text_generation_server/utils/hub.py +++ b/server/text_generation_server/utils/hub.py @@ -40,7 +40,6 @@ def _weight_hub_files_from_model_info( and "arguments" not in s.rfilename and "args" not in s.rfilename and "training" not in s.rfilename - and "medusa_lm_head" not in s.rfilename ] @@ -57,7 +56,6 @@ def _weight_files_from_dir(d: Path, extension: str) -> List[str]: and "args" not in f and "adapter" not in f and "training" not in f - and "medusa_lm_head" not in f ] return filenames diff --git a/server/text_generation_server/utils/import_utils.py b/server/text_generation_server/utils/import_utils.py index db205f4d..40e57646 100644 --- a/server/text_generation_server/utils/import_utils.py +++ b/server/text_generation_server/utils/import_utils.py @@ -10,6 +10,41 @@ def is_xpu_available(): return hasattr(torch, "xpu") and torch.xpu.is_available() -IS_ROCM_SYSTEM = torch.version.hip is not None -IS_CUDA_SYSTEM = torch.version.cuda is not None -IS_XPU_SYSTEM = is_xpu_available() +def get_cuda_free_memory(device, memory_fraction): + total_free_memory, _ = torch.cuda.mem_get_info(device) + total_gpu_memory = torch.cuda.get_device_properties(device).total_memory + free_memory = max(0, total_free_memory - (1 - memory_fraction) * total_gpu_memory) + return free_memory + + +def get_xpu_free_memory(device, memory_fraction): + total_gpu_memory = torch.xpu.get_device_properties(device).total_memory + free_memory = int(total_gpu_memory * 0.5) + return free_memory + + +SYSTEM = None +if torch.version.hip is not None: + SYSTEM = "rocm" + empty_cache = torch.cuda.empty_cache + synchronize = torch.cuda.synchronize + get_free_memory = get_cuda_free_memory +elif torch.version.cuda is not None and torch.cuda.is_available(): + SYSTEM = "cuda" + empty_cache = torch.cuda.empty_cache + synchronize = torch.cuda.synchronize + get_free_memory = get_cuda_free_memory +elif is_xpu_available(): + SYSTEM = "xpu" + empty_cache = torch.xpu.empty_cache + synchronize = torch.xpu.synchronize + get_free_memory = get_xpu_free_memory +else: + SYSTEM = "cpu" + + def noop(*args, **kwargs): + pass + + empty_cache = noop + synchronize = noop + get_free_memory = noop diff --git a/server/text_generation_server/utils/layers.py b/server/text_generation_server/utils/layers.py deleted file mode 100644 index 6e4a13cd..00000000 --- a/server/text_generation_server/utils/layers.py +++ /dev/null @@ -1,1284 +0,0 @@ -import os -import torch -import torch.distributed - -from torch import nn -from torch.nn import functional as F -from typing import List, Tuple, Optional -from loguru import logger -from functools import lru_cache - -from text_generation_server.utils.speculate import get_speculate - -HAS_BITS_AND_BYTES = True -try: - import bitsandbytes as bnb - from bitsandbytes.nn import Int8Params, Params4bit -except ImportError: - HAS_BITS_AND_BYTES = False - -from accelerate import init_empty_weights - -from text_generation_server.utils.gptq.quant_linear import QuantLinear -from text_generation_server.utils.import_utils import ( - IS_CUDA_SYSTEM, - IS_ROCM_SYSTEM, - IS_XPU_SYSTEM, -) - -if IS_XPU_SYSTEM: - import intel_extension_for_pytorch as ipex - -HAS_AWQ = True -try: - from text_generation_server.utils.awq.quantize.qmodule import WQLinear -except ImportError: - HAS_AWQ = False - -try: - major, _minor = torch.cuda.get_device_capability() -except Exception: - major = 1 - -HAS_EXLLAMA = False -CAN_EXLLAMA = major >= 8 or IS_ROCM_SYSTEM -V2 = os.getenv("EXLLAMA_VERSION", "2") == "2" - -if os.getenv("DISABLE_EXLLAMA") == "True": - HAS_EXLLAMA = False -elif CAN_EXLLAMA: - try: - if V2: - from text_generation_server.utils.gptq.exllamav2 import ( - QuantLinear as ExllamaQuantLinear, - create_exllama_buffers, - set_device, - ) - - HAS_EXLLAMA = "2" - else: - from text_generation_server.utils.gptq.exllama import ( - Ex4bitLinear as ExllamaQuantLinear, - create_exllama_buffers, - set_device, - ) - - HAS_EXLLAMA = "1" - - except ImportError: - pass - -HAS_EETQ = False -try: - from EETQ import quant_weights, w8_a16_gemm - - HAS_EETQ = True -except ImportError: - pass - - -# Monkey patching -@classmethod -def load_layer_norm(cls, prefix, weights, eps): - weight = weights.get_tensor(f"{prefix}.weight") - bias = weights.get_tensor(f"{prefix}.bias") - with init_empty_weights(): - ln = cls(weight.shape, eps=eps) - - ln.weight = nn.Parameter(weight) - ln.bias = nn.Parameter(bias) - return ln - - -@classmethod -def load_layer_norm_no_bias(cls, prefix, weights, eps): - weight = weights.get_tensor(f"{prefix}.weight") - with init_empty_weights(): - ln = cls(weight.shape, eps=eps) - - ln.weight = nn.Parameter(weight) - ln.bias = None - return ln - - -@classmethod -def load_conv2d(cls, prefix, weights, in_channels, out_channels, kernel_size, stride): - weight = weights.get_tensor(f"{prefix}.weight") - bias = weights.get_tensor(f"{prefix}.bias") - with init_empty_weights(): - conv2d = cls( - in_channels=in_channels, - out_channels=out_channels, - kernel_size=kernel_size, - stride=stride, - ) - - conv2d.weight = nn.Parameter(weight) - conv2d.bias = nn.Parameter(bias) - return conv2d - - -@classmethod -def load_conv2d_no_bias( - cls, prefix, weights, in_channels, out_channels, kernel_size, stride -): - weight = weights.get_tensor(f"{prefix}.weight") - with init_empty_weights(): - conv2d = cls( - in_channels=in_channels, - out_channels=out_channels, - kernel_size=kernel_size, - stride=stride, - ) - - conv2d.weight = nn.Parameter(weight) - conv2d.bias = None - return conv2d - - -torch.nn.Conv2d.load = load_conv2d -torch.nn.Conv2d.load_no_bias = load_conv2d_no_bias -torch.nn.LayerNorm.load = load_layer_norm -torch.nn.LayerNorm.load_no_bias = load_layer_norm_no_bias - - -class FastLinear(nn.Module): - def __init__( - self, - weight, - bias, - ) -> None: - super().__init__() - self.weight = nn.Parameter(weight) - if bias is not None: - self.bias = nn.Parameter(bias) - else: - self.bias = None - - @classmethod - def load(cls, config, prefix: str, weights, bias: bool): - weight = weights.get_tensor(f"{prefix}.weight") - if bias: - bias = weights.get_tensor(f"{prefix}.bias") - else: - bias = None - return cls(weight, bias) - - def forward(self, input: torch.Tensor) -> torch.Tensor: - return F.linear(input, self.weight, self.bias) - - -class EETQLinear(nn.Module): - def __init__( - self, - weight, - bias, - ) -> None: - super().__init__() - device = weight.device - if weight.dtype != torch.float16: - weight = weight.to(dtype=torch.float16) - weight = torch.t(weight).contiguous().cpu() - weight, scale = quant_weights(weight, torch.int8, False) - - self.weight = weight.cuda(device) - self.scale = scale.cuda(device) - self.bias = bias.cuda(device) if bias is not None else None - - def forward(self, input: torch.Tensor) -> torch.Tensor: - output = w8_a16_gemm(input, self.weight, self.scale) - output = output + self.bias if self.bias is not None else output - return output - - -def fp8_quantize(weight, qdtype=torch.float8_e4m3fn): - device = weight.device - # weight, scale = quant_weights(weight, torch.int8, False) - finfo = torch.finfo(qdtype) - # Calculate the scale as dtype max divided by absmax - scale = finfo.max / weight.abs().max().clamp(min=1e-12) - # scale and clamp the tensor to bring it to - # the representative range of float8 data type - # (as default cast is unsaturated) - qweight = (weight * scale).clamp(min=finfo.min, max=finfo.max) - # Return both float8 data and the inverse scale (as float), - # as both required as inputs to torch._scaled_mm - qweight = qweight.to(qdtype) - scale = scale.float().reciprocal() - return qweight, scale - - -class Fp8Linear(nn.Module): - def __init__( - self, - weight, - bias, - ) -> None: - super().__init__() - self.dtype = weight.dtype - self.qweight, self.scale = fp8_quantize(weight) - - self.bias = bias if bias is not None else None - - def forward(self, input: torch.Tensor) -> torch.Tensor: - qinput, scale = fp8_quantize(input) - output, _ = torch._scaled_mm( - qinput, - self.qweight.t(), - out_dtype=self.dtype, - scale_a=scale, - scale_b=self.scale, - bias=self.bias, - ) - return output - - -class Linear8bitLt(nn.Module): - def __init__( - self, - weight, - bias, - has_fp16_weights=True, - memory_efficient_backward=False, - threshold=0.0, - index=None, - ): - super().__init__() - assert ( - not memory_efficient_backward - ), "memory_efficient_backward is no longer required and the argument is deprecated in 0.37.0 and will be removed in 0.39.0" - self.state = bnb.MatmulLtState() - self.index = index - - # Necessary for stacked layers - self.state.threshold = threshold - self.state.has_fp16_weights = has_fp16_weights - self.state.memory_efficient_backward = memory_efficient_backward - if threshold > 0.0 and not has_fp16_weights: - self.state.use_pool = True - - self.weight = Int8Params( - weight.data, - has_fp16_weights=has_fp16_weights, - requires_grad=has_fp16_weights, - ) - self.weight.cuda(weight.device) - self.bias = bias - - def init_8bit_state(self): - self.state.CB = self.weight.CB - self.state.SCB = self.weight.SCB - self.weight.CB = None - self.weight.SCB = None - - def forward(self, x: torch.Tensor): - self.state.is_training = self.training - if self.weight.CB is not None: - self.init_8bit_state() - - # weights are cast automatically as Int8Params, but the bias has to be cast manually - if self.bias is not None and self.bias.dtype != x.dtype: - self.bias.data = self.bias.data.to(x.dtype) - - out = bnb.matmul(x, self.weight, bias=self.bias, state=self.state) - - if not self.state.has_fp16_weights: - if self.state.CB is not None and self.state.CxB is not None: - # we converted 8-bit row major to turing/ampere format in the first inference pass - # we no longer need the row-major weight - del self.state.CB - self.weight.data = self.state.CxB - return out - - -class Linear4bit(nn.Module): - def __init__(self, weight, bias, quant_type): - super().__init__() - self.weight = Params4bit( - weight.data, - requires_grad=False, - compress_statistics=True, - quant_type=quant_type, - ) - self.compute_dtype = None - self.weight.cuda(weight.device) - self.bias = bias - - def forward(self, x: torch.Tensor): - # weights are cast automatically as Int8Params, but the bias has to be cast manually - if self.bias is not None and self.bias.dtype != x.dtype: - self.bias.data = self.bias.data.to(x.dtype) - - if getattr(self.weight, "quant_state", None) is None: - print( - "FP4 quantization state not initialized. Please call .cuda() or .to(device) on the LinearFP4 layer first." - ) - inp_dtype = x.dtype - if self.compute_dtype is not None: - x = x.to(self.compute_dtype) - - bias = None if self.bias is None else self.bias.to(self.compute_dtype) - out = bnb.matmul_4bit( - x, self.weight.t(), bias=bias, quant_state=self.weight.quant_state - ) - - out = out.to(inp_dtype) - - return out - - -@lru_cache(1) -def warn_deprecate_bnb(): - logger.warning( - "Bitsandbytes 8bit is deprecated, using `eetq` is a drop-in replacement, and has much better performnce" - ) - - -def get_linear(weight, bias, quantize): - if quantize is None: - linear = FastLinear(weight, bias) - elif quantize == "eetq": - if HAS_EETQ: - linear = EETQLinear(weight, bias) - else: - raise ImportError( - "Please install EETQ from https://github.com/NetEase-FuXi/EETQ" - ) - elif quantize == "fp8": - linear = Fp8Linear(weight, bias) - elif quantize == "bitsandbytes": - warn_deprecate_bnb() - linear = Linear8bitLt( - weight, - bias, - has_fp16_weights=False, - threshold=6.0, - ) - if bias is not None: - linear.bias = nn.Parameter(bias) - elif quantize == "bitsandbytes-fp4": - linear = Linear4bit( - weight, - bias, - quant_type="fp4", - ) - elif quantize == "bitsandbytes-nf4": - linear = Linear4bit( - weight, - bias, - quant_type="nf4", - ) - elif quantize == "gptq": - try: - qweight, qzeros, scales, g_idx, bits, groupsize, use_exllama = weight - except Exception: - raise NotImplementedError( - f"The passed weight is not `gptq` compatible, loader needs to be updated." - ) - - if use_exllama: - linear = ExllamaQuantLinear( - qweight, qzeros, scales, g_idx, bias, bits, groupsize - ) - else: - linear = QuantLinear( - qweight, - qzeros, - scales, - g_idx, - bias, - bits, - groupsize, - ) - elif quantize == "awq": - try: - qweight, qzeros, scales, _, bits, groupsize, _ = weight - except Exception: - raise NotImplementedError( - f"The passed weight is not `awq` compatible, loader needs to be updated." - ) - if IS_ROCM_SYSTEM: - raise NotImplementedError( - "AWQ GEMM kernel can't be used on ROCm systems, please use `--quantize gptq` instead " - "to use Exllama/GPTQ kernels for AWQ inference." - ) - if not HAS_AWQ: - raise NotImplementedError( - "You do not seem to have awq installed, either install it (cd server && make install-awq), or try using GPTQ `---quantize gptq` a conversion AWQ->GPTQ will happen on the fly" - ) - linear = WQLinear( - w_bit=bits, - group_size=groupsize, - qweight=qweight, - qzeros=qzeros, - scales=scales, - bias=bias is not None, - ) - else: - raise NotImplementedError(f"Quantization `{quantize}` is not implemented yet.") - return linear - - -class SuperLayer(nn.Module): - def __init__(self, linear): - super().__init__() - self.linear = linear - - def forward(self, x): - return self.linear.forward(x) - - -class ResBlock(torch.nn.Module): - def __init__(self, config, prefix, weights): - super().__init__() - self.linear = FastLinear.load( - config, prefix=f"{prefix}.linear", weights=weights, bias=True - ) - self.act = torch.nn.SiLU() - - def forward(self, x): - return x + self.act(self.linear(x)) - - -class MedusaModel(torch.nn.Module): - def __init__(self, config, medusa_config, weights): - super().__init__() - self.heads = torch.nn.ModuleList( - [ - MedusaHead(config, medusa_config, prefix=f"{i}", weights=weights) - for i in range(get_speculate()) - ] - ) - - def forward(self, x): - speculative_logits = torch.stack([head(x) for head in self.heads], dim=1) - return speculative_logits - - -class MedusaHead(torch.nn.Module): - def __init__(self, config, medusa_config, prefix, weights): - super().__init__() - self.blocks = torch.nn.ModuleList( - [ - ResBlock(config, prefix=f"{prefix}.{i}", weights=weights) - for i in range(medusa_config["medusa_num_layers"]) - ] - ) - n = len(self.blocks) - self.out = FastLinear.load( - config, prefix=f"{prefix}.{n}", weights=weights, bias=False - ) - - def forward(self, x): - for block in self.blocks: - x = block(x) - x = self.out(x) - return x - - -class MedusaHeadV1(nn.Module): - def __init__(self, lm_head, medusa): - super().__init__() - self.lm_head = lm_head - self.medusa = medusa - - @staticmethod - def load(config, prefix: str, weights): - from pathlib import Path - from safetensors import safe_open - import json - - use_medusa = config.use_medusa - - medusa_config = str(Path(use_medusa) / "config.json") - filename = str(Path(use_medusa) / "medusa_lm_head.safetensors") - - with open(medusa_config, "r") as f: - medusa_config = json.load(f) - routing = weights.routing - with safe_open(filename, framework="pytorch") as f: - for k in f.keys(): - if k in routing and routing[k] != filename: - raise RuntimeError( - f"Key {k} was found in multiple files: {filename} and {routing[k]}" - ) - routing[k] = filename - - medusa = MedusaModel(config, medusa_config, weights) - lm_head = TensorParallelHead.load(config, prefix, weights) - return MedusaHeadV1(lm_head, medusa) - - def forward( - self, input: torch.Tensor - ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: - logits = self.lm_head(input) - # If we have too many tokens, we skip speculative logits - if input.shape[0] > 128: - return logits, None - - speculative_logits = self.medusa(input) - return logits, speculative_logits - - -class MedusaHeadV2(nn.Module): - def __init__(self, config, prefix, weights): - super().__init__() - from pathlib import Path - from safetensors import safe_open - import json - - use_medusa = config.use_medusa - - medusa_config = str(Path(use_medusa) / "config.json") - filename = str(Path(use_medusa) / "medusa_lm_head.safetensors") - - with open(medusa_config, "r") as f: - medusa_config = json.load(f) - routing = weights.routing - with safe_open(filename, framework="pytorch") as f: - for k in f.keys(): - if k in routing and routing[k] != filename: - raise RuntimeError( - f"Key {k} was found in multiple files: {filename} and {routing[k]}" - ) - routing[k] = filename - - self.n_medusa_heads = get_speculate() - - assert medusa_config["medusa_num_layers"] == 1 - self.linear = TensorParallelColumnLinear.load_multi( - config, - prefixes=[f"{i}.0.linear" for i in range(self.n_medusa_heads)], - dim=0, - weights=weights, - bias=True, - ) - self.process_group = weights.process_group - self.world_size = self.process_group.size() - self.rank = self.process_group.rank() - - self.act = torch.nn.SiLU() - - self.lm_head = TensorParallelHead.load(config, prefix, weights) - - def forward(self, x): - # If we have too many tokens, we skip speculative logits - if x.shape[0] > 128: - logits = self.lm_head(x) - return logits, None - - size = x.shape[-1] - block_size = (size + self.world_size - 1) // self.world_size - start = self.rank * block_size - stop = (self.rank + 1) * block_size - - x_block = x[:, start:stop] - - # Compute all medusa heads at the same time, then reshape and move the n_medusa_heads dim to dim 1 - medusa_res = self.act(self.linear(x)).reshape( - *x_block.shape[:-1], self.n_medusa_heads, x_block.shape[-1] - ) - - # Apply all residual medusa heads - output = x[:, start:stop].unsqueeze(-2) + medusa_res - - # Gather medusa heads - world_output = [ - torch.empty_like(output) for _ in range(self.process_group.size()) - ] - torch.distributed.all_gather(world_output, output, group=self.process_group) - world_output = torch.cat(world_output, dim=-1) - - # Stack x and medusa residual x - stacked_x = torch.cat([x.unsqueeze(-2), world_output], dim=-2) - - # Compute lm head on x + medusa residual x - logits = self.lm_head(stacked_x) - - # Finally, split logits from speculative logits - logits, speculative_logits = torch.split( - logits, [1, self.n_medusa_heads], dim=-2 - ) - # Squeeze added dimension - logits = logits.squeeze(-2) - - return logits, speculative_logits - - -class SpeculativeHead(nn.Module): - def __init__(self, lm_head, medusa): - super().__init__() - self.head = lm_head - self.medusa = medusa - - @staticmethod - def load(config, prefix: str, weights): - use_medusa = config.use_medusa - if use_medusa: - lm_head = None - try: - medusa = MedusaHeadV1.load(config, prefix, weights) - except: - medusa = MedusaHeadV2(config, prefix, weights) - else: - lm_head = TensorParallelHead.load(config, prefix, weights) - medusa = None - return SpeculativeHead(lm_head, medusa) - - def forward( - self, input: torch.Tensor - ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: - if self.medusa is not None: - return self.medusa(input) - - assert self.head is not None - logits = self.head(input) - return logits, None - - -class TensorParallelHead(SuperLayer): - def __init__(self, linear, process_group, should_gather: bool): - super().__init__(linear) - self.process_group = process_group - self.should_gather = should_gather - - @staticmethod - def load(config, prefix: str, weights): - if weights.process_group.size() > 1: - try: - weight = weights.get_sharded(f"{prefix}.weight", dim=0) - should_gather = True - except AssertionError: - # If the vocab size is not divisible by number of shards - # just load the entire thing. - weight = weights.get_tensor(f"{prefix}.weight") - should_gather = False - else: - weight = weights.get_tensor(f"{prefix}.weight") - should_gather = False - - # GPTQ,AWQ,EETQ don't quantize heads (nor embeddings) - if config.quantize in ["gptq", "awq", "eetq"]: - quantize = None - else: - quantize = config.quantize - return TensorParallelHead( - get_linear(weight, bias=None, quantize=quantize), - process_group=weights.process_group, - should_gather=should_gather, - ) - - def forward(self, input: torch.Tensor) -> torch.Tensor: - if not self.should_gather: - return super().forward(input) - - world_size = self.process_group.size() - if len(input.shape) == 2 and isinstance(self.linear, FastLinear): - out_dim = self.linear.weight.shape[0] - - if input.shape[0] == 1: - world_out = input.new_empty(1, out_dim * world_size) - local_out = input.new_empty(1, out_dim) - gather_input = local_out - else: - world_out = input.new_empty(out_dim * world_size, input.shape[0]) - gather_input = input.new_empty(out_dim, input.shape[0]) - local_out = gather_input.T - - torch.mm(input, self.linear.weight.T, out=local_out) - - torch.distributed.all_gather_into_tensor( - world_out, gather_input, group=self.process_group - ) - - if input.shape[0] == 1: - return world_out - return world_out.T - - output = super().forward(input) - world_output = [ - torch.empty_like(output) for _ in range(self.process_group.size()) - ] - torch.distributed.all_gather(world_output, output, group=self.process_group) - world_output = torch.cat(world_output, dim=-1) - return world_output - - -class TensorParallelColumnLinear(SuperLayer): - @classmethod - def load_gate_up(cls, config, prefix: str, weights, bias: bool): - """Specific method when the QKV was joined after the fact""" - weight = weights.get_weights_col_packed_gate_up( - prefix, quantize=config.quantize - ) - if bias: - raise NotImplementedError("packed_gate_up only implemented without bias") - else: - bias = None - linear = get_linear(weight, bias, config.quantize) - return cls(linear) - - @classmethod - def load_qkv(cls, config, prefix: str, weights, bias: bool): - """Specific method when the QKV was joined after the fact""" - weight = weights.get_weights_col_packed_qkv(prefix, quantize=config.quantize) - if bias: - raise NotImplementedError("packed_qkv only implemented for baichuan") - else: - bias = None - linear = get_linear(weight, bias, config.quantize) - return cls(linear) - - @classmethod - def load(cls, config, prefix: str, weights, bias: bool): - return cls.load_multi(config, [prefix], weights, bias, dim=0) - - @classmethod - def load_multi(cls, config, prefixes: List[str], weights, bias: bool, dim: int): - weight = weights.get_multi_weights_col( - prefixes, quantize=config.quantize, dim=dim - ) - - if bias: - b = [weights.get_sharded(f"{p}.bias", dim=0) for p in prefixes] - bias = torch.cat(b, dim=dim) - else: - bias = None - linear = get_linear(weight, bias, config.quantize) - return cls(linear) - - -class TensorParallelRowLinear(SuperLayer): - def __init__(self, linear, process_group): - super().__init__(linear) - self.process_group = process_group - - @classmethod - def load(cls, config, prefix: str, weights, bias: bool): - weight = weights.get_multi_weights_row(prefix, quantize=config.quantize) - - if bias and weights.process_group.rank() == 0: - # Rank is only on the first rank process - bias = weights.get_tensor(f"{prefix}.bias") - else: - bias = None - return cls( - get_linear(weight, bias, config.quantize), - process_group=weights.process_group, - ) - - def forward(self, input: torch.Tensor, reduce: bool = True) -> torch.Tensor: - out = super().forward(input) - if self.process_group.size() > 1 and reduce: - torch.distributed.all_reduce(out, group=self.process_group) - return out - - -class TensorParallelEmbedding(nn.Module): - def __init__(self, prefix: str, weights, reduce=True): - super().__init__() - weight = weights.get_partial_sharded(f"{prefix}.weight", dim=0) - num_embeddings = weights.get_shape(f"{prefix}.weight")[0] - - process_group = weights.process_group - - world_size = process_group.size() - rank = process_group.rank() - - block_size = (num_embeddings + world_size - 1) // world_size - self.min_id = rank * block_size - self.max_id = min(num_embeddings, (rank + 1) * block_size) - self.null_idx = weight.shape[ - 0 - ] # Usually block_size, might be less in non even vocab_size. - self.process_group = weights.process_group - self.reduce = reduce - - """Additional 0 entry used for masking""" - self.weight = nn.Parameter(F.pad(weight, (0, 0, 0, 1))) - - def forward(self, input: torch.Tensor) -> torch.Tensor: - # default all out of bounds values to `self.null_idx` that will then be mapped to 0 - # translate for [0, self.max_id - self.min_id[ - input = torch.where( - (self.min_id > input) | (input >= self.max_id), - self.null_idx, - input - self.min_id, - ) - out = torch.nn.functional.embedding(input, self.weight) - if self.reduce and self.process_group.size() > 1: - torch.distributed.all_reduce(out, group=self.process_group) - return out - - -try: - if IS_CUDA_SYSTEM: - import dropout_layer_norm - elif IS_ROCM_SYSTEM: - from vllm import layernorm_ops - else: - dropout_layer_norm = None - - class FastLayerNorm(nn.LayerNorm): - def forward(self, hidden_states, residual=None): - if IS_XPU_SYSTEM: - res_out = hidden_states - out = ipex.llm.functional.add_layer_norm( - residual, hidden_states, self.weight, self.bias, self.eps, True - ) - if residual is not None: - res_out = residual - return out, res_out - elif hidden_states.shape[-1] > 8192 or IS_ROCM_SYSTEM: - if residual is not None: - hidden_states += residual - residual = hidden_states - - return super(FastLayerNorm, self).forward(hidden_states), residual - else: - ( - normed_hidden_states, - residual, - *rest, - ) = dropout_layer_norm.dropout_add_ln_fwd( - hidden_states, - residual, - self.weight, - self.bias, - None, - None, - None, - None, - 0.0, - self.eps, - 1.0, - 0, - None, - False, - False, - ) - if residual is None: - residual = hidden_states - - return normed_hidden_states, residual - - class FastRMSNorm(nn.Module): - def __init__(self, weight: torch.Tensor, eps: float): - super().__init__() - - self.weight = nn.Parameter(weight) - self.variance_epsilon = eps - - @classmethod - def load(cls, prefix, weights, eps=1e-6): - weight = weights.get_tensor(f"{prefix}.weight") - return cls(weight, eps) - - def forward(self, hidden_states, residual=None): - if IS_XPU_SYSTEM: - residual_out = hidden_states - out = ipex.llm.functional.add_rms_norm( - residual, - hidden_states, - self.weight, - None, - self.variance_epsilon, - True, - ) - if residual is not None: - residual_out = residual - return out, residual_out - elif hidden_states.shape[-1] > 8192: - if residual is not None: - hidden_states += residual - residual = hidden_states - - hidden_states = hidden_states.to(torch.float32) - variance = hidden_states.pow(2).mean(-1, keepdim=True) - hidden_states = hidden_states * torch.rsqrt( - variance + self.variance_epsilon - ) - - # convert into half-precision if necessary - if self.weight.dtype in [torch.float16, torch.bfloat16]: - hidden_states = hidden_states.to(self.weight.dtype) - - return self.weight * hidden_states, residual - elif IS_CUDA_SYSTEM: - # faster post attention rms norm - ( - normed_hidden_states, - res, - *rest, - ) = dropout_layer_norm.dropout_add_ln_fwd( - hidden_states, - residual, - self.weight, - None, - None, - None, - None, - None, - 0.0, - self.variance_epsilon, - 1.0, - 0, - None, - False, - True, # Activate RMSNorm - ) - if res is None: - res = hidden_states - - return normed_hidden_states, res - elif IS_ROCM_SYSTEM: - # We use VLLM RMSNorm kernel that can be compiled for RoCm, instead of Flash Attention ones that can not. - if residual is not None: - hidden_states += residual - residual = hidden_states - - out = torch.empty_like(hidden_states) - layernorm_ops.rms_norm( - out, - hidden_states, - self.weight.data, - self.variance_epsilon, - ) - return out, residual - else: - raise ValueError( - "Your system seem to be not supported. Please check your install or open an issue at https://github.com/huggingface/text-generation-inference/issues with a clear reproduction." - ) - -except ImportError: - pass - -try: - if IS_CUDA_SYSTEM: - from flash_attn.layers.rotary import RotaryEmbedding - import rotary_emb - elif IS_ROCM_SYSTEM: - from vllm import pos_encoding_ops - - def _create_inv_freq(dim, base, device): - inv_freq = 1.0 / ( - base ** (torch.arange(0, dim, 2, device=device, dtype=torch.float32) / dim) - ) - return inv_freq - - def _get_rope_config(config): - if os.getenv("ROPE_SCALING", None) is not None: - rope_scaling = { - "type": os.environ["ROPE_SCALING"], - "factor": float(os.environ["ROPE_FACTOR"]), - } - return rope_scaling - return getattr(config, "rope_scaling", None) - - class PositionRotaryEmbedding(nn.Module): - def __init__(self, inv_freq, scaling_factor): - super().__init__() - self.inv_freq = inv_freq - self._seq_len_cached = 0 - self._cos_cached = None - self._sin_cached = None - self._cos_k_cached = None - self._sin_k_cached = None - self.scaling_factor = scaling_factor - self.dynamic_args = None - - def forward( - self, - query: torch.Tensor, - key: torch.Tensor, - cos: torch.Tensor, - sin: torch.Tensor, - ): - # Such controlflows may add some overhead. - if IS_CUDA_SYSTEM: - rotary_dim = cos.shape[-1] - q1 = query[..., :rotary_dim] - q2 = query[..., rotary_dim : 2 * rotary_dim] - - rotary_emb.apply_rotary(q1, q2, cos, sin, q1, q2, False) - - k1 = key[..., :rotary_dim] - k2 = key[..., rotary_dim : 2 * rotary_dim] - - rotary_emb.apply_rotary(k1, k2, cos, sin, k1, k2, False) - elif IS_ROCM_SYSTEM: - # NOTE: On RoCm systems, we use a ROPE implementatation adapted from VLLM which launches a single kernel for both query/key, contrary to flash-attn implementation used on NVIDIA systems. - # Compiling flash-attn rotary on RoCm, it appears hipcc is unable to unroll loops, resulting in an even slower inference compared to eager: https://github.com/pytorch/pytorch/issues/113773 - - head_size = query.shape[-1] - - # Inplace operation, updating query and key. - pos_encoding_ops.rotary_embedding(query, key, head_size, cos, sin, True) - elif IS_XPU_SYSTEM: - ipex.llm.functional.rotary_embedding( - query, key, sin, cos, query.size(-1), True - ) - else: - raise ValueError( - "Your system seem to be not supported. Please check your install or open an issue at https://github.com/huggingface/text-generation-inference/issues with a clear reproduction." - ) - - @classmethod - def static(cls, config, dim, base, device): - inv_freq = _create_inv_freq(dim, base, device) - scaling_factor = None - rope_scaling = _get_rope_config(config) - if rope_scaling is not None: - scaling_factor = rope_scaling["factor"] - if rope_scaling["type"] == "linear": - pass - elif rope_scaling["type"] == "dynamic": - return DynamicPositionRotaryEmbedding( - dim=dim, - max_position_embeddings=config.max_position_embeddings, - base=base, - device=inv_freq.device, - scaling_factor=scaling_factor, - ) - elif rope_scaling["type"] == "yarn": - return YarnPositionRotaryEmbedding( - dim=2 * inv_freq.shape[0], - max_position_embeddings=rope_scaling[ - "original_max_position_embeddings" - ], - base=10000.0, - device=inv_freq.device, - scaling_factor=scaling_factor, - extrapolation_factor=1, - attn_factor=1, - beta_fast=32, - beta_slow=1, - ) - else: - raise NotImplementedError( - f"rope scaling type {rope_scaling['type']} is not implemented or invalid" - ) - return cls(inv_freq, scaling_factor) - - @classmethod - def load(cls, config, prefix, weights): - # XXX: Always load this in float32 ! - dtype = weights.dtype - weights.dtype = torch.float32 - inv_freq = weights.get_tensor(f"{prefix}.inv_freq") - weights.dtype = dtype - - scaling_factor = None - rope_scaling = _get_rope_config(config) - if rope_scaling is not None: - scaling_factor = rope_scaling["factor"] - if rope_scaling["type"] == "linear": - pass - elif rope_scaling["type"] == "dynamic": - return DynamicPositionRotaryEmbedding( - dim=2 * inv_freq.shape[0], - max_position_embeddings=config.max_position_embeddings, - base=10000.0, - device=inv_freq.device, - scaling_factor=scaling_factor, - ) - elif rope_scaling["type"] == "yarn": - return YarnPositionRotaryEmbedding( - dim=2 * inv_freq.shape[0], - max_position_embeddings=rope_scaling[ - "original_max_position_embeddings" - ], - base=10000.0, - device=inv_freq.device, - scaling_factor=scaling_factor, - extrapolation_factor=1, - attn_factor=1, - beta_fast=32, - beta_slow=1, - ) - else: - raise NotImplementedError( - f"rope scaling type {rope_scaling['type']} is not implemented or invalid" - ) - return cls(inv_freq, scaling_factor) - - def _update_cos_sin_cache(self, dtype, device, seqlen): - # Reset the tables if the sequence length has changed, - # or if we're on a new device (possibly due to tracing for instance) - if ( - seqlen > self._seq_len_cached - or self._cos_cached.device != device - or self._cos_cached.dtype != dtype - ): - self._seq_len_cached = seqlen - t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype) - if self.scaling_factor is not None: - t /= self.scaling_factor - # Don't do einsum, it converts fp32 to fp16 - # freqs = torch.einsum("i,j->ij", t, self.inv_freq) - - freqs = torch.outer(t, self.inv_freq.to(device=t.device)) - self._cos_cached = torch.cos(freqs).to(dtype) - self._sin_cached = torch.sin(freqs).to(dtype) - - def get_cos_sin( - self, position_ids: torch.Tensor, max_s: int, dtype: torch.dtype - ): - """ - Return cos and sin for the asked position ids - """ - if IS_ROCM_SYSTEM: - # For RoCm, we always use float cos/sin to avoid a cast. - # For NVIDIA, for some reason, the flash-attn rotary kernel requires cos/sin and query/key to be of same dtype: https://github.com/Dao-AILab/flash-attention/blob/017716451d446e464dde9aca3a3c1ed2209caaa9/csrc/rotary/rotary.cpp#L26 - # But later on goes and cast cos/sin to float anyway: https://github.com/Dao-AILab/flash-attention/blob/017716451d446e464dde9aca3a3c1ed2209caaa9/csrc/rotary/rotary_cuda.cu#L29, which looks suboptimal. - dtype = torch.float32 - - self._update_cos_sin_cache(dtype, position_ids.device, max_s) - - cos = torch.index_select(self._cos_cached, 0, position_ids) - sin = torch.index_select(self._sin_cached, 0, position_ids) - - # Note: this unsqueeze is not necessary on RoCm + VLLM ROPE implementation, but we leave it as is to avoid yet an other controlflow. - return cos.unsqueeze(1), sin.unsqueeze(1) - - class DynamicPositionRotaryEmbedding(PositionRotaryEmbedding): - def __init__(self, dim, max_position_embeddings, base, device, scaling_factor): - inv_freq = _create_inv_freq(dim, base, device) - super().__init__(inv_freq, scaling_factor) - self.dim = dim - self.max_position_embeddings = max_position_embeddings - self.base = base - - def _update_cos_sin_cache(self, dtype, device, seqlen): - # Reset the tables if the sequence length has changed, - # or if we're on a new device (possibly due to tracing for instance) - if ( - seqlen > self._seq_len_cached - or self._cos_cached.device != device - or self._cos_cached.dtype != dtype - ): - if seqlen > self.max_position_embeddings: - newbase = self.base * ( - (self.scaling_factor * seqlen / self.max_position_embeddings) - - (self.scaling_factor - 1) - ) ** (self.dim / (self.dim - 2)) - self.inv_freq = _create_inv_freq( - self.dim, newbase, self.inv_freq.device - ) - self._seq_len_cached = seqlen - t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype) - # Don't do einsum, it converts fp32 to fp16 - # freqs = torch.einsum("i,j->ij", t, self.inv_freq) - - freqs = torch.outer(t, self.inv_freq.to(device=t.device)) - self._cos_cached = torch.cos(freqs).to(dtype) - self._sin_cached = torch.sin(freqs).to(dtype) - - # Inverse dim formula to find dim based on number of rotations - import math - - def find_correction_dim( - num_rotations, dim, base=10000, max_position_embeddings=2048 - ): - return ( - dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi)) - ) / (2 * math.log(base)) - - # Find dim range bounds based on rotations - def find_correction_range( - low_rot, high_rot, dim, base=10000, max_position_embeddings=2048 - ): - low = math.floor( - find_correction_dim(low_rot, dim, base, max_position_embeddings) - ) - high = math.ceil( - find_correction_dim(high_rot, dim, base, max_position_embeddings) - ) - return max(low, 0), min(high, dim - 1) # Clamp values just in case - - def linear_ramp_mask(min, max, dim): - if min == max: - max += 0.001 # Prevent singularity - - linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min) - ramp_func = torch.clamp(linear_func, 0, 1) - return ramp_func - - def get_mscale(scale=1): - if scale <= 1: - return 1.0 - return 0.1 * math.log(scale) + 1.0 - - class YarnPositionRotaryEmbedding(PositionRotaryEmbedding): - def __init__( - self, - dim, - max_position_embeddings, - base, - device, - scaling_factor, - *, - extrapolation_factor, - attn_factor, - beta_fast, - beta_slow, - ): - inv_freq = _create_inv_freq(dim, base, device) - super().__init__(inv_freq, scaling_factor) - self.dim = dim - self.max_position_embeddings = max_position_embeddings - self.base = base - self.extrapolation_factor = extrapolation_factor - self.attn_factor = attn_factor - self.beta_fast = beta_fast - self.beta_slow = beta_slow - self.mscale = float( - get_mscale(self.scaling_factor) * self.attn_factor - ) # Get n-d magnitude scaling corrected for interpolation - - def _update_cos_sin_cache(self, dtype, device, seqlen): - # Reset the tables if the sequence length has changed, - # or if we're on a new device (possibly due to tracing for instance) - if ( - seqlen > self._seq_len_cached - or self._cos_cached.device != device - or self._cos_cached.dtype != dtype - ): - if seqlen > self.max_position_embeddings: - inv_freq_extrapolation = _create_inv_freq( - self.dim, self.base, self.inv_freq.device - ) - freqs = 1.0 / inv_freq_extrapolation - inv_freq_interpolation = 1.0 / (self.scaling_factor * freqs) - low, high = find_correction_range( - self.beta_fast, - self.beta_slow, - self.dim, - self.base, - self.max_position_embeddings, - ) - inv_freq_mask = ( - 1 - - linear_ramp_mask(low, high, self.dim // 2).float().to(device) - ) * self.extrapolation_factor # Get n-d rotational scaling corrected for extrapolation - inv_freq = ( - inv_freq_interpolation * (1 - inv_freq_mask) - + inv_freq_extrapolation * inv_freq_mask - ) - - self.inv_freq = inv_freq - self.mscale = float( - get_mscale(self.scaling_factor) * self.attn_factor - ) # Get n-d magnitude scaling corrected for interpolation - - self._seq_len_cached = seqlen - t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype) - # Don't do einsum, it converts fp32 to fp16 - # freqs = torch.einsum("i,j->ij", t, self.inv_freq) - - freqs = torch.outer(t, self.inv_freq.to(device=t.device)) - self._cos_cached = (torch.cos(freqs) * self.mscale).to(dtype) - self._sin_cached = (torch.sin(freqs) * self.mscale).to(dtype) - -except ImportError: - pass diff --git a/server/text_generation_server/utils/paged_attention.py b/server/text_generation_server/utils/paged_attention.py index 62c0c893..6cc30e6d 100644 --- a/server/text_generation_server/utils/paged_attention.py +++ b/server/text_generation_server/utils/paged_attention.py @@ -1,14 +1,18 @@ import torch -from text_generation_server.utils.import_utils import ( - IS_CUDA_SYSTEM, - IS_ROCM_SYSTEM, - IS_XPU_SYSTEM, -) +from text_generation_server.utils.import_utils import SYSTEM _PARTITION_SIZE = 512 -if IS_XPU_SYSTEM: +if SYSTEM == "xpu": import intel_extension_for_pytorch as ipex +else: + try: + from vllm._C import cache_ops + from vllm._C import ops + except Exception as e: + raise ImportError( + f"Could not import vllm paged attention. Make sure your installation is correct. Complete error: {e}" + ) def reshape_and_cache( @@ -18,22 +22,14 @@ def reshape_and_cache( value_cache: torch.Tensor, slots: torch.Tensor, ): - if IS_CUDA_SYSTEM: - from vllm._C import cache_ops - - cache_ops.reshape_and_cache( - key, value, key_cache, value_cache, slots, "auto", 1.0 - ) - elif IS_ROCM_SYSTEM: - from vllm import cache_ops - - cache_ops.reshape_and_cache(key, value, key_cache, value_cache, slots) - elif IS_XPU_SYSTEM: + if SYSTEM == "xpu": ipex.llm.modules.PagedAttention.reshape_and_cache( key, value, key_cache, value_cache, slots ) else: - raise ValueError("vllm is not supported on your system") + cache_ops.reshape_and_cache( + key, value, key_cache, value_cache, slots, "auto", 1.0 + ) def attention( @@ -68,7 +64,7 @@ def attention( block_size = value_cache.shape[3] num_seqs, num_heads, head_size = query.shape max_num_partitions = (max_s + _PARTITION_SIZE - 1) // _PARTITION_SIZE - if IS_XPU_SYSTEM: + if SYSTEM == "xpu": query = query.contiguous() return ipex.llm.modules.PagedAttention.single_query_cached_kv_attention( out, @@ -91,43 +87,21 @@ def attention( # to parallelize. use_v1 = max_s <= 8192 and (max_num_partitions == 1 or num_seqs * num_heads > 512) if use_v1: - if IS_CUDA_SYSTEM: - from vllm._C import ops - - ops.paged_attention_v1( - out, - query, - key_cache, - value_cache, - kv_head_mapping, - softmax_scale, - block_tables, - input_lengths, - block_size, - max_s, - None, - "auto", - 1.0, - ) - elif IS_ROCM_SYSTEM: - from vllm import attention_ops - - attention_ops.paged_attention_v1( - out, - query, - key_cache, - value_cache, - kv_head_mapping, - softmax_scale, - block_tables, - input_lengths, - block_size, - max_s, - None, - ) - else: - raise ValueError("vllm is not supported on your system") - + ops.paged_attention_v1( + out, + query, + key_cache, + value_cache, + kv_head_mapping, + softmax_scale, + block_tables, + input_lengths, + block_size, + max_s, + None, + "auto", + 1.0, + ) else: # Run PagedAttention V2. assert _PARTITION_SIZE % block_size == 0 @@ -143,45 +117,21 @@ def attention( ) max_logits = torch.empty_like(exp_sums) - if IS_CUDA_SYSTEM: - from vllm._C import ops - - ops.paged_attention_v2( - out, - exp_sums, - max_logits, - tmp_output, - query, - key_cache, - value_cache, - kv_head_mapping, - softmax_scale, - block_tables, - input_lengths, - block_size, - max_s, - None, - "auto", - 1.0, - ) - elif IS_ROCM_SYSTEM: - from vllm import attention_ops - - attention_ops.paged_attention_v2( - out, - exp_sums, - max_logits, - tmp_output, - query, - key_cache, - value_cache, - kv_head_mapping, - softmax_scale, - block_tables, - input_lengths, - block_size, - max_s, - None, - ) - else: - raise ValueError("vllm is not supported on your system") + ops.paged_attention_v2( + out, + exp_sums, + max_logits, + tmp_output, + query, + key_cache, + value_cache, + kv_head_mapping, + softmax_scale, + block_tables, + input_lengths, + block_size, + max_s, + None, + "auto", + 1.0, + ) diff --git a/server/text_generation_server/utils/weights.py b/server/text_generation_server/utils/weights.py index da7aed1a..6af7d3fb 100644 --- a/server/text_generation_server/utils/weights.py +++ b/server/text_generation_server/utils/weights.py @@ -171,7 +171,7 @@ class Weights: log_once( logger.info, "Converting AWQ model to Exllama/GPTQ packing format." ) - from text_generation_server.utils.awq.conversion_utils import ( + from text_generation_server.layers.awq.conversion_utils import ( fast_awq_to_gptq, ) @@ -227,7 +227,7 @@ class Weights: bits, groupsize, desc_act, quant_method = self._get_gptq_params() - from text_generation_server.utils.layers import HAS_EXLLAMA + from text_generation_server.layers.gptq import HAS_EXLLAMA use_exllama = ( bits == 4 and HAS_EXLLAMA and quantize == "gptq" and not desc_act @@ -242,7 +242,7 @@ class Weights: log_once( logger.info, "Converting AWQ model to Exllama/GPTQ packing format." ) - from text_generation_server.utils.awq.conversion_utils import ( + from text_generation_server.layers.awq.conversion_utils import ( fast_awq_to_gptq, ) @@ -321,7 +321,7 @@ class Weights: # it would require to reorder input activations that are split unto several GPUs use_exllama = False - from text_generation_server.utils.layers import HAS_EXLLAMA, CAN_EXLLAMA + from text_generation_server.layers.gptq import HAS_EXLLAMA, CAN_EXLLAMA if use_exllama: if not HAS_EXLLAMA: @@ -348,7 +348,7 @@ class Weights: log_once( logger.info, "Converting AWQ model to Exllama/GPTQ packing format." ) - from text_generation_server.utils.awq.conversion_utils import ( + from text_generation_server.layers.awq.conversion_utils import ( fast_awq_to_gptq, ) diff --git a/update_doc.py b/update_doc.py index 6127418c..5da81c72 100644 --- a/update_doc.py +++ b/update_doc.py @@ -1,13 +1,34 @@ import subprocess import argparse +import ast + +TEMPLATE = """ +# Supported Models and Hardware + +Text Generation Inference enables serving optimized models on specific hardware for the highest performance. The following sections list which models are hardware are supported. + +## Supported Models + +SUPPORTED_MODELS + +If the above list lacks the model you would like to serve, depending on the model's pipeline type, you can try to initialize and serve the model anyways to see how well it performs, but performance isn't guaranteed for non-optimized models: + +```python +# for causal LMs/text-generation models +AutoModelForCausalLM.from_pretrained(, device_map="auto")` +# or, for text-to-text generation models +AutoModelForSeq2SeqLM.from_pretrained(, device_map="auto") +``` + +If you wish to serve a supported model that already exists on a local folder, just point to the local folder. + +```bash +text-generation-launcher --model-id +``` +""" -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--check", action="store_true") - - args = parser.parse_args() - +def check_cli(check: bool): output = subprocess.check_output(["text-generation-launcher", "--help"]).decode( "utf-8" ) @@ -41,7 +62,7 @@ def main(): block = [] filename = "docs/source/basic_tutorials/launcher.md" - if args.check: + if check: with open(filename, "r") as f: doc = f.read() if doc != final_doc: @@ -53,12 +74,63 @@ def main(): ).stdout.decode("utf-8") print(diff) raise Exception( - "Doc is not up-to-date, run `python update_doc.py` in order to update it" + "Cli arguments Doc is not up-to-date, run `python update_doc.py` in order to update it" ) else: with open(filename, "w") as f: f.write(final_doc) +def check_supported_models(check: bool): + filename = "server/text_generation_server/models/__init__.py" + with open(filename, "r") as f: + tree = ast.parse(f.read()) + + enum_def = [ + x for x in tree.body if isinstance(x, ast.ClassDef) and x.name == "ModelType" + ][0] + _locals = {} + _globals = {} + exec(f"import enum\n{ast.unparse(enum_def)}", _globals, _locals) + ModelType = _locals["ModelType"] + list_string = "" + for data in ModelType: + list_string += f"- [{data.value['name']}]({data.value['url']})" + if data.value.get("multimodal", None): + list_string += " (Multimodal)" + list_string += "\n" + + final_doc = TEMPLATE.replace("SUPPORTED_MODELS", list_string) + + filename = "docs/source/supported_models.md" + if check: + with open(filename, "r") as f: + doc = f.read() + if doc != final_doc: + tmp = "supported.md" + with open(tmp, "w") as g: + g.write(final_doc) + diff = subprocess.run( + ["diff", tmp, filename], capture_output=True + ).stdout.decode("utf-8") + print(diff) + raise Exception( + "Supported models is not up-to-date, run `python update_doc.py` in order to update it" + ) + else: + with open(filename, "w") as f: + f.write(final_doc) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--check", action="store_true") + + args = parser.parse_args() + + check_cli(args.check) + check_supported_models(args.check) + + if __name__ == "__main__": main()