diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index b7cc7955..99f29d7e 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -124,6 +124,15 @@ jobs: export extra_pytest="--neuron" export target="" ;; + gaudi) + export dockerfile="Dockerfile_gaudi" + export label_extension="-gaudi" + export docker_volume="/mnt/cache" + export docker_devices="" + export runs_on="ubuntu-latest" + export platform="" + export extra_pytest="" + export target="" esac echo $dockerfile echo "Dockerfile=${dockerfile}" @@ -224,7 +233,12 @@ jobs: - name: Final id: final run: | - echo "docker_image=docker.io/huggingface/text-generation-inference-ci:sha-${{ env.GITHUB_SHA_SHORT}}${{ env.LABEL_EXTENSION }}" >> "$GITHUB_OUTPUT" + + if [ "${{ github.event_name }}" = "pull_request" ]; then + echo "docker_image=docker.io/huggingface/text-generation-inference-ci:sha-${{ env.GITHUB_SHA_SHORT}}${{ env.LABEL_EXTENSION }}" >> "$GITHUB_OUTPUT" + else + echo "docker_image=ghcr.io/huggingface/text-generation-inference:sha-${{ env.GITHUB_SHA_SHORT}}${{ env.LABEL_EXTENSION }}" >> "$GITHUB_OUTPUT" + fi echo "docker_devices=${{ env.DOCKER_DEVICES }}" >> "$GITHUB_OUTPUT" echo "docker_volume=${{ env.DOCKER_VOLUME }}" >> "$GITHUB_OUTPUT" echo "runs_on=${{ env.RUNS_ON }}" >> "$GITHUB_OUTPUT" diff --git a/.github/workflows/ci_build.yaml b/.github/workflows/ci_build.yaml index 752c6ddd..f0d39399 100644 --- a/.github/workflows/ci_build.yaml +++ b/.github/workflows/ci_build.yaml @@ -21,6 +21,7 @@ on: - "Dockerfile_amd" - "Dockerfile_intel" - "Dockerfile.neuron" + - "Dockerfile_gaudi" branches: - "main" workflow_dispatch: @@ -38,7 +39,7 @@ jobs: # fail-fast is true by default fail-fast: false matrix: - hardware: ["cuda", "cuda-trtllm", "rocm", "intel-xpu", "intel-cpu", "neuron"] + hardware: ["cuda", "cuda-trtllm", "rocm", "intel-xpu", "intel-cpu", "neuron", "gaudi"] uses: ./.github/workflows/build.yaml # calls the one above ^ permissions: contents: write diff --git a/.github/workflows/nix_build.yaml b/.github/workflows/nix_build.yaml new file mode 100644 index 00000000..5a10bd39 --- /dev/null +++ b/.github/workflows/nix_build.yaml @@ -0,0 +1,53 @@ +name: "Nix Build Docker image" +on: + pull_request: + push: + branches: + - 'main' + tags: + - 'v*' +concurrency: + group: nix-image-${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +jobs: + build_nix_image: + runs-on: + group: aws-highmemory-32-plus-priv + steps: + - uses: actions/checkout@v4 + - uses: cachix/install-nix-action@v27 + with: + nix_path: nixpkgs=channel:nixos-unstable + - uses: cachix/cachix-action@v14 + with: + name: text-generation-inference + # If you chose signing key for write access + authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}' + env: + USER: github_runner + - name: Build + run: nix build .#dockerImage + - name: Initialize Docker Buildx + uses: docker/setup-buildx-action@v3 + with: + install: true + buildkitd-config: /tmp/buildkitd.toml + - name: Inject slug/short variables + uses: rlespinasse/github-slug-action@v4.4.1 + - name: Login to internal Container Registry + # if: github.event_name != 'pull_request' + uses: docker/login-action@v3 + with: + username: ${{ secrets.REGISTRY_USERNAME }} + password: ${{ secrets.REGISTRY_PASSWORD }} + registry: registry.internal.huggingface.tech + - name: Push to docker + run: | + if [ "${{ github.event_name }}" = "pull_request" ]; then + export TAG=nix-sha-${{ env.GITHUB_SHA_SHORT }} + else + export TAG=nix-${{ github.ref_name }} + fi + export IMAGE=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:$TAG + nix-shell -p skopeo --command "skopeo --insecure-policy copy docker-archive:$(readlink -f ./result) docker://$IMAGE --dest-compress-format zstd" diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index f8c17dc7..895b4dd4 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -46,7 +46,7 @@ jobs: - name: Download locked kernels run: | source ./.venv/bin/activate - hf-kernels download server + kernels download server - name: Run server tests run: | source ./.venv/bin/activate diff --git a/.gitignore b/.gitignore index 7d6c7564..8a6bda72 100644 --- a/.gitignore +++ b/.gitignore @@ -28,3 +28,4 @@ server/fbgemmm hl-smi_log*.txt .graph_dumps out +hqt_output diff --git a/Cargo.lock b/Cargo.lock index 9b036e69..5e13e855 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4617,7 +4617,7 @@ dependencies = [ [[package]] name = "text-generation-backends-trtllm" -version = "3.1.2-dev0" +version = "3.2.1-dev0" dependencies = [ "async-trait", "clap 4.5.30", @@ -4638,7 +4638,7 @@ dependencies = [ [[package]] name = "text-generation-benchmark" -version = "3.1.2-dev0" +version = "3.2.1-dev0" dependencies = [ "average", "clap 4.5.30", @@ -4658,7 +4658,7 @@ dependencies = [ [[package]] name = "text-generation-client" -version = "3.1.2-dev0" +version = "3.2.1-dev0" dependencies = [ "async-trait", "base64 0.22.1", @@ -4676,7 +4676,7 @@ dependencies = [ [[package]] name = "text-generation-launcher" -version = "3.1.2-dev0" +version = "3.2.1-dev0" dependencies = [ "clap 4.5.30", "ctrlc", @@ -4697,7 +4697,7 @@ dependencies = [ [[package]] name = "text-generation-router" -version = "3.1.2-dev0" +version = "3.2.1-dev0" dependencies = [ "anyhow", "async-stream", @@ -4749,7 +4749,7 @@ dependencies = [ [[package]] name = "text-generation-router-llamacpp" -version = "3.1.2-dev0" +version = "3.2.1-dev0" dependencies = [ "async-trait", "bindgen 0.71.1", @@ -4767,7 +4767,7 @@ dependencies = [ [[package]] name = "text-generation-router-v2" -version = "3.1.2-dev0" +version = "3.2.1-dev0" dependencies = [ "async-stream", "async-trait", @@ -4816,7 +4816,7 @@ dependencies = [ [[package]] name = "text-generation-router-v3" -version = "3.1.2-dev0" +version = "3.2.1-dev0" dependencies = [ "async-stream", "async-trait", diff --git a/Cargo.toml b/Cargo.toml index 4e3ad010..d52adec4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,7 +21,7 @@ default-members = [ resolver = "2" [workspace.package] -version = "3.1.2-dev0" +version = "3.2.1-dev0" edition = "2021" authors = ["Olivier Dehaene"] homepage = "https://github.com/huggingface/text-generation-inference" @@ -29,7 +29,7 @@ homepage = "https://github.com/huggingface/text-generation-inference" [workspace.dependencies] base64 = "0.22.0" tokenizers = { version = "0.20.0", features = ["http"] } -hf-hub = { version = "0.4.1", features = ["tokio"] } +hf-hub = { version = "0.4.2", features = ["tokio"] } metrics = { version = "0.23.0" } metrics-exporter-prometheus = { version = "0.15.1", features = [] } minijinja = { version = "2.2.0", features = ["json"] } diff --git a/Dockerfile b/Dockerfile index fb87968a..cbb0977f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -183,12 +183,12 @@ COPY server server COPY server/Makefile server/Makefile ENV HF_KERNELS_CACHE=/kernels RUN cd server && \ - uv sync --frozen --extra gen --extra attention --extra bnb --extra accelerate --extra compressed-tensors --extra marlin --extra moe --extra quantize --extra peft --extra outlines --no-install-project --active && \ + uv sync --frozen --extra gen --extra bnb --extra accelerate --extra compressed-tensors --extra quantize --extra peft --extra outlines --no-install-project --active && \ make gen-server-raw && \ - hf-kernels download . + kernels download . RUN cd server && \ - uv sync --frozen --extra gen --extra attention --extra bnb --extra accelerate --extra compressed-tensors --extra marlin --extra moe --extra quantize --extra peft --extra outlines --active --python=${PYTHON_VERSION} && \ + uv sync --frozen --extra gen --extra bnb --extra accelerate --extra compressed-tensors --extra quantize --extra peft --extra outlines --active --python=${PYTHON_VERSION} && \ uv pip install nvidia-nccl-cu12==2.25.1 && \ pwd && \ text-generation-server --help diff --git a/Dockerfile.neuron b/Dockerfile.neuron index 5a22fab3..b2e0eb2c 100644 --- a/Dockerfile.neuron +++ b/Dockerfile.neuron @@ -5,7 +5,7 @@ RUN mkdir -p /tgi # Fetch the optimum-neuron sources directly to avoid relying on pypi deployments FROM alpine AS optimum-neuron RUN mkdir -p /optimum-neuron -ADD https://github.com/huggingface/optimum-neuron/archive/refs/tags/v0.0.28.tar.gz /optimum-neuron/sources.tar.gz +ADD https://github.com/huggingface/optimum-neuron/archive/refs/tags/v0.1.0.tar.gz /optimum-neuron/sources.tar.gz RUN tar -C /optimum-neuron -xf /optimum-neuron/sources.tar.gz --strip-components=1 # Build cargo components (adapted from TGI original Dockerfile) @@ -108,10 +108,10 @@ RUN wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEU # Install neuronx packages RUN apt-get update -y \ && apt-get install -y --no-install-recommends \ - aws-neuronx-dkms=2.18.20.0 \ - aws-neuronx-collectives=2.22.33.0-d2128d1aa \ - aws-neuronx-runtime-lib=2.22.19.0-5856c0b42 \ - aws-neuronx-tools=2.19.0.0 \ + aws-neuronx-dkms=2.19.64.0 \ + aws-neuronx-collectives=2.23.135.0-3e70920f2 \ + aws-neuronx-runtime-lib=2.23.112.0-9b5179492 \ + aws-neuronx-tools=2.20.204.0 \ libxml2 \ && rm -rf /var/lib/apt/lists/* \ && apt-get clean @@ -120,16 +120,16 @@ ENV PATH="/opt/bin/:/opt/aws/neuron/bin:${PATH}" # Install manually torch CPU version to avoid pulling CUDA RUN pip3 install \ - torch==2.1.2 \ - torchvision==0.16.2 \ + torch==2.5.1 \ + torchvision==0.20.1 \ --index-url https://download.pytorch.org/whl/cpu RUN pip3 install \ - neuronx-cc==2.15.143.0 \ - torch-neuronx==2.1.2.2.3.2 \ - transformers-neuronx==0.12.313 \ - neuronx-distributed==0.9.0 \ - libneuronxla==2.0.5347.0 \ + neuronx-cc==2.16.372.0 \ + torch-neuronx==2.5.1.2.4.0 \ + transformers-neuronx==0.13.322 \ + neuronx-distributed==0.10.1 \ + libneuronxla==2.1.681.0 \ --extra-index-url=https://pip.repos.neuron.amazonaws.com # Install HuggingFace packages diff --git a/Dockerfile_gaudi b/Dockerfile_gaudi index 14c507d0..a0ba74b6 100644 --- a/Dockerfile_gaudi +++ b/Dockerfile_gaudi @@ -1,6 +1,6 @@ # Those arguments are required to build the image -ARG HABANA_VERSION -ARG PYTORCH_VERSION +ARG HABANA_VERSION=1.20.0 +ARG PYTORCH_VERSION=2.6.0 # Rust builder FROM lukemathwalker/cargo-chef:latest-rust-1.85.0 AS chef @@ -92,7 +92,6 @@ RUN cd server && \ make gen-server && \ pip install --no-deps -r requirements.txt && \ bash ./dill-0.3.8-patch.sh && \ - pip install outlines~=0.0.34 && \ pip install "git+https://github.com/HabanaAI/DeepSpeed.git@${HABANA_VERSION}" && \ BUILD_CUDA_EXT=0 pip install git+https://github.com/AutoGPTQ/AutoGPTQ.git@097dd04e --no-build-isolation && \ pip install . --no-cache-dir diff --git a/Dockerfile_intel b/Dockerfile_intel index bdff0290..10ed914a 100644 --- a/Dockerfile_intel +++ b/Dockerfile_intel @@ -45,7 +45,7 @@ RUN cargo build --profile release-opt --frozen # Text Generation Inference base image for Intel -FROM intel/oneapi-basekit:2024.2.1-0-devel-ubuntu22.04 AS xpu +FROM intel/oneapi-basekit:2025.0.1-0-devel-ubuntu22.04 AS xpu USER root @@ -87,7 +87,7 @@ RUN echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https:/ RUN mv /tmp/intel-for-pytorch-gpu-dev.list /etc/apt/sources.list.d -RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt install -y xpu-smi cmake ninja-build pciutils intel-pti-dev-0.9 +RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt install -y xpu-smi cmake ninja-build pciutils intel-ocloc # Text Generation Inference base env ENV HF_HOME=/data \ @@ -96,13 +96,11 @@ ENV HF_HOME=/data \ -WORKDIR /usr/src -RUN pip install https://intel-optimized-pytorch.s3.cn-north-1.amazonaws.com.cn/ipex_dev/xpu/torch-2.5.0a0%2Bgite84e33f-cp311-cp311-linux_x86_64.whl --no-cache-dir -RUN pip install https://intel-optimized-pytorch.s3.cn-north-1.amazonaws.com.cn/ipex_dev/xpu/torchaudio-2.5.0a0%2B56bc006-cp311-cp311-linux_x86_64.whl --no-cache-dir -RUN pip install https://intel-optimized-pytorch.s3.cn-north-1.amazonaws.com.cn/ipex_dev/xpu/torchvision-0.20.0a0%2B8e8a208-cp311-cp311-linux_x86_64.whl --no-cache-dir -RUN pip install https://intel-optimized-pytorch.s3.cn-north-1.amazonaws.com.cn/ipex_dev/xpu/oneccl_bind_pt-2.5.0%2Bxpu-cp311-cp311-linux_x86_64.whl --no-cache-dir -RUN pip install triton-xpu==3.0.0b2 --no-cache-dir +WORKDIR /usr/src +RUN pip install torch==2.6.0 --index-url https://download.pytorch.org/whl/test/xpu + +RUN pip install triton-xpu==3.2.0b1 --no-cache-dir # Install server COPY proto proto @@ -114,15 +112,14 @@ RUN cd server && \ pip install -U pip uv && \ uv pip install -e ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir -ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/intel/oneapi/pti/0.9/lib:/opt/conda/lib +ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/lib ENV CCL_ZE_IPC_EXCHANGE=sockets -#ENV TORCH_LLM_ALLREDUCE=1 -#ENV CCL_TOPO_FABRIC_VERTEX_CONNECTION_CHECK=0 +ENV TORCH_LLM_ALLREDUCE=1 +ENV CCL_TOPO_FABRIC_VERTEX_CONNECTION_CHECK=0 ENV TORCH_DEVICE_BACKEND_AUTOLOAD=0 -RUN git clone https://github.com/intel/intel-extension-for-pytorch && cd intel-extension-for-pytorch && git checkout 1ccf72b2d11cd00b47aef6d6cd054c088aa6f083 -RUN cd intel-extension-for-pytorch && git submodule update --init --recursive && USE_AOT_DEVLIST='pvc,ats-m150' BUILD_SEPARATE_OPS=OFF BUILD_WITH_CPU=OFF USE_XETLA=ON python setup.py install && rm -rf /usr/src/intel-extension-for-pytorch - +RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_stable/xpu/oneccl_bind_pt-2.6.0%2Bxpu-cp311-cp311-linux_x86_64.whl +RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_stable/xpu/intel_extension_for_pytorch-2.6.10%2Bxpu-cp311-cp311-linux_x86_64.whl # Install benchmarker COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark # Install router diff --git a/Makefile b/Makefile index 3068a06f..2ecdd45c 100644 --- a/Makefile +++ b/Makefile @@ -53,3 +53,6 @@ run-falcon-7b-instruct-quantize: clean: rm -rf target aml + +preview_doc: + doc-builder preview text-generation-inference docs/source --not_python_module diff --git a/README.md b/README.md index e0633980..274cd86c 100644 --- a/README.md +++ b/README.md @@ -84,7 +84,7 @@ model=HuggingFaceH4/zephyr-7b-beta volume=$PWD/data docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data \ - ghcr.io/huggingface/text-generation-inference:3.1.1 --model-id $model + ghcr.io/huggingface/text-generation-inference:3.2.1 --model-id $model ``` And then you can make requests like @@ -121,7 +121,7 @@ curl localhost:8080/v1/chat/completions \ **Note:** To use NVIDIA GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). We also recommend using NVIDIA drivers with CUDA version 12.2 or higher. For running the Docker container on a machine with no GPUs or CUDA support, it is enough to remove the `--gpus all` flag and add `--disable-custom-kernels`, please note CPU is not the intended platform for this project, so performance might be subpar. -**Note:** TGI supports AMD Instinct MI210 and MI250 GPUs. Details can be found in the [Supported Hardware documentation](https://huggingface.co/docs/text-generation-inference/installation_amd#using-tgi-with-amd-gpus). To use AMD GPUs, please use `docker run --device /dev/kfd --device /dev/dri --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.1.1-rocm --model-id $model` instead of the command above. +**Note:** TGI supports AMD Instinct MI210 and MI250 GPUs. Details can be found in the [Supported Hardware documentation](https://huggingface.co/docs/text-generation-inference/installation_amd#using-tgi-with-amd-gpus). To use AMD GPUs, please use `docker run --device /dev/kfd --device /dev/dri --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.2.1-rocm --model-id $model` instead of the command above. To see all options to serve your models (in the [code](https://github.com/huggingface/text-generation-inference/blob/main/launcher/src/main.rs) or in the cli): ``` @@ -152,7 +152,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading token= docker run --gpus all --shm-size 1g -e HF_TOKEN=$token -p 8080:80 -v $volume:/data \ - ghcr.io/huggingface/text-generation-inference:3.1.1 --model-id $model + ghcr.io/huggingface/text-generation-inference:3.2.1 --model-id $model ``` ### A note on Shared Memory (shm) diff --git a/backends/gaudi/Makefile b/backends/gaudi/Makefile index ce3be25d..6e38c19e 100644 --- a/backends/gaudi/Makefile +++ b/backends/gaudi/Makefile @@ -2,8 +2,8 @@ mkfile_path := $(abspath $(lastword $(MAKEFILE_LIST))) mkfile_dir := $(dir $(mkfile_path)) root_dir := "${mkfile_dir}/../.." -HABANA_VERSION := 1.19.0 -PYTORCH_VERSION := 2.5.1 +HABANA_VERSION := 1.20.0 +PYTORCH_VERSION := 2.6.0 .PHONY: image run-local-dev-container install-dependencies install-server install-router install-launcher local-dev-install diff --git a/backends/gaudi/examples/docker_commands/docker_commands.md b/backends/gaudi/examples/docker_commands/docker_commands.md new file mode 100644 index 00000000..59701289 --- /dev/null +++ b/backends/gaudi/examples/docker_commands/docker_commands.md @@ -0,0 +1,283 @@ +# Examples of Docker Commands for Gaudi Backend + +This page gives a list of examples of docker run commands for some of the most popular models. + +> **Note:** The parameters are chosen for Gaudi2 hardware to maximize performance on this given hardware, please adjust the parameters based on your hardware. For example, if you are using Gaudi3, you may want to increase the batch size. + +## Default Precision (BF16) + +### Llama3.1-8B on 1 card (BF16) + +```bash +model=meta-llama/Meta-Llama-3.1-8B-Instruct +hf_token=YOUR_ACCESS_TOKEN +volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run + +docker run -p 8080:80 \ + --runtime=habana \ + --cap-add=sys_nice \ + --ipc=host \ + -v $volume:/data \ + -e HF_TOKEN=$hf_token \ + -e MAX_TOTAL_TOKENS=2048 \ + -e PREFILL_BATCH_BUCKET_SIZE=2 \ + -e BATCH_BUCKET_SIZE=32 \ + -e PAD_SEQUENCE_TO_MULTIPLE_OF=256 \ + ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \ + --model-id $model \ + --max-input-tokens 1024 --max-total-tokens 2048 \ + --max-batch-prefill-tokens 2048 --max-batch-size 32 \ + --max-waiting-tokens 7 --waiting-served-ratio 1.2 --max-concurrent-requests 64 +``` + +### Llama3.1-70B 8 cards (BF16) + +```bash +model=meta-llama/Meta-Llama-3.1-70B-Instruct +hf_token=YOUR_ACCESS_TOKEN +volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run + +docker run -p 8080:80 \ + --runtime=habana \ + --cap-add=sys_nice \ + --ipc=host \ + -v $volume:/data \ + -e HF_TOKEN=$hf_token \ + -e MAX_TOTAL_TOKENS=2048 \ + -e BATCH_BUCKET_SIZE=256 \ + -e PREFILL_BATCH_BUCKET_SIZE=4 \ + -e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \ + ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \ + --model-id $model \ + --sharded true --num-shard 8 \ + --max-input-tokens 1024 --max-total-tokens 2048 \ + --max-batch-prefill-tokens 4096 --max-batch-size 256 \ + --max-waiting-tokens 7 --waiting-served-ratio 1.2 --max-concurrent-requests 512 +``` + +### Llama2-7B on 1 Card (BF16) + +```bash +model=meta-llama/Llama-2-7b-chat-hf +hf_token=YOUR_ACCESS_TOKEN +volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run + +docker run -p 8080:80 \ + --runtime=habana \ + --cap-add=sys_nice \ + --ipc=host \ + -v $volume:/data \ + -e HF_TOKEN=$hf_token \ + -e MAX_TOTAL_TOKENS=2048 \ + -e PREFILL_BATCH_BUCKET_SIZE=2 \ + -e BATCH_BUCKET_SIZE=32 \ + -e PAD_SEQUENCE_TO_MULTIPLE_OF=256 \ + ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \ + --model-id $model \ + --max-input-tokens 1024 --max-total-tokens 2048 \ + --max-batch-prefill-tokens 2048 --max-batch-size 32 \ + --max-waiting-tokens 7 --waiting-served-ratio 1.2 --max-concurrent-requests 64 +``` + +### Llama2-70B on 8 cards (BF16) + +```bash +model=meta-llama/Llama-2-70b-chat-hf +hf_token=YOUR_ACCESS_TOKEN +volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run + +docker run -p 8080:80 \ + --runtime=habana \ + --cap-add=sys_nice \ + --ipc=host \ + -v $volume:/data \ + -e HF_TOKEN=$hf_token \ + -e MAX_TOTAL_TOKENS=2048 \ + -e BATCH_BUCKET_SIZE=256 \ + -e PREFILL_BATCH_BUCKET_SIZE=4 \ + -e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \ + ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \ + --model-id $model \ + --sharded true --num-shard 8 \ + --max-input-tokens 1024 --max-total-tokens 2048 \ + --max-batch-prefill-tokens 4096 --max-batch-size 256 \ + --max-waiting-tokens 7 --waiting-served-ratio 1.2 --max-concurrent-requests 512 +``` + +### Llava-v1.6-Mistral-7B on 1 card (BF16) + +```bash +model=llava-hf/llava-v1.6-mistral-7b-hf +volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run + +docker run -p 8080:80 \ + --runtime=habana \ + --cap-add=sys_nice \ + --ipc=host \ + -v $volume:/data \ + -e PREFILL_BATCH_BUCKET_SIZE=1 \ + -e BATCH_BUCKET_SIZE=1 \ + ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \ + --model-id $model \ + --max-input-tokens 4096 --max-batch-prefill-tokens 16384 \ + --max-total-tokens 8192 --max-batch-size 4 +``` + +## FP8 Precision + +Please refer to the [FP8 Precision](https://huggingface.co/docs/text-generation-inference/backends/gaudi_new#how-to-use-different-precision-formats) section for more details. You need to measure the statistics of the model first before running the model in FP8 precision. + +## Llama3.1-8B on 1 Card (FP8) + +```bash +model=meta-llama/Meta-Llama-3.1-8B-Instruct +hf_token=YOUR_ACCESS_TOKEN +volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run + +docker run -p 8080:80 \ + --runtime=habana \ + --cap-add=sys_nice \ + --ipc=host \ + -v $volume:/data \ + -v $PWD/quantization_config:/usr/src/quantization_config \ + -v $PWD/hqt_output:/usr/src/hqt_output \ + -e QUANT_CONFIG=./quantization_config/maxabs_quant.json \ + -e HF_TOKEN=$hf_token \ + -e MAX_TOTAL_TOKENS=2048 \ + -e PREFILL_BATCH_BUCKET_SIZE=2 \ + -e BATCH_BUCKET_SIZE=32 \ + -e PAD_SEQUENCE_TO_MULTIPLE_OF=256 \ + ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \ + --model-id $model \ + --max-input-tokens 1024 --max-total-tokens 2048 \ + --max-batch-prefill-tokens 2048 --max-batch-size 32 \ + --max-waiting-tokens 7 --waiting-served-ratio 1.2 --max-concurrent-requests 64 +``` + +## Llama3.1-70B on 8 cards (FP8) + +```bash +model=meta-llama/Meta-Llama-3.1-70B-Instruct +hf_token=YOUR_ACCESS_TOKEN +volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run + +docker run -p 8080:80 \ + --runtime=habana \ + --cap-add=sys_nice \ + --ipc=host \ + -v $volume:/data \ + -v $PWD/quantization_config:/usr/src/quantization_config \ + -v $PWD/hqt_output:/usr/src/hqt_output \ + -e QUANT_CONFIG=./quantization_config/maxabs_quant.json \ + -e HF_TOKEN=$hf_token \ + -e MAX_TOTAL_TOKENS=2048 \ + -e BATCH_BUCKET_SIZE=256 \ + -e PREFILL_BATCH_BUCKET_SIZE=4 \ + -e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \ + ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \ + --model-id $model \ + --sharded true --num-shard 8 \ + --max-input-tokens 1024 --max-total-tokens 2048 \ + --max-batch-prefill-tokens 4096 --max-batch-size 256 \ + --max-waiting-tokens 7 --waiting-served-ratio 1.2 --max-concurrent-requests 512 +``` + +## Llama2-7B on 1 Card (FP8) + +```bash +model=meta-llama/Llama-2-7b-chat-hf +hf_token=YOUR_ACCESS_TOKEN +volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run + +docker run -p 8080:80 \ + --runtime=habana \ + --cap-add=sys_nice \ + --ipc=host \ + -v $volume:/data \ + -v $PWD/quantization_config:/usr/src/quantization_config \ + -v $PWD/hqt_output:/usr/src/hqt_output \ + -e QUANT_CONFIG=./quantization_config/maxabs_quant.json \ + -e HF_TOKEN=$hf_token \ + -e MAX_TOTAL_TOKENS=2048 \ + -e PREFILL_BATCH_BUCKET_SIZE=2 \ + -e BATCH_BUCKET_SIZE=32 \ + -e PAD_SEQUENCE_TO_MULTIPLE_OF=256 \ + ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \ + --model-id $model \ + --max-input-tokens 1024 --max-total-tokens 2048 \ + --max-batch-prefill-tokens 2048 --max-batch-size 32 \ + --max-waiting-tokens 7 --waiting-served-ratio 1.2 --max-concurrent-requests 64 +``` + +## Llama2-70B on 8 Cards (FP8) + +```bash +model=meta-llama/Llama-2-70b-chat-hf +hf_token=YOUR_ACCESS_TOKEN +volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run + +docker run -p 8080:80 \ + --runtime=habana \ + --cap-add=sys_nice \ + --ipc=host \ + -v $volume:/data \ + -v $PWD/quantization_config:/usr/src/quantization_config \ + -v $PWD/hqt_output:/usr/src/hqt_output \ + -e QUANT_CONFIG=./quantization_config/maxabs_quant.json \ + -e HF_TOKEN=$hf_token \ + -e MAX_TOTAL_TOKENS=2048 \ + -e BATCH_BUCKET_SIZE=256 \ + -e PREFILL_BATCH_BUCKET_SIZE=4 \ + -e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \ + ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \ + --model-id $model \ + --sharded true --num-shard 8 \ + --max-input-tokens 1024 --max-total-tokens 2048 \ + --max-batch-prefill-tokens 4096 --max-batch-size 256 \ + --max-waiting-tokens 7 --waiting-served-ratio 1.2 --max-concurrent-requests 512 +``` + +## Llava-v1.6-Mistral-7B on 1 Card (FP8) + +```bash +model=llava-hf/llava-v1.6-mistral-7b-hf +volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run + +docker run -p 8080:80 \ + --runtime=habana \ + --cap-add=sys_nice \ + --ipc=host \ + -v $volume:/data \ + -v $PWD/quantization_config:/usr/src/quantization_config \ + -v $PWD/hqt_output:/usr/src/hqt_output \ + -e QUANT_CONFIG=./quantization_config/maxabs_quant.json \ + -e PREFILL_BATCH_BUCKET_SIZE=1 \ + -e BATCH_BUCKET_SIZE=1 \ + ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \ + --model-id $model \ + --max-input-tokens 4096 --max-batch-prefill-tokens 16384 \ + --max-total-tokens 8192 --max-batch-size 4 +``` + +## Llava-v1.6-Mistral-7B on 8 Cards (FP8) + +```bash +model=llava-hf/llava-v1.6-mistral-7b-hf +volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run + +docker run -p 8080:80 \ + --runtime=habana \ + --cap-add=sys_nice \ + --ipc=host \ + -v $volume:/data \ + -v $PWD/quantization_config:/usr/src/quantization_config \ + -v $PWD/hqt_output:/usr/src/hqt_output \ + -e QUANT_CONFIG=./quantization_config/maxabs_quant.json \ + -e PREFILL_BATCH_BUCKET_SIZE=1 \ + -e BATCH_BUCKET_SIZE=1 \ + ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \ + --model-id $model \ + --sharded true --num-shard 8 \ + --max-input-tokens 4096 --max-batch-prefill-tokens 16384 \ + --max-total-tokens 8192 --max-batch-size 4 +``` diff --git a/backends/gaudi/server/pyproject.toml b/backends/gaudi/server/pyproject.toml index c61ac030..b38f4562 100644 --- a/backends/gaudi/server/pyproject.toml +++ b/backends/gaudi/server/pyproject.toml @@ -22,7 +22,7 @@ opentelemetry-instrumentation-grpc = "^0.36b0" hf-transfer = "^0.1.2" sentencepiece = "^0.1.97" peft = "^0.10" -optimum-habana = "1.15.0" +optimum-habana = "1.16.0" transformers = "4.45.2" numpy = "1.26.4" accelerate = "0.33.0" diff --git a/backends/gaudi/server/requirements.txt b/backends/gaudi/server/requirements.txt index 07414490..b71d29d9 100644 --- a/backends/gaudi/server/requirements.txt +++ b/backends/gaudi/server/requirements.txt @@ -46,7 +46,7 @@ opentelemetry-instrumentation==0.36b0 ; python_version >= "3.9" and python_versi opentelemetry-proto==1.15.0 ; python_version >= "3.9" and python_version < "3.13" opentelemetry-sdk==1.15.0 ; python_version >= "3.9" and python_version < "3.13" opentelemetry-semantic-conventions==0.36b0 ; python_version >= "3.9" and python_version < "3.13" -optimum-habana==1.15.0 ; python_version >= "3.9" and python_version < "3.13" +optimum-habana==1.16.0 ; python_version >= "3.9" and python_version < "3.13" optimum==1.23.2 ; python_version >= "3.9" and python_version < "3.13" packaging==24.1 ; python_version >= "3.9" and python_version < "3.13" pandas==2.2.3 ; python_version >= "3.9" and python_version < "3.13" @@ -87,3 +87,18 @@ wrapt==1.16.0 ; python_version >= "3.9" and python_version < "3.13" xxhash==3.5.0 ; python_version >= "3.9" and python_version < "3.13" yarl==1.16.0 ; python_version >= "3.9" and python_version < "3.13" zipp==3.20.2 ; python_version >= "3.9" and python_version < "3.13" +outlines==0.0.34 ; python_version >= "3.9" and python_version < "3.13" +interegular==0.3.3 ; python_version >= "3.9" and python_version < "3.13" +lark==1.2.2 ; python_version >= "3.9" and python_version < "3.13" +cloudpickle==3.1.0 ; python_version >= "3.9" and python_version < "3.13" +diskcache==5.6.3 ; python_version >= "3.9" and python_version < "3.13" +numba==0.60.0 ; python_version >= "3.9" and python_version < "3.13" +llvmlite==0.43.0 ; python_version >= "3.9" and python_version < "3.13" +jsonschema==4.23.0 ; python_version >= "3.9" and python_version < "3.13" +annotated-types==0.7.0 ; python_version >= "3.9" and python_version < "3.13" +jsonschema-specifications==2024.10.1 ; python_version >= "3.9" and python_version < "3.13" +nest-asyncio==1.6.0; python_version >= "3.9" and python_version < "3.13" +pydantic==2.10.6; python_version >= "3.9" and python_version < "3.13" +pydantic-core==2.27.2 ; python_version >= "3.9" and python_version < "3.13" +referencing==0.36.2 ; python_version >= "3.9" and python_version < "3.13" +rpds-py==0.22.3 ; python_version >= "3.9" and python_version < "3.13" diff --git a/backends/gaudi/server/text_generation_server/models/__init__.py b/backends/gaudi/server/text_generation_server/models/__init__.py index dfdec9dc..6833ecce 100644 --- a/backends/gaudi/server/text_generation_server/models/__init__.py +++ b/backends/gaudi/server/text_generation_server/models/__init__.py @@ -20,8 +20,9 @@ from text_generation_server.models.causal_lm import CausalLM from text_generation_server.models.bloom import BLOOM from text_generation_server.models.starcoder import StarCoder from text_generation_server.models.vlm_causal_lm import VlmCausalLM - -# from text_generation_server.models.mllama_causal_lm import MllamaCausalLM +from text_generation_server.models.custom_modeling.mllama import ( + MllamaForConditionalGeneration, +) from text_generation_server.models.custom_modeling.llava_next import ( LlavaNextForConditionalGeneration, ) @@ -30,9 +31,6 @@ from text_generation_server.models.custom_modeling.flash_phi_moe_modeling import ) # from text_generation_server.models.mllama_causal_lm import MllamaCausalLMBatch -# from text_generation_server.models.custom_modeling.mllama import ( -# MllamaForConditionalGeneration, -# ) from text_generation_server.utils.adapter import ( AdapterParameters, build_layer_weight_lookup, @@ -329,6 +327,7 @@ __GLOBALS = locals() for data in ModelType: __GLOBALS[data.name] = data.value["type"] +SDP_ON_BF16 = int(os.environ.get("SDP_ON_BF16", 0)) # Disable gradients torch.set_grad_enabled(False) @@ -849,6 +848,8 @@ def get_model( trust_remote_code=trust_remote_code, ) adapt_transformers_to_gaudi() + if SDP_ON_BF16 == 1: + torch._C._set_math_sdp_allow_fp16_bf16_reduction(True) if model_type == "gpt_bigcode": return StarCoder(model_id=model_id, revision=revision, dtype=dtype) if model_type == "bloom": @@ -871,6 +872,17 @@ def get_model( trust_remote_code=trust_remote_code, ) + if model_type == "mllama": + return VlmCausalLM( + model_class=MllamaForConditionalGeneration, + model_id=model_id, + revision=revision, + quantize=None, + speculator=speculator, + dtype=dtype, + trust_remote_code=trust_remote_code, + ) + if model_type in modeling_auto.MODEL_FOR_CAUSAL_LM_MAPPING_NAMES: return CausalLM( model_id, diff --git a/backends/gaudi/server/text_generation_server/models/causal_lm.py b/backends/gaudi/server/text_generation_server/models/causal_lm.py index 8fda0517..776c109f 100644 --- a/backends/gaudi/server/text_generation_server/models/causal_lm.py +++ b/backends/gaudi/server/text_generation_server/models/causal_lm.py @@ -704,6 +704,9 @@ class CausalLM(Model): htorch.core.hpu_set_env() if world_size > 1: + os.environ.setdefault( + "DEEPSPEED_USE_HABANA_FRAMEWORKS_DETERMINISTIC_API", "1" + ) model = self.get_deepspeed_model(model_id, dtype, revision) model = hq_env.prepare_model_for_quantization(model) else: diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/llava_next.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/llava_next.py index df7366ea..70449f6b 100644 --- a/backends/gaudi/server/text_generation_server/models/custom_modeling/llava_next.py +++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/llava_next.py @@ -14,25 +14,18 @@ # limitations under the License. """ PyTorch Llava-NeXT model.""" -from typing import List, Optional, Tuple +from typing import List, Optional, Union import torch import torch.utils.checkpoint -from torch import nn +import numpy as np -from transformers.activations import ACT2FN +from transformers.models.llava_next.modeling_llava_next import ( + unpad_image, +) +from optimum.habana.transformers.models import GaudiLlavaNextForConditionalGeneration from transformers.image_processing_utils import select_best_resolution -from text_generation_server.layers.attention import Seqlen -from text_generation_server.models.custom_modeling.vlm import ( - load_text_model, - load_vision_model, -) -from text_generation_server.layers import ( - TensorParallelColumnLinear, - TensorParallelRowLinear, -) - def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size): """ @@ -40,7 +33,7 @@ def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size): Args: image_size (`tuple`): - The size of the input image in the format (height, width). + The size of the input image in the format (width, height). grid_pinpoints (`List`): A list containing possible resolutions. Each item in the list should be a tuple or list of the form `(height, width)`. @@ -48,7 +41,7 @@ def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size): The size of each image patch. Returns: - tuple: The shape of the image patch grid in the format (height, width). + tuple: The shape of the image patch grid in the format (width, height). """ if not isinstance(grid_pinpoints, list): raise ValueError("grid_pinpoints should be a list of tuples or lists") @@ -57,100 +50,53 @@ def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size): return height // patch_size, width // patch_size -def unpad_image(tensor, original_size): +# Copied from https://github.com/huggingface/transformers/blob/6966fa190172b48b2fb46fe4552a13b943e692cf/src/transformers/models/llava_next/modeling_llava_next.py#L79 +def image_size_to_num_patches(image_size, grid_pinpoints, patch_size: int): """ - Unpads a PyTorch tensor of a padded and resized image. + Calculate the number of patches after the preprocessing for images of any resolution. Args: - tensor (`torch.Tensor`): - The image tensor, assumed to be of shape (num_channels, height, width). - original_size (`tuple`): - The original size of the image (height, width). + image_size (`torch.LongTensor` or `np.ndarray` or `Tuple[int, int]`): + The size of the input image in the format (height, width). ? + grid_pinpoints (`List`): + A list containing possible resolutions. Each item in the list should be a tuple or list + of the form `(height, width)`. + patch_size (`int`): + The size of each image patch. Returns: - `torch.Tensor`: The unpadded image tensor. + int: the number of patches """ - original_height, original_width = original_size - current_height, current_width = tensor.shape[1:] + if not isinstance(grid_pinpoints, list): + raise TypeError("grid_pinpoints should be a list of tuples or lists") - original_aspect_ratio = original_width / original_height - current_aspect_ratio = current_width / current_height + # ! VERY IMPORTANT if image_size is tensor, must convert to into tuple, otherwise it will cause wrong calculate + if not isinstance(image_size, (list, tuple)): + if not isinstance(image_size, (torch.Tensor, np.ndarray)): + raise TypeError( + f"image_size invalid type {type(image_size)} with value {image_size}" + ) + image_size = image_size.tolist() - if original_aspect_ratio > current_aspect_ratio: - scale_factor = current_width / original_width - new_height = int(original_height * scale_factor) - padding = (current_height - new_height) // 2 - unpadded_tensor = tensor[:, padding : current_height - padding, :] - else: - scale_factor = current_height / original_height - new_width = int(original_width * scale_factor) - padding = (current_width - new_width) // 2 - unpadded_tensor = tensor[:, :, padding : current_width - padding] - - return unpadded_tensor + best_resolution = select_best_resolution(image_size, grid_pinpoints) + height, width = best_resolution + num_patches = 0 + # consider change to ceil(height/patch_size)*ceil(width/patch_size) + 1 + for i in range(0, height, patch_size): + for j in range(0, width, patch_size): + num_patches += 1 + # add the base patch + num_patches += 1 + return num_patches -# Copied from transformers.models.llava.modeling_llava.LlavaMultiModalProjector with Llava->LlavaNext -class LlavaNextMultiModalProjector(nn.Module): - def __init__(self, prefix, config, weights): - super().__init__() - - self.linear_1 = TensorParallelColumnLinear.load( - prefix=f"{prefix}.linear_1", config=config, weights=weights, bias=True - ) - self.act = ACT2FN[config.projector_hidden_act] - self.linear_2 = TensorParallelRowLinear.load( - prefix=f"{prefix}.linear_2", config=config, weights=weights, bias=True - ) - - def forward(self, image_features): - hidden_states = self.linear_1(image_features) - hidden_states = self.act(hidden_states) - hidden_states = self.linear_2(hidden_states) - return hidden_states - - -class LlavaNextForConditionalGeneration(nn.Module): - def __init__(self, prefix, config, weights): - super().__init__() - config.vision_config.quantize = config.quantize - vision_config = config.vision_config - # Instead of selecting in hidden_states[-2]. - # Instead compute only the n -2 + 1 layers and don't pool - if config.vision_feature_layer < 0: - vision_config.num_hidden_layers += config.vision_feature_layer + 1 - else: - vision_config.num_hidden_layers = config.vision_feature_layer + 1 - self.vision_tower = load_vision_model( - prefix="vision_tower" if not prefix else f"{prefix}.vision_tower", - config=config.vision_config, - weights=weights, - ) - - self.multi_modal_projector = LlavaNextMultiModalProjector( - prefix="multi_modal_projector", config=config, weights=weights - ) - - self.image_newline = weights.get_tensor("image_newline") - - self.vocab_size = config.text_config.vocab_size - self.config = config - config.text_config.quantize = config.quantize - config.text_config.speculator = config.speculator - self.text_model = load_text_model( - prefix="language_model" if not prefix else f"{prefix}.language_model", - config=config.text_config, - weights=weights, - ) - self.pad_token_id = ( - config.pad_token_id if config.pad_token_id is not None else -1 - ) +class LlavaNextForConditionalGeneration(GaudiLlavaNextForConditionalGeneration): def _merge_input_ids_with_image_features( self, - input_ids: torch.Tensor, inputs_embeds: torch.Tensor, image_features: torch.Tensor, + input_ids: torch.Tensor, ): """In place merges in vision_embeddings with inputs_embeds.""" mask = input_ids == self.config.image_token_index @@ -165,126 +111,315 @@ class LlavaNextForConditionalGeneration(nn.Module): def forward( self, - input_ids: torch.Tensor, - position_ids: torch.Tensor, - cu_seqlen_prefill: Optional[torch.Tensor], - kv_cache: List[Tuple[torch.Tensor, torch.Tensor]], - block_tables: torch.Tensor, - slots: torch.Tensor, - seqlen: Seqlen, - max_s: int, - prefill_cache_indices: Optional[torch.Tensor], - lm_head_indices: Optional[torch.Tensor] = None, + input_ids: torch.LongTensor = None, pixel_values: torch.FloatTensor = None, - # Unused for this model - pixel_attention_mask=None, image_sizes: Optional[torch.LongTensor] = None, - adapter_data: Optional[torch.Tensor] = None, - image_grid_thw: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + vision_feature_layer: Optional[int] = None, + vision_feature_select_strategy: Optional[str] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + token_idx: Optional[torch.Tensor] = None, + use_flash_attention: Optional[bool] = True, + flash_attention_recompute: Optional[bool] = True, ): - inputs_embeds = self.text_model.embed_tokens(input_ids) - if pixel_values is not None and len(pixel_values) > 0: - # num_special_image_tokens = (input_ids == self.config.image_token_index).sum() - # assert num_special_image_tokens == len(pixel_values), f"Received {num_special_image_tokens} for {len(pixel_values)} images, this is invalid" - # 1. Extract the input embeddings - # 2. Merge text and images - num_images, num_patches, channels, height, width = pixel_values.shape - pixel_values = pixel_values.view( - num_images * num_patches, channels, height, width + if token_idx is not None: + output_attentions = ( + output_attentions + if output_attentions is not None + else self.config.output_attentions ) - image_features = self.vision_tower(pixel_values) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + if inputs_embeds is None: + inputs_embeds = self.get_input_embeddings()(input_ids) - # selected_image_feature = image_features.hidden_states[self.config.vision_feature_layer] - # Already done within the clip model - selected_image_feature = image_features.last_hidden_state + outputs = self.language_model( + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + token_idx=token_idx, + use_flash_attention=use_flash_attention, + flash_attention_recompute=flash_attention_recompute, + ) - if self.config.vision_feature_select_strategy == "default": - selected_image_feature = selected_image_feature[:, 1:] - elif self.config.vision_feature_select_strategy == "full": - selected_image_feature = selected_image_feature - else: - raise RuntimeError( - f"Strategy `{self.config.vision_feature_select_strategy}` is not supported/valid." + logits = outputs[0] + + if not return_dict: + output = (logits,) + outputs[1:] + return output + + return outputs + + # Copied from https://github.com/huggingface/transformers/blob/6966fa190172b48b2fb46fe4552a13b943e692cf/src/transformers/models/llava_next/modeling_llava_next.py#L479 + def get_image_features( + self, + pixel_values: torch.FloatTensor, + image_sizes: torch.Tensor, + vision_feature_layer: Union[int, List[int]], + vision_feature_select_strategy: str, + ): + """ + Obtains image last hidden states from the vision tower and apply multimodal projection. + + Args: + pixel_values (`torch.FloatTensor]` of shape `(batch_size, num_patches, channels, height, width)`) + The tensors corresponding to the input images. + image_sizes (`torch.Tensor` of shape `(num_images, 2)`) + Actual image size of each images (H, W). + vision_feature_layer (`Union[int, List[int]]`): + The index of the layer to select the vision feature. If multiple indices are provided, + the vision feature of the corresponding indices will be concatenated to form the + vision features. + vision_feature_select_strategy (`str`): + The feature selection strategy used to select the vision feature from the vision backbone. + Can be one of `"default"` or `"full"` + Returns: + image_features (List[`torch.Tensor`]): List of image feature tensor, each contains all the visual feature of all patches + and are of shape `(num_patches, image_length, embed_dim)`). + """ + # ! infer image_num_patches from image_sizes + image_num_patches = [ + image_size_to_num_patches( + image_size=imsize, + grid_pinpoints=self.config.image_grid_pinpoints, + patch_size=self.config.vision_config.image_size, + ) + for imsize in image_sizes + ] + if pixel_values.dim() == 5: + # stacked if input is (batch_size, num_patches, num_channels, height, width) + _pixel_values_list = [ + pix_val[:num_patch] + for pix_val, num_patch in zip(pixel_values, image_num_patches) + ] + pixel_values = torch.cat(_pixel_values_list, dim=0) + elif pixel_values.dim() != 4: + # otherwise has to be stacked from list of (num_patches, num_channels, height, width) + raise ValueError( + f"pixel_values of shape {pixel_values.shape}, expect to be of 4 or 5 dimensions" + ) + + image_features = self.vision_tower(pixel_values, output_hidden_states=True) + # If we have one vision feature layer, return the corresponding hidden states, + # otherwise, select the hidden states of each feature layer and concatenate them + if isinstance(vision_feature_layer, int): + selected_image_feature = image_features.hidden_states[vision_feature_layer] + else: + hs_pool = [ + image_features.hidden_states[layer_idx] + for layer_idx in vision_feature_layer + ] + selected_image_feature = torch.cat(hs_pool, dim=-1) + + if vision_feature_select_strategy == "default": + selected_image_feature = selected_image_feature[:, 1:] + elif vision_feature_select_strategy == "full": + selected_image_feature = selected_image_feature + + image_features = self.multi_modal_projector(selected_image_feature) + image_features = torch.split(image_features, image_num_patches, dim=0) + return image_features + + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + inputs_embeds=None, + pixel_values=None, + image_sizes=None, + attention_mask=None, + **kwargs, + ): + """ + Inherits from LlavaForConditionalGeneration: https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/models/llava_next/modeling_llava_next.py#L635 + The only differences are: + - add new args token_idx + - add the process of merging images into inputs_embeds + """ + token_idx = kwargs.get("token_idx", None) + if token_idx is None: + return super().prepare_inputs_for_generation( + input_ids=input_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + pixel_values=pixel_values, + image_sizes=image_sizes, + attention_mask=attention_mask, + **kwargs, + ) + else: + use_flash_attention = kwargs.get("use_flash_attention", True) + flash_attention_recompute = kwargs.get("flash_attention_recompute", True) + + position_ids = kwargs.get("position_ids", None) + labels = kwargs.get("labels", None) + if ( + past_key_values is None + and pixel_values is not None + and input_ids.shape[1] != 1 + ): + vision_feature_select_strategy = kwargs.get( + "vision_feature_select_strategy", None + ) + vision_feature_layer = kwargs.get("vision_feature_layer", None) + vision_feature_select_strategy = ( + vision_feature_select_strategy + if vision_feature_select_strategy is not None + else self.config.vision_feature_select_strategy + ) + vision_feature_layer = ( + vision_feature_layer + if vision_feature_layer is not None + else self.config.vision_feature_layer ) - image_features = self.multi_modal_projector(selected_image_feature) + # 1. Extract the input embeddings + inputs_embeds = self.get_input_embeddings()(input_ids) + # 2. Merge text and images + image_features = self.get_image_features( + pixel_values, + image_sizes, + vision_feature_layer=vision_feature_layer, + vision_feature_select_strategy=vision_feature_select_strategy, + ) - # split up image_features for each of the individual images - # hence we get a list of image_features, each of shape (5, num_patches, hidden_size) - # if we assume each image has 5 image features (base image + 4 patches) - split_sizes = [num_patches] * num_images - image_features = torch.split(image_features, split_sizes, dim=0) + # NOTE we only support multimodal_patch_merge_type == "spatial_unpad" + height = width = ( + self.config.vision_config.image_size + // self.config.vision_config.patch_size + ) - # NOTE we only support multimodal_patch_merge_type == "spatial_unpad" - height = width = ( - self.config.vision_config.image_size - // self.config.vision_config.patch_size - ) + new_image_features = [] + for image_idx, image_feature in enumerate(image_features): + if image_feature.shape[0] > 1: + base_image_feature = image_feature[0] + image_feature = image_feature[1:] - new_image_features = [] - for image_idx, image_feature in enumerate(image_features): - if image_feature.shape[0] > 1: - base_image_feature = image_feature[0] - image_feature = image_feature[1:] + if height * width != base_image_feature.shape[0]: + raise ValueError( + "The number of patches is not consistent with the image size." + ) - if height * width != base_image_feature.shape[0]: - raise ValueError( - "The number of patches is not consistent with the image size." + num_patch_height, num_patch_width = get_anyres_image_grid_shape( + image_sizes[image_idx].tolist(), + self.config.image_grid_pinpoints, + self.config.vision_config.image_size, ) - # Dimensions are intentionally swapped to be bug-compatible with - # upstream: https://github.com/LLaVA-VL/LLaVA-NeXT/issues/59 - num_patch_width, num_patch_height = get_anyres_image_grid_shape( - image_sizes[image_idx], - self.config.image_grid_pinpoints, - self.config.vision_config.image_size, - ) - image_feature = image_feature.view( - num_patch_height, num_patch_width, height, width, -1 - ) - image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous() - image_feature = image_feature.flatten(1, 2).flatten(2, 3) - image_feature = unpad_image(image_feature, image_sizes[image_idx]) - image_feature = torch.cat( - ( - image_feature, - self.image_newline[:, None, None].expand( - *image_feature.shape[:-1], 1 + image_feature = image_feature.view( + num_patch_height, num_patch_width, height, width, -1 + ) + image_feature = image_feature.permute( + 4, 0, 2, 1, 3 + ).contiguous() + image_feature = image_feature.flatten(1, 2).flatten(2, 3) + image_feature = unpad_image( + image_feature, image_sizes[image_idx] + ) + image_feature = torch.cat( + ( + image_feature, + self.image_newline[:, None, None].expand( + *image_feature.shape[:-1], 1 + ), ), - ), - dim=-1, - ) - image_feature = image_feature.flatten(1, 2).transpose(0, 1) - image_feature = torch.cat( - (base_image_feature, image_feature), dim=0 - ) - else: - image_feature = image_feature[0] - image_feature = torch.cat( - (image_feature, self.image_newline[None]), dim=0 - ) - new_image_features.append(image_feature) - image_features = torch.stack(new_image_features, dim=0) + dim=-1, + ) + image_feature = image_feature.flatten(1, 2).transpose(0, 1) + image_feature = torch.cat( + (base_image_feature, image_feature), dim=0 + ) + else: + image_feature = image_feature[0] + image_feature = torch.cat( + (image_feature, self.image_newline[None]), dim=0 + ) + new_image_features.append(image_feature) + image_features = torch.cat(new_image_features, dim=0) + inputs_embeds = self._merge_input_ids_with_image_features( + inputs_embeds, image_features, input_ids + ) + # In case input_ids.shape[1] == 1 & pixel_values==None & past_key_values != None, we are in the case of + # generation with cache + elif past_key_values is not None: + seq_len = input_ids.shape[1] + pad_len = seq_len - token_idx.item() + input_ids = torch.index_select(input_ids, 1, token_idx - 1) + # Retrieve the first layer to inspect the logits and mask out the hidden states + # that are set to 0 + first_layer_past_key_value = past_key_values[0][0][:, :, :, 0] + # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941 + batch_index, non_attended_tokens = torch.where( + first_layer_past_key_value.float().sum(-2) == 0 + ) + # Get the target length + past_length = first_layer_past_key_value.shape[-1] + extended_attention_mask = torch.ones( + (attention_mask.shape[0], past_length), + dtype=attention_mask.dtype, + device=attention_mask.device, + ) + # Filter out only the tokens that can be un-attended, this can happen + # if one uses Llava + Fused modules where the cache on the + # first iteration is already big enough, or if one passes custom cache + valid_indices = non_attended_tokens < extended_attention_mask.size(-1) + new_batch_index = batch_index[valid_indices] + new_non_attended_tokens = non_attended_tokens[valid_indices] - inputs_embeds = self._merge_input_ids_with_image_features( - input_ids, inputs_embeds, image_features + # Zero-out the places where we don't need to attend + extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0 + + attention_mask = extended_attention_mask + attention_mask[:, -pad_len:] = 0 + + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past_key_values: + if token_idx is not None: + position_ids = ( + torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1 + ) + else: + position_ids = position_ids[:, -input_ids.shape[1] :] + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and past_key_values is None: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids} + + model_inputs.update( + { + "position_ids": position_ids, + "past_key_values": past_key_values, + "use_cache": kwargs.get("use_cache"), + "attention_mask": attention_mask, + "token_idx": token_idx, + "labels": labels, + "use_flash_attention": use_flash_attention, + "flash_attention_recompute": flash_attention_recompute, + } ) - hidden_states = self.text_model.model( - inputs_embeds=inputs_embeds, - position_ids=position_ids, - cu_seqlen_prefill=cu_seqlen_prefill, - kv_cache=kv_cache, - block_tables=block_tables, - slots=slots, - seqlen=seqlen, - max_s=max_s, - true_max_s=max_s, - prefill_cache_indices=None, - adapter_data=adapter_data, - ) - if lm_head_indices is not None: - hidden_states = hidden_states[lm_head_indices] - logits, speculative_logits = self.text_model.lm_head(hidden_states) - return logits, speculative_logits + return model_inputs diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/mllama.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/mllama.py index e040a542..6ba0ffff 100644 --- a/backends/gaudi/server/text_generation_server/models/custom_modeling/mllama.py +++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/mllama.py @@ -14,985 +14,279 @@ # limitations under the License. """PyTorch Mllama model.""" -from typing import Optional, Tuple, List +from typing import Optional, Tuple, List, Union import torch import torch.utils.checkpoint -from torch import nn -from habana_frameworks.torch.hpex.kernels import FusedSDPA -from vllm_hpu_extension.utils import ModuleFusedSDPA - - -from transformers.activations import ACT2FN -import torch.nn.functional as F - -from text_generation_server.layers import ( - TensorParallelColumnLinear, - TensorParallelEmbedding, - TensorParallelRowLinear, - FastLinear, -) -from text_generation_server.layers.attention import ( - Seqlen, -) -from text_generation_server.models.custom_modeling.flash_llama_modeling import ( - FlashLlamaForCausalLM, +from optimum.habana.transformers.models import GaudiMllamaForConditionalGeneration +from optimum.habana.transformers.models.mllama.modeling_mllama import ( + _prepare_cross_attention_mask, ) +from transformers.modeling_outputs import CausalLMOutputWithPast -def _prepare_aspect_ratio_attention_mask( - aspect_ratio_mask: torch.Tensor, - num_patches: int, - target_length: int, - dtype: torch.dtype, -) -> torch.Tensor: - # Expand aspect ratio mask to target_length - batch_size, max_num_tiles = aspect_ratio_mask.shape - attention_mask = aspect_ratio_mask.view(batch_size, max_num_tiles, 1, 1).to(dtype) - attention_mask = attention_mask.repeat(1, 1, target_length, 1) - - # Mask padding patches - pad_patches = target_length - num_patches - attention_mask[:, :, -pad_patches:] = 0 - - # Invert the mask (0 -> 1, 1 -> 0) - attention_mask = 1 - attention_mask - - # Reshape to 2D and create 4D attention mask - # (batch_size, 1, max_num_tiles * target_length, max_num_tiles * target_length) - attention_mask = attention_mask.reshape( - batch_size, max_num_tiles * target_length, 1 - ) - attention_mask = ( - attention_mask @ attention_mask.transpose(-1, -2) * torch.finfo(dtype).min - ) - attention_mask = attention_mask.unsqueeze(1) - - return attention_mask - - -# Copied from transformers.models.llama.modeling_llama._prepare_4d_causal_attention_mask_with_cache_position -def _prepare_4d_causal_attention_mask_with_cache_position( - attention_mask: torch.Tensor, - sequence_length: int, - target_length: int, - dtype: torch.dtype, - device: torch.device, - min_dtype: float, - cache_position: torch.Tensor, - batch_size: int, -): - """ - Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape - `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing. - - Args: - attention_mask (`torch.Tensor`): - A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`. - sequence_length (`int`): - The sequence length being processed. - target_length (`int`): - The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet. - dtype (`torch.dtype`): - The dtype to use for the 4D attention mask. - device (`torch.device`): - The device to plcae the 4D attention mask on. - min_dtype (`float`): - The minimum value representable with the dtype `dtype`. - cache_position (`torch.Tensor`): - Indices depicting the position of the input sequence tokens in the sequence. - batch_size (`torch.Tensor`): - Batch size. - """ - if attention_mask is not None and attention_mask.dim() == 4: - # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing. - causal_mask = attention_mask - else: - causal_mask = torch.full( - (sequence_length, target_length), - fill_value=min_dtype, - dtype=dtype, - device=device, - ) - if sequence_length != 1: - causal_mask = torch.triu(causal_mask, diagonal=1) - causal_mask *= torch.arange( - target_length, device=device - ) > cache_position.reshape(-1, 1) - causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) - if attention_mask is not None: - causal_mask = ( - causal_mask.clone() - ) # copy to contiguous memory for in-place edit - mask_length = attention_mask.shape[-1] - padding_mask = ( - causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :] - ) - padding_mask = padding_mask == 0 - causal_mask[:, :, :, :mask_length] = causal_mask[ - :, :, :, :mask_length - ].masked_fill(padding_mask, min_dtype) - - return causal_mask - - -def _prepare_cross_attention_mask( - cross_attention_mask: torch.Tensor, - num_vision_tokens: int, - dtype: str, -) -> Tuple[torch.Tensor, torch.Tensor]: - # reshape so it can be used by attn module - batch_size, text_total_length, *_ = cross_attention_mask.shape - cross_attention_mask = cross_attention_mask.repeat_interleave( - num_vision_tokens, dim=3 - ) - cross_attention_mask = cross_attention_mask.view(batch_size, text_total_length, -1) - cross_attention_mask = cross_attention_mask.unsqueeze(1) - - # invert the mask - inverted_cross_attn_mask = (1.0 - cross_attention_mask).to(dtype) - cross_attention_mask = inverted_cross_attn_mask.masked_fill( - inverted_cross_attn_mask.to(torch.bool), torch.finfo(dtype).min - ) - - # apply full-row bias, which return 4D tensor of shape [B, H, S1, 1] where value is 0 if the a full row in cross attn mask's - # last dimension contains negative infinity values, otherwise it's 1 - negative_inf_value = torch.finfo(dtype).min - full_text_row_masked_out_mask = ( - (cross_attention_mask != negative_inf_value) - .any(dim=-1) - .type_as(cross_attention_mask)[..., None] - ) - cross_attention_mask *= full_text_row_masked_out_mask - - return cross_attention_mask, full_text_row_masked_out_mask - - -# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->MllamaVision -class MllamaVisionMLP(nn.Module): - def __init__(self, *, prefix, config, weights): - super().__init__() - self.config = config - self.activation_fn = ACT2FN[config.hidden_act] - self.fc1 = TensorParallelColumnLinear.load( - prefix=f"{prefix}.fc1", weights=weights, config=config, bias=True - ) - self.fc2 = TensorParallelRowLinear.load( - prefix=f"{prefix}.fc2", weights=weights, config=config, bias=True - ) - - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - hidden_states = self.fc1(hidden_states) - hidden_states = self.activation_fn(hidden_states) - hidden_states = self.fc2(hidden_states) - return hidden_states - - -class MllamaVisionSdpaAttention(nn.Module): - def __init__(self, *, prefix, config, weights): - super().__init__() - - self.embed_dim = config.hidden_size - self.head_dim = config.hidden_size // config.attention_heads - self.num_heads = config.attention_heads // weights.process_group.size() - - self.qkv_proj = TensorParallelColumnLinear.load_multi( - config, - prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"], - dim=0, - weights=weights, - bias=False, - ) - self.o_proj = TensorParallelRowLinear.load( - config, - prefix=f"{prefix}.o_proj", - weights=weights, - bias=False, - ) +class MllamaForConditionalGeneration(GaudiMllamaForConditionalGeneration): def forward( self, - hidden_state: torch.Tensor, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + aspect_ratio_mask: Optional[torch.Tensor] = None, + aspect_ratio_ids: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - qkv = self.qkv_proj(hidden_state) - query, key, value = qkv.split( - [ - self.head_dim * self.num_heads, - self.head_dim * self.num_heads, - self.head_dim * self.num_heads, - ], - dim=2, - ) - - batch_size, q_seq_len, _ = query.shape - _, kv_seq_len, _ = key.shape - - query = query.view(batch_size, q_seq_len, self.num_heads, self.head_dim) - key = key.view(batch_size, kv_seq_len, self.num_heads, self.head_dim) - value = value.view(batch_size, kv_seq_len, self.num_heads, self.head_dim) - - query = query.transpose(1, 2) - key = key.transpose(1, 2) - value = value.transpose(1, 2) - - attn_output = F.scaled_dot_product_attention( - query, key, value, attn_mask=attention_mask - ) - - attn_output = attn_output.transpose(1, 2).contiguous() - attn_output = attn_output.reshape(batch_size, q_seq_len, -1) - - output = self.o_proj(attn_output) - return output - - -class MllamaVisionEncoderLayer(nn.Module): - def __init__(self, *, prefix, config, weights, is_gated: bool): - super().__init__() - - self.hidden_size = config.hidden_size - self.num_attention_heads = config.attention_heads - self.is_gated = is_gated - self.intermediate_size = config.intermediate_size - - self.self_attn = MllamaVisionSdpaAttention( - prefix=f"{prefix}.self_attn", config=config, weights=weights - ) - self.mlp = MllamaVisionMLP( - prefix=f"{prefix}.mlp", config=config, weights=weights - ) - - self.input_layernorm = nn.LayerNorm.load( - prefix=f"{prefix}.input_layernorm", weights=weights, eps=1e-05 - ) - self.post_attention_layernorm = nn.LayerNorm.load( - prefix=f"{prefix}.post_attention_layernorm", weights=weights, eps=1e-05 - ) - - # there used to be an if else here, no code path - if is_gated: - self.gate_attn = nn.Parameter( - weights.get_tensor(f"{prefix}.gate_attn"), requires_grad=False - ) - self.gate_ffn = nn.Parameter( - weights.get_tensor(f"{prefix}.gate_ffn"), requires_grad=False - ) - - def forward( - self, - hidden_state: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - ): - # Self Attention - residual = hidden_state - hidden_state = self.input_layernorm(hidden_state) - hidden_state = self.self_attn(hidden_state, attention_mask=attention_mask) - gate_attn = 1 if not self.is_gated else self.gate_attn.tanh() - hidden_state = residual + gate_attn * hidden_state - - # Feed forward - residual = hidden_state - hidden_state = self.post_attention_layernorm(hidden_state) - hidden_state = self.mlp(hidden_state) - gate_ffn = 1 if not self.is_gated else self.gate_ffn.tanh() - hidden_state = residual + gate_ffn * hidden_state - return hidden_state - - -class MllamaVisionEncoder(nn.Module): - def __init__(self, *, prefix, config, weights, is_gated: bool, num_layers: int): - super().__init__() - self.config = config - self.layers = [ - MllamaVisionEncoderLayer( - prefix=f"{prefix}.layers.{i}", - config=config, - weights=weights, - is_gated=is_gated, - ) - for i in range(num_layers) - ] - - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - ): - encoder_states = [hidden_states] - for encoder_layer in self.layers: - layer_outputs = encoder_layer( - hidden_states, - attention_mask, - ) - - hidden_states = layer_outputs - encoder_states.append(hidden_states) - - return hidden_states, encoder_states - - -class MllamaPrecomputedAspectRatioEmbedding(nn.Module): - def __init__(self, *, prefix, config, weights): - super().__init__() - self.max_num_tiles = config.max_num_tiles - self.hidden_size = config.hidden_size - self.max_aspect_ratio_id = config.max_aspect_ratio_id - - self.embedding = TensorParallelEmbedding( - prefix=f"{prefix}.embedding", weights=weights - ) - self.gate = nn.Parameter( - weights.get_tensor(f"{prefix}.gate"), requires_grad=False - ) - - def forward( - self, hidden_state: torch.Tensor, aspect_ratio_ids: torch.Tensor - ) -> torch.Tensor: - embeddings = self.embedding(aspect_ratio_ids) - embeddings = embeddings.reshape(-1, self.max_num_tiles, 1, self.hidden_size) - - # Always gated. - embeddings = embeddings * self.gate.tanh() - - hidden_state = hidden_state + embeddings - return hidden_state - - -class MllamaPrecomputedPositionEmbedding(nn.Module): - def __init__(self, *, prefix, config, weights): - super().__init__() - self.max_num_tiles = config.max_num_tiles - self.max_aspect_ratio_id = config.max_aspect_ratio_id - self.num_patches = (config.image_size // config.patch_size) ** 2 + 1 - self.hidden_size = config.hidden_size - self.scale = config.hidden_size**-0.5 - - self.gate = nn.Parameter( - weights.get_tensor(f"{prefix}.gate"), requires_grad=False - ) - - # position embedding - embedding = nn.Parameter( - weights.get_tensor(f"{prefix}.embedding"), requires_grad=False - ) - self.gated_position_embedding = (1 - self.gate.tanh()) * embedding - self.tile_embedding = TensorParallelEmbedding( - prefix=f"{prefix}.tile_embedding", weights=weights - ) - - def forward( - self, hidden_state: torch.Tensor, aspect_ratio_ids: torch.Tensor - ) -> torch.Tensor: - # position embeddings - hidden_state = hidden_state + self.gated_position_embedding.view( - 1, 1, self.num_patches, self.hidden_size - ) - - # precomputed tile position embeddings - tile_position_embedding = self.tile_embedding(aspect_ratio_ids) - batch_size = hidden_state.shape[0] - tile_position_embedding = tile_position_embedding.reshape( - batch_size, self.max_num_tiles, self.num_patches, self.hidden_size - ) - gated_tile_position_embedding = self.gate.tanh() * tile_position_embedding - hidden_state = hidden_state + gated_tile_position_embedding - - return hidden_state - - -class MllamaVisionModel(nn.Module): - def __init__(self, *, prefix, config, weights): - super().__init__() - self.image_size = config.image_size - self.patch_size = config.patch_size - self.max_num_tiles = config.max_num_tiles - self.hidden_size = config.hidden_size - self.num_channels = config.num_channels - self.intermediate_layers_indices = config.intermediate_layers_indices - - self.num_patches = (self.image_size // self.patch_size) ** 2 + 1 - self.scale = config.hidden_size**-0.5 - self.dtype = weights.dtype - - self.patch_embedding = nn.Conv2d( - in_channels=config.num_channels, - out_channels=self.hidden_size, - kernel_size=self.patch_size, - stride=self.patch_size, - padding="valid", - bias=False, - ) - self.patch_embedding.weight = nn.Parameter( - weights.get_tensor(f"{prefix}.patch_embedding.weight"), requires_grad=False - ) - - self.class_embedding = nn.Parameter( - weights.get_tensor(f"{prefix}.class_embedding"), requires_grad=False - ) - - self.gated_positional_embedding = MllamaPrecomputedPositionEmbedding( - prefix=f"{prefix}.gated_positional_embedding", - config=config, - weights=weights, - ) - - self.pre_tile_positional_embedding = MllamaPrecomputedAspectRatioEmbedding( - prefix=f"{prefix}.pre_tile_positional_embedding", - config=config, - weights=weights, - ) - self.post_tile_positional_embedding = MllamaPrecomputedAspectRatioEmbedding( - prefix=f"{prefix}.post_tile_positional_embedding", - config=config, - weights=weights, - ) - - ## layer norms - self.layernorm_pre = nn.LayerNorm.load( - prefix=f"{prefix}.layernorm_pre", - weights=weights, - # torch default - eps=1e-05, - ) - self.layernorm_post = nn.LayerNorm.load( - prefix=f"{prefix}.layernorm_post", - weights=weights, - # torch default - eps=1e-05, - ) - - ## encoders - self.transformer = MllamaVisionEncoder( - prefix=f"{prefix}.transformer", - config=config, - weights=weights, - is_gated=False, - num_layers=config.num_hidden_layers, - ) - self.global_transformer = MllamaVisionEncoder( - prefix=f"{prefix}.global_transformer", - config=config, - weights=weights, - is_gated=True, - num_layers=config.num_global_layers, - ) - - def apply_class_embedding(self, hidden_state: torch.Tensor) -> torch.Tensor: - batch_size, _, hidden_size = hidden_state.shape - class_embedding = self.class_embedding.expand(batch_size, 1, hidden_size) - hidden_state = torch.cat([class_embedding, hidden_state], dim=1) - return hidden_state - - def forward( - self, - pixel_values: torch.Tensor, - aspect_ratio_ids: torch.Tensor, - attention_mask: torch.Tensor, - ) -> torch.Tensor: - ( - batch_size, - num_concurrent_media, - num_tiles, - num_channels, - height, - width, - ) = pixel_values.shape - - pixel_values = pixel_values.reshape( - batch_size * num_concurrent_media * num_tiles, num_channels, height, width - ) - aspect_ratio_ids = aspect_ratio_ids.reshape( - batch_size * num_concurrent_media, -1 - ) - - # patch embedding - patch_embeds = self.patch_embedding(pixel_values) - hidden_state = patch_embeds.flatten(2).transpose(1, 2) - - # tile embeddings - _, num_patches, dim = hidden_state.shape - hidden_state = hidden_state.reshape( - batch_size * num_concurrent_media, num_tiles, -1, dim - ) - hidden_state = self.pre_tile_positional_embedding( - hidden_state, aspect_ratio_ids - ) - - # apply cls token - hidden_state = hidden_state.reshape( - batch_size * num_concurrent_media * num_tiles, num_patches, dim - ) - hidden_state = self.apply_class_embedding(hidden_state) - num_patches += 1 - - # apply position embeddings - hidden_state = hidden_state.reshape( - batch_size * num_concurrent_media, num_tiles, num_patches, dim - ) - hidden_state = self.gated_positional_embedding(hidden_state, aspect_ratio_ids) - - # apply encoder - hidden_state = self.layernorm_pre(hidden_state) - - # Compute the number of tokens to pad - num_padding_patches = (8 - (hidden_state.shape[-2] % 8)) % 8 - # Compute padding tuple for pad function - padding = ( - 0, - 0, - 0, - num_padding_patches, - ) # (pad_left, pad_right, pad_left for dim -2, pad_right for dim -2) - # Pad the tensor - hidden_state = F.pad(hidden_state, padding, mode="constant", value=0) - slice_index = -num_padding_patches if num_padding_patches > 0 else None - - if attention_mask is not None: - attention_mask = attention_mask.reshape( - batch_size * num_concurrent_media, -1 - ) - attention_mask = _prepare_aspect_ratio_attention_mask( - aspect_ratio_mask=attention_mask, - num_patches=self.num_patches, - target_length=hidden_state.shape[2], - dtype=self.dtype, - ) - - hidden_state = hidden_state.view(batch_size * num_concurrent_media, -1, dim) - hidden_state, all_intermediate_hidden_states = self.transformer( - hidden_state, - attention_mask=attention_mask, - ) - intermediate_hidden_states = [ - hidden_state - for idx, hidden_state in enumerate(all_intermediate_hidden_states) - if idx in self.intermediate_layers_indices - ] - intermediate_hidden_states = torch.stack(intermediate_hidden_states, dim=-1) - - # apply global encoder - hidden_state = self.layernorm_post(hidden_state) - hidden_state = hidden_state.reshape( - batch_size * num_concurrent_media, - num_tiles, - num_patches + num_padding_patches, - dim, - ) - hidden_state = self.post_tile_positional_embedding( - hidden_state, aspect_ratio_ids - ) - hidden_state = hidden_state.reshape( - batch_size * num_concurrent_media, - num_tiles * (num_patches + num_padding_patches), - dim, - ) - hidden_state, _ = self.global_transformer( - hidden_state, attention_mask=attention_mask - ) - hidden_state = hidden_state.reshape( - batch_size * num_concurrent_media, - num_tiles, - num_patches + num_padding_patches, - dim, - ) - hidden_state = hidden_state[:, :, :slice_index] - - # adding intermediate layer outputs - hidden_state = hidden_state.reshape( - batch_size, num_concurrent_media, num_tiles, num_patches, dim - ) - intermediate_hidden_states = intermediate_hidden_states.reshape( - batch_size * num_concurrent_media, - num_tiles, - num_patches + num_padding_patches, - -1, - ) - intermediate_hidden_states = intermediate_hidden_states[:, :, :slice_index] - intermediate_hidden_states = intermediate_hidden_states.reshape( - batch_size, num_concurrent_media, num_tiles, num_patches, -1 - ) - hidden_state = torch.cat([hidden_state, intermediate_hidden_states], dim=-1) - return hidden_state - - -class MllamaTextCrossAttention(nn.Module): - """Multi-headed attention from 'Attention Is All You Need' paper""" - - def __init__(self, *, prefix, config, weights, layer_idx): - super().__init__() - self.config = config - self.num_heads = self.config.num_attention_heads - self.num_key_value_heads = self.config.num_key_value_heads - self.dropout = config.dropout - self.hidden_size = config.hidden_size - self.head_size = config.hidden_size // self.num_heads - self.num_key_value_groups = self.num_heads // self.num_key_value_heads - self.layer_idx = layer_idx - - self.num_heads = self.num_heads // weights.process_group.size() - self.num_key_value_heads = ( - self.num_key_value_heads // weights.process_group.size() - ) - - self.q_proj = TensorParallelColumnLinear.load( - config, - prefix=f"{prefix}.q_proj", - weights=weights, - bias=False, - ) - self.k_proj = TensorParallelColumnLinear.load( - config, - prefix=f"{prefix}.k_proj", - weights=weights, - bias=False, - ) - self.v_proj = TensorParallelColumnLinear.load( - config, - prefix=f"{prefix}.v_proj", - weights=weights, - bias=False, - ) - self.o_proj = TensorParallelRowLinear.load( - config, - prefix=f"{prefix}.o_proj", - weights=weights, - bias=False, - ) - - self.q_norm = MllamaTextRMSNorm.load( - prefix=f"{prefix}.q_norm", weights=weights, eps=config.rms_norm_eps - ) - self.k_norm = MllamaTextRMSNorm.load( - prefix=f"{prefix}.k_norm", weights=weights, eps=config.rms_norm_eps - ) - self.softmax_scale = self.head_size**-0.5 - - def forward( - self, - hidden_states: torch.Tensor, + cross_attention_mask: Optional[torch.Tensor] = None, cross_attention_states: Optional[torch.Tensor] = None, - # past_key_value=None, - # attention_mask: Optional[torch.Tensor] = None, - # cache_position: Optional[torch.LongTensor] = None, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - """Input shape: Batch x Time x Channel""" - # hidden_states = hidden_states.unsqueeze(0) - # bsz, q_len, _ = hidden_states.size() - query_states = self.q_proj(hidden_states) - query_states = query_states.view(-1, self.num_heads, self.head_size) - query_states = self.q_norm(query_states) - - ( - cross_attention_states, - cu_seqlen_q, - cu_seqlen_k, - max_q, - max_k, - indices, - ) = cross_attention_states - - key_states = self.k_proj(cross_attention_states) - value_states = self.v_proj(cross_attention_states) - key_states = key_states.view(-1, self.num_key_value_heads, self.head_size) - value_states = value_states.view(-1, self.num_key_value_heads, self.head_size) - key_states = self.k_norm(key_states) - - # key_states = key_states.repeat(1, self.num_key_value_groups, 1) - # value_states = value_states.repeat(1, self.num_key_value_groups, 1) - - causal = False - # logger.info( - # f"Q: {query_states.shape} -K {key_states.shape} - V{value_states.shape}" - # ) - query_states = query_states.unsqueeze(0).transpose(1, 2) - key_states = key_states.unsqueeze(0).transpose(1, 2) - value_states = value_states.unsqueeze(0).transpose(1, 2) - fsdpa_op = ModuleFusedSDPA(FusedSDPA) - attn_output = fsdpa_op( - query_states, - key_states, - value_states, - attn_mask=None, - dropout_p=0.0, - is_causal=causal, - scale=None, - softmax_mode="None", - recompute_mode=None, - valid_sequence_lengths=None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + num_logits_to_keep: int = 0, + token_idx: Optional[torch.Tensor] = None, + use_flash_attention: Optional[bool] = True, + flash_attention_recompute: Optional[bool] = True, + **kwargs, + ) -> Union[Tuple, CausalLMOutputWithPast]: + """ + Copied from MllamaForConditionalGeneration::forward: https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/models/mllama/modeling_mllama.py#L2077 + The only differences are: + - add token_idx input + - add use_flash_attention and flash_attention_recompute + """ + full_text_row_masked_out_mask = kwargs.get( + "full_text_row_masked_out_mask", None ) - attn_output = attn_output.transpose(1, 2).squeeze(0).contiguous() - - attn_output = self.o_proj(attn_output.view(-1, self.num_heads * self.head_size)) - - return attn_output - - -# Copied from transformers.models.gemma2.modeling_gemma2.Gemma2MLP with Gemma2->MllamaText -class MllamaTextMLP(nn.Module): - def __init__(self, *, prefix, config, weights): - super().__init__() - self.config = config - self.hidden_size = config.hidden_size - self.intermediate_size = ( - config.intermediate_size // weights.process_group.size() - ) - self.gate_up_proj = TensorParallelColumnLinear.load_multi( - config, - prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"], - weights=weights, - dim=0, - bias=False, - ) - self.down_proj = TensorParallelRowLinear.load( - config, - prefix=f"{prefix}.down_proj", - weights=weights, - bias=False, - ) - self.act_fn = ACT2FN[config.hidden_act] - - def forward(self, x): - shape = x.shape - gate_up_states = self.gate_up_proj(x) - gate_up_states = gate_up_states.view(*shape[:-1], 2, self.intermediate_size) - result = self.down_proj( - self.act_fn(gate_up_states[:, 0]) * gate_up_states[:, 1] - ) - return result - - -class FlashLlamaCrossLayer(torch.nn.Module): - """Cross-attention transformer block with tanh-gated attention and feedforward.""" - - def __init__(self, *, prefix, config, weights, index) -> None: - layer_idx = index - super().__init__() - self.cross_attn = MllamaTextCrossAttention( - prefix=f"{prefix}.cross_attn", - config=config, - weights=weights, - layer_idx=layer_idx, - ) - - self.input_layernorm = MllamaTextRMSNorm.load( - prefix=f"{prefix}.input_layernorm", weights=weights, eps=config.rms_norm_eps - ) - self.cross_attn_attn_gate = torch.nn.Parameter( - weights.get_tensor(f"{prefix}.cross_attn_attn_gate"), requires_grad=False - ) - - self.mlp = MllamaTextMLP(prefix=f"{prefix}.mlp", config=config, weights=weights) - self.post_attention_layernorm = MllamaTextRMSNorm.load( - prefix=f"{prefix}.post_attention_layernorm", - weights=weights, - eps=config.rms_norm_eps, - ) - self.cross_attn_mlp_gate = torch.nn.Parameter( - weights.get_tensor(f"{prefix}.cross_attn_mlp_gate"), requires_grad=False - ) - self.layer_idx = layer_idx - - def forward( - self, - hidden_states, - residual, - cos, - sin, - cu_seqlen_prefill, - kv_cache, - block_tables, - slots, - seqlen, - max_s, - adapter_data, - cross_attention_states, # [ IB, ...] - ) -> Tuple[torch.Tensor, torch.Tensor]: - if cross_attention_states is None: - return hidden_states, residual - if residual is not None: - hidden_states += residual - - indices = cross_attention_states[-1] - out_hidden_states = hidden_states[:] - if len(indices) > 0: - assert max(indices) < hidden_states.shape[0] - hidden_states = hidden_states[indices] - residual = hidden_states - hidden_states = self.input_layernorm(hidden_states) - - hidden_states = self.cross_attn( - hidden_states=hidden_states, - # attention_mask=cross_attention_mask, - cross_attention_states=cross_attention_states, - ) - hidden_states = residual + self.cross_attn_attn_gate.tanh() * hidden_states - - residual = hidden_states - hidden_states = self.post_attention_layernorm(hidden_states) - hidden_states = self.mlp(hidden_states) - hidden_states = residual + self.cross_attn_mlp_gate.tanh() * hidden_states - - out_hidden_states[indices] = hidden_states - hidden_states = out_hidden_states - - return hidden_states, None - - -# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->MllamaText -class MllamaTextRMSNorm(nn.Module): - def __init__(self, weight, eps): - super().__init__() - self.weight = weight - self.variance_epsilon = eps - - @classmethod - def load(cls, *, prefix, weights, eps): - weight = nn.Parameter( - weights.get_tensor(f"{prefix}.weight"), requires_grad=False - ) - return cls(weight=weight, eps=eps) - - def forward(self, hidden_states): - input_dtype = hidden_states.dtype - hidden_states = hidden_states.to(torch.float32) - variance = hidden_states.pow(2).mean(-1, keepdim=True) - hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) - return self.weight * hidden_states.to(input_dtype) - - def extra_repr(self): - return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}" - - -class MllamaForConditionalGeneration(nn.Module): - def __init__(self, prefix, config, weights): - super().__init__() - config.vision_config.quantize = None - config.vision_config.speculator = config.speculator - config.text_config.quantize = config.quantize - config.text_config.speculator = config.speculator - config.text_config._attn_implementation = "sdpa" - self.hidden_size = config.text_config.hidden_size - self.vision_model = MllamaVisionModel( - prefix="vision_model", config=config.vision_config, weights=weights - ) - self.multi_modal_projector = FastLinear.load( - prefix="multi_modal_projector", config=config, weights=weights, bias=True - ) - self.text_model = FlashLlamaForCausalLM( - prefix="language_model", config=config.text_config, weights=weights - ) - self.config = config - self.dtype = weights.dtype - self.device = weights.device - - def vision_forward(self, pixel_values, aspect_ratio_ids, aspect_ratio_mask): - if aspect_ratio_ids is None: + if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError( - "`aspect_ratio_ids` must be provided if `pixel_values` is provided" - ) - # logger.info(f"PIxel values {pixel_values.shape}") - batch_size = pixel_values.shape[0] - vision_states = self.vision_model( - pixel_values, aspect_ratio_ids, aspect_ratio_mask - ) - cross_attention_states = self.multi_modal_projector(vision_states).reshape( - -1, vision_states.shape[-2], self.hidden_size - ) - _, _, h = cross_attention_states.shape - cross_attention_states = cross_attention_states.view(batch_size, -1, h) - # logger.info(f"cross {cross_attention_states.shape}") - return cross_attention_states - - def forward( - self, - input_ids: torch.Tensor, - position_ids: torch.Tensor, - cu_seqlen_prefill: Optional[torch.Tensor], - kv_cache: List[Tuple[torch.Tensor, torch.Tensor]], - block_tables: torch.Tensor, - slots: torch.Tensor, - seqlen: Seqlen, - max_s: int, - prefill_cache_indices: Optional[torch.Tensor], - lm_head_indices: Optional[torch.Tensor], - adapter_data: Optional[torch.Tensor] = None, - # XXX: Putting these as optional so that the cuda warmup calls can go through. - cross_attention_states: Optional[torch.Tensor] = None, - image_indices=None, - ): - if cross_attention_states is not None: - seqlen_q = len(image_indices) - n_images = cross_attention_states.shape[0] - seqlen_k = cross_attention_states.shape[1] - device = cross_attention_states.device - if cu_seqlen_prefill is not None: - offset = 0 - cu_q = [] - indices = [] - for index in image_indices: - cu_q.append(offset) - length = seqlen.input_lengths[index].item() - assert index < seqlen.cu_seqlen_q.shape[0] - input_ids_offset = seqlen.cu_seqlen_q[index] - indices.extend(range(input_ids_offset, input_ids_offset + length)) - offset += length - cu_q.append(offset) - cu_seqlen_q = torch.Tensor(cu_q).to(device=device, dtype=torch.int32) - - assert max(indices) < input_ids.shape[0] - - cu_seqlen_k = ( - torch.arange( - n_images + 1, - device=device, - dtype=torch.int32, - ) - * seqlen_k - ) - max_q = cu_seqlen_q[-1].item() - max_k = seqlen_k - else: - cu_seqlen_q = torch.arange( - seqlen_q + 1, device=device, dtype=torch.int32 - ) - seqlen_k = cross_attention_states.shape[1] - n_images = cross_attention_states.shape[0] - cu_seqlen_k = ( - torch.arange( - n_images + 1, - device=device, - dtype=torch.int32, - ) - * seqlen_k - ) - max_q = seqlen_q - max_k = seqlen_k - indices = image_indices[:] - - cross_attention_states = ( - cross_attention_states, - cu_seqlen_q, - cu_seqlen_k, - max_q, - max_k, - indices, + "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one" ) - outputs = self.text_model( + outputs = self.language_model( input_ids=input_ids, + attention_mask=attention_mask, position_ids=position_ids, - cu_seqlen_prefill=cu_seqlen_prefill, - kv_cache=kv_cache, - block_tables=block_tables, - slots=slots, - seqlen=seqlen, - max_s=max_s, - prefill_cache_indices=prefill_cache_indices, - lm_head_indices=lm_head_indices, - adapter_data=adapter_data, cross_attention_states=cross_attention_states, + cross_attention_mask=cross_attention_mask, + full_text_row_masked_out_mask=full_text_row_masked_out_mask, + past_key_values=past_key_values, + use_cache=use_cache, + inputs_embeds=inputs_embeds, + labels=labels, + output_hidden_states=output_hidden_states, + output_attentions=output_attentions, + return_dict=return_dict, + cache_position=cache_position, + num_logits_to_keep=num_logits_to_keep, + token_idx=token_idx, + use_flash_attention=use_flash_attention, + flash_attention_recompute=flash_attention_recompute, ) + logits = outputs[0] + if not return_dict: + output = (logits,) + outputs[1:] + return output + return outputs + + def prepare_inputs_for_generation( + self, + input_ids=None, + inputs_embeds=None, + attention_mask=None, + position_ids=None, + pixel_values=None, + aspect_ratio_ids=None, + aspect_ratio_mask=None, + cross_attention_mask=None, + past_key_values=None, + use_cache=False, + cache_position=None, + num_logits_to_keep=None, + **kwargs, + ): + """ + Copied from MllamaForConditionalGeneration::prepare_inputs_for_generation: https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/models/mllama/modeling_mllama.py#L2208 + The only differences are: + - add token_idx handling + - add bucket_internal handling + - add use_flash_attention and flash_attention_recompute + """ + + token_idx = kwargs.get("token_idx", None) + if token_idx is None: + return super().prepare_inputs_for_generation( + input_ids=input_ids, + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + position_ids=position_ids, + pixel_values=pixel_values, + aspect_ratio_ids=aspect_ratio_ids, + aspect_ratio_mask=aspect_ratio_mask, + cross_attention_mask=cross_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + cache_position=cache_position, + **kwargs, + ) + else: + use_flash_attention = kwargs.get("use_flash_attention", True) + flash_attention_recompute = kwargs.get("flash_attention_recompute", True) + position_ids = kwargs.get("position_ids", None) + output_attentions = kwargs.get("output_attentions", None) + output_hidden_states = kwargs.get("output_hidden_states", None) + return_dict = kwargs.get("return_dict", None) + labels = kwargs.get("labels", None) + cross_attention_states = kwargs.get("cross_attention_states", None) + + output_attentions = ( + output_attentions + if output_attentions is not None + else self.config.output_attentions + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + bucket_internal = kwargs.get("bucket_internal", None) + + if past_key_values is not None: + if token_idx is not None: + input_ids = torch.index_select(input_ids, 1, token_idx - 1) + elif inputs_embeds is not None: # Exception 1 + input_ids = input_ids[:, -cache_position.shape[0] :] + elif ( + input_ids.shape[1] != cache_position.shape[0] + ): # Default case (the "else", a no op, is Exception 2) + input_ids = input_ids[:, cache_position] + elif bucket_internal and token_idx is not None: + # for the 1st token we can slice the inputs till token idx for the fwd pass. + input_ids = input_ids[:, :token_idx] + attention_mask = attention_mask[:, :token_idx] + if cross_attention_mask is not None: + cross_attention_mask = cross_attention_mask[:, :token_idx, ...] + + # TODO: we have no attention_mask so this won't work, check if we really won't need attention mask and find another way + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past_key_values: + if token_idx is not None: + position_ids = torch.index_select( + position_ids, 1, token_idx - 1 + ) + else: + position_ids = position_ids[:, -input_ids.shape[1] :] + + # This `clone` call is needed to avoid recapturing cuda graphs with `torch.compile`'s `mode="reduce-overhead`, as otherwise the input `position_ids` would have various stride during the decoding. Here, simply using `.contiguous()` is not sufficient as in the batch size = 1 case, `position_ids` is already contiguous but with varying stride which retriggers a capture. + position_ids = position_ids.clone( + memory_format=torch.contiguous_format + ) + + if pixel_values is not None and inputs_embeds is not None: + raise ValueError( + "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one" + ) + + if pixel_values is not None and cross_attention_states is not None: + raise ValueError( + "`pixel_values` and `cross_attention_states` cannot be provided simultaneously" + ) + + if pixel_values is not None: + if aspect_ratio_ids is None: + raise ValueError( + "`aspect_ratio_ids` must be provided if `pixel_values` is provided" + ) + # get vision tokens from vision model + vision_outputs = self.vision_model( + pixel_values=pixel_values, + aspect_ratio_ids=aspect_ratio_ids, + aspect_ratio_mask=aspect_ratio_mask, + output_hidden_states=output_hidden_states, + output_attentions=output_attentions, + return_dict=return_dict, + use_flash_attention=use_flash_attention, + ) + cross_attention_states = vision_outputs[0] + cross_attention_states = self.multi_modal_projector( + cross_attention_states + ).reshape(-1, cross_attention_states.shape[-2], self.hidden_size) + + if cross_attention_mask is not None: + cross_attention_mask, full_text_row_masked_out_mask = ( + _prepare_cross_attention_mask( + cross_attention_mask, + num_vision_tokens=self.vision_model.num_patches, + dtype=self.dtype, + token_idx=token_idx, + ) + ) + else: + full_text_row_masked_out_mask = None + + if cross_attention_mask is not None: + if cache_position is not None: + cross_attention_mask = cross_attention_mask[:, :, cache_position] + full_text_row_masked_out_mask = full_text_row_masked_out_mask[ + :, :, cache_position + ] + elif past_key_values is not None: + if token_idx is not None: + cross_attention_mask = torch.index_select( + cross_attention_mask, -2, token_idx - 1 + ) + full_text_row_masked_out_mask = torch.index_select( + full_text_row_masked_out_mask, -2, token_idx - 1 + ) + else: + cross_attention_mask = cross_attention_mask[:, :, -1:] + full_text_row_masked_out_mask = full_text_row_masked_out_mask[ + :, :, -1: + ] + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and past_key_values is None: + model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None} + else: + # The clone here is for the same reason as for `position_ids`. + model_inputs = { + "input_ids": input_ids.clone(memory_format=torch.contiguous_format), + "inputs_embeds": None, + } + + if num_logits_to_keep is not None: + model_inputs["num_logits_to_keep"] = num_logits_to_keep + + # keep cache_position implementation as None for HPU + cache_position = None + + model_inputs.update( + { + "position_ids": position_ids, + "past_key_values": past_key_values, + "use_cache": kwargs.get("use_cache"), + "attention_mask": attention_mask, + "token_idx": token_idx, + "labels": labels, + "return_dict": kwargs.get("return_dict"), + "full_text_row_masked_out_mask": full_text_row_masked_out_mask, + "use_flash_attention": use_flash_attention, + "cross_attention_mask": cross_attention_mask, + "cross_attention_states": cross_attention_states, + "output_attentions": output_attentions, + "flash_attention_recompute": flash_attention_recompute, + } + ) + + return model_inputs diff --git a/backends/gaudi/server/text_generation_server/models/vlm_causal_lm.py b/backends/gaudi/server/text_generation_server/models/vlm_causal_lm.py index d4f4c1af..66e00171 100644 --- a/backends/gaudi/server/text_generation_server/models/vlm_causal_lm.py +++ b/backends/gaudi/server/text_generation_server/models/vlm_causal_lm.py @@ -8,7 +8,6 @@ from io import BytesIO from opentelemetry import trace from loguru import logger from typing import Iterable, Optional, Tuple, List, Type, Dict -import itertools import tempfile import copy from text_generation_server.models import Model @@ -19,7 +18,6 @@ from text_generation_server.models.causal_lm import ( CausalLMBatch, CausalLMRequest, remove_kv_cache_from_output, - biggest_single_chunk, ) from transformers.models.llava_next.modeling_llava_next import ( @@ -68,14 +66,19 @@ IDEFICS2_IMAGE_TOKEN = "" IMAGES = re.compile(r"!\[[^\]]*\]\((.*?)\s*(\"(?:.*[^\"])\")?\s*\)") BASE_IMAGE_TOKENS = int(os.environ.get("BASE_IMAGE_TOKENS", 2048)) MAX_TOTAL_TOKENS = int(os.environ.get("MAX_TOTAL_TOKENS", 8192)) -MAX_BATCH_TOTAL_TOKENS = int(os.environ.get("MAX_BATCH_TOTAL_TOKENS", 131072)) -PAD_SEQUENCE_TO_MULTIPLE_OF = int(os.environ.get("PAD_SEQUENCE_TO_MULTIPLE_OF", 256)) +PAD_SEQUENCE_TO_MULTIPLE_OF = int(os.environ.get("PAD_SEQUENCE_TO_MULTIPLE_OF", 128)) CHUNK_SIZES = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048] LAZY_MODE = int(os.environ.get("PT_HPU_LAZY_MODE", 1)) +max_batch_size_str = os.environ.get("MAX_BATCH_SIZE") +if max_batch_size_str is not None: + MAX_BATCH_SIZE = int(max_batch_size_str) +else: + raise ValueError("MAX_BATCH_SIZE is not set") PREFILL_WARMUP_BATCH_SIZE_LIST = [] PREFILL_WARMUP_SEQLEN_LIST = [] DECODE_WARMUP_BATCH_SIZE_LIST = [] +CROSS_ATTENTION_LAYERS = [] def round_up(warmup_list: list, num): @@ -83,7 +86,7 @@ def round_up(warmup_list: list, num): for i in warmup_list: if num <= i: break - return i + return i if i > 0 else num def split(string) -> List[Dict[str, str]]: @@ -103,20 +106,17 @@ def split(string) -> List[Dict[str, str]]: return parts -def image_text_replacement(processor, image_input, config, image_id: int) -> str: +def image_text_replacement(config) -> str: if config.model_type == "idefics2": image_seq_len = 64 image_str = f"{IDEFICS2_FAKE_TOKEN}{IDEFICS2_IMAGE_TOKEN * image_seq_len}{IDEFICS2_FAKE_TOKEN}" - if processor.image_processor.do_image_splitting: - image_str *= 5 return image_str elif config.model_type == "llava_next": - height, width = image_input["image_sizes"][image_id] - num_features = get_number_of_features(height, width, config) - return "" * num_features - + return "" elif config.model_type == "paligemma": - return "" * config.text_config.num_image_tokens + return "" + elif config.model_type == "mllama": + return "<|image|>" else: raise RuntimeError(f"Unknown config {config.model_type} for multimodal") @@ -188,6 +188,100 @@ class VlmCausalLMBatch(CausalLMBatch): pixel_values: Optional[List[torch.Tensor]] pixel_attention_mask: Optional[List[torch.Tensor]] image_sizes: Optional[List[Tuple[int, int]]] + aspect_ratio_ids: Optional[torch.Tensor] = None + aspect_ratio_mask: Optional[torch.Tensor] = None + cross_attention_mask: Optional[torch.Tensor] = None + prefilling: bool = True + token_idx: torch.Tensor = None + + def __init__( + self, + batch_id, + requests, + input_ids, + attention_mask, + position_ids, + past_key_values, + merged_kv_cache, + next_token_chooser, + top_n_tokens, + top_n_tokens_tensor, + input_length, + pixel_values: Optional[List[torch.Tensor]] = None, + pixel_attention_mask: Optional[List[torch.Tensor]] = None, + image_sizes: Optional[List[Tuple[int, int]]] = None, + aspect_ratio_ids: Optional[torch.Tensor] = None, + aspect_ratio_mask: Optional[torch.Tensor] = None, + cross_attention_mask: Optional[torch.Tensor] = None, + prefilling: Optional[bool] = True, + ): + super().__init__( + batch_id=batch_id, + requests=requests, + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + merged_kv_cache=merged_kv_cache, + next_token_chooser=next_token_chooser, + top_n_tokens=top_n_tokens, + top_n_tokens_tensor=top_n_tokens_tensor, + input_length=input_length, + ) + + self.pixel_values = pixel_values + self.pixel_attention_mask = pixel_attention_mask + self.image_sizes = image_sizes + self.aspect_ratio_ids = aspect_ratio_ids + self.aspect_ratio_mask = aspect_ratio_mask + self.cross_attention_mask = cross_attention_mask + self.prefilling = prefilling + + @property + def token_idx(self): + if self.prefilling: + # no right padding for prefill + token_idx_scalar = self.attention_mask.shape[-1] - 1 + return torch.tensor(token_idx_scalar).to(self.attention_mask.device) + else: + token_idx_scalar = self.attention_mask.shape[-1] - self.right_padding + return torch.tensor(token_idx_scalar).to(self.attention_mask.device) + + def padding_process(self, pad_id: int): + # self.input_ids = torch.index_select(self.input_ids, 1, self.token_idx - 1) + right_padding = MAX_TOTAL_TOKENS - self.attention_mask.shape[1] + self.input_ids = torch.nn.functional.pad( + self.input_ids, (0, right_padding), value=pad_id + ) + self.attention_mask = torch.nn.functional.pad( + self.attention_mask, (0, right_padding), value=0 + ) + # if self.position_ids is not None: + # self.position_ids = torch.index_select(self.position_ids, 1, self.token_idx - 1) + 1 + if self.cross_attention_mask is not None: + self.cross_attention_mask = torch.nn.functional.pad( + self.cross_attention_mask, (0, 0, 0, 0, 0, right_padding), value=0 + ) + if self.past is not None: + past_key_values_list = list(self.past_key_values) + for layer_id in range(len(self.past)): + past_key_value_list = list(self.past_key_values[layer_id]) + if layer_id not in CROSS_ATTENTION_LAYERS: + past_key_value_list[0] = torch.nn.functional.pad( + self.past_key_values[layer_id][0], + (0, 0, 0, right_padding), + value=0, + ) + past_key_value_list[1] = torch.nn.functional.pad( + self.past_key_values[layer_id][1], + (0, 0, 0, right_padding), + value=0, + ) + past_key_values_list[layer_id] = tuple(past_key_value_list) + self.past_key_values = tuple(past_key_values_list) + + self.prefilling = False + self.input_length = self.input_length @classmethod def from_tokenized( @@ -235,23 +329,23 @@ class VlmCausalLMBatch(CausalLMBatch): bucket_size = max_input_length left_padding = max_input_length - input_len if is_warmup is False: - if input_len < max_input_length: - rounded_seq_len = round_up(PREFILL_WARMUP_SEQLEN_LIST, input_len + 1) - if rounded_seq_len <= max_input_length: - bucket_size = rounded_seq_len - 1 - else: - bucket_size = max_input_length - 1 - left_padding = bucket_size - input_len + rounded_seq_len = round_up(PREFILL_WARMUP_SEQLEN_LIST, input_len + 1) + bucket_size = rounded_seq_len - 1 + left_padding = bucket_size - input_len input_ids = tokenized_inputs["input_ids"] attention_mask = tokenized_inputs["attention_mask"] + cross_attention_mask = tokenized_inputs.get("cross_attention_mask", None) # Allocate space for first token - if left_padding > 0: - input_ids = torch.nn.functional.pad( - input_ids, (left_padding, 1), value=tokenizer.pad_token_id - ) - attention_mask = torch.nn.functional.pad( - attention_mask, (left_padding, 1), value=0 + input_ids = torch.nn.functional.pad( + input_ids, (left_padding, 1), value=tokenizer.pad_token_id + ) + attention_mask = torch.nn.functional.pad( + attention_mask, (left_padding, 1), value=0 + ) + if cross_attention_mask is not None: + cross_attention_mask = torch.nn.functional.pad( + cross_attention_mask, (0, 0, 0, 0, left_padding, 1), value=0 ) all_input_ids = torch.nn.functional.pad( input_ids, (0, max_new_tokens), value=tokenizer.pad_token_id @@ -266,9 +360,13 @@ class VlmCausalLMBatch(CausalLMBatch): r.all_input_ids = all_input_ids[r.idx] input_ids = input_ids.to(device) attention_mask = attention_mask.to(device) + cross_attention_mask = ( + cross_attention_mask.to(device) + if cross_attention_mask is not None + else None + ) position_ids = attention_mask.long().cumsum(-1) - 1 position_ids.masked_fill_(attention_mask == 0, 1) - htorch.core.mark_step() return cls( @@ -283,6 +381,7 @@ class VlmCausalLMBatch(CausalLMBatch): top_n_tokens=top_n_tokens, top_n_tokens_tensor=top_n_tokens_tensor, input_length=input_len, + cross_attention_mask=cross_attention_mask, ) @classmethod @@ -294,46 +393,40 @@ class VlmCausalLMBatch(CausalLMBatch): config, is_warmup, ): - # Process images first. We need all of them so that the processor - # can make the image splits the same size. And we need the final - # sizes to insert correct number of image tokens. + image_inputs = {} + texts = [] images = [] - for r in requests: + batch_tokenized_inputs = {} + + for i, r in enumerate(requests): + # Each input is encoded into a list, where each element of this input list is either a string or a URL + curr_text = "" + curr_image = None for chunk in r.input_chunks.chunks: chunk_type = chunk.WhichOneof("chunk") if chunk_type == "text": - pass + curr_text += chunk.text elif chunk_type == "image": image = Image.open(BytesIO(chunk.image.data)) - if config.model_type == "llava_next": - images.append(image) - else: - images.append([image]) + # TODO unsure about BOS + curr_image = image else: raise RuntimeError(f"Invalid chunk type {chunk_type}") - image_inputs = None - if images: - image_inputs = processor.image_processor(images, return_tensors="pt") - - batch_inputs = [] - max_truncation = 0 - image_id = 0 - for r in requests: - full_text = "" - for chunk in r.input_chunks.chunks: - chunk_type = chunk.WhichOneof("chunk") - if chunk_type == "text": - full_text += chunk.text - elif chunk_type == "image": - full_text += image_text_replacement( - processor, image_inputs, config, image_id + if image_text_replacement(config) not in curr_text: + if "" in curr_text: + curr_text = curr_text.replace( + "", image_text_replacement(config) ) - image_id += 1 - full_text = image_text_replacement_fixup(config, full_text) + else: + curr_text = image_text_replacement(config) + curr_text - batch_inputs.append(full_text) - max_truncation = max(max_truncation, r.truncate) + texts.append(curr_text) + if curr_image is not None: + if config.model_type == "mllama": + images.append([curr_image]) + else: + images.append(curr_image) missing_inputs = 0 dummy_images = None @@ -342,45 +435,48 @@ class VlmCausalLMBatch(CausalLMBatch): missing_inputs = new_bs - len(requests) if missing_inputs > 0: dummy_inputs = [] - if len(batch_inputs) > 0: - dummy_inputs = [batch_inputs[0]] * missing_inputs + if len(texts) > 0: + dummy_inputs = [texts[0]] * missing_inputs + dummy_images = [images[0]] * missing_inputs + texts += dummy_inputs + images += dummy_images - batch_inputs += dummy_inputs - - batch_tokenized_inputs = tokenizer( - batch_inputs, + processor_output = processor( + images, + texts, truncation=True, - max_length=max_truncation, - add_special_tokens=not config.model_type == "paligemma", + max_length=r.truncate, + add_special_tokens=r.add_special_tokens, return_tensors="pt", + padding_side="left", padding="longest", - return_token_type_ids=False, ) - - if missing_inputs > 0 and image_inputs is not None: - dummy_shape = list(image_inputs["pixel_values"].shape) - dummy_shape[0] = missing_inputs - dummy_images = torch.rand(dummy_shape) - new_image_inputs = { - "pixel_values": torch.cat( - (image_inputs["pixel_values"], dummy_images), dim=0 - ), - } - if "pixel_attention_mask" in image_inputs: - dummy_shape = list(image_inputs["pixel_attention_mask"].shape) - dummy_shape[0] = missing_inputs - dummy_attention = torch.zeros(dummy_shape) - new_image_inputs["pixel_attention_mask"] = torch.cat( - (image_inputs["pixel_attention_mask"], dummy_attention), dim=0 - ) - if "image_sizes" in image_inputs: - dummy_shape = list(list(image_inputs["image_sizes"])[0]) - dummy_shape = missing_inputs * [dummy_shape] - dummy_sizes = torch.IntTensor(dummy_shape) - new_image_inputs["image_sizes"] = torch.cat( - (image_inputs["image_sizes"], dummy_sizes), dim=0 - ) - image_inputs = new_image_inputs + if "input_ids" in processor_output: + batch_tokenized_inputs.update({"input_ids": processor_output["input_ids"]}) + if "attention_mask" in processor_output: + batch_tokenized_inputs.update( + {"attention_mask": processor_output["attention_mask"]} + ) + if "cross_attention_mask" in processor_output: + batch_tokenized_inputs.update( + {"cross_attention_mask": processor_output["cross_attention_mask"]} + ) + if "pixel_values" in processor_output: + image_inputs.update({"pixel_values": processor_output["pixel_values"]}) + if "pixel_attention_mask" in processor_output: + image_inputs.update( + {"pixel_attention_mask": processor_output["pixel_attention_mask"]} + ) + if "aspect_ratio_ids" in processor_output: + image_inputs.update( + {"aspect_ratio_ids": processor_output["aspect_ratio_ids"]} + ) + if "aspect_ratio_mask" in processor_output: + image_inputs.update( + {"aspect_ratio_mask": processor_output["aspect_ratio_mask"]} + ) + if "image_sizes" in processor_output: + image_inputs.update({"image_sizes": processor_output["image_sizes"]}) return batch_tokenized_inputs, image_inputs @@ -398,7 +494,9 @@ class VlmCausalLMBatch(CausalLMBatch): batch_tokenized_inputs, image_inputs = cls.batch_tokenized_inputs( pb.requests, tokenizer, processor, config, is_warmup ) - batch = cls.from_tokenized(pb, tokenizer, batch_tokenized_inputs, dtype, device) + batch = cls.from_tokenized( + pb, tokenizer, batch_tokenized_inputs, dtype, device, is_warmup=is_warmup + ) if image_inputs is not None: batch.pixel_values = image_inputs["pixel_values"].to(device=device) if "pixel_attention_mask" in image_inputs: @@ -411,10 +509,26 @@ class VlmCausalLMBatch(CausalLMBatch): batch.image_sizes = image_inputs["image_sizes"].to(device=device) else: batch.image_sizes = None + if "aspect_ratio_ids" in image_inputs: + batch.aspect_ratio_ids = image_inputs["aspect_ratio_ids"].to( + device=device + ) + else: + batch.aspect_ratio_ids = None + if "aspect_ratio_mask" in image_inputs: + batch.aspect_ratio_mask = image_inputs["aspect_ratio_mask"].to( + device=device + ) + else: + batch.aspect_ratio_mask = None else: batch.pixel_values = None batch.pixel_attention_mask = None batch.image_sizes = None + batch.aspect_ratio_ids = None + batch.aspect_ratio_mask = None + batch.cross_attention_mask = None + return batch @classmethod @@ -436,107 +550,231 @@ class VlmCausalLMBatch(CausalLMBatch): ) -> "VlmCausalLMBatch": if not all(b.past_key_values is not None for b in batches): raise ValueError("KV cache not allocated! Cannot recombine before prefill!") + # Used for padding total_requests = sum(len(b) for b in batches) new_bs = total_requests - if is_warmup is False: + if not is_warmup: new_bs = round_up(DECODE_WARMUP_BATCH_SIZE_LIST, total_requests) - batch_id = batches[0].batch_id - device = batches[0].input_ids.device - input_lengths = [b.input_length for b in batches] - max_input_length = max(input_lengths) - offsets = [max_input_length - b.input_length for b in batches] - - cur_padding = [b.right_padding for b in batches] - # For prefill there is a space allocated only for first token - # Need to add padding to the max total tokens before first decode - - moves_needed = [ - total_requests - len(b) if b.batch_size == new_bs else total_requests - for b in batches - ] - dst_batch_idx = min(enumerate(moves_needed), key=lambda idx_val: idx_val[1])[0] - reshape = batches[dst_batch_idx].batch_size < new_bs - - # TODO: Add support for changing max seq len, i.e. due to output length bucketing - # FIXME: max_seq_len for non optimized code if len(batches) > 1: scenario = "CONCAT" - elif reshape: - scenario = "RESHAPE" - elif cur_padding[dst_batch_idx] <= 0: + elif batches[0].prefilling: scenario = "SHIFT" - offsets = [ - biggest_single_chunk(b.max_input_length - max_input_length) - for b in batches - ] - max_input_length = max_input_length + offsets[dst_batch_idx] else: - # Nothing to do return batches[0] dbg_trace( scenario, f"bs:{[b.batch_size for b in batches]}->{new_bs}" - f" reqs:{[len(b) for b in batches]}" - f" offsets:{offsets}" - f" input_lengths:{input_lengths}" - f" cur_padding:{cur_padding}" - f" dst_batch:{dst_batch_idx}", + f" reqs:{[len(b) for b in batches]}", ) - grouped_requests = [[req for req in batch.requests] for batch in batches] - flat_requests = list(itertools.chain(*grouped_requests)) + if scenario == "SHIFT": + batch = batches[0] + batch.padding_process(pad_token_id) + return batch - for i in range(len(batches)): - target_bs = new_bs if i == dst_batch_idx else batches[i].batch_size - batches[i].merge_kv_cache_if_needed(target_bs, offsets[i]) - batches[i].realign(target_bs, offsets[i], pad_token_id) - batches[i].split_kv_cache_if_needed(i == dst_batch_idx) - batches[dst_batch_idx].expand_bs(new_bs) - batches[dst_batch_idx].move_data( - [batches[i] for i in range(len(batches)) if i != dst_batch_idx] - ) + total_batch_size = 0 + max_input_length = 0 + for i, batch in enumerate(batches): + total_batch_size += len(batch) + max_input_length = max(max_input_length, batch.input_length) + # Batch attributes + requests = [] + input_lengths = [] + top_n_tokens = [] + parameters = [] + fsm_grammar_states = [] - top_n_tokens = [r.data.top_n_tokens for r in flat_requests] - top_n_tokens_tensor = torch.tensor( - top_n_tokens, device=device, dtype=torch.int64 - ) + # Batch tensors + input_ids = None + attention_mask = None + position_ids = None + past_key_values = [] + top_n_tokens_tensor = None + cross_attention_mask = None + # Used for slicing correctly inside the tensors + # Equivalent to a cumsum on batch sizes + start_index = 0 + for i, batch in enumerate(batches): + keep_indices = [] + for req in batch.requests: + keep_indices.append(req.idx) - parameters = [r.data.parameters for r in flat_requests] - # append the dummy parameters for dummy requests - batch_size = batches[dst_batch_idx].batch_size - parameters = pad_next_token_chooser_parameters(parameters, batch_size) + requests.extend(batch.requests) + parameters.extend([r.data.parameters for r in batch.requests]) + fsm_grammar_states.extend( + [batch.next_token_chooser.fsm_grammar_states[i] for i in keep_indices] + ) + input_lengths.extend([batch.input_length]) + top_n_tokens.extend([batch.top_n_tokens[i] for i in keep_indices]) - # update past grammar states - fsm_grammar_states = [0] * batch_size - for batch in batches: - for i, req in enumerate(batch.requests): - fsm_grammar_states[req.idx] = ( - batch.next_token_chooser.fsm_grammar_states[i] + # Slicing end index for this batch + end_index = start_index + len(batch) + + # We only concatenate batches that did at least one step + if batch.past_key_values is None: + raise ValueError("only concatenate prefilled batches") + + # Create empty tensor + # input_ids is always of shape [batch_size, 1] + # We do not need to pad it + if input_ids is None: + input_ids = batch.input_ids.new_empty((new_bs, MAX_TOTAL_TOKENS)) + # # Copy to correct indices + + left_offset = max_input_length - batch.input_length + right_padding = MAX_TOTAL_TOKENS - max_input_length + input_ids[start_index:end_index, left_offset:-right_padding] = ( + batch.input_ids[keep_indices, : batch.input_length] + ) + + # Create padded tensor + if top_n_tokens_tensor is None: + top_n_tokens_tensor = batches[0].top_n_tokens_tensor.new_zeros( + new_bs, + ) + top_n_tokens_tensor[start_index:end_index] = batch.top_n_tokens_tensor[ + keep_indices + ] + + if attention_mask is None: + attention_mask = batch.attention_mask.new_zeros( + (new_bs, MAX_TOTAL_TOKENS), ) + attention_mask[ + start_index:end_index, + left_offset:-right_padding, + ] = batch.attention_mask[ + keep_indices, + : batch.input_length, + ] + + if batch.cross_attention_mask is not None: + cross_attention_mask_shape = list(batch.cross_attention_mask.shape) + cross_attention_mask_shape[1] = MAX_TOTAL_TOKENS + cross_attention_mask_shape[0] = new_bs + cross_attention_mask_shape = torch.Size(cross_attention_mask_shape) + if cross_attention_mask is None: + cross_attention_mask = batch.cross_attention_mask.new_zeros( + cross_attention_mask_shape, + ) + cross_attention_mask[ + start_index:end_index, + left_offset:-right_padding, + ] = batch.cross_attention_mask[ + keep_indices, + : batch.input_length, + ] + + # Create empty tensor + # position_ids is always of shape [batch_size, 1] + if position_ids is None: + position_ids = batch.position_ids.new_empty((new_bs, 1)) + position_ids[start_index:end_index] = batch.position_ids[keep_indices, :] + + # Shenanigans to get dimensions because BLOOM outputs a past with a different shape + # BLOOM Keys: [batch_size * num_heads, head_dim, seq_length] + # BLOOM Values: [batch_size * num_heads, seq_length, head_dim] + # And ensure that we can update tensors in-place + if isinstance(batch.past_key_values, tuple): + batch.past_key_values = [ + [t.view(batch.batch_size, -1, *t.shape[-2:]) for t in layer] + for layer in batch.past_key_values + ] + elif len(batch.past_key_values[0][0].shape) == 3: + for layer in batch.past_key_values: + for k, t in enumerate(layer): + layer[k] = t.view(batch.batch_size, -1, *t.shape[-2:]) + + start_index = end_index + + first_past_kvs = batches[0].past_key_values + _, num_heads, padded_sequence_length, head_dim = first_past_kvs[0][1].shape + past_key_values = [] + for layer_id in range(len(batches[0].past_key_values)): + if layer_id in CROSS_ATTENTION_LAYERS: + padded_past_keys_shape = list( + batches[0].past_key_values[layer_id][0].shape + ) + padded_past_keys_shape[0] = new_bs + padded_past_keys_shape = torch.Size(padded_past_keys_shape) + else: + padded_past_keys_shape = ( + new_bs, + num_heads, + MAX_TOTAL_TOKENS, + head_dim, + ) + + padded_past_keys = first_past_kvs[layer_id][0].new_zeros( + padded_past_keys_shape + ) + padded_past_values = first_past_kvs[layer_id][1].new_zeros( + padded_past_keys_shape + ) + start_index = 0 + for batch in batches: + keep_indices = [] + for req in batch.requests: + keep_indices.append(req.idx) + + left_offset = max_input_length - batch.input_length + right_padding = MAX_TOTAL_TOKENS - max_input_length + past_keys = batch.past_key_values[layer_id][0] + past_values = batch.past_key_values[layer_id][1] + # Clear reference to the original tensor + batch.past_key_values[layer_id] = None + + # Slicing end index for this batch + end_index = start_index + len(batch) + # We slice the keys to remove the padding from previous batches + if layer_id in CROSS_ATTENTION_LAYERS: + padded_past_keys[start_index:end_index, :, :, :] = past_keys[ + keep_indices, :, :, : + ] + padded_past_values[start_index:end_index, :, :, :] = past_values[ + keep_indices, :, :, : + ] + + else: + padded_past_keys[ + start_index:end_index, :, left_offset:-right_padding, : + ] = past_keys[keep_indices, :, : batch.input_length, :] + padded_past_values[ + start_index:end_index, :, left_offset:-right_padding, : + ] = past_values[keep_indices, :, : batch.input_length, :] + + start_index = end_index + + past_key_values.append(tuple([padded_past_keys, padded_past_values])) + past_key_values = tuple(past_key_values) + + batch_id = batches[0].batch_id + top_n_tokens.extend([-1] * (new_bs - total_batch_size)) + fsm_grammar_states.extend([-1] * (new_bs - total_batch_size)) + + for idx, req in enumerate(requests): + req.idx = idx + + parameters = pad_next_token_chooser_parameters(parameters, new_bs) next_token_chooser = HeterogeneousNextTokenChooser.from_pb( parameters, - batches[dst_batch_idx].next_token_chooser.dtype, - batches[dst_batch_idx].next_token_chooser.device, - batches[dst_batch_idx].next_token_chooser.tokenizer, + batches[0].next_token_chooser.dtype, + batches[0].next_token_chooser.device, + batches[0].next_token_chooser.tokenizer, fsm_grammar_states, quantization_enabled=hq_env.is_quantization_enabled, ) - - input_ids = batches[dst_batch_idx].input_ids - attention_mask = batches[dst_batch_idx].attention_mask - position_ids = batches[dst_batch_idx].position_ids - past_key_values = batches[dst_batch_idx].past_key_values input_length = max_input_length htorch.core.mark_step() return cls( batch_id=batch_id, - requests=flat_requests, + requests=requests, input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, @@ -546,6 +784,13 @@ class VlmCausalLMBatch(CausalLMBatch): top_n_tokens=top_n_tokens, top_n_tokens_tensor=top_n_tokens_tensor, input_length=input_length, + pixel_values=None, + pixel_attention_mask=None, + image_sizes=None, + aspect_ratio_ids=None, + aspect_ratio_mask=None, + cross_attention_mask=cross_attention_mask, + prefilling=False, ) @@ -597,6 +842,9 @@ class VlmCausalLM(Model): htorch.core.hpu_set_env() if world_size > 1: + os.environ.setdefault( + "DEEPSPEED_USE_HABANA_FRAMEWORKS_DETERMINISTIC_API", "1" + ) model = self.get_deepspeed_model(model_class, model_id, dtype, revision) model = hq_env.prepare_model_for_quantization(model) else: @@ -674,6 +922,11 @@ class VlmCausalLM(Model): self.kwargs["flash_attention_recompute"] = True self.speculate = get_speculate() + if model.config.model_type == "mllama": + global CROSS_ATTENTION_LAYERS, BASE_IMAGE_TOKENS + CROSS_ATTENTION_LAYERS = model.config.text_config.cross_attention_layers + BASE_IMAGE_TOKENS = 0 + super(VlmCausalLM, self).__init__( model_id=model_id, model=model, @@ -802,39 +1055,39 @@ class VlmCausalLM(Model): def forward( self, - input_ids, - attention_mask, - position_ids, - token_idx, - past_key_values: Optional[List[Tuple]] = None, - pixel_values: Optional[List[torch.Tensor]] = None, - image_sizes: Optional[List[Tuple[int, int]]] = None, + batch: VlmCausalLMBatch, bypass_hpu_graph: Optional[bool] = None, ) -> Tuple[torch.Tensor, List[Tuple[torch.Tensor, torch.Tensor]]]: # Model Forward kwargs = { - "input_ids": input_ids, - "attention_mask": attention_mask, - "past_key_values": past_key_values, - "token_idx": token_idx, - "pixel_values": pixel_values, - "image_sizes": image_sizes, + "input_ids": batch.input_ids, + "attention_mask": batch.attention_mask, + "past_key_values": batch.past_key_values, + "token_idx": batch.token_idx, + "pixel_values": batch.pixel_values, } + if self.model.config.model_type == "mllama": + kwargs["aspect_ratio_ids"] = batch.aspect_ratio_ids + kwargs["aspect_ratio_mask"] = batch.aspect_ratio_mask + kwargs["cross_attention_mask"] = batch.cross_attention_mask + else: + kwargs["image_sizes"] = batch.image_sizes + hpu_kwargs = {} # Optimum Habana got "lazy_mode" key-val only supported for llama type of models if self.model.config.model_type == "llama": hpu_kwargs["lazy_mode"] = LAZY_MODE == 1 if self.has_position_ids: - kwargs["position_ids"] = position_ids - + kwargs["position_ids"] = batch.position_ids if bypass_hpu_graph is not None: hpu_kwargs["bypass_hpu_graphs"] = bypass_hpu_graph kwargs.update(self.kwargs) model_inputs = self.model.prepare_inputs_for_generation(**kwargs) - if past_key_values is not None: + + if batch.past_key_values is not None: return self.model.forward(**model_inputs, **hpu_kwargs) else: outputs = self.model.forward(**model_inputs, **hpu_kwargs) @@ -842,8 +1095,9 @@ class VlmCausalLM(Model): @tracer.start_as_current_span("generate_token") def generate_token( - self, batches: List[VlmCausalLMBatch], is_warmup: bool = False - ) -> Tuple[List[Generation], Optional[CausalLMBatch], Tuple[int, int]]: + self, batches: list[VlmCausalLMBatch], is_warmup: bool = False + ) -> Tuple[List[Generation], Optional[VlmCausalLMBatch], Tuple[int, int]]: + start = time.time_ns() # Results generations: List[Generation] = [] @@ -923,9 +1177,18 @@ class VlmCausalLM(Model): # Update attention_mask as we added a new token to input_ids batch.attention_mask.index_fill_(1, token_idx, 1) + # add cross-attn mask for new token + if batch.cross_attention_mask is not None: + cross_attention_mask_prev = batch.cross_attention_mask + if token_idx is not None: + mask = cross_attention_mask_prev[ + :, token_idx - 2 : token_idx - 1, ... + ] + cross_attention_mask_prev.index_copy_(1, token_idx - 1, mask) + batch.cross_attention_mask = cross_attention_mask_prev + # Adjust lengths batch.input_length += 1 - # Update position_ids if prefill: batch.position_ids = ( @@ -951,7 +1214,7 @@ class VlmCausalLM(Model): # Check if we need to do any bookkeeping first if not prefill: - batch = batch.__class__.recombine( + batch = self.batch_type.recombine( [batch], self.tokenizer.pad_token_id, is_warmup ) @@ -973,38 +1236,34 @@ class VlmCausalLM(Model): # Execute batch if prefill: # no right padding for prefill - token_idx = torch.tensor(batch.attention_mask.shape[-1] - 1).to(self.device) + # token_idx = torch.tensor(batch.attention_mask.shape[-1] - 1).to(self.device) batch.logits, batch.past = self.forward( - batch.input_ids, - batch.attention_mask, - batch.position_ids, - token_idx, - batch.past_key_values, - batch.pixel_values, - batch.image_sizes, + batch, bypass_hpu_graph=( prefill and self.limit_hpu_graph if self.enable_hpu_graph else None ), ) + elif all([req.stopping_criteria.max_new_tokens == 1 for req in batch.requests]): # Don't schedule next forward if max_new_tokens for all requests equals 1 # - we've already generated the first and only needed token in the prefill phase pass else: - token_idx = torch.tensor( - batch.attention_mask.shape[-1] - batch.right_padding - ).to(self.device) + # token_idx = torch.tensor(batch.attention_mask.shape[-1] - batch.right_padding).to(self.device) batch.logits = self.forward( - batch.input_ids, - batch.attention_mask, - batch.position_ids, - token_idx, - batch.past_key_values, + batch, bypass_hpu_graph=( prefill and self.limit_hpu_graph if self.enable_hpu_graph else None ), ) + if batch.pixel_values is not None: + batch.pixel_values = None + if batch.aspect_ratio_ids is not None: + batch.aspect_ratio_ids = None + if batch.aspect_ratio_mask is not None: + batch.aspect_ratio_mask = None + htorch.core.mark_step() start_decode = time.time_ns() @@ -1177,7 +1436,7 @@ class VlmCausalLM(Model): return generations, batch if not stopped else None, (forward_ns, decode_ns) def batch_from_pb(self, batch, is_warmup): - return VlmCausalLMBatch.from_pb_processor( + return self.batch_type.from_pb_processor( batch, self.tokenizer, self.processor, @@ -1197,22 +1456,25 @@ class VlmCausalLM(Model): return self.batch_from_pb(batch, is_warmup) - def warmup(self, request) -> None: - is_warmup = True - batch = self.batch_from_pb(request.batch, is_warmup) + def warmup( + self, request: generate_pb2.WarmupRequest + ) -> Tuple[Optional[int], Optional[int], Optional[int]]: + global MAX_TOTAL_TOKENS + MAX_TOTAL_TOKENS = request.max_total_tokens + batch = self.batch_from_pb(request.batch, is_warmup=True) + max_input_tokens = request.max_input_tokens + max_prefill_batch_size = batch.input_ids.shape[0] try: # max prefill batch size warmup - _, prefill_batch, _ = self.generate_token([batch], is_warmup) + _, prefill_batch, _ = self.generate_token([batch], is_warmup=True) except Exception: raise RuntimeError( f"Not enough memory to handle {len(batch.input_ids)} prefill tokens. " f"You need to decrease `--max-batch-prefill-tokens`" ) - global BASE_IMAGE_TOKENS, MAX_TOTAL_TOKENS, MAX_BATCH_TOTAL_TOKENS, PREFILL_WARMUP_BATCH_SIZE_LIST, PREFILL_WARMUP_SEQLEN_LIST, DECODE_WARMUP_BATCH_SIZE_LIST - max_input_length = batch.input_ids.shape[1] - max_prefill_batch_size = batch.input_ids.shape[0] + global BASE_IMAGE_TOKENS, PREFILL_WARMUP_BATCH_SIZE_LIST, PREFILL_WARMUP_SEQLEN_LIST, DECODE_WARMUP_BATCH_SIZE_LIST PREFILL_WARMUP_BATCH_SIZE_LIST = [] batch_size = 1 while batch_size <= max_prefill_batch_size: @@ -1221,15 +1483,19 @@ class VlmCausalLM(Model): if PREFILL_WARMUP_BATCH_SIZE_LIST[-1] < max_prefill_batch_size: PREFILL_WARMUP_BATCH_SIZE_LIST.append(max_prefill_batch_size) - seq_len = BASE_IMAGE_TOKENS + if self.model.config.model_type == "mllama": + seq_len = PAD_SEQUENCE_TO_MULTIPLE_OF + else: + seq_len = BASE_IMAGE_TOKENS + PREFILL_WARMUP_SEQLEN_LIST = [] i = 0 - while seq_len <= max_input_length: + while seq_len <= max_input_tokens: PREFILL_WARMUP_SEQLEN_LIST.append(seq_len) seq_len += PAD_SEQUENCE_TO_MULTIPLE_OF * (2**i) i += 1 - if PREFILL_WARMUP_SEQLEN_LIST[-1] < max_input_length: - PREFILL_WARMUP_SEQLEN_LIST.append(max_input_length) + if PREFILL_WARMUP_SEQLEN_LIST[-1] < max_input_tokens: + PREFILL_WARMUP_SEQLEN_LIST.append(max_input_tokens) # Prefill and decode warmup DECODE_WARMUP_BATCH_SIZE_LIST = [] @@ -1239,10 +1505,13 @@ class VlmCausalLM(Model): for batch_size in PREFILL_WARMUP_BATCH_SIZE_LIST: for seq_len in PREFILL_WARMUP_SEQLEN_LIST: batch = self.generate_warmup_batch( - request, seq_len, batch_size, is_warmup + request, seq_len, batch_size, is_warmup=True + ) + _, prefill_batch, _ = self.generate_token([batch], is_warmup=True) + assert prefill_batch is not None + _, decode_batch, _ = self.generate_token( + [prefill_batch], is_warmup=True ) - _, prefill_batch, _ = self.generate_token([batch], is_warmup) - _, decode_batch, _ = self.generate_token([prefill_batch], is_warmup) DECODE_WARMUP_BATCH_SIZE_LIST.append(batch_size) @@ -1264,7 +1533,7 @@ class VlmCausalLM(Model): f"Memory stats: {mem_stats} " ) - max_decode_batch_size = math.floor(MAX_BATCH_TOTAL_TOKENS / MAX_TOTAL_TOKENS) + max_decode_batch_size = MAX_BATCH_SIZE batch_size = max_prefill_batch_size * 2 # Decode warmup with bigger batch_size try: @@ -1273,51 +1542,47 @@ class VlmCausalLM(Model): and batch_size <= max_decode_batch_size ): batches = [] - for i in range(int(batch_size / max_prefill_batch_size)): - batch = self.generate_warmup_batch( - request, - PREFILL_WARMUP_SEQLEN_LIST[0], - DECODE_WARMUP_BATCH_SIZE_LIST[-1], - is_warmup, - ) - _, prefill_batch, _ = self.generate_token([batch], is_warmup) - batches.append(prefill_batch) while batch_size <= max_decode_batch_size: - _, decode_batch, _ = self.generate_token(batches, is_warmup) + for i in range(int(batch_size / max_prefill_batch_size)): + batch = self.generate_warmup_batch( + request, + PREFILL_WARMUP_SEQLEN_LIST[0] - 1, + max_prefill_batch_size, + is_warmup=False, + ) + _, prefill_batch, _ = self.generate_token( + [batch], is_warmup=True + ) + batches.append(prefill_batch) + + _, decode_batch, _ = self.generate_token(batches, is_warmup=True) DECODE_WARMUP_BATCH_SIZE_LIST.append(batch_size) batch_size = batch_size * 2 batches.clear() - for i in range(int(batch_size / max_prefill_batch_size)): - batch = self.generate_warmup_batch( - request, - PREFILL_WARMUP_SEQLEN_LIST[0], - DECODE_WARMUP_BATCH_SIZE_LIST[-1], - is_warmup, - ) - _, prefill_batch, _ = self.generate_token([batch], is_warmup) - batches.append(prefill_batch) - - batches.clear() if DECODE_WARMUP_BATCH_SIZE_LIST[-1] < max_decode_batch_size: max_decode_batch_size = math.floor(max_decode_batch_size / 2) * 2 batch_size = max_decode_batch_size for i in range(int(max_decode_batch_size / 2)): batch = self.generate_warmup_batch( - request, PREFILL_WARMUP_SEQLEN_LIST[0], 2, is_warmup + request, + PREFILL_WARMUP_SEQLEN_LIST[0] - 1, + 2, + is_warmup=False, + ) + _, prefill_batch, _ = self.generate_token( + [batch], is_warmup=True ) - _, prefill_batch, _ = self.generate_token([batch], is_warmup) batches.append(prefill_batch) - _, decode_batch, _ = self.generate_token(batches, is_warmup) + _, decode_batch, _ = self.generate_token(batches, is_warmup=True) DECODE_WARMUP_BATCH_SIZE_LIST.append(max_decode_batch_size) - max_batch_total_tokens = max_decode_batch_size * MAX_TOTAL_TOKENS - MAX_BATCH_TOTAL_TOKENS = max_batch_total_tokens + except Exception: raise RuntimeError( f"Not enough memory to handle batch_size({batch_size}) decode warmup." f"Decode batch size list:{DECODE_WARMUP_BATCH_SIZE_LIST}" f"max_decode_batch_size is {max_decode_batch_size}" - f"You need to decrease env `MAX_BATCH_TOTAL_TOKENS` or '--max_batch_total_tokens'" + f"You need to decrease env `MAX_BATCH_SIZE` or '--max_batch_size'" ) mem_stats = get_hpu_memory_stats(self.device) @@ -1327,4 +1592,8 @@ class VlmCausalLM(Model): f"Memory stats: {mem_stats}" ) - return MAX_BATCH_TOTAL_TOKENS + max_supported_total_tokens = MAX_BATCH_SIZE * MAX_TOTAL_TOKENS + max_input_tokens = max_input_tokens + max_total_tokens = MAX_TOTAL_TOKENS + + return max_supported_total_tokens, max_input_tokens, max_total_tokens diff --git a/backends/neuron/Makefile b/backends/neuron/Makefile index 6c5002ce..06674971 100644 --- a/backends/neuron/Makefile +++ b/backends/neuron/Makefile @@ -25,6 +25,7 @@ image: --ulimit nofile=100000:100000 \ --build-arg VERSION=$(VERSION) \ -t text-generation-inference:$(VERSION)-neuron ${root_dir} + docker tag text-generation-inference:$(VERSION)-neuron text-generation-inference:latest-neuron install_server: make -C ${mkfile_dir}/server install VERSION:=${VERSION} diff --git a/docs/openapi.json b/docs/openapi.json index e1ce234e..85ca3f97 100644 --- a/docs/openapi.json +++ b/docs/openapi.json @@ -10,7 +10,7 @@ "name": "Apache 2.0", "url": "https://www.apache.org/licenses/LICENSE-2.0" }, - "version": "3.1.2-dev0" + "version": "3.2.1-dev0" }, "paths": { "/": { @@ -2148,9 +2148,6 @@ }, "StreamOptions": { "type": "object", - "required": [ - "include_usage" - ], "properties": { "include_usage": { "type": "boolean", diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index 37b57d6f..4c6f0151 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -52,6 +52,8 @@ - sections: - local: backends/neuron title: Neuron + - local: backends/gaudi + title: Gaudi - local: backends/trtllm title: TensorRT-LLM - local: backends/llamacpp diff --git a/docs/source/backends/gaudi.mdx b/docs/source/backends/gaudi.mdx new file mode 100644 index 00000000..7e0e69ab --- /dev/null +++ b/docs/source/backends/gaudi.mdx @@ -0,0 +1,317 @@ +# Gaudi Backend for Text Generation Inference + +## Overview +Text Generation Inference (TGI) has been optimized to run on Gaudi hardware via the Gaudi backend for TGI. + +## Supported Hardware +- **Gaudi1**: Available on [AWS EC2 DL1 instances](https://aws.amazon.com/ec2/instance-types/dl1/) +- **Gaudi2**: Available on [Intel Cloud](https://console.cloud.intel.com/docs/reference/ai_instances.html) +- **Gaudi3**: Available on [Intel Cloud](https://console.cloud.intel.com/docs/reference/ai_instances.html) + +## Tutorial: Getting Started with TGI on Gaudi + +### Basic Usage +The easiest way to run TGI on Gaudi is to use the official Docker image: + +```bash +model=meta-llama/Meta-Llama-3.1-8B-Instruct +volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run +hf_token=YOUR_HF_ACCESS_TOKEN + +docker run --runtime=habana --cap-add=sys_nice --ipc=host \ + -p 8080:80 -v $volume:/data -e HF_TOKEN=$hf_token \ + ghcr.io/huggingface/text-generation-inference:3.2.1-gaudi \ + --model-id $model +``` + +Once you see the `connected` log, the server is ready to accept requests: +> 2024-05-22T19:31:48.302239Z INFO text_generation_router: router/src/main.rs:378: Connected + +You can find your `YOUR_HF_ACCESS_TOKEN` at [https://huggingface.co/settings/tokens](https://huggingface.co/settings/tokens). This is necessary to access gated models like llama3.1. + +### Making Your First Request +You can send a request from a separate terminal: + +```bash +curl 127.0.0.1:8080/generate \ + -X POST \ + -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":32}}' \ + -H 'Content-Type: application/json' +``` + +## How-to Guides + +### How to Run Specific Models + +The following models have been validated on Gaudi2: + +| Model | Model ID | BF16 | | FP8 | | +|-----------------------|----------------------------------------|-------------|------------|-------------|------------| +| | | Single Card | Multi-Card | Single Card | Multi-Card | +| Llama2-7B | meta-llama/Llama-2-7b-chat-hf | ✔ | ✔ | ✔ | ✔ | +| Llama2-70B | meta-llama/Llama-2-70b-chat-hf | | ✔ | | ✔ | +| Llama3-8B | meta-llama/Meta-Llama-3.1-8B-Instruct | ✔ | ✔ | ✔ | ✔ | +| Llama3-70B | meta-llama/Meta-Llama-3-70B-Instruct | | ✔ | | ✔ | +| Llama3.1-8B | meta-llama/Meta-Llama-3.1-8B-Instruct | ✔ | ✔ | ✔ | ✔ | +| Llama3.1-70B | meta-llama/Meta-Llama-3.1-70B-Instruct | | ✔ | | ✔ | +| CodeLlama-13B | codellama/CodeLlama-13b-hf | ✔ | ✔ | ✔ | ✔ | +| Mixtral-8x7B | mistralai/Mixtral-8x7B-Instruct-v0.1 | ✔ | ✔ | ✔ | ✔ | +| Mistral-7B | mistralai/Mistral-7B-Instruct-v0.3 | ✔ | ✔ | ✔ | ✔ | +| Falcon-180B | tiiuae/falcon-180B-chat | | ✔ | | ✔ | +| Qwen2-72B | Qwen/Qwen2-72B-Instruct | | ✔ | | ✔ | +| Starcoder2-3b | bigcode/starcoder2-3b | ✔ | ✔ | ✔ | | +| Starcoder2-15b | bigcode/starcoder2-15b | ✔ | ✔ | ✔ | | +| Starcoder | bigcode/starcoder | ✔ | ✔ | ✔ | ✔ | +| Gemma-7b | google/gemma-7b-it | ✔ | ✔ | ✔ | ✔ | +| Llava-v1.6-Mistral-7B | llava-hf/llava-v1.6-mistral-7b-hf | ✔ | ✔ | ✔ | ✔ | + +To run any of these models: + +```bash +model=MODEL_ID_THAT_YOU_WANT_TO_RUN +volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run +hf_token=YOUR_ACCESS_TOKEN + +docker run --runtime=habana --cap-add=sys_nice --ipc=host \ + -p 8080:80 -v $volume:/data -e HF_TOKEN=$hf_token \ + ghcr.io/huggingface/text-generation-inference:3.2.1-gaudi \ + --model-id $model + +``` + +For the full list of service parameters, refer to the [launcher-arguments page](https://huggingface.co/docs/text-generation-inference/reference/launcher). + +The validated docker commands can be found in the [examples/docker_commands folder](https://github.com/huggingface/text-generation-inference/tree/main/backends/gaudi/examples/docker_commands). + +> Note: `--runtime=habana --cap-add=sys_nice --ipc=host ` is required to enable docker to use the Gaudi hardware (more details [here](https://docs.habana.ai/en/latest/Installation_Guide/Additional_Installation/Docker_Installation.html)). + +### How to Enable Multi-Card Inference (Sharding) + +TGI-Gaudi supports sharding for multi-card inference, allowing you to distribute the load across multiple Gaudi cards. + +For example, on a machine with 8 Gaudi cards, you can run: + +```bash +docker run --runtime=habana --ipc=host --cap-add=sys_nice \ + -p 8080:80 -v $volume:/data -e HF_TOKEN=$hf_token \ + tgi-gaudi \ + --model-id $model --sharded true --num-shard 8 +``` + + +We recommend always using sharding when running on a multi-card machine. + + +### How to Use Different Precision Formats + +#### BF16 Precision (Default) +By default, all models run with BF16 precision on Gaudi hardware. + +#### FP8 Precision +TGI-Gaudi supports FP8 precision inference with [Intel Neural Compressor (INC)](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_FP8.html). + +To run FP8 Inference: + +1. Measure statistics using [Optimum Habana measurement script](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation#running-with-fp8) +2. Run the model in TGI with QUANT_CONFIG setting - e.g. `-e QUANT_CONFIG=./quantization_config/maxabs_quant.json`. + +The following commmand example for FP8 inference is based on the assumption that measurement is done via the first step above. + +Example for Llama3.1-70B on 8 cards with FP8 precision: + +```bash +model=meta-llama/Meta-Llama-3.1-70B-Instruct +hf_token=YOUR_ACCESS_TOKEN +volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run + +docker run -p 8080:80 \ + --runtime=habana \ + --cap-add=sys_nice \ + --ipc=host \ + -v $volume:/data \ + -v $PWD/quantization_config:/usr/src/quantization_config \ + -v $PWD/hqt_output:/usr/src/hqt_output \ + -e QUANT_CONFIG=./quantization_config/maxabs_quant.json \ + -e HF_TOKEN=$hf_token \ + -e MAX_TOTAL_TOKENS=2048 \ + -e BATCH_BUCKET_SIZE=256 \ + -e PREFILL_BATCH_BUCKET_SIZE=4 \ + -e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \ + ghcr.io/huggingface/text-generation-inference:3.2.1-gaudi \ + --model-id $model \ + --sharded true --num-shard 8 \ + --max-input-tokens 1024 --max-total-tokens 2048 \ + --max-batch-prefill-tokens 4096 --max-batch-size 256 \ + --max-waiting-tokens 7 --waiting-served-ratio 1.2 --max-concurrent-requests 512 +``` + +### How to Run Vision-Language Models (VLMs) + +Gaudi supports VLM inference. + +Example for Llava-v1.6-Mistral-7B on 1 card: + +Start the TGI server via the following command: +```bash +model=llava-hf/llava-v1.6-mistral-7b-hf +volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run + +docker run -p 8080:80 \ + --runtime=habana \ + --cap-add=sys_nice \ + --ipc=host \ + -v $volume:/data \ + -e PREFILL_BATCH_BUCKET_SIZE=1 \ + -e BATCH_BUCKET_SIZE=1 \ + ghcr.io/huggingface/text-generation-inference:3.2.1-gaudi \ + --model-id $model \ + --max-input-tokens 4096 --max-batch-prefill-tokens 16384 \ + --max-total-tokens 8192 --max-batch-size 4 +``` + +You can then send a request to the server via the following command: +```bash +curl -N 127.0.0.1:8080/generate \ + -X POST \ + -d '{"inputs":"![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png)What is this a picture of?\n\n","parameters":{"max_new_tokens":32}}' \ + -H 'Content-Type: application/json' +``` + +> Note: In Llava-v1.6-Mistral-7B, an image usually accounts for 2000 input tokens. For example, an image of size 512x512 is represented by 2800 tokens. Thus, `max-input-tokens` must be larger than the number of tokens associated with the image. Otherwise the image may be truncated. We set `BASE_IMAGE_TOKENS=2048` as the default image token value. This is the minimum value of `max-input-tokens`. You can override the environment variable `BASE_IMAGE_TOKENS` to change this value. The warmup will generate graphs with input length from `BASE_IMAGE_TOKENS` to `max-input-tokens`. For Llava-v1.6-Mistral-7B, the value of `max-batch-prefill-tokens` is 16384, which is calcualted as follows: `prefill_batch_size` = `max-batch-prefill-tokens` / `max-input-tokens`. + +### How to Benchmark Performance + +We recommend using the [inference-benchmarker tool](https://github.com/huggingface/inference-benchmarker) to benchmark performance on Gaudi hardware. + +This benchmark tool simulates user requests and measures the performance of the model on realistic scenarios. + +To run it on the same machine, you can do the following: +```bash +MODEL=meta-llama/Llama-3.1-8B-Instruct +HF_TOKEN= +# run a benchmark to evaluate the performance of the model for chat use case +# we mount results to the current directory +docker run \ + --rm \ + -it \ + --net host \ + -v $(pwd):/opt/inference-benchmarker/results \ + -e "HF_TOKEN=$HF_TOKEN" \ + ghcr.io/huggingface/inference-benchmarker:latest \ + inference-benchmarker \ + --tokenizer-name "$MODEL" \ + --url http://localhost:8080 \ + --profile chat +``` + +Please refer to the [inference-benchmarker README](https://github.com/huggingface/inference-benchmarker) for more details. + +### How to Profile Performance + +To collect performance profiling, you need to set the following environment variables: + +| Name | Value(s) | Default | Description | +|--------------------| :--------- | :--------------- | :------------------------------------------------------- | +| PROF_WAITSTEP | integer | 0 | Control profile wait steps | +| PROF_WARMUPSTEP | integer | 0 | Control profile warmup steps | +| PROF_STEP | integer | 0 | Enable/disable profile, control profile active steps | +| PROF_PATH | string | /tmp/hpu_profile | Define profile folder | +| PROF_RANKS | string | 0 | Comma-separated list of ranks to profile | +| PROF_RECORD_SHAPES | True/False | False | Control record_shapes option in the profiler | + +To use these environment variables, add them to your docker run command with the -e flag. For example: + +```bash +docker run --runtime=habana --ipc=host --cap-add=sys_nice \ + -p 8080:80 -v $volume:/data -e HF_TOKEN=$hf_token \ + -e PROF_WAITSTEP=10 \ + -e PROF_WARMUPSTEP=10 \ + -e PROF_STEP=1 \ + -e PROF_PATH=/tmp/hpu_profile \ + -e PROF_RANKS=0 \ + -e PROF_RECORD_SHAPES=True \ + ghcr.io/huggingface/text-generation-inference:3.2.1-gaudi \ + --model-id $model +``` + +## Explanation: Understanding TGI on Gaudi + +### The Warmup Process + +To ensure optimal performance, warmup is performed at the beginning of each server run. This process creates queries with various input shapes based on provided parameters and runs basic TGI operations (prefill, decode, concatenate). + +Note: Model warmup can take several minutes, especially for FP8 inference. For faster subsequent runs, refer to [Disk Caching Eviction Policy](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_PyTorch_Models.html#disk-caching-eviction-policy). + +### Understanding Parameter Tuning + +#### Sequence Length Parameters +- `--max-input-tokens` is the maximum possible input prompt length. Default value is `4095`. +- `--max-total-tokens` is the maximum possible total length of the sequence (input and output). Default value is `4096`. + +#### Batch Size Parameters +- For prefill operation, please set `--max-batch-prefill-tokens` as `bs * max-input-tokens`, where `bs` is your expected maximum prefill batch size. +- For decode operation, please set `--max-batch-size` as `bs`, where `bs` is your expected maximum decode batch size. +- Please note that batch size will be always padded to the nearest multiplication of `BATCH_BUCKET_SIZE` and `PREFILL_BATCH_BUCKET_SIZE`. + +#### Performance and Memory Parameters +- `PAD_SEQUENCE_TO_MULTIPLE_OF` determines sizes of input length buckets. Since warmup creates several graphs for each bucket, it's important to adjust that value proportionally to input sequence length. Otherwise, some out of memory issues can be observed. +- `ENABLE_HPU_GRAPH` enables HPU graphs usage, which is crucial for performance results. Recommended value to keep is `true`. + +#### Sequence Length Parameters +- `--max-input-tokens`: Maximum possible input prompt length (default: 4095) +- `--max-total-tokens`: Maximum possible total sequence length (input + output) (default: 4096) + +#### Batch Size Parameters +- `--max-batch-prefill-tokens`: Set as `bs * max-input-tokens` where `bs` is your expected maximum prefill batch size +- `--max-batch-size`: Set as `bs` where `bs` is your expected maximum decode batch size +- Note: Batch sizes are padded to the nearest multiple of `BATCH_BUCKET_SIZE` and `PREFILL_BATCH_BUCKET_SIZE` + +## Reference + +This section contains reference information about the Gaudi backend. + +### Environment Variables + +The following table contains the environment variables that can be used to configure the Gaudi backend: + +| Name | Value(s) | Default | Description | Usage | +|-----------------------------| :--------- | :--------------- | :------------------------------------------------------------------------------------------------------------------------------- | :--------------------------- | +| ENABLE_HPU_GRAPH | True/False | True | Enable hpu graph or not | add -e in docker run command | +| LIMIT_HPU_GRAPH | True/False | True | Skip HPU graph usage for prefill to save memory, set to `True` for large sequence/decoding lengths(e.g. 300/212) | add -e in docker run command | +| BATCH_BUCKET_SIZE | integer | 8 | Batch size for decode operation will be rounded to the nearest multiple of this number. This limits the number of cached graphs | add -e in docker run command | +| PREFILL_BATCH_BUCKET_SIZE | integer | 4 | Batch size for prefill operation will be rounded to the nearest multiple of this number. This limits the number of cached graphs | add -e in docker run command | +| PAD_SEQUENCE_TO_MULTIPLE_OF | integer | 128 | For prefill operation, sequences will be padded to a multiple of provided value. | add -e in docker run command | +| SKIP_TOKENIZER_IN_TGI | True/False | False | Skip tokenizer for input/output processing | add -e in docker run command | +| WARMUP_ENABLED | True/False | True | Enable warmup during server initialization to recompile all graphs. This can increase TGI setup time. | add -e in docker run command | +| QUEUE_THRESHOLD_MS | integer | 120 | Controls the threshold beyond which the request are considered overdue and handled with priority. Shorter requests are prioritized otherwise. | add -e in docker run command | +| USE_FLASH_ATTENTION | True/False | True | Whether to enable Habana Flash Attention, provided that the model supports it. Please refer to https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_PyTorch_Models.html?highlight=fusedsdpa#using-fused-scaled-dot-product-attention-fusedsdpa | add -e in docker run command | +| FLASH_ATTENTION_RECOMPUTE | True/False | True | Whether to enable Habana Flash Attention in recompute mode on first token generation. | add -e in docker run command | + +## Contributing + +Contributions to the TGI-Gaudi project are welcome. Please refer to the [contributing guide](https://github.com/huggingface/text-generation-inference/blob/main/CONTRIBUTING.md). + +**Guidelines for contributing to Gaudi on TGI:** All changes should be made within the `backends/gaudi` folder. In general, you should avoid modifying the router, launcher, or benchmark to accommodate Gaudi hardware, as all Gaudi-specific logic should be contained within the `backends/gaudi` folder. + +### Building the Docker Image from Source + +To build the Docker image from source: + +```bash +make -C backends/gaudi image +``` + +This builds the image and saves it as `tgi-gaudi`. You can then run TGI-Gaudi with this image: + +```bash +model=meta-llama/Meta-Llama-3.1-8B-Instruct +volume=$PWD/data +hf_token=YOUR_ACCESS_TOKEN + +docker run --runtime=habana --ipc=host --cap-add=sys_nice \ + -p 8080:80 -v $volume:/data -e HF_TOKEN=$hf_token \ + tgi-gaudi \ + --model-id $model +``` + +For more details, see the [README of the Gaudi backend](https://github.com/huggingface/text-generation-inference/blob/main/backends/gaudi/README.md) and the [Makefile of the Gaudi backend](https://github.com/huggingface/text-generation-inference/blob/main/backends/gaudi/Makefile). diff --git a/docs/source/backends/neuron.md b/docs/source/backends/neuron.md index e7ba8873..f3a5dac7 100644 --- a/docs/source/backends/neuron.md +++ b/docs/source/backends/neuron.md @@ -31,7 +31,7 @@ deployment instructions in the model card: The service is launched simply by running the text-generation-inference container with two sets of parameters: ``` -docker run ghcr.io/huggingface/text-generation-inference:3.1.1-neuron +docker run ghcr.io/huggingface/text-generation-inference:3.2.1-neuron ``` - system parameters are used to map ports, volumes and devices between the host and the service, diff --git a/docs/source/basic_tutorials/gated_model_access.md b/docs/source/basic_tutorials/gated_model_access.md index 2cbd7e06..29ced4f8 100644 --- a/docs/source/basic_tutorials/gated_model_access.md +++ b/docs/source/basic_tutorials/gated_model_access.md @@ -19,6 +19,6 @@ docker run --gpus all \ --shm-size 1g \ -e HF_TOKEN=$token \ -p 8080:80 \ - -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.1.1 \ + -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.2.1 \ --model-id $model ``` diff --git a/docs/source/conceptual/quantization.md b/docs/source/conceptual/quantization.md index 77a64a88..4288b0a2 100644 --- a/docs/source/conceptual/quantization.md +++ b/docs/source/conceptual/quantization.md @@ -19,7 +19,7 @@ bitsandbytes is a library used to apply 8-bit and 4-bit quantization to models. In TGI, you can use 8-bit quantization by adding `--quantize bitsandbytes` like below 👇 ```bash -docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.1.1 --model-id $model --quantize bitsandbytes +docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.2.1 --model-id $model --quantize bitsandbytes ``` 4-bit quantization is also possible with bitsandbytes. You can choose one of the following 4-bit data types: 4-bit float (`fp4`), or 4-bit `NormalFloat` (`nf4`). These data types were introduced in the context of parameter-efficient fine-tuning, but you can apply them for inference by automatically converting the model weights on load. @@ -27,7 +27,7 @@ docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingf In TGI, you can use 4-bit quantization by adding `--quantize bitsandbytes-nf4` or `--quantize bitsandbytes-fp4` like below 👇 ```bash -docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.1.1 --model-id $model --quantize bitsandbytes-nf4 +docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.2.1 --model-id $model --quantize bitsandbytes-nf4 ``` You can get more information about 8-bit quantization by reading this [blog post](https://huggingface.co/blog/hf-bitsandbytes-integration), and 4-bit quantization by reading [this blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes). @@ -48,7 +48,7 @@ $$({\hat{W}_{l}}^{*} = argmin_{\hat{W_{l}}} ||W_{l}X-\hat{W}_{l}X||^{2}_{2})$$ TGI allows you to both run an already GPTQ quantized model (see available models [here](https://huggingface.co/models?search=gptq)) or quantize a model of your choice using quantization script. You can run a quantized model by simply passing --quantize like below 👇 ```bash -docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.1.1 --model-id $model --quantize gptq +docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.2.1 --model-id $model --quantize gptq ``` Note that TGI's GPTQ implementation doesn't use [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) under the hood. However, models quantized using AutoGPTQ or Optimum can still be served by TGI. diff --git a/docs/source/installation_amd.md b/docs/source/installation_amd.md index 20ef26a8..4242cffb 100644 --- a/docs/source/installation_amd.md +++ b/docs/source/installation_amd.md @@ -11,7 +11,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading docker run --rm -it --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \ --device=/dev/kfd --device=/dev/dri --group-add video \ --ipc=host --shm-size 256g --net host -v $volume:/data \ - ghcr.io/huggingface/text-generation-inference:3.1.1-rocm \ + ghcr.io/huggingface/text-generation-inference:3.2.1-rocm \ --model-id $model ``` diff --git a/docs/source/installation_gaudi.md b/docs/source/installation_gaudi.md index 1ddf2b47..51aa667d 100644 --- a/docs/source/installation_gaudi.md +++ b/docs/source/installation_gaudi.md @@ -1,3 +1,3 @@ # Using TGI with Intel Gaudi -Check out this [repository](https://github.com/huggingface/tgi-gaudi) to serve models with TGI on Gaudi and Gaudi2 with [Optimum Habana](https://huggingface.co/docs/optimum/habana/index). +You can use TGI on Intel Gaudi using the [TGI gaudi backend](https://huggingface.co/docs/text-generation-inference/backends/gaudi). diff --git a/docs/source/installation_intel.md b/docs/source/installation_intel.md index a0bf11d1..c0b40e02 100644 --- a/docs/source/installation_intel.md +++ b/docs/source/installation_intel.md @@ -12,7 +12,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading docker run --rm --privileged --cap-add=sys_nice \ --device=/dev/dri \ --ipc=host --shm-size 1g --net host -v $volume:/data \ - ghcr.io/huggingface/text-generation-inference:3.1.1-intel-xpu \ + ghcr.io/huggingface/text-generation-inference:3.2.1-intel-xpu \ --model-id $model --cuda-graphs 0 ``` @@ -29,7 +29,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading docker run --rm --privileged --cap-add=sys_nice \ --device=/dev/dri \ --ipc=host --shm-size 1g --net host -v $volume:/data \ - ghcr.io/huggingface/text-generation-inference:3.1.1-intel-cpu \ + ghcr.io/huggingface/text-generation-inference:3.2.1-intel-cpu \ --model-id $model --cuda-graphs 0 ``` diff --git a/docs/source/installation_nvidia.md b/docs/source/installation_nvidia.md index 3b20c7e1..31fba9d6 100644 --- a/docs/source/installation_nvidia.md +++ b/docs/source/installation_nvidia.md @@ -11,7 +11,7 @@ model=teknium/OpenHermes-2.5-Mistral-7B volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run docker run --gpus all --shm-size 64g -p 8080:80 -v $volume:/data \ - ghcr.io/huggingface/text-generation-inference:3.1.1 \ + ghcr.io/huggingface/text-generation-inference:3.2.1 \ --model-id $model ``` diff --git a/docs/source/quicktour.md b/docs/source/quicktour.md index be905102..9ed60efd 100644 --- a/docs/source/quicktour.md +++ b/docs/source/quicktour.md @@ -11,7 +11,7 @@ model=teknium/OpenHermes-2.5-Mistral-7B volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data \ - ghcr.io/huggingface/text-generation-inference:3.1.1 \ + ghcr.io/huggingface/text-generation-inference:3.2.1 \ --model-id $model ``` @@ -96,7 +96,7 @@ curl 127.0.0.1:8080/generate \ To see all possible deploy flags and options, you can use the `--help` flag. It's possible to configure the number of shards, quantization, generation parameters, and more. ```bash -docker run ghcr.io/huggingface/text-generation-inference:3.1.1 --help +docker run ghcr.io/huggingface/text-generation-inference:3.2.1 --help ``` diff --git a/docs/source/reference/api_reference.md b/docs/source/reference/api_reference.md index bc4029e4..e563a9b1 100644 --- a/docs/source/reference/api_reference.md +++ b/docs/source/reference/api_reference.md @@ -163,7 +163,7 @@ hub = { # create Hugging Face Model Class huggingface_model = HuggingFaceModel( - image_uri=get_huggingface_llm_image_uri("huggingface",version="3.1.1"), + image_uri=get_huggingface_llm_image_uri("huggingface",version="3.2.1"), env=hub, role=role, ) diff --git a/docs/source/supported_models.md b/docs/source/supported_models.md index 1f804d5a..f168fd76 100644 --- a/docs/source/supported_models.md +++ b/docs/source/supported_models.md @@ -14,6 +14,8 @@ Text Generation Inference enables serving optimized models. The following sectio - [Gemma](https://huggingface.co/google/gemma-7b) - [PaliGemma](https://huggingface.co/google/paligemma-3b-pt-224) - [Gemma2](https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315) +- [Gemma3](https://huggingface.co/collections/google/gemma-3-release-67c6c6f89c4f76621268bb6d) +- [Gemma3 Text](https://huggingface.co/collections/google/gemma-3-release-67c6c6f89c4f76621268bb6d) - [Cohere](https://huggingface.co/CohereForAI/c4ai-command-r-plus) - [Dbrx](https://huggingface.co/databricks/dbrx-instruct) - [Mamba](https://huggingface.co/state-spaces/mamba-2.8b-slimpj) diff --git a/flake.lock b/flake.lock index 719cdeea..d049a71d 100644 --- a/flake.lock +++ b/flake.lock @@ -978,16 +978,16 @@ "nixpkgs": "nixpkgs_6" }, "locked": { - "lastModified": 1740049068, - "narHash": "sha256-heYzYOt+TSnRKHIV24s74yEjLkTbBfjNCWHdQEX++eI=", + "lastModified": 1741617161, + "narHash": "sha256-cwKYAsIVSLtoLbG48+oi3NkSrvuZRLYs8lkJmpDsTw0=", "owner": "huggingface", "repo": "text-generation-inference-nix", - "rev": "143e8451efa22b120f97e6698508e9a0aed82769", + "rev": "5946021ec6cb6aae18158a9dc27f893cfbab2925", "type": "github" }, "original": { "owner": "huggingface", - "ref": "hub-rotary", + "ref": "kernels-0.2.0", "repo": "text-generation-inference-nix", "type": "github" } diff --git a/flake.nix b/flake.nix index 5058667a..7ddd3b92 100644 --- a/flake.nix +++ b/flake.nix @@ -5,7 +5,7 @@ inputs.nixpkgs.follows = "tgi-nix/nixpkgs"; }; nix-filter.url = "github:numtide/nix-filter"; - tgi-nix.url = "github:huggingface/text-generation-inference-nix/hub-rotary"; + tgi-nix.url = "github:huggingface/text-generation-inference-nix/kernels-0.2.0"; nixpkgs.follows = "tgi-nix/nixpkgs"; flake-utils.url = "github:numtide/flake-utils"; rust-overlay = { @@ -176,11 +176,15 @@ ''; }; - dockerImage = pkgs.callPackage nix/docker.nix { + # Use plain nixpkgs without overlays for dockerTools. dockerTools + # uses a Python package for computing the layers from the transitive + # closure. However, this needs a lot of rebuilds due to our overlay. + + dockerImage = nixpkgs.legacyPackages.${system}.callPackage nix/docker.nix { text-generation-inference = default; }; - dockerImageStreamed = pkgs.callPackage nix/docker.nix { + dockerImageStreamed = nixpkgs.legacyPackages.${system}.callPackage nix/docker.nix { text-generation-inference = default; stream = true; }; diff --git a/integration-tests/models/__snapshots__/test_flash_gemma3/test_exceed_window.json b/integration-tests/models/__snapshots__/test_flash_gemma3/test_exceed_window.json new file mode 100644 index 00000000..5c6b4cb9 --- /dev/null +++ b/integration-tests/models/__snapshots__/test_flash_gemma3/test_exceed_window.json @@ -0,0 +1,109 @@ +{ + "details": { + "best_of_sequences": null, + "finish_reason": "eos_token", + "generated_tokens": 16, + "prefill": [], + "seed": null, + "tokens": [ + { + "id": 506, + "logprob": -1.3984375, + "special": false, + "text": " the" + }, + { + "id": 1331, + "logprob": -1.6953125, + "special": false, + "text": " people" + }, + { + "id": 236764, + "logprob": -0.23535156, + "special": false, + "text": "," + }, + { + "id": 532, + "logprob": -0.24316406, + "special": false, + "text": " and" + }, + { + "id": 506, + "logprob": -0.12109375, + "special": false, + "text": " the" + }, + { + "id": 2780, + "logprob": -1.1640625, + "special": false, + "text": " food" + }, + { + "id": 236761, + "logprob": -0.21386719, + "special": false, + "text": "." + }, + { + "id": 108, + "logprob": -0.64453125, + "special": false, + "text": "\n\n" + }, + { + "id": 2094, + "logprob": -0.77734375, + "special": false, + "text": "This" + }, + { + "id": 563, + "logprob": -0.040283203, + "special": false, + "text": " is" + }, + { + "id": 496, + "logprob": -0.03125, + "special": false, + "text": " a" + }, + { + "id": 6290, + "logprob": -0.03515625, + "special": false, + "text": " nice" + }, + { + "id": 1977, + "logprob": -0.0020751953, + "special": false, + "text": " place" + }, + { + "id": 236761, + "logprob": -0.0079956055, + "special": false, + "text": "." + }, + { + "id": 107, + "logprob": -0.9921875, + "special": false, + "text": "\n" + }, + { + "id": 106, + "logprob": -0.45507812, + "special": true, + "text": "" + } + ], + "top_tokens": null + }, + "generated_text": " the people, and the food.\n\nThis is a nice place.\n" +} diff --git a/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3.json b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3.json new file mode 100644 index 00000000..859544c8 --- /dev/null +++ b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3.json @@ -0,0 +1,613 @@ +{ + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 100, + "prefill": [], + "seed": null, + "tokens": [ + { + "id": 1331, + "logprob": -0.34960938, + "special": false, + "text": " people" + }, + { + "id": 8390, + "logprob": -0.14746094, + "special": false, + "text": " died" + }, + { + "id": 528, + "logprob": -1.2265625, + "special": false, + "text": " in" + }, + { + "id": 506, + "logprob": -0.47070312, + "special": false, + "text": " the" + }, + { + "id": 3640, + "logprob": -0.5859375, + "special": false, + "text": " United" + }, + { + "id": 4184, + "logprob": -0.0027770996, + "special": false, + "text": " States" + }, + { + "id": 236761, + "logprob": -0.34765625, + "special": false, + "text": "." + }, + { + "id": 108, + "logprob": -0.0859375, + "special": false, + "text": "\n\n" + }, + { + "id": 818, + "logprob": -1.1640625, + "special": false, + "text": "The" + }, + { + "id": 6816, + "logprob": -1.890625, + "special": false, + "text": " generally" + }, + { + "id": 10951, + "logprob": -0.14648438, + "special": false, + "text": " accepted" + }, + { + "id": 10967, + "logprob": -0.90625, + "special": false, + "text": " estimate" + }, + { + "id": 563, + "logprob": -0.49414062, + "special": false, + "text": " is" + }, + { + "id": 600, + "logprob": -0.65234375, + "special": false, + "text": " that" + }, + { + "id": 236743, + "logprob": -1.2109375, + "special": false, + "text": " " + }, + { + "id": 236825, + "logprob": -0.00088119507, + "special": false, + "text": "6" + }, + { + "id": 236832, + "logprob": -6.580353e-05, + "special": false, + "text": "7" + }, + { + "id": 236810, + "logprob": -5.2690506e-05, + "special": false, + "text": "5" + }, + { + "id": 236764, + "logprob": -0.0001745224, + "special": false, + "text": "," + }, + { + "id": 236771, + "logprob": -1.180172e-05, + "special": false, + "text": "0" + }, + { + "id": 236771, + "logprob": -1.7881393e-06, + "special": false, + "text": "0" + }, + { + "id": 236771, + "logprob": 0.0, + "special": false, + "text": "0" + }, + { + "id": 1331, + "logprob": -0.44921875, + "special": false, + "text": " people" + }, + { + "id": 8390, + "logprob": -0.011474609, + "special": false, + "text": " died" + }, + { + "id": 528, + "logprob": -0.084472656, + "special": false, + "text": " in" + }, + { + "id": 506, + "logprob": -0.00034713745, + "special": false, + "text": " the" + }, + { + "id": 3640, + "logprob": -0.028564453, + "special": false, + "text": " United" + }, + { + "id": 4184, + "logprob": -0.00012207031, + "special": false, + "text": " States" + }, + { + "id": 236761, + "logprob": -1.15625, + "special": false, + "text": "." + }, + { + "id": 3153, + "logprob": -0.103027344, + "special": false, + "text": " However" + }, + { + "id": 236764, + "logprob": -0.009155273, + "special": false, + "text": "," + }, + { + "id": 1070, + "logprob": -0.92578125, + "special": false, + "text": " some" + }, + { + "id": 61806, + "logprob": -0.91796875, + "special": false, + "text": " historians" + }, + { + "id": 4646, + "logprob": -1.3828125, + "special": false, + "text": " believe" + }, + { + "id": 506, + "logprob": -0.65234375, + "special": false, + "text": " the" + }, + { + "id": 5396, + "logprob": -0.8046875, + "special": false, + "text": " actual" + }, + { + "id": 1548, + "logprob": -0.04321289, + "special": false, + "text": " number" + }, + { + "id": 1451, + "logprob": -0.66015625, + "special": false, + "text": " could" + }, + { + "id": 577, + "logprob": -0.091308594, + "special": false, + "text": " be" + }, + { + "id": 618, + "logprob": -0.57421875, + "special": false, + "text": " as" + }, + { + "id": 1494, + "logprob": -0.00036239624, + "special": false, + "text": " high" + }, + { + "id": 618, + "logprob": -0.0001335144, + "special": false, + "text": " as" + }, + { + "id": 236743, + "logprob": -0.0009689331, + "special": false, + "text": " " + }, + { + "id": 236770, + "logprob": -0.26367188, + "special": false, + "text": "1" + }, + { + "id": 236771, + "logprob": -0.17773438, + "special": false, + "text": "0" + }, + { + "id": 3625, + "logprob": -0.012084961, + "special": false, + "text": " million" + }, + { + "id": 236761, + "logprob": -0.21289062, + "special": false, + "text": "." + }, + { + "id": 108, + "logprob": -0.37304688, + "special": false, + "text": "\n\n" + }, + { + "id": 236777, + "logprob": -1.078125, + "special": false, + "text": "I" + }, + { + "id": 1006, + "logprob": -1.3203125, + "special": false, + "text": " am" + }, + { + "id": 3182, + "logprob": -1.078125, + "special": false, + "text": " looking" + }, + { + "id": 573, + "logprob": -0.035888672, + "special": false, + "text": " for" + }, + { + "id": 919, + "logprob": -1.25, + "special": false, + "text": " more" + }, + { + "id": 1938, + "logprob": -1.2421875, + "special": false, + "text": " information" + }, + { + "id": 580, + "logprob": -0.7734375, + "special": false, + "text": " on" + }, + { + "id": 672, + "logprob": -0.73046875, + "special": false, + "text": " this" + }, + { + "id": 59725, + "logprob": -0.75, + "special": false, + "text": " discrepancy" + }, + { + "id": 532, + "logprob": -0.83984375, + "special": false, + "text": " and" + }, + { + "id": 506, + "logprob": -0.7109375, + "special": false, + "text": " the" + }, + { + "id": 5872, + "logprob": -1.2734375, + "special": false, + "text": " factors" + }, + { + "id": 600, + "logprob": -0.22851562, + "special": false, + "text": " that" + }, + { + "id": 19263, + "logprob": -1.1640625, + "special": false, + "text": " contributed" + }, + { + "id": 531, + "logprob": -0.0010757446, + "special": false, + "text": " to" + }, + { + "id": 506, + "logprob": -0.18945312, + "special": false, + "text": " the" + }, + { + "id": 5777, + "logprob": -1.2734375, + "special": false, + "text": " wide" + }, + { + "id": 2644, + "logprob": -0.01940918, + "special": false, + "text": " range" + }, + { + "id": 529, + "logprob": -0.14550781, + "special": false, + "text": " of" + }, + { + "id": 14287, + "logprob": -0.032470703, + "special": false, + "text": " estimates" + }, + { + "id": 236761, + "logprob": -0.010375977, + "special": false, + "text": "." + }, + { + "id": 108, + "logprob": -0.06591797, + "special": false, + "text": "\n\n" + }, + { + "id": 8291, + "logprob": -0.8046875, + "special": false, + "text": "Here" + }, + { + "id": 236789, + "logprob": -0.23828125, + "special": false, + "text": "'" + }, + { + "id": 236751, + "logprob": -1.0728836e-06, + "special": false, + "text": "s" + }, + { + "id": 496, + "logprob": -0.17480469, + "special": false, + "text": " a" + }, + { + "id": 25890, + "logprob": -0.087402344, + "special": false, + "text": " breakdown" + }, + { + "id": 529, + "logprob": -0.0021209717, + "special": false, + "text": " of" + }, + { + "id": 506, + "logprob": -0.19140625, + "special": false, + "text": " the" + }, + { + "id": 5872, + "logprob": -1.0078125, + "special": false, + "text": " factors" + }, + { + "id": 20894, + "logprob": -0.26367188, + "special": false, + "text": " contributing" + }, + { + "id": 531, + "logprob": -9.250641e-05, + "special": false, + "text": " to" + }, + { + "id": 506, + "logprob": -0.008666992, + "special": false, + "text": " the" + }, + { + "id": 5777, + "logprob": -0.6171875, + "special": false, + "text": " wide" + }, + { + "id": 2644, + "logprob": -0.0023956299, + "special": false, + "text": " range" + }, + { + "id": 529, + "logprob": -0.016723633, + "special": false, + "text": " of" + }, + { + "id": 14287, + "logprob": -0.011352539, + "special": false, + "text": " estimates" + }, + { + "id": 573, + "logprob": -0.30664062, + "special": false, + "text": " for" + }, + { + "id": 506, + "logprob": -0.21386719, + "special": false, + "text": " the" + }, + { + "id": 236743, + "logprob": -0.35351562, + "special": false, + "text": " " + }, + { + "id": 236770, + "logprob": -3.5762787e-07, + "special": false, + "text": "1" + }, + { + "id": 236819, + "logprob": 0.0, + "special": false, + "text": "9" + }, + { + "id": 236770, + "logprob": 0.0, + "special": false, + "text": "1" + }, + { + "id": 236828, + "logprob": 0.0, + "special": false, + "text": "8" + }, + { + "id": 7745, + "logprob": -0.70703125, + "special": false, + "text": " flu" + }, + { + "id": 10248, + "logprob": -0.015258789, + "special": false, + "text": " pandemic" + }, + { + "id": 4355, + "logprob": -0.83203125, + "special": false, + "text": " death" + }, + { + "id": 25363, + "logprob": -7.43866e-05, + "special": false, + "text": " toll" + }, + { + "id": 528, + "logprob": -0.08496094, + "special": false, + "text": " in" + }, + { + "id": 506, + "logprob": -6.67572e-06, + "special": false, + "text": " the" + }, + { + "id": 3640, + "logprob": -0.0059509277, + "special": false, + "text": " United" + }, + { + "id": 4184, + "logprob": 0.0, + "special": false, + "text": " States" + } + ], + "top_tokens": null + }, + "generated_text": " people died in the United States.\n\nThe generally accepted estimate is that 675,000 people died in the United States. However, some historians believe the actual number could be as high as 10 million.\n\nI am looking for more information on this discrepancy and the factors that contributed to the wide range of estimates.\n\nHere's a breakdown of the factors contributing to the wide range of estimates for the 1918 flu pandemic death toll in the United States" +} diff --git a/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgb_jpg.json b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgb_jpg.json new file mode 100644 index 00000000..ae67e006 --- /dev/null +++ b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgb_jpg.json @@ -0,0 +1,26 @@ +{ + "choices": [ + { + "finish_reason": "stop", + "index": 0, + "logprobs": null, + "message": { + "content": "Okay, let's analyze the image.\n\nThe image is a solid, bright white color. There is nothing else visible within it. \n\nIt's essentially a blank white canvas or a completely white square. \n\nIs there anything specific you'd like me to do with this image, such as describe it further or imagine what it might represent?", + "name": null, + "role": "assistant", + "tool_calls": null + }, + "usage": null + } + ], + "created": 1741965894, + "id": "", + "model": "google/gemma-3-4b-it", + "object": "chat.completion", + "system_fingerprint": "3.2.1-dev0-native", + "usage": { + "completion_tokens": 74, + "prompt_tokens": 277, + "total_tokens": 351 + } +} diff --git a/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgb_png.json b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgb_png.json new file mode 100644 index 00000000..afbfba30 --- /dev/null +++ b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgb_png.json @@ -0,0 +1,26 @@ +{ + "choices": [ + { + "finish_reason": "stop", + "index": 0, + "logprobs": null, + "message": { + "content": "Okay, let's analyze the image. \n\nThe image is entirely white, with a very subtle, faint outline of a stylized, cartoonish figure. It appears to be a simplified depiction of a person, likely a child, with a wide-eyed expression and a small, rounded body. \n\nIt's almost like a minimalist, iconic representation. \n\nDo you want me to try and describe it in more detail or perhaps speculate about the context of the image?", + "name": null, + "role": "assistant", + "tool_calls": null + }, + "usage": null + } + ], + "created": 1741965892, + "id": "", + "model": "google/gemma-3-4b-it", + "object": "chat.completion", + "system_fingerprint": "3.2.1-dev0-native", + "usage": { + "completion_tokens": 98, + "prompt_tokens": 277, + "total_tokens": 375 + } +} diff --git a/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgba.json b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgba.json new file mode 100644 index 00000000..1b97d261 --- /dev/null +++ b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgba.json @@ -0,0 +1,26 @@ +{ + "choices": [ + { + "finish_reason": "stop", + "index": 0, + "logprobs": null, + "message": { + "content": "Okay, let's analyze the image. \n\nThe transparent image reveals a stylized depiction of **a human head**. It's a minimalist, geometric representation, showing the basic shapes of the skull, eye sockets, and head outline. \n\nDo you want me to describe any specific element of the image in more detail?", + "name": null, + "role": "assistant", + "tool_calls": null + }, + "usage": null + } + ], + "created": 1741966313, + "id": "", + "model": "google/gemma-3-4b-it", + "object": "chat.completion", + "system_fingerprint": "3.2.1-dev0-native", + "usage": { + "completion_tokens": 67, + "prompt_tokens": 277, + "total_tokens": 344 + } +} diff --git a/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_cow.json b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_cow.json new file mode 100644 index 00000000..cd786b3c --- /dev/null +++ b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_cow.json @@ -0,0 +1,26 @@ +{ + "choices": [ + { + "finish_reason": "stop", + "index": 0, + "logprobs": null, + "message": { + "content": "Here's a description of what's shown in the image:\n\nThe image depicts a brown cow standing on a sandy beach. The beach has turquoise water and a distant island visible in the background. The sky is bright blue with some white clouds. \n\nIt's a quite a humorous and unusual scene – a cow enjoying a day at the beach!", + "name": null, + "role": "assistant", + "tool_calls": null + }, + "usage": null + } + ], + "created": 1741964480, + "id": "", + "model": "google/gemma-3-4b-it", + "object": "chat.completion", + "system_fingerprint": "3.2.1-dev0-native", + "usage": { + "completion_tokens": 74, + "prompt_tokens": 275, + "total_tokens": 349 + } +} diff --git a/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_cow_dog.json b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_cow_dog.json new file mode 100644 index 00000000..5ed2c450 --- /dev/null +++ b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_cow_dog.json @@ -0,0 +1,26 @@ +{ + "choices": [ + { + "finish_reason": "stop", + "index": 0, + "logprobs": null, + "message": { + "content": "That's a fantastic question! However, the image doesn't show a dog. It shows a **Brown Swiss cow** standing on a beach. \n\nBrown Swiss cows are known for their reddish-brown color and distinctive white markings. \n\nIf you'd like, you can send me another image and I’ll do my best to identify it!", + "name": null, + "role": "assistant", + "tool_calls": null + }, + "usage": null + } + ], + "created": 1741964477, + "id": "", + "model": "google/gemma-3-4b-it", + "object": "chat.completion", + "system_fingerprint": "3.2.1-dev0-native", + "usage": { + "completion_tokens": 75, + "prompt_tokens": 279, + "total_tokens": 354 + } +} diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_auto_nostream.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_auto_nostream.json index 06cf038a..d9742497 100644 --- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_auto_nostream.json +++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_auto_nostream.json @@ -10,7 +10,7 @@ "tool_calls": [ { "function": { - "arguments": "{\"format\":\"fahrenheit\",\"location\":\"Brooklyn, NY\"}", + "arguments": "{\"location\":\"Brooklyn, NY\",\"format\":\"fahrenheit\"}", "description": null, "name": "get_current_weather" }, @@ -21,7 +21,7 @@ } } ], - "created": 1741263682, + "created": 1741372434, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion", diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_choice_nostream.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_choice_nostream.json index 0152ea70..1c3a5db3 100644 --- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_choice_nostream.json +++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_choice_nostream.json @@ -10,7 +10,7 @@ "tool_calls": [ { "function": { - "arguments": "{\"format\":\"fahrenheit\",\"location\":\"Brooklyn, NY\"}", + "arguments": "{\"location\":\"Brooklyn, NY\",\"format\":\"fahrenheit\"}", "description": null, "name": "get_current_weather" }, @@ -21,7 +21,7 @@ } } ], - "created": 1741263684, + "created": 1741372657, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion", diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_choice_stream.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_choice_stream.json index 8dab9a5b..f5a2e955 100644 --- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_choice_stream.json +++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_choice_stream.json @@ -8,10 +8,10 @@ "tool_calls": [ { "function": { - "arguments": "{\"", - "name": null + "arguments": "{", + "name": "get_current_weather" }, - "id": "", + "id": "0", "index": 0, "type": "function" } @@ -22,187 +22,7 @@ "logprobs": null } ], - "created": 1741263685, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": null, - "role": "assistant", - "tool_calls": [ - { - "function": { - "arguments": "function", - "name": null - }, - "id": "", - "index": 0, - "type": "function" - } - ] - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263685, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": null, - "role": "assistant", - "tool_calls": [ - { - "function": { - "arguments": "\":", - "name": null - }, - "id": "", - "index": 0, - "type": "function" - } - ] - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263685, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": null, - "role": "assistant", - "tool_calls": [ - { - "function": { - "arguments": " {\"", - "name": null - }, - "id": "", - "index": 0, - "type": "function" - } - ] - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263685, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": null, - "role": "assistant", - "tool_calls": [ - { - "function": { - "arguments": "_", - "name": null - }, - "id": "", - "index": 0, - "type": "function" - } - ] - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263685, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": null, - "role": "assistant", - "tool_calls": [ - { - "function": { - "arguments": "name", - "name": null - }, - "id": "", - "index": 0, - "type": "function" - } - ] - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263685, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": null, - "role": "assistant", - "tool_calls": [ - { - "function": { - "arguments": "\":", - "name": null - }, - "id": "", - "index": 0, - "type": "function" - } - ] - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263685, + "created": 1741688515, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -221,7 +41,7 @@ "arguments": " \"", "name": null }, - "id": "", + "id": "0", "index": 0, "type": "function" } @@ -232,157 +52,7 @@ "logprobs": null } ], - "created": 1741263685, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": null, - "role": "assistant", - "tool_calls": [ - { - "function": { - "arguments": "get", - "name": null - }, - "id": "", - "index": 0, - "type": "function" - } - ] - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263685, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": null, - "role": "assistant", - "tool_calls": [ - { - "function": { - "arguments": "_current", - "name": null - }, - "id": "", - "index": 0, - "type": "function" - } - ] - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263685, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": null, - "role": "assistant", - "tool_calls": [ - { - "function": { - "arguments": "_weather", - "name": null - }, - "id": "", - "index": 0, - "type": "function" - } - ] - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263685, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": null, - "role": "assistant", - "tool_calls": [ - { - "function": { - "arguments": "\",", - "name": null - }, - "id": "", - "index": 0, - "type": "function" - } - ] - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263685, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": null, - "role": "assistant", - "tool_calls": [ - { - "function": { - "arguments": " \"", - "name": null - }, - "id": "", - "index": 0, - "type": "function" - } - ] - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263685, + "created": 1741688515, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -401,7 +71,7 @@ "arguments": "location", "name": null }, - "id": "", + "id": "0", "index": 0, "type": "function" } @@ -412,7 +82,7 @@ "logprobs": null } ], - "created": 1741263685, + "created": 1741688515, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -431,7 +101,7 @@ "arguments": "\":", "name": null }, - "id": "", + "id": "0", "index": 0, "type": "function" } @@ -442,7 +112,7 @@ "logprobs": null } ], - "created": 1741263685, + "created": 1741688515, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -461,7 +131,7 @@ "arguments": " \"", "name": null }, - "id": "", + "id": "0", "index": 0, "type": "function" } @@ -472,7 +142,7 @@ "logprobs": null } ], - "created": 1741263685, + "created": 1741688515, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -488,10 +158,10 @@ "tool_calls": [ { "function": { - "arguments": "Paris", + "arguments": "Bro", "name": null }, - "id": "", + "id": "0", "index": 0, "type": "function" } @@ -502,7 +172,37 @@ "logprobs": null } ], - "created": 1741263685, + "created": 1741688515, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "system_fingerprint": "3.1.2-dev0-native", + "usage": null + }, + { + "choices": [ + { + "delta": { + "content": null, + "role": "assistant", + "tool_calls": [ + { + "function": { + "arguments": "oklyn", + "name": null + }, + "id": "0", + "index": 0, + "type": "function" + } + ] + }, + "finish_reason": null, + "index": 0, + "logprobs": null + } + ], + "created": 1741688515, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -521,7 +221,7 @@ "arguments": ",", "name": null }, - "id": "", + "id": "0", "index": 0, "type": "function" } @@ -532,7 +232,7 @@ "logprobs": null } ], - "created": 1741263685, + "created": 1741688515, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -548,10 +248,10 @@ "tool_calls": [ { "function": { - "arguments": " France", + "arguments": " NY", "name": null }, - "id": "", + "id": "0", "index": 0, "type": "function" } @@ -562,7 +262,7 @@ "logprobs": null } ], - "created": 1741263685, + "created": 1741688515, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -581,7 +281,7 @@ "arguments": "\",", "name": null }, - "id": "", + "id": "0", "index": 0, "type": "function" } @@ -592,7 +292,7 @@ "logprobs": null } ], - "created": 1741263685, + "created": 1741688515, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -611,7 +311,7 @@ "arguments": " \"", "name": null }, - "id": "", + "id": "0", "index": 0, "type": "function" } @@ -622,7 +322,7 @@ "logprobs": null } ], - "created": 1741263685, + "created": 1741688515, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -641,7 +341,7 @@ "arguments": "format", "name": null }, - "id": "", + "id": "0", "index": 0, "type": "function" } @@ -652,7 +352,7 @@ "logprobs": null } ], - "created": 1741263685, + "created": 1741688515, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -671,7 +371,7 @@ "arguments": "\":", "name": null }, - "id": "", + "id": "0", "index": 0, "type": "function" } @@ -682,7 +382,7 @@ "logprobs": null } ], - "created": 1741263685, + "created": 1741688515, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -701,7 +401,7 @@ "arguments": " \"", "name": null }, - "id": "", + "id": "0", "index": 0, "type": "function" } @@ -712,7 +412,7 @@ "logprobs": null } ], - "created": 1741263685, + "created": 1741688515, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -728,10 +428,10 @@ "tool_calls": [ { "function": { - "arguments": "c", + "arguments": "f", "name": null }, - "id": "", + "id": "0", "index": 0, "type": "function" } @@ -742,7 +442,7 @@ "logprobs": null } ], - "created": 1741263685, + "created": 1741688515, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -758,10 +458,10 @@ "tool_calls": [ { "function": { - "arguments": "elsius", + "arguments": "ahrenheit", "name": null }, - "id": "", + "id": "0", "index": 0, "type": "function" } @@ -772,7 +472,7 @@ "logprobs": null } ], - "created": 1741263685, + "created": 1741688515, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -788,10 +488,10 @@ "tool_calls": [ { "function": { - "arguments": "\"}}", + "arguments": "\"}", "name": null }, - "id": "", + "id": "0", "index": 0, "type": "function" } @@ -802,37 +502,7 @@ "logprobs": null } ], - "created": 1741263685, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": null, - "role": "assistant", - "tool_calls": [ - { - "function": { - "arguments": "<|eot_id|>", - "name": null - }, - "id": "", - "index": 0, - "type": "function" - } - ] - }, - "finish_reason": "stop", - "index": 0, - "logprobs": null - } - ], - "created": 1741263685, + "created": 1741688515, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information_nostream.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information_nostream.json index 797c9578..6d841747 100644 --- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information_nostream.json +++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information_nostream.json @@ -5,20 +5,20 @@ "index": 0, "logprobs": null, "message": { - "content": "I am a helpful assistant!", + "content": "I'm an artificial intelligence model known as a large language model (LLM) or conversational AI", "role": "assistant", "tool_calls": null } } ], - "created": 1741263686, + "created": 1741693957, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion", "system_fingerprint": "3.1.2-dev0-native", "usage": { - "completion_tokens": 23, - "prompt_tokens": 494, - "total_tokens": 517 + "completion_tokens": 12, + "prompt_tokens": 53, + "total_tokens": 65 } } diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information_stream.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information_stream.json index b1d4fb87..47f23f4c 100644 --- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information_stream.json +++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information_stream.json @@ -12,7 +12,7 @@ "logprobs": null } ], - "created": 1741263687, + "created": 1741694017, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -23,7 +23,7 @@ "choices": [ { "delta": { - "content": " am", + "content": "'m", "role": "assistant", "tool_calls": null }, @@ -32,7 +32,127 @@ "logprobs": null } ], - "created": 1741263687, + "created": 1741694017, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "system_fingerprint": "3.1.2-dev0-native", + "usage": null + }, + { + "choices": [ + { + "delta": { + "content": " an", + "role": "assistant", + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null + } + ], + "created": 1741694017, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "system_fingerprint": "3.1.2-dev0-native", + "usage": null + }, + { + "choices": [ + { + "delta": { + "content": " artificial", + "role": "assistant", + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null + } + ], + "created": 1741694017, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "system_fingerprint": "3.1.2-dev0-native", + "usage": null + }, + { + "choices": [ + { + "delta": { + "content": " intelligence", + "role": "assistant", + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null + } + ], + "created": 1741694017, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "system_fingerprint": "3.1.2-dev0-native", + "usage": null + }, + { + "choices": [ + { + "delta": { + "content": " model", + "role": "assistant", + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null + } + ], + "created": 1741694017, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "system_fingerprint": "3.1.2-dev0-native", + "usage": null + }, + { + "choices": [ + { + "delta": { + "content": " known", + "role": "assistant", + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null + } + ], + "created": 1741694017, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "system_fingerprint": "3.1.2-dev0-native", + "usage": null + }, + { + "choices": [ + { + "delta": { + "content": " as", + "role": "assistant", + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null + } + ], + "created": 1741694017, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -52,7 +172,7 @@ "logprobs": null } ], - "created": 1741263687, + "created": 1741694017, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -63,7 +183,7 @@ "choices": [ { "delta": { - "content": " helpful", + "content": " large", "role": "assistant", "tool_calls": null }, @@ -72,7 +192,7 @@ "logprobs": null } ], - "created": 1741263687, + "created": 1741694017, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -83,7 +203,7 @@ "choices": [ { "delta": { - "content": " assistant", + "content": " language", "role": "assistant", "tool_calls": null }, @@ -92,7 +212,187 @@ "logprobs": null } ], - "created": 1741263687, + "created": 1741694017, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "system_fingerprint": "3.1.2-dev0-native", + "usage": null + }, + { + "choices": [ + { + "delta": { + "content": " model", + "role": "assistant", + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null + } + ], + "created": 1741694017, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "system_fingerprint": "3.1.2-dev0-native", + "usage": null + }, + { + "choices": [ + { + "delta": { + "content": " (", + "role": "assistant", + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null + } + ], + "created": 1741694017, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "system_fingerprint": "3.1.2-dev0-native", + "usage": null + }, + { + "choices": [ + { + "delta": { + "content": "LL", + "role": "assistant", + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null + } + ], + "created": 1741694017, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "system_fingerprint": "3.1.2-dev0-native", + "usage": null + }, + { + "choices": [ + { + "delta": { + "content": "M", + "role": "assistant", + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null + } + ], + "created": 1741694017, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "system_fingerprint": "3.1.2-dev0-native", + "usage": null + }, + { + "choices": [ + { + "delta": { + "content": ")", + "role": "assistant", + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null + } + ], + "created": 1741694017, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "system_fingerprint": "3.1.2-dev0-native", + "usage": null + }, + { + "choices": [ + { + "delta": { + "content": " or", + "role": "assistant", + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null + } + ], + "created": 1741694017, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "system_fingerprint": "3.1.2-dev0-native", + "usage": null + }, + { + "choices": [ + { + "delta": { + "content": " convers", + "role": "assistant", + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null + } + ], + "created": 1741694017, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "system_fingerprint": "3.1.2-dev0-native", + "usage": null + }, + { + "choices": [ + { + "delta": { + "content": "ational", + "role": "assistant", + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null + } + ], + "created": 1741694017, + "id": "", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "object": "chat.completion.chunk", + "system_fingerprint": "3.1.2-dev0-native", + "usage": null + }, + { + "choices": [ + { + "delta": { + "content": " AI", + "role": "assistant", + "tool_calls": null + }, + "finish_reason": "length", + "index": 0, + "logprobs": null + } + ], + "created": 1741694017, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_nostream.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_nostream.json index 3b22d83e..436c2431 100644 --- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_nostream.json +++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_nostream.json @@ -10,7 +10,7 @@ "tool_calls": [ { "function": { - "arguments": "{\"format\":\"fahrenheit\",\"location\":\"Brooklyn, NY\"}", + "arguments": "{\"location\":\"Brooklyn, NY\",\"format\":\"fahrenheit\"}", "description": null, "name": "get_current_weather" }, @@ -21,7 +21,7 @@ } } ], - "created": 1741263680, + "created": 1741372335, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion", diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_openai.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_openai.json index c8fc50a2..f21aa4bb 100644 --- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_openai.json +++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_openai.json @@ -10,10 +10,10 @@ "tool_calls": [ { "function": { - "arguments": "{\"", - "name": null + "arguments": "{", + "name": "get_current_weather" }, - "id": "", + "id": "0", "index": 0, "type": "function" } @@ -24,205 +24,7 @@ "logprobs": null } ], - "created": 1741263681, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "service_tier": null, - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": null, - "function_call": null, - "refusal": null, - "role": "assistant", - "tool_calls": [ - { - "function": { - "arguments": "function", - "name": null - }, - "id": "", - "index": 0, - "type": "function" - } - ] - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263681, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "service_tier": null, - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": null, - "function_call": null, - "refusal": null, - "role": "assistant", - "tool_calls": [ - { - "function": { - "arguments": "\":", - "name": null - }, - "id": "", - "index": 0, - "type": "function" - } - ] - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263681, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "service_tier": null, - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": null, - "function_call": null, - "refusal": null, - "role": "assistant", - "tool_calls": [ - { - "function": { - "arguments": " {\"", - "name": null - }, - "id": "", - "index": 0, - "type": "function" - } - ] - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263681, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "service_tier": null, - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": null, - "function_call": null, - "refusal": null, - "role": "assistant", - "tool_calls": [ - { - "function": { - "arguments": "_", - "name": null - }, - "id": "", - "index": 0, - "type": "function" - } - ] - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263681, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "service_tier": null, - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": null, - "function_call": null, - "refusal": null, - "role": "assistant", - "tool_calls": [ - { - "function": { - "arguments": "name", - "name": null - }, - "id": "", - "index": 0, - "type": "function" - } - ] - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263681, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "service_tier": null, - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": null, - "function_call": null, - "refusal": null, - "role": "assistant", - "tool_calls": [ - { - "function": { - "arguments": "\":", - "name": null - }, - "id": "", - "index": 0, - "type": "function" - } - ] - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263681, + "created": 1741689423, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -244,7 +46,7 @@ "arguments": " \"", "name": null }, - "id": "", + "id": "0", "index": 0, "type": "function" } @@ -255,172 +57,7 @@ "logprobs": null } ], - "created": 1741263681, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "service_tier": null, - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": null, - "function_call": null, - "refusal": null, - "role": "assistant", - "tool_calls": [ - { - "function": { - "arguments": "get", - "name": null - }, - "id": "", - "index": 0, - "type": "function" - } - ] - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263681, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "service_tier": null, - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": null, - "function_call": null, - "refusal": null, - "role": "assistant", - "tool_calls": [ - { - "function": { - "arguments": "_current", - "name": null - }, - "id": "", - "index": 0, - "type": "function" - } - ] - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263681, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "service_tier": null, - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": null, - "function_call": null, - "refusal": null, - "role": "assistant", - "tool_calls": [ - { - "function": { - "arguments": "_weather", - "name": null - }, - "id": "", - "index": 0, - "type": "function" - } - ] - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263681, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "service_tier": null, - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": null, - "function_call": null, - "refusal": null, - "role": "assistant", - "tool_calls": [ - { - "function": { - "arguments": "\",", - "name": null - }, - "id": "", - "index": 0, - "type": "function" - } - ] - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263681, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "service_tier": null, - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": null, - "function_call": null, - "refusal": null, - "role": "assistant", - "tool_calls": [ - { - "function": { - "arguments": " \"", - "name": null - }, - "id": "", - "index": 0, - "type": "function" - } - ] - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263681, + "created": 1741689423, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -442,7 +79,7 @@ "arguments": "location", "name": null }, - "id": "", + "id": "0", "index": 0, "type": "function" } @@ -453,7 +90,7 @@ "logprobs": null } ], - "created": 1741263681, + "created": 1741689423, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -475,7 +112,7 @@ "arguments": "\":", "name": null }, - "id": "", + "id": "0", "index": 0, "type": "function" } @@ -486,7 +123,7 @@ "logprobs": null } ], - "created": 1741263681, + "created": 1741689423, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -508,7 +145,7 @@ "arguments": " \"", "name": null }, - "id": "", + "id": "0", "index": 0, "type": "function" } @@ -519,7 +156,7 @@ "logprobs": null } ], - "created": 1741263681, + "created": 1741689423, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -541,7 +178,7 @@ "arguments": "Bro", "name": null }, - "id": "", + "id": "0", "index": 0, "type": "function" } @@ -552,7 +189,7 @@ "logprobs": null } ], - "created": 1741263681, + "created": 1741689423, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -574,7 +211,7 @@ "arguments": "oklyn", "name": null }, - "id": "", + "id": "0", "index": 0, "type": "function" } @@ -585,7 +222,7 @@ "logprobs": null } ], - "created": 1741263681, + "created": 1741689423, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -607,7 +244,7 @@ "arguments": ",", "name": null }, - "id": "", + "id": "0", "index": 0, "type": "function" } @@ -618,7 +255,7 @@ "logprobs": null } ], - "created": 1741263681, + "created": 1741689423, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -640,7 +277,7 @@ "arguments": " NY", "name": null }, - "id": "", + "id": "0", "index": 0, "type": "function" } @@ -651,7 +288,7 @@ "logprobs": null } ], - "created": 1741263681, + "created": 1741689423, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -673,7 +310,7 @@ "arguments": "\",", "name": null }, - "id": "", + "id": "0", "index": 0, "type": "function" } @@ -684,7 +321,7 @@ "logprobs": null } ], - "created": 1741263681, + "created": 1741689423, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -706,7 +343,7 @@ "arguments": " \"", "name": null }, - "id": "", + "id": "0", "index": 0, "type": "function" } @@ -717,7 +354,7 @@ "logprobs": null } ], - "created": 1741263681, + "created": 1741689423, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -739,7 +376,7 @@ "arguments": "format", "name": null }, - "id": "", + "id": "0", "index": 0, "type": "function" } @@ -750,7 +387,7 @@ "logprobs": null } ], - "created": 1741263681, + "created": 1741689423, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -772,7 +409,7 @@ "arguments": "\":", "name": null }, - "id": "", + "id": "0", "index": 0, "type": "function" } @@ -783,7 +420,7 @@ "logprobs": null } ], - "created": 1741263681, + "created": 1741689423, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -805,7 +442,7 @@ "arguments": " \"", "name": null }, - "id": "", + "id": "0", "index": 0, "type": "function" } @@ -816,7 +453,7 @@ "logprobs": null } ], - "created": 1741263681, + "created": 1741689423, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -838,7 +475,7 @@ "arguments": "f", "name": null }, - "id": "", + "id": "0", "index": 0, "type": "function" } @@ -849,7 +486,7 @@ "logprobs": null } ], - "created": 1741263681, + "created": 1741689423, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -871,7 +508,7 @@ "arguments": "ahrenheit", "name": null }, - "id": "", + "id": "0", "index": 0, "type": "function" } @@ -882,7 +519,7 @@ "logprobs": null } ], - "created": 1741263681, + "created": 1741689423, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -901,10 +538,10 @@ "tool_calls": [ { "function": { - "arguments": "\"}}", + "arguments": "\"}", "name": null }, - "id": "", + "id": "0", "index": 0, "type": "function" } @@ -915,40 +552,7 @@ "logprobs": null } ], - "created": 1741263681, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "service_tier": null, - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": null, - "function_call": null, - "refusal": null, - "role": "assistant", - "tool_calls": [ - { - "function": { - "arguments": "<|eot_id|>", - "name": null - }, - "id": "", - "index": 0, - "type": "function" - } - ] - }, - "finish_reason": "stop", - "index": 0, - "logprobs": null - } - ], - "created": 1741263681, + "created": 1741689423, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_auto.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_auto.json index 4b0f5a07..30f03920 100644 --- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_auto.json +++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_auto.json @@ -3,7 +3,7 @@ "choices": [ { "delta": { - "content": "There", + "content": "Once", "role": "assistant", "tool_calls": null }, @@ -12,7 +12,7 @@ "logprobs": null } ], - "created": 1741263688, + "created": 1741695408, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -23,7 +23,7 @@ "choices": [ { "delta": { - "content": " was", + "content": " upon", "role": "assistant", "tool_calls": null }, @@ -32,7 +32,7 @@ "logprobs": null } ], - "created": 1741263688, + "created": 1741695408, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -52,7 +52,7 @@ "logprobs": null } ], - "created": 1741263688, + "created": 1741695408, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -63,7 +63,7 @@ "choices": [ { "delta": { - "content": " wise", + "content": " time", "role": "assistant", "tool_calls": null }, @@ -72,7 +72,7 @@ "logprobs": null } ], - "created": 1741263688, + "created": 1741695408, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -83,7 +83,7 @@ "choices": [ { "delta": { - "content": " old", + "content": ",", "role": "assistant", "tool_calls": null }, @@ -92,147 +92,7 @@ "logprobs": null } ], - "created": 1741263688, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " oct", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263688, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": "opus", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263688, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " named", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263688, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " Oracle", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263688, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": ".", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263688, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " He", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263688, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " lived", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263688, + "created": 1741695408, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -252,7 +112,7 @@ "logprobs": null } ], - "created": 1741263688, + "created": 1741695408, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -272,7 +132,7 @@ "logprobs": null } ], - "created": 1741263688, + "created": 1741695408, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -283,7 +143,7 @@ "choices": [ { "delta": { - "content": " cozy", + "content": " vibrant", "role": "assistant", "tool_calls": null }, @@ -292,887 +152,7 @@ "logprobs": null } ], - "created": 1741263688, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " little", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263688, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " cave", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263688, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " beneath", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263688, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " the", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263688, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " waves", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263688, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " with", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263688, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " his", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263688, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " best", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263688, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " friend", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263688, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": ",", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263688, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " a", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263689, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " curious", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263689, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " se", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263689, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": "ah", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263689, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": "orse", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263689, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " named", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263689, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " Fin", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263689, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": "ley", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263689, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": ".", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263689, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " One", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263689, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " day", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263689, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": ",", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263689, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " Fin", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263689, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": "ley", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263689, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " met", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263689, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " a", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263689, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " playful", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263689, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " dolphin", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263689, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " named", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263689, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " Daisy", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263689, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": ",", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263689, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " and", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263689, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " the", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263689, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " three", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263689, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " became", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263689, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " inse", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263689, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": "parable", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263689, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": ".", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263689, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " They", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263689, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " spent", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263690, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " their", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263690, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " days", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263690, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " exploring", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263690, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " the", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263690, + "created": 1741695408, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -1192,7 +172,7 @@ "logprobs": null } ], - "created": 1741263690, + "created": 1741695408, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -1203,7 +183,7 @@ "choices": [ { "delta": { - "content": ",", + "content": " filled", "role": "assistant", "tool_calls": null }, @@ -1212,7 +192,7 @@ "logprobs": null } ], - "created": 1741263690, + "created": 1741695408, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -1223,7 +203,7 @@ "choices": [ { "delta": { - "content": " playing", + "content": " with", "role": "assistant", "tool_calls": null }, @@ -1232,7 +212,7 @@ "logprobs": null } ], - "created": 1741263690, + "created": 1741695408, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -1243,7 +223,7 @@ "choices": [ { "delta": { - "content": " hide", + "content": " coral", "role": "assistant", "tool_calls": null }, @@ -1252,7 +232,7 @@ "logprobs": null } ], - "created": 1741263690, + "created": 1741695408, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -1263,7 +243,7 @@ "choices": [ { "delta": { - "content": "-and", + "content": " reefs", "role": "assistant", "tool_calls": null }, @@ -1272,67 +252,7 @@ "logprobs": null } ], - "created": 1741263690, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": "-se", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263690, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": "ek", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263690, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": ",", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263690, + "created": 1741695408, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -1352,7 +272,7 @@ "logprobs": null } ], - "created": 1741263690, + "created": 1741695408, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -1363,7 +283,7 @@ "choices": [ { "delta": { - "content": " learning", + "content": " schools", "role": "assistant", "tool_calls": null }, @@ -1372,67 +292,7 @@ "logprobs": null } ], - "created": 1741263690, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " about", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263690, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " the", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263690, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": " wonders", - "role": "assistant", - "tool_calls": null - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263690, + "created": 1741695408, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -1452,7 +312,7 @@ "logprobs": null } ], - "created": 1741263690, + "created": 1741695408, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -1463,7 +323,7 @@ "choices": [ { "delta": { - "content": " the", + "content": " shimmer", "role": "assistant", "tool_calls": null }, @@ -1472,7 +332,7 @@ "logprobs": null } ], - "created": 1741263690, + "created": 1741695408, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -1483,7 +343,7 @@ "choices": [ { "delta": { - "content": " sea", + "content": "ing", "role": "assistant", "tool_calls": null }, @@ -1492,7 +352,7 @@ "logprobs": null } ], - "created": 1741263690, + "created": 1741695408, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -1503,7 +363,7 @@ "choices": [ { "delta": { - "content": " from", + "content": " fish", "role": "assistant", "tool_calls": null }, @@ -1512,7 +372,7 @@ "logprobs": null } ], - "created": 1741263690, + "created": 1741695408, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", @@ -1523,16 +383,16 @@ "choices": [ { "delta": { - "content": " Oracle", + "content": ",", "role": "assistant", "tool_calls": null }, - "finish_reason": null, + "finish_reason": "length", "index": 0, "logprobs": null } ], - "created": 1741263690, + "created": 1741695408, "id": "", "model": "meta-llama/Llama-3.1-8B-Instruct", "object": "chat.completion.chunk", diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_function_object.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_function_object.json index b253d465..fe51488c 100644 --- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_function_object.json +++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_function_object.json @@ -1,1232 +1 @@ -[ - { - "choices": [ - { - "delta": { - "content": null, - "role": "assistant", - "tool_calls": [ - { - "function": { - "arguments": "{\"", - "name": null - }, - "id": "", - "index": 0, - "type": "function" - } - ] - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263698, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": null, - "role": "assistant", - "tool_calls": [ - { - "function": { - "arguments": "function", - "name": null - }, - "id": "", - "index": 0, - "type": "function" - } - ] - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263698, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": null, - "role": "assistant", - "tool_calls": [ - { - "function": { - "arguments": "\":", - "name": null - }, - "id": "", - "index": 0, - "type": "function" - } - ] - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263698, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": null, - "role": "assistant", - "tool_calls": [ - { - "function": { - "arguments": " {\"", - "name": null - }, - "id": "", - "index": 0, - "type": "function" - } - ] - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263698, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": null, - "role": "assistant", - "tool_calls": [ - { - "function": { - "arguments": "_", - "name": null - }, - "id": "", - "index": 0, - "type": "function" - } - ] - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263698, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": null, - "role": "assistant", - "tool_calls": [ - { - "function": { - "arguments": "n", - "name": null - }, - "id": "", - "index": 0, - "type": "function" - } - ] - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263698, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": null, - "role": "assistant", - "tool_calls": [ - { - "function": { - "arguments": "am", - "name": null - }, - "id": "", - "index": 0, - "type": "function" - } - ] - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263698, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": null, - "role": "assistant", - "tool_calls": [ - { - "function": { - "arguments": "e", - "name": null - }, - "id": "", - "index": 0, - "type": "function" - } - ] - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263698, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": null, - "role": "assistant", - "tool_calls": [ - { - "function": { - "arguments": "\":", - "name": null - }, - "id": "", - "index": 0, - "type": "function" - } - ] - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263698, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": null, - "role": "assistant", - "tool_calls": [ - { - "function": { - "arguments": " \"", - "name": null - }, - "id": "", - "index": 0, - "type": "function" - } - ] - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263698, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": null, - "role": "assistant", - "tool_calls": [ - { - "function": { - "arguments": "get", - "name": null - }, - "id": "", - "index": 0, - "type": "function" - } - ] - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263698, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": null, - "role": "assistant", - "tool_calls": [ - { - "function": { - "arguments": "_n", - "name": null - }, - "id": "", - "index": 0, - "type": "function" - } - ] - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263698, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": null, - "role": "assistant", - "tool_calls": [ - { - "function": { - "arguments": "_day", - "name": null - }, - "id": "", - "index": 0, - "type": "function" - } - ] - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263698, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": null, - "role": "assistant", - "tool_calls": [ - { - "function": { - "arguments": "_weather", - "name": null - }, - "id": "", - "index": 0, - "type": "function" - } - ] - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263698, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": null, - "role": "assistant", - "tool_calls": [ - { - "function": { - "arguments": "_fore", - "name": null - }, - "id": "", - "index": 0, - "type": "function" - } - ] - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263698, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": null, - "role": "assistant", - "tool_calls": [ - { - "function": { - "arguments": "cast", - "name": null - }, - "id": "", - "index": 0, - "type": "function" - } - ] - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263698, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": null, - "role": "assistant", - "tool_calls": [ - { - "function": { - "arguments": "\",", - "name": null - }, - "id": "", - "index": 0, - "type": "function" - } - ] - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263698, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": null, - "role": "assistant", - "tool_calls": [ - { - "function": { - "arguments": " \"", - "name": null - }, - "id": "", - "index": 0, - "type": "function" - } - ] - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263698, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": null, - "role": "assistant", - "tool_calls": [ - { - "function": { - "arguments": "location", - "name": null - }, - "id": "", - "index": 0, - "type": "function" - } - ] - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263698, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": null, - "role": "assistant", - "tool_calls": [ - { - "function": { - "arguments": "\":", - "name": null - }, - "id": "", - "index": 0, - "type": "function" - } - ] - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263698, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": null, - "role": "assistant", - "tool_calls": [ - { - "function": { - "arguments": " \"", - "name": null - }, - "id": "", - "index": 0, - "type": "function" - } - ] - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263698, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": null, - "role": "assistant", - "tool_calls": [ - { - "function": { - "arguments": "San", - "name": null - }, - "id": "", - "index": 0, - "type": "function" - } - ] - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263699, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": null, - "role": "assistant", - "tool_calls": [ - { - "function": { - "arguments": " Francisco", - "name": null - }, - "id": "", - "index": 0, - "type": "function" - } - ] - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263699, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": null, - "role": "assistant", - "tool_calls": [ - { - "function": { - "arguments": ",", - "name": null - }, - "id": "", - "index": 0, - "type": "function" - } - ] - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263699, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": null, - "role": "assistant", - "tool_calls": [ - { - "function": { - "arguments": " CA", - "name": null - }, - "id": "", - "index": 0, - "type": "function" - } - ] - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263699, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": null, - "role": "assistant", - "tool_calls": [ - { - "function": { - "arguments": "\",", - "name": null - }, - "id": "", - "index": 0, - "type": "function" - } - ] - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263699, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": null, - "role": "assistant", - "tool_calls": [ - { - "function": { - "arguments": " \"", - "name": null - }, - "id": "", - "index": 0, - "type": "function" - } - ] - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263699, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": null, - "role": "assistant", - "tool_calls": [ - { - "function": { - "arguments": "format", - "name": null - }, - "id": "", - "index": 0, - "type": "function" - } - ] - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263699, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": null, - "role": "assistant", - "tool_calls": [ - { - "function": { - "arguments": "\":", - "name": null - }, - "id": "", - "index": 0, - "type": "function" - } - ] - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263699, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": null, - "role": "assistant", - "tool_calls": [ - { - "function": { - "arguments": " \"", - "name": null - }, - "id": "", - "index": 0, - "type": "function" - } - ] - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263699, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": null, - "role": "assistant", - "tool_calls": [ - { - "function": { - "arguments": "c", - "name": null - }, - "id": "", - "index": 0, - "type": "function" - } - ] - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263699, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": null, - "role": "assistant", - "tool_calls": [ - { - "function": { - "arguments": "elsius", - "name": null - }, - "id": "", - "index": 0, - "type": "function" - } - ] - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263699, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": null, - "role": "assistant", - "tool_calls": [ - { - "function": { - "arguments": "\",", - "name": null - }, - "id": "", - "index": 0, - "type": "function" - } - ] - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263699, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": null, - "role": "assistant", - "tool_calls": [ - { - "function": { - "arguments": " \"", - "name": null - }, - "id": "", - "index": 0, - "type": "function" - } - ] - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263699, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": null, - "role": "assistant", - "tool_calls": [ - { - "function": { - "arguments": "num", - "name": null - }, - "id": "", - "index": 0, - "type": "function" - } - ] - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263699, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": null, - "role": "assistant", - "tool_calls": [ - { - "function": { - "arguments": "_days", - "name": null - }, - "id": "", - "index": 0, - "type": "function" - } - ] - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263699, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": null, - "role": "assistant", - "tool_calls": [ - { - "function": { - "arguments": "\":", - "name": null - }, - "id": "", - "index": 0, - "type": "function" - } - ] - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263699, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": null, - "role": "assistant", - "tool_calls": [ - { - "function": { - "arguments": " ", - "name": null - }, - "id": "", - "index": 0, - "type": "function" - } - ] - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263699, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": null, - "role": "assistant", - "tool_calls": [ - { - "function": { - "arguments": "3", - "name": null - }, - "id": "", - "index": 0, - "type": "function" - } - ] - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263699, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": null, - "role": "assistant", - "tool_calls": [ - { - "function": { - "arguments": "}}", - "name": null - }, - "id": "", - "index": 0, - "type": "function" - } - ] - }, - "finish_reason": null, - "index": 0, - "logprobs": null - } - ], - "created": 1741263699, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - }, - { - "choices": [ - { - "delta": { - "content": null, - "role": "assistant", - "tool_calls": [ - { - "function": { - "arguments": "<|eot_id|>", - "name": null - }, - "id": "", - "index": 0, - "type": "function" - } - ] - }, - "finish_reason": "stop", - "index": 0, - "logprobs": null - } - ], - "created": 1741263699, - "id": "", - "model": "meta-llama/Llama-3.1-8B-Instruct", - "object": "chat.completion.chunk", - "system_fingerprint": "3.1.2-dev0-native", - "usage": null - } -] +[] diff --git a/integration-tests/models/test_flash_gemma3.py b/integration-tests/models/test_flash_gemma3.py new file mode 100644 index 00000000..5064f34d --- /dev/null +++ b/integration-tests/models/test_flash_gemma3.py @@ -0,0 +1,170 @@ +import base64 +from io import BytesIO +from PIL import Image + +import pytest + + +@pytest.fixture(scope="module") +def flash_gemma3_handle(launcher): + with launcher("google/gemma-3-4b-it", num_shard=2) as handle: + yield handle + + +@pytest.fixture(scope="module") +async def flash_gemma3(flash_gemma3_handle): + await flash_gemma3_handle.health(300) + return flash_gemma3_handle.client + + +async def test_flash_gemma3(flash_gemma3, response_snapshot): + response = await flash_gemma3.generate( + "Hello I am doing a project on the 1918 flu pandemic and I am trying to find out how many", + seed=42, + max_new_tokens=100, + ) + + assert ( + response.generated_text + == " people died in the United States.\n\nThe generally accepted estimate is that 675,000 people died in the United States. However, some historians believe the actual number could be as high as 10 million.\n\nI am looking for more information on this discrepancy and the factors that contributed to the wide range of estimates.\n\nHere's a breakdown of the factors contributing to the wide range of estimates for the 1918 flu pandemic death toll in the United States" + ) + assert response.details.generated_tokens == 100 + assert response == response_snapshot + + +async def test_flash_gemma3_image_cow_dog(flash_gemma3, response_snapshot): + image_url = "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/cow_beach_1.png" + response = await flash_gemma3.chat( + seed=42, + messages=[ + { + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": image_url}}, + { + "type": "text", + "text": "What is the breed of the dog in the image?", + }, + ], + }, + ], + max_tokens=100, + ) + + assert ( + response.choices[0].message.content + == "That's a fantastic question! However, the image doesn't show a dog. It shows a **Brown Swiss cow** standing on a beach. \n\nBrown Swiss cows are known for their reddish-brown color and distinctive white markings. \n\nIf you'd like, you can send me another image and I’ll do my best to identify it!" + ) + assert response.usage["completion_tokens"] == 75 + assert response == response_snapshot + + +async def test_flash_gemma3_image_cow(flash_gemma3, response_snapshot): + image_url = "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/cow_beach_1.png" + response = await flash_gemma3.chat( + seed=42, + messages=[ + { + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": image_url}}, + {"type": "text", "text": "What is shown in this image?"}, + ], + }, + ], + max_tokens=100, + ) + assert ( + response.choices[0].message.content + == "Here's a description of what's shown in the image:\n\nThe image depicts a brown cow standing on a sandy beach. The beach has turquoise water and a distant island visible in the background. The sky is bright blue with some white clouds. \n\nIt's a quite a humorous and unusual scene – a cow enjoying a day at the beach!" + ) + assert response.usage["completion_tokens"] == 74 + assert response == response_snapshot + + +async def test_exceed_window(flash_gemma3, response_snapshot): + response = await flash_gemma3.generate( + "This is a nice place. " * 800 + "I really enjoy the scenery,", + seed=42, + max_new_tokens=20, + ) + + assert ( + response.generated_text + == " the people, and the food.\n\nThis is a nice place.\n" + ) + assert response.details.generated_tokens == 16 + assert response == response_snapshot + + +# Helper function to convert a Pillow image to a base64 data URL +def image_to_data_url(img: Image.Image, fmt: str) -> str: + buffer = BytesIO() + img.save(buffer, format=fmt) + img_data = buffer.getvalue() + b64_str = base64.b64encode(img_data).decode("utf-8") + mime_type = "image/png" if fmt.upper() == "PNG" else "image/jpeg" + return f"data:{mime_type};base64,{b64_str}" + + +async def test_flash_gemma3_image_base64_rgba(flash_gemma3, response_snapshot): + # Create an empty 100x100 PNG image with alpha (transparent background) + img = Image.new("RGBA", (100, 100), (0, 0, 0, 0)) + data_url = image_to_data_url(img, "PNG") + response = await flash_gemma3.chat( + seed=42, + messages=[ + { + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": data_url}}, + { + "type": "text", + "text": "What do you see in this transparent image?", + }, + ], + }, + ], + max_tokens=100, + ) + assert response == response_snapshot + + +async def test_flash_gemma3_image_base64_rgb_png(flash_gemma3, response_snapshot): + # Create an empty 100x100 PNG image without alpha (white background) + img = Image.new("RGB", (100, 100), (255, 255, 255)) + data_url = image_to_data_url(img, "PNG") + response = await flash_gemma3.chat( + seed=42, + messages=[ + { + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": data_url}}, + {"type": "text", "text": "What do you see in this plain image?"}, + ], + }, + ], + max_tokens=100, + ) + assert response == response_snapshot + + +async def test_flash_gemma3_image_base64_rgb_jpg(flash_gemma3, response_snapshot): + # Create an empty 100x100 JPEG image (white background) + img = Image.new("RGB", (100, 100), (255, 255, 255)) + data_url = image_to_data_url(img, "JPEG") + response = await flash_gemma3.chat( + seed=42, + messages=[ + { + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": data_url}}, + {"type": "text", "text": "What do you see in this JPEG image?"}, + ], + }, + ], + max_tokens=100, + ) + assert response == response_snapshot diff --git a/integration-tests/models/test_tools_llama.py b/integration-tests/models/test_tools_llama.py index ebf69cb7..612fa6bd 100644 --- a/integration-tests/models/test_tools_llama.py +++ b/integration-tests/models/test_tools_llama.py @@ -108,7 +108,7 @@ async def test_flash_llama_grammar_tools_nostream( function=ChatCompletionOutputFunctionDefinition( description=None, name="get_current_weather", - arguments='{"format":"fahrenheit","location":"Brooklyn, NY"}', + arguments='{"location":"Brooklyn, NY","format":"fahrenheit"}', ), ) ] @@ -142,14 +142,15 @@ async def test_flash_llama_grammar_tools_openai( chunks = [] tool = "" + name = "" for chunk in stream: + if chunk.choices[0].delta.tool_calls[0].function.name: + name += chunk.choices[0].delta.tool_calls[0].function.name tool += chunk.choices[0].delta.tool_calls[0].function.arguments chunks.append(chunk) - assert ( - tool - == '{"function": {"_name": "get_current_weather", "location": "Brooklyn, NY", "format": "fahrenheit"}}<|eot_id|>' - ) + assert name == "get_current_weather" + assert tool == '{ "location": "Brooklyn, NY", "format": "fahrenheit"}' assert chunks == response_snapshot @@ -184,7 +185,7 @@ async def test_flash_llama_grammar_tools_auto_nostream( function=ChatCompletionOutputFunctionDefinition( description=None, name="get_current_weather", - arguments='{"format":"fahrenheit","location":"Brooklyn, NY"}', + arguments='{"location":"Brooklyn, NY","format":"fahrenheit"}', ), ) ] @@ -223,7 +224,7 @@ async def test_flash_llama_grammar_tools_choice_nostream( function=ChatCompletionOutputFunctionDefinition( description=None, name="get_current_weather", - arguments='{"format":"fahrenheit","location":"Brooklyn, NY"}', + arguments='{"location":"Brooklyn, NY","format":"fahrenheit"}', ), ) ] @@ -250,23 +251,24 @@ async def test_flash_llama_grammar_tools_choice_stream( }, { "role": "user", - "content": "What is the weather like in Paris, France?", + "content": "What is the weather like in Brooklyn, New York?", }, ], stream=True, ) - tool_calls_generated = "" + arguments = "" chunks = [] + name = "" for chunk in stream: - tool_calls_generated += chunk.choices[0].delta.tool_calls[0].function.arguments + if chunk.choices[0].delta.tool_calls[0].function.name: + name += chunk.choices[0].delta.tool_calls[0].function.name + arguments += chunk.choices[0].delta.tool_calls[0].function.arguments assert chunk.choices[0].delta.content is None chunks.append(chunk) - assert ( - tool_calls_generated - == '{"function": {"_name": "get_current_weather", "location": "Paris, France", "format": "celsius"}}<|eot_id|>' - ) + assert name == "get_current_weather" + assert arguments == '{ "location": "Brooklyn, NY", "format": "fahrenheit"}' assert chunks == response_snapshot @@ -277,7 +279,7 @@ async def test_flash_llama_grammar_tools_insufficient_information_nostream( ): client = InferenceClient(base_url=f"{flash_llama_grammar_tools.base_url}/v1") response = client.chat_completion( - max_tokens=100, + max_tokens=20, seed=24, tools=tools, tool_choice="auto", @@ -297,9 +299,10 @@ async def test_flash_llama_grammar_tools_insufficient_information_nostream( content_generated = response.choices[0].message.content assert response.choices[0].message.tool_calls is None - ######## FIXME before MERGE ############################ - # TODO This is different from the streaming case, this is NOT normal. - assert content_generated == "I am a helpful assistant!" + assert ( + content_generated + == "I'm an artificial intelligence model known as a large language model (LLM) or conversational AI" + ) assert response == response_snapshot @@ -310,7 +313,7 @@ async def test_flash_llama_grammar_tools_insufficient_information_stream( ): client = InferenceClient(base_url=f"{flash_llama_grammar_tools.base_url}/v1") stream = client.chat_completion( - max_tokens=100, + max_tokens=20, seed=24, tools=tools, tool_choice="auto", @@ -334,7 +337,11 @@ async def test_flash_llama_grammar_tools_insufficient_information_stream( chunks.append(chunk) assert chunk.choices[0].delta.tool_calls is None - assert content_generated == "I am a helpful assistant" + ######## This is exactly the same as the non streaming case + assert ( + content_generated + == "I'm an artificial intelligence model known as a large language model (LLM) or conversational AI" + ) assert chunks == response_snapshot @@ -345,7 +352,7 @@ async def test_flash_llama_grammar_tools_sea_creatures_stream_auto( ): client = InferenceClient(base_url=f"{flash_llama_grammar_tools.base_url}/v1") stream = client.chat_completion( - max_tokens=100, + max_tokens=20, seed=24, tools=tools, tool_choice="auto", @@ -371,7 +378,7 @@ async def test_flash_llama_grammar_tools_sea_creatures_stream_auto( assert ( content_generated - == "There was a wise old octopus named Oracle. He lived in a cozy little cave beneath the waves with his best friend, a curious seahorse named Finley. One day, Finley met a playful dolphin named Daisy, and the three became inseparable. They spent their days exploring the ocean, playing hide-and-seek, and learning about the wonders of the sea from Oracle" + == "Once upon a time, in a vibrant ocean filled with coral reefs and schools of shimmering fish," ) assert chunks == response_snapshot @@ -401,14 +408,18 @@ async def test_flash_llama_grammar_tools_sea_creatures_stream_required( ) tool_calls_generated = "" + name = "" chunks = [] for chunk in stream: assert chunk.choices[0].delta.content is None + if chunk.choices[0].delta.tool_calls[0].function.name: + name += chunk.choices[0].delta.tool_calls[0].function.name tool_calls_generated += chunk.choices[0].delta.tool_calls[0].function.arguments + assert name == "get_n_day_weather_forecast" assert ( tool_calls_generated - == '{"function": {"_name": "get_n_day_weather_forecast", "location": "San Francisco, CA", "format": "fahrenheit", "num_days":3}}<|eot_id|>' + == '{ "location": "San Francisco, CA", "format": "fahrenheit", "num_days":3}' ) assert chunks == response_snapshot @@ -479,12 +490,17 @@ async def test_flash_llama_grammar_tools_sea_creatures_stream_function_object( ) chunks = [] tool_calls_generated = "" + name = "" for chunk in stream: + assert chunk.choices[0].delta.content is None + if chunk.choices[0].delta.tool_calls[0].function.name: + name += chunk.choices[0].delta.tool_calls[0].function.name tool_calls_generated += chunk.choices[0].delta.tool_calls[0].function.arguments - chunks.append(chunk) + + assert name == "get_n_day_weather_forecast" assert ( tool_calls_generated - == '{"function": {"_name": "get_n_day_weather_forecast", "location": "San Francisco, CA", "format": "celsius", "num_days": 3}}<|eot_id|>' + == '{ "location": "San Francisco, CA", "format": "celsius", "num_days": 3}' ) assert chunks == response_snapshot diff --git a/integration-tests/neuron/test_generate.py b/integration-tests/neuron/test_generate.py index 6a1b4990..f0804356 100644 --- a/integration-tests/neuron/test_generate.py +++ b/integration-tests/neuron/test_generate.py @@ -49,17 +49,11 @@ async def test_model_single_request(tgi_service): max_new_tokens=128, seed=42, ) - sample_expectations = { - "gpt2": "Deep Learning", - "llama": "Deep Learning", - "mistral": "Deep learning", - "qwen2": "Deep Learning", - "granite": "Deep learning", - } - assert sample_expectations[service_name] in response + # The response must be different + assert not response.startswith(greedy_expectations[service_name]) - # Sampling with stop sequence - stop_sequence = sample_expectations[service_name][-5:] + # Sampling with stop sequence (using one of the words returned from the previous test) + stop_sequence = response.split(" ")[-5] response = await tgi_service.client.text_generation( "What is Deep Learning?", do_sample=True, diff --git a/integration-tests/pyproject.toml b/integration-tests/pyproject.toml index 07aa4307..abe8cfee 100644 --- a/integration-tests/pyproject.toml +++ b/integration-tests/pyproject.toml @@ -15,6 +15,7 @@ dependencies = [ "numpy>=2.0", "openai>=1.65", "huggingface_hub>=0.29", + "pillow>=11.1.0", ] [tool.isort] diff --git a/integration-tests/requirements.txt b/integration-tests/requirements.txt index a85db4a5..ca2dee93 100644 --- a/integration-tests/requirements.txt +++ b/integration-tests/requirements.txt @@ -1,8 +1,8 @@ # This file was autogenerated by uv via the following command: -# uv pip compile pyproject.toml -o requirements.txt -aiohappyeyeballs==2.4.6 +# uv pip compile pyproject.toml +aiohappyeyeballs==2.6.1 # via aiohttp -aiohttp==3.11.12 +aiohttp==3.11.13 # via text-generation aiosignal==1.3.2 # via aiohttp @@ -12,7 +12,7 @@ anyio==4.8.0 # via # httpx # openai -attrs==25.1.0 +attrs==25.3.0 # via aiohttp certifi==2025.1.31 # via @@ -25,13 +25,13 @@ distro==1.9.0 # via openai docker==7.1.0 # via text-generation-integration-tests (pyproject.toml) -filelock==3.17.0 +filelock==3.18.0 # via huggingface-hub frozenlist==1.5.0 # via # aiohttp # aiosignal -fsspec==2025.2.0 +fsspec==2025.3.0 # via huggingface-hub h11==0.14.0 # via httpcore @@ -39,7 +39,7 @@ httpcore==1.0.7 # via httpx httpx==0.28.1 # via openai -huggingface-hub==0.29.0 +huggingface-hub==0.29.3 # via # text-generation-integration-tests (pyproject.toml) # text-generation @@ -51,7 +51,7 @@ idna==3.10 # yarl iniconfig==2.0.0 # via pytest -jiter==0.8.2 +jiter==0.9.0 # via openai multidict==6.1.0 # via @@ -59,15 +59,17 @@ multidict==6.1.0 # yarl numpy==2.2.3 # via text-generation-integration-tests (pyproject.toml) -openai==1.65.3 +openai==1.66.3 # via text-generation-integration-tests (pyproject.toml) packaging==24.2 # via # huggingface-hub # pytest +pillow==11.1.0 + # via text-generation-integration-tests (pyproject.toml) pluggy==1.5.0 # via pytest -propcache==0.2.1 +propcache==0.3.0 # via # aiohttp # yarl @@ -78,7 +80,7 @@ pydantic==2.10.6 # text-generation pydantic-core==2.27.2 # via pydantic -pytest==8.3.4 +pytest==8.3.5 # via # text-generation-integration-tests (pyproject.toml) # pytest-asyncio @@ -95,7 +97,7 @@ sniffio==1.3.1 # via # anyio # openai -syrupy==4.8.1 +syrupy==4.9.0 # via text-generation-integration-tests (pyproject.toml) text-generation==0.7.0 # via text-generation-integration-tests (pyproject.toml) diff --git a/integration-tests/uv.lock b/integration-tests/uv.lock index 9f3765b8..bad6aa8f 100644 --- a/integration-tests/uv.lock +++ b/integration-tests/uv.lock @@ -97,6 +97,21 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643 }, ] +[[package]] +name = "anyio" +version = "4.8.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "exceptiongroup", marker = "python_full_version < '3.11'" }, + { name = "idna" }, + { name = "sniffio" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a3/73/199a98fc2dae33535d6b8e8e6ec01f8c1d76c9adb096c6b7d64823038cde/anyio-4.8.0.tar.gz", hash = "sha256:1d9fe889df5212298c0c0723fa20479d1b94883a2df44bd3897aa91083316f7a", size = 181126 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/46/eb/e7f063ad1fec6b3178a3cd82d1a3c4de82cccf283fc42746168188e1cdd5/anyio-4.8.0-py3-none-any.whl", hash = "sha256:b5011f270ab5eb0abf13385f851315585cc37ef330dd88e27ec3d34d651fd47a", size = 96041 }, +] + [[package]] name = "async-timeout" version = "5.0.1" @@ -181,6 +196,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335 }, ] +[[package]] +name = "distro" +version = "1.9.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/fc/f8/98eea607f65de6527f8a2e8885fc8015d3e6f5775df186e443e0964a11c3/distro-1.9.0.tar.gz", hash = "sha256:2fa77c6fd8940f116ee1d6b94a2f90b13b5ea8d019b98bc8bafdcabcdd9bdbed", size = 60722 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/12/b3/231ffd4ab1fc9d679809f356cebee130ac7daa00d6d6f3206dd4fd137e9e/distro-1.9.0-py3-none-any.whl", hash = "sha256:7bffd925d65168f85027d8da9af6bddab658135b840670a223589bc0c8ef02b2", size = 20277 }, +] + [[package]] name = "docker" version = "7.1.0" @@ -276,6 +300,43 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e2/94/758680531a00d06e471ef649e4ec2ed6bf185356a7f9fbfbb7368a40bd49/fsspec-2025.2.0-py3-none-any.whl", hash = "sha256:9de2ad9ce1f85e1931858535bc882543171d197001a0a5eb2ddc04f1781ab95b", size = 184484 }, ] +[[package]] +name = "h11" +version = "0.14.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f5/38/3af3d3633a34a3316095b39c8e8fb4853a28a536e55d347bd8d8e9a14b03/h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d", size = 100418 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/95/04/ff642e65ad6b90db43e668d70ffb6736436c7ce41fcc549f4e9472234127/h11-0.14.0-py3-none-any.whl", hash = "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761", size = 58259 }, +] + +[[package]] +name = "httpcore" +version = "1.0.7" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "certifi" }, + { name = "h11" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/6a/41/d7d0a89eb493922c37d343b607bc1b5da7f5be7e383740b4753ad8943e90/httpcore-1.0.7.tar.gz", hash = "sha256:8551cb62a169ec7162ac7be8d4817d561f60e08eaa485234898414bb5a8a0b4c", size = 85196 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/87/f5/72347bc88306acb359581ac4d52f23c0ef445b57157adedb9aee0cd689d2/httpcore-1.0.7-py3-none-any.whl", hash = "sha256:a3fff8f43dc260d5bd363d9f9cf1830fa3a458b332856f34282de498ed420edd", size = 78551 }, +] + +[[package]] +name = "httpx" +version = "0.28.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "anyio" }, + { name = "certifi" }, + { name = "httpcore" }, + { name = "idna" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b1/df/48c586a5fe32a0f01324ee087459e112ebb7224f646c0b5023f5e79e9956/httpx-0.28.1.tar.gz", hash = "sha256:75e98c5f16b0f35b567856f597f06ff2270a374470a5c2392242528e3e3e42fc", size = 141406 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517 }, +] + [[package]] name = "huggingface-hub" version = "0.29.0" @@ -312,6 +373,50 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ef/a6/62565a6e1cf69e10f5727360368e451d4b7f58beeac6173dc9db836a5b46/iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374", size = 5892 }, ] +[[package]] +name = "jiter" +version = "0.9.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/1e/c2/e4562507f52f0af7036da125bb699602ead37a2332af0788f8e0a3417f36/jiter-0.9.0.tar.gz", hash = "sha256:aadba0964deb424daa24492abc3d229c60c4a31bfee205aedbf1acc7639d7893", size = 162604 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b0/82/39f7c9e67b3b0121f02a0b90d433626caa95a565c3d2449fea6bcfa3f5f5/jiter-0.9.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:816ec9b60fdfd1fec87da1d7ed46c66c44ffec37ab2ef7de5b147b2fce3fd5ad", size = 314540 }, + { url = "https://files.pythonhosted.org/packages/01/07/7bf6022c5a152fca767cf5c086bb41f7c28f70cf33ad259d023b53c0b858/jiter-0.9.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9b1d3086f8a3ee0194ecf2008cf81286a5c3e540d977fa038ff23576c023c0ea", size = 321065 }, + { url = "https://files.pythonhosted.org/packages/6c/b2/de3f3446ecba7c48f317568e111cc112613da36c7b29a6de45a1df365556/jiter-0.9.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1339f839b91ae30b37c409bf16ccd3dc453e8b8c3ed4bd1d6a567193651a4a51", size = 341664 }, + { url = "https://files.pythonhosted.org/packages/13/cf/6485a4012af5d407689c91296105fcdb080a3538e0658d2abf679619c72f/jiter-0.9.0-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ffba79584b3b670fefae66ceb3a28822365d25b7bf811e030609a3d5b876f538", size = 364635 }, + { url = "https://files.pythonhosted.org/packages/0d/f7/4a491c568f005553240b486f8e05c82547340572d5018ef79414b4449327/jiter-0.9.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5cfc7d0a8e899089d11f065e289cb5b2daf3d82fbe028f49b20d7b809193958d", size = 406288 }, + { url = "https://files.pythonhosted.org/packages/d3/ca/f4263ecbce7f5e6bded8f52a9f1a66540b270c300b5c9f5353d163f9ac61/jiter-0.9.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e00a1a2bbfaaf237e13c3d1592356eab3e9015d7efd59359ac8b51eb56390a12", size = 397499 }, + { url = "https://files.pythonhosted.org/packages/ac/a2/522039e522a10bac2f2194f50e183a49a360d5f63ebf46f6d890ef8aa3f9/jiter-0.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d1d9870561eb26b11448854dce0ff27a9a27cb616b632468cafc938de25e9e51", size = 352926 }, + { url = "https://files.pythonhosted.org/packages/b1/67/306a5c5abc82f2e32bd47333a1c9799499c1c3a415f8dde19dbf876f00cb/jiter-0.9.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:9872aeff3f21e437651df378cb75aeb7043e5297261222b6441a620218b58708", size = 384506 }, + { url = "https://files.pythonhosted.org/packages/0f/89/c12fe7b65a4fb74f6c0d7b5119576f1f16c79fc2953641f31b288fad8a04/jiter-0.9.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:1fd19112d1049bdd47f17bfbb44a2c0001061312dcf0e72765bfa8abd4aa30e5", size = 520621 }, + { url = "https://files.pythonhosted.org/packages/c4/2b/d57900c5c06e6273fbaa76a19efa74dbc6e70c7427ab421bf0095dfe5d4a/jiter-0.9.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:6ef5da104664e526836070e4a23b5f68dec1cc673b60bf1edb1bfbe8a55d0678", size = 512613 }, + { url = "https://files.pythonhosted.org/packages/89/05/d8b90bfb21e58097d5a4e0224f2940568366f68488a079ae77d4b2653500/jiter-0.9.0-cp310-cp310-win32.whl", hash = "sha256:cb12e6d65ebbefe5518de819f3eda53b73187b7089040b2d17f5b39001ff31c4", size = 206613 }, + { url = "https://files.pythonhosted.org/packages/2c/1d/5767f23f88e4f885090d74bbd2755518050a63040c0f59aa059947035711/jiter-0.9.0-cp310-cp310-win_amd64.whl", hash = "sha256:c43ca669493626d8672be3b645dbb406ef25af3f4b6384cfd306da7eb2e70322", size = 208371 }, + { url = "https://files.pythonhosted.org/packages/23/44/e241a043f114299254e44d7e777ead311da400517f179665e59611ab0ee4/jiter-0.9.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:6c4d99c71508912a7e556d631768dcdef43648a93660670986916b297f1c54af", size = 314654 }, + { url = "https://files.pythonhosted.org/packages/fb/1b/a7e5e42db9fa262baaa9489d8d14ca93f8663e7f164ed5e9acc9f467fc00/jiter-0.9.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8f60fb8ce7df529812bf6c625635a19d27f30806885139e367af93f6e734ef58", size = 320909 }, + { url = "https://files.pythonhosted.org/packages/60/bf/8ebdfce77bc04b81abf2ea316e9c03b4a866a7d739cf355eae4d6fd9f6fe/jiter-0.9.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:51c4e1a4f8ea84d98b7b98912aa4290ac3d1eabfde8e3c34541fae30e9d1f08b", size = 341733 }, + { url = "https://files.pythonhosted.org/packages/a8/4e/754ebce77cff9ab34d1d0fa0fe98f5d42590fd33622509a3ba6ec37ff466/jiter-0.9.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5f4c677c424dc76684fea3e7285a7a2a7493424bea89ac441045e6a1fb1d7b3b", size = 365097 }, + { url = "https://files.pythonhosted.org/packages/32/2c/6019587e6f5844c612ae18ca892f4cd7b3d8bbf49461ed29e384a0f13d98/jiter-0.9.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2221176dfec87f3470b21e6abca056e6b04ce9bff72315cb0b243ca9e835a4b5", size = 406603 }, + { url = "https://files.pythonhosted.org/packages/da/e9/c9e6546c817ab75a1a7dab6dcc698e62e375e1017113e8e983fccbd56115/jiter-0.9.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3c7adb66f899ffa25e3c92bfcb593391ee1947dbdd6a9a970e0d7e713237d572", size = 396625 }, + { url = "https://files.pythonhosted.org/packages/be/bd/976b458add04271ebb5a255e992bd008546ea04bb4dcadc042a16279b4b4/jiter-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c98d27330fdfb77913c1097a7aab07f38ff2259048949f499c9901700789ac15", size = 351832 }, + { url = "https://files.pythonhosted.org/packages/07/51/fe59e307aaebec9265dbad44d9d4381d030947e47b0f23531579b9a7c2df/jiter-0.9.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:eda3f8cc74df66892b1d06b5d41a71670c22d95a1ca2cbab73654745ce9d0419", size = 384590 }, + { url = "https://files.pythonhosted.org/packages/db/55/5dcd2693794d8e6f4889389ff66ef3be557a77f8aeeca8973a97a7c00557/jiter-0.9.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:dd5ab5ddc11418dce28343123644a100f487eaccf1de27a459ab36d6cca31043", size = 520690 }, + { url = "https://files.pythonhosted.org/packages/54/d5/9f51dc90985e9eb251fbbb747ab2b13b26601f16c595a7b8baba964043bd/jiter-0.9.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:42f8a68a69f047b310319ef8e2f52fdb2e7976fb3313ef27df495cf77bcad965", size = 512649 }, + { url = "https://files.pythonhosted.org/packages/a6/e5/4e385945179bcf128fa10ad8dca9053d717cbe09e258110e39045c881fe5/jiter-0.9.0-cp311-cp311-win32.whl", hash = "sha256:a25519efb78a42254d59326ee417d6f5161b06f5da827d94cf521fed961b1ff2", size = 206920 }, + { url = "https://files.pythonhosted.org/packages/4c/47/5e0b94c603d8e54dd1faab439b40b832c277d3b90743e7835879ab663757/jiter-0.9.0-cp311-cp311-win_amd64.whl", hash = "sha256:923b54afdd697dfd00d368b7ccad008cccfeb1efb4e621f32860c75e9f25edbd", size = 210119 }, + { url = "https://files.pythonhosted.org/packages/af/d7/c55086103d6f29b694ec79156242304adf521577530d9031317ce5338c59/jiter-0.9.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:7b46249cfd6c48da28f89eb0be3f52d6fdb40ab88e2c66804f546674e539ec11", size = 309203 }, + { url = "https://files.pythonhosted.org/packages/b0/01/f775dfee50beb420adfd6baf58d1c4d437de41c9b666ddf127c065e5a488/jiter-0.9.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:609cf3c78852f1189894383cf0b0b977665f54cb38788e3e6b941fa6d982c00e", size = 319678 }, + { url = "https://files.pythonhosted.org/packages/ab/b8/09b73a793714726893e5d46d5c534a63709261af3d24444ad07885ce87cb/jiter-0.9.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d726a3890a54561e55a9c5faea1f7655eda7f105bd165067575ace6e65f80bb2", size = 341816 }, + { url = "https://files.pythonhosted.org/packages/35/6f/b8f89ec5398b2b0d344257138182cc090302854ed63ed9c9051e9c673441/jiter-0.9.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2e89dc075c1fef8fa9be219e249f14040270dbc507df4215c324a1839522ea75", size = 364152 }, + { url = "https://files.pythonhosted.org/packages/9b/ca/978cc3183113b8e4484cc7e210a9ad3c6614396e7abd5407ea8aa1458eef/jiter-0.9.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:04e8ffa3c353b1bc4134f96f167a2082494351e42888dfcf06e944f2729cbe1d", size = 406991 }, + { url = "https://files.pythonhosted.org/packages/13/3a/72861883e11a36d6aa314b4922125f6ae90bdccc225cd96d24cc78a66385/jiter-0.9.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:203f28a72a05ae0e129b3ed1f75f56bc419d5f91dfacd057519a8bd137b00c42", size = 395824 }, + { url = "https://files.pythonhosted.org/packages/87/67/22728a86ef53589c3720225778f7c5fdb617080e3deaed58b04789418212/jiter-0.9.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fca1a02ad60ec30bb230f65bc01f611c8608b02d269f998bc29cca8619a919dc", size = 351318 }, + { url = "https://files.pythonhosted.org/packages/69/b9/f39728e2e2007276806d7a6609cda7fac44ffa28ca0d02c49a4f397cc0d9/jiter-0.9.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:237e5cee4d5d2659aaf91bbf8ec45052cc217d9446070699441a91b386ae27dc", size = 384591 }, + { url = "https://files.pythonhosted.org/packages/eb/8f/8a708bc7fd87b8a5d861f1c118a995eccbe6d672fe10c9753e67362d0dd0/jiter-0.9.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:528b6b71745e7326eed73c53d4aa57e2a522242320b6f7d65b9c5af83cf49b6e", size = 520746 }, + { url = "https://files.pythonhosted.org/packages/95/1e/65680c7488bd2365dbd2980adaf63c562d3d41d3faac192ebc7ef5b4ae25/jiter-0.9.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:9f48e86b57bc711eb5acdfd12b6cb580a59cc9a993f6e7dcb6d8b50522dcd50d", size = 512754 }, + { url = "https://files.pythonhosted.org/packages/78/f3/fdc43547a9ee6e93c837685da704fb6da7dba311fc022e2766d5277dfde5/jiter-0.9.0-cp312-cp312-win32.whl", hash = "sha256:699edfde481e191d81f9cf6d2211debbfe4bd92f06410e7637dffb8dd5dfde06", size = 207075 }, + { url = "https://files.pythonhosted.org/packages/cd/9d/742b289016d155f49028fe1bfbeb935c9bf0ffeefdf77daf4a63a42bb72b/jiter-0.9.0-cp312-cp312-win_amd64.whl", hash = "sha256:099500d07b43f61d8bd780466d429c45a7b25411b334c60ca875fa775f68ccb0", size = 207999 }, +] + [[package]] name = "multidict" version = "6.1.0" @@ -411,6 +516,25 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/17/7f/d322a4125405920401450118dbdc52e0384026bd669939484670ce8b2ab9/numpy-2.2.3-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:783145835458e60fa97afac25d511d00a1eca94d4a8f3ace9fe2043003c678e4", size = 12839607 }, ] +[[package]] +name = "openai" +version = "1.66.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "anyio" }, + { name = "distro" }, + { name = "httpx" }, + { name = "jiter" }, + { name = "pydantic" }, + { name = "sniffio" }, + { name = "tqdm" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a3/77/5172104ca1df35ed2ed8fb26dbc787f721c39498fc51d666c4db07756a0c/openai-1.66.3.tar.gz", hash = "sha256:8dde3aebe2d081258d4159c4cb27bdc13b5bb3f7ea2201d9bd940b9a89faf0c9", size = 397244 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/78/5a/e20182f7b6171642d759c548daa0ba20a1d3ac10d2bd0a13fd75704a9ac3/openai-1.66.3-py3-none-any.whl", hash = "sha256:a427c920f727711877ab17c11b95f1230b27767ba7a01e5b66102945141ceca9", size = 567400 }, +] + [[package]] name = "packaging" version = "24.2" @@ -420,6 +544,54 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/88/ef/eb23f262cca3c0c4eb7ab1933c3b1f03d021f2c48f54763065b6f0e321be/packaging-24.2-py3-none-any.whl", hash = "sha256:09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759", size = 65451 }, ] +[[package]] +name = "pillow" +version = "11.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f3/af/c097e544e7bd278333db77933e535098c259609c4eb3b85381109602fb5b/pillow-11.1.0.tar.gz", hash = "sha256:368da70808b36d73b4b390a8ffac11069f8a5c85f29eff1f1b01bcf3ef5b2a20", size = 46742715 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/50/1c/2dcea34ac3d7bc96a1fd1bd0a6e06a57c67167fec2cff8d95d88229a8817/pillow-11.1.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:e1abe69aca89514737465752b4bcaf8016de61b3be1397a8fc260ba33321b3a8", size = 3229983 }, + { url = "https://files.pythonhosted.org/packages/14/ca/6bec3df25e4c88432681de94a3531cc738bd85dea6c7aa6ab6f81ad8bd11/pillow-11.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c640e5a06869c75994624551f45e5506e4256562ead981cce820d5ab39ae2192", size = 3101831 }, + { url = "https://files.pythonhosted.org/packages/d4/2c/668e18e5521e46eb9667b09e501d8e07049eb5bfe39d56be0724a43117e6/pillow-11.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a07dba04c5e22824816b2615ad7a7484432d7f540e6fa86af60d2de57b0fcee2", size = 4314074 }, + { url = "https://files.pythonhosted.org/packages/02/80/79f99b714f0fc25f6a8499ecfd1f810df12aec170ea1e32a4f75746051ce/pillow-11.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e267b0ed063341f3e60acd25c05200df4193e15a4a5807075cd71225a2386e26", size = 4394933 }, + { url = "https://files.pythonhosted.org/packages/81/aa/8d4ad25dc11fd10a2001d5b8a80fdc0e564ac33b293bdfe04ed387e0fd95/pillow-11.1.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:bd165131fd51697e22421d0e467997ad31621b74bfc0b75956608cb2906dda07", size = 4353349 }, + { url = "https://files.pythonhosted.org/packages/84/7a/cd0c3eaf4a28cb2a74bdd19129f7726277a7f30c4f8424cd27a62987d864/pillow-11.1.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:abc56501c3fd148d60659aae0af6ddc149660469082859fa7b066a298bde9482", size = 4476532 }, + { url = "https://files.pythonhosted.org/packages/8f/8b/a907fdd3ae8f01c7670dfb1499c53c28e217c338b47a813af8d815e7ce97/pillow-11.1.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:54ce1c9a16a9561b6d6d8cb30089ab1e5eb66918cb47d457bd996ef34182922e", size = 4279789 }, + { url = "https://files.pythonhosted.org/packages/6f/9a/9f139d9e8cccd661c3efbf6898967a9a337eb2e9be2b454ba0a09533100d/pillow-11.1.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:73ddde795ee9b06257dac5ad42fcb07f3b9b813f8c1f7f870f402f4dc54b5269", size = 4413131 }, + { url = "https://files.pythonhosted.org/packages/a8/68/0d8d461f42a3f37432203c8e6df94da10ac8081b6d35af1c203bf3111088/pillow-11.1.0-cp310-cp310-win32.whl", hash = "sha256:3a5fe20a7b66e8135d7fd617b13272626a28278d0e578c98720d9ba4b2439d49", size = 2291213 }, + { url = "https://files.pythonhosted.org/packages/14/81/d0dff759a74ba87715509af9f6cb21fa21d93b02b3316ed43bda83664db9/pillow-11.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:b6123aa4a59d75f06e9dd3dac5bf8bc9aa383121bb3dd9a7a612e05eabc9961a", size = 2625725 }, + { url = "https://files.pythonhosted.org/packages/ce/1f/8d50c096a1d58ef0584ddc37e6f602828515219e9d2428e14ce50f5ecad1/pillow-11.1.0-cp310-cp310-win_arm64.whl", hash = "sha256:a76da0a31da6fcae4210aa94fd779c65c75786bc9af06289cd1c184451ef7a65", size = 2375213 }, + { url = "https://files.pythonhosted.org/packages/dd/d6/2000bfd8d5414fb70cbbe52c8332f2283ff30ed66a9cde42716c8ecbe22c/pillow-11.1.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:e06695e0326d05b06833b40b7ef477e475d0b1ba3a6d27da1bb48c23209bf457", size = 3229968 }, + { url = "https://files.pythonhosted.org/packages/d9/45/3fe487010dd9ce0a06adf9b8ff4f273cc0a44536e234b0fad3532a42c15b/pillow-11.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:96f82000e12f23e4f29346e42702b6ed9a2f2fea34a740dd5ffffcc8c539eb35", size = 3101806 }, + { url = "https://files.pythonhosted.org/packages/e3/72/776b3629c47d9d5f1c160113158a7a7ad177688d3a1159cd3b62ded5a33a/pillow-11.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a3cd561ded2cf2bbae44d4605837221b987c216cff94f49dfeed63488bb228d2", size = 4322283 }, + { url = "https://files.pythonhosted.org/packages/e4/c2/e25199e7e4e71d64eeb869f5b72c7ddec70e0a87926398785ab944d92375/pillow-11.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f189805c8be5ca5add39e6f899e6ce2ed824e65fb45f3c28cb2841911da19070", size = 4402945 }, + { url = "https://files.pythonhosted.org/packages/c1/ed/51d6136c9d5911f78632b1b86c45241c712c5a80ed7fa7f9120a5dff1eba/pillow-11.1.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:dd0052e9db3474df30433f83a71b9b23bd9e4ef1de13d92df21a52c0303b8ab6", size = 4361228 }, + { url = "https://files.pythonhosted.org/packages/48/a4/fbfe9d5581d7b111b28f1d8c2762dee92e9821bb209af9fa83c940e507a0/pillow-11.1.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:837060a8599b8f5d402e97197d4924f05a2e0d68756998345c829c33186217b1", size = 4484021 }, + { url = "https://files.pythonhosted.org/packages/39/db/0b3c1a5018117f3c1d4df671fb8e47d08937f27519e8614bbe86153b65a5/pillow-11.1.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:aa8dd43daa836b9a8128dbe7d923423e5ad86f50a7a14dc688194b7be5c0dea2", size = 4287449 }, + { url = "https://files.pythonhosted.org/packages/d9/58/bc128da7fea8c89fc85e09f773c4901e95b5936000e6f303222490c052f3/pillow-11.1.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:0a2f91f8a8b367e7a57c6e91cd25af510168091fb89ec5146003e424e1558a96", size = 4419972 }, + { url = "https://files.pythonhosted.org/packages/5f/bb/58f34379bde9fe197f51841c5bbe8830c28bbb6d3801f16a83b8f2ad37df/pillow-11.1.0-cp311-cp311-win32.whl", hash = "sha256:c12fc111ef090845de2bb15009372175d76ac99969bdf31e2ce9b42e4b8cd88f", size = 2291201 }, + { url = "https://files.pythonhosted.org/packages/3a/c6/fce9255272bcf0c39e15abd2f8fd8429a954cf344469eaceb9d0d1366913/pillow-11.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:fbd43429d0d7ed6533b25fc993861b8fd512c42d04514a0dd6337fb3ccf22761", size = 2625686 }, + { url = "https://files.pythonhosted.org/packages/c8/52/8ba066d569d932365509054859f74f2a9abee273edcef5cd75e4bc3e831e/pillow-11.1.0-cp311-cp311-win_arm64.whl", hash = "sha256:f7955ecf5609dee9442cbface754f2c6e541d9e6eda87fad7f7a989b0bdb9d71", size = 2375194 }, + { url = "https://files.pythonhosted.org/packages/95/20/9ce6ed62c91c073fcaa23d216e68289e19d95fb8188b9fb7a63d36771db8/pillow-11.1.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:2062ffb1d36544d42fcaa277b069c88b01bb7298f4efa06731a7fd6cc290b81a", size = 3226818 }, + { url = "https://files.pythonhosted.org/packages/b9/d8/f6004d98579a2596c098d1e30d10b248798cceff82d2b77aa914875bfea1/pillow-11.1.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a85b653980faad27e88b141348707ceeef8a1186f75ecc600c395dcac19f385b", size = 3101662 }, + { url = "https://files.pythonhosted.org/packages/08/d9/892e705f90051c7a2574d9f24579c9e100c828700d78a63239676f960b74/pillow-11.1.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9409c080586d1f683df3f184f20e36fb647f2e0bc3988094d4fd8c9f4eb1b3b3", size = 4329317 }, + { url = "https://files.pythonhosted.org/packages/8c/aa/7f29711f26680eab0bcd3ecdd6d23ed6bce180d82e3f6380fb7ae35fcf3b/pillow-11.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7fdadc077553621911f27ce206ffcbec7d3f8d7b50e0da39f10997e8e2bb7f6a", size = 4412999 }, + { url = "https://files.pythonhosted.org/packages/c8/c4/8f0fe3b9e0f7196f6d0bbb151f9fba323d72a41da068610c4c960b16632a/pillow-11.1.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:93a18841d09bcdd774dcdc308e4537e1f867b3dec059c131fde0327899734aa1", size = 4368819 }, + { url = "https://files.pythonhosted.org/packages/38/0d/84200ed6a871ce386ddc82904bfadc0c6b28b0c0ec78176871a4679e40b3/pillow-11.1.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:9aa9aeddeed452b2f616ff5507459e7bab436916ccb10961c4a382cd3e03f47f", size = 4496081 }, + { url = "https://files.pythonhosted.org/packages/84/9c/9bcd66f714d7e25b64118e3952d52841a4babc6d97b6d28e2261c52045d4/pillow-11.1.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3cdcdb0b896e981678eee140d882b70092dac83ac1cdf6b3a60e2216a73f2b91", size = 4296513 }, + { url = "https://files.pythonhosted.org/packages/db/61/ada2a226e22da011b45f7104c95ebda1b63dcbb0c378ad0f7c2a710f8fd2/pillow-11.1.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:36ba10b9cb413e7c7dfa3e189aba252deee0602c86c309799da5a74009ac7a1c", size = 4431298 }, + { url = "https://files.pythonhosted.org/packages/e7/c4/fc6e86750523f367923522014b821c11ebc5ad402e659d8c9d09b3c9d70c/pillow-11.1.0-cp312-cp312-win32.whl", hash = "sha256:cfd5cd998c2e36a862d0e27b2df63237e67273f2fc78f47445b14e73a810e7e6", size = 2291630 }, + { url = "https://files.pythonhosted.org/packages/08/5c/2104299949b9d504baf3f4d35f73dbd14ef31bbd1ddc2c1b66a5b7dfda44/pillow-11.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:a697cd8ba0383bba3d2d3ada02b34ed268cb548b369943cd349007730c92bddf", size = 2626369 }, + { url = "https://files.pythonhosted.org/packages/37/f3/9b18362206b244167c958984b57c7f70a0289bfb59a530dd8af5f699b910/pillow-11.1.0-cp312-cp312-win_arm64.whl", hash = "sha256:4dd43a78897793f60766563969442020e90eb7847463eca901e41ba186a7d4a5", size = 2375240 }, + { url = "https://files.pythonhosted.org/packages/fa/c5/389961578fb677b8b3244fcd934f720ed25a148b9a5cc81c91bdf59d8588/pillow-11.1.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:8c730dc3a83e5ac137fbc92dfcfe1511ce3b2b5d7578315b63dbbb76f7f51d90", size = 3198345 }, + { url = "https://files.pythonhosted.org/packages/c4/fa/803c0e50ffee74d4b965229e816af55276eac1d5806712de86f9371858fd/pillow-11.1.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:7d33d2fae0e8b170b6a6c57400e077412240f6f5bb2a342cf1ee512a787942bb", size = 3072938 }, + { url = "https://files.pythonhosted.org/packages/dc/67/2a3a5f8012b5d8c63fe53958ba906c1b1d0482ebed5618057ef4d22f8076/pillow-11.1.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a8d65b38173085f24bc07f8b6c505cbb7418009fa1a1fcb111b1f4961814a442", size = 3400049 }, + { url = "https://files.pythonhosted.org/packages/e5/a0/514f0d317446c98c478d1872497eb92e7cde67003fed74f696441e647446/pillow-11.1.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:015c6e863faa4779251436db398ae75051469f7c903b043a48f078e437656f83", size = 3422431 }, + { url = "https://files.pythonhosted.org/packages/cd/00/20f40a935514037b7d3f87adfc87d2c538430ea625b63b3af8c3f5578e72/pillow-11.1.0-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:d44ff19eea13ae4acdaaab0179fa68c0c6f2f45d66a4d8ec1eda7d6cecbcc15f", size = 3446208 }, + { url = "https://files.pythonhosted.org/packages/28/3c/7de681727963043e093c72e6c3348411b0185eab3263100d4490234ba2f6/pillow-11.1.0-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:d3d8da4a631471dfaf94c10c85f5277b1f8e42ac42bade1ac67da4b4a7359b73", size = 3509746 }, + { url = "https://files.pythonhosted.org/packages/41/67/936f9814bdd74b2dfd4822f1f7725ab5d8ff4103919a1664eb4874c58b2f/pillow-11.1.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:4637b88343166249fe8aa94e7c4a62a180c4b3898283bb5d3d2fd5fe10d8e4e0", size = 2626353 }, +] + [[package]] name = "pluggy" version = "1.5.0" @@ -656,6 +828,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f9/9b/335f9764261e915ed497fcdeb11df5dfd6f7bf257d4a6a2a686d80da4d54/requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6", size = 64928 }, ] +[[package]] +name = "sniffio" +version = "1.3.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a2/87/a6771e1546d97e7e041b6ae58d80074f81b7d5121207425c964ddf5cfdbd/sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc", size = 20372 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235 }, +] + [[package]] name = "syrupy" version = "4.8.1" @@ -688,7 +869,10 @@ version = "2.0.1" source = { virtual = "." } dependencies = [ { name = "docker" }, + { name = "huggingface-hub" }, { name = "numpy" }, + { name = "openai" }, + { name = "pillow" }, { name = "pydantic" }, { name = "pytest" }, { name = "pytest-asyncio" }, @@ -699,7 +883,10 @@ dependencies = [ [package.metadata] requires-dist = [ { name = "docker", specifier = ">=7" }, + { name = "huggingface-hub", specifier = ">=0.29" }, { name = "numpy", specifier = ">=2.0" }, + { name = "openai", specifier = ">=1.65" }, + { name = "pillow", specifier = ">=11.1.0" }, { name = "pydantic", specifier = ">2,<3" }, { name = "pytest", specifier = ">=8.3.0" }, { name = "pytest-asyncio", specifier = ">=0.23.1" }, @@ -741,7 +928,7 @@ name = "tqdm" version = "4.67.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "colorama", marker = "sys_platform == 'win32'" }, + { name = "colorama", marker = "platform_system == 'Windows'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/a8/4b/29b4ef32e036bb34e4ab51796dd745cdba7ed47ad142a9f4a1eb8e0c744d/tqdm-4.67.1.tar.gz", hash = "sha256:f8aef9c52c08c13a65f30ea34f4e5aac3fd1a34959879d7e59e63027286627f2", size = 169737 } wheels = [ diff --git a/launcher/src/main.rs b/launcher/src/main.rs index d9c41346..57dc6350 100644 --- a/launcher/src/main.rs +++ b/launcher/src/main.rs @@ -97,11 +97,10 @@ fn get_config( let filename = if !path.exists() { // Assume it's a hub id - let mut builder = if let Ok(token) = std::env::var("HF_TOKEN") { + let mut builder = ApiBuilder::from_env(); + if let Ok(token) = std::env::var("HF_TOKEN") { // env variable has precedence over on file token. - ApiBuilder::new().with_token(Some(token)) - } else { - ApiBuilder::new() + builder = builder.with_token(Some(token)) }; if let Ok(origin) = env::var("HF_HUB_USER_AGENT_ORIGIN") { builder = builder.with_user_agent("origin", origin.as_str()); @@ -152,7 +151,7 @@ fn resolve_attention(config: &Option, lora_adapters: &Option) -> "flashdecoding" }; - match config.head_dim { + match config.get_head_dim() { Some(h) if h == 64 || h == 128 || h == 256 => { if lora_adapters.is_some() && prefix_caching.is_none() { tracing::info!("Disabling prefix caching because of lora adapters"); @@ -214,6 +213,7 @@ struct RawConfig { num_key_value_heads: Option, num_hidden_layers: Option, head_dim: Option, + text_config: Option, vision_config: Option, is_encoder_decoder: Option, #[serde(rename = "num_experts_per_tok")] @@ -233,6 +233,11 @@ struct QuantizationConfig { #[derive(Debug, Deserialize)] struct VisionConfig {} +#[derive(Debug, Deserialize)] +struct TextConfig { + head_dim: Option, +} + #[derive(Debug, Deserialize)] struct Config { max_position_embeddings: Option, @@ -244,6 +249,7 @@ struct Config { intermediate_size: Option, hidden_size: Option, model_type: Option, + text_config: Option, vision_config: Option, is_encoder_decoder: bool, num_experts_per_token: usize, @@ -253,6 +259,14 @@ struct Config { } impl Config { + fn get_head_dim(&self) -> Option { + self.head_dim.or_else(|| { + self.text_config + .as_ref() + .and_then(|text_config| text_config.head_dim) + }) + } + fn flop(&self) -> Option { if self.vision_config.is_some() { // VLM are much harder to predict and VRAM requirements @@ -261,7 +275,7 @@ impl Config { } let num_heads = self.num_heads? as u64; let num_kv_heads = self.num_kv_heads? as u64; - let head_dim = self.head_dim? as u64; + let head_dim = self.get_head_dim()? as u64; let hidden_size = self.hidden_size? as u64; let intermediate_size = (self.intermediate_size? * (self.num_experts_per_token + self.num_shared_experts)) @@ -289,7 +303,7 @@ impl Config { } // 2 for key and values // 2 for f16 dtype? - Some(self.num_kv_heads? * 2 * self.head_dim? * 2 * self.num_layers?) + Some(self.num_kv_heads? * 2 * self.get_head_dim()? * 2 * self.num_layers?) } fn mlp_vram_per_tok(&self) -> Option { @@ -310,8 +324,8 @@ impl Config { } fn model_vram(&self) -> Option { - let attn_vram = (self.num_heads? + 2 * self.num_kv_heads?) * self.head_dim?; - let o_vram = self.num_heads? * self.head_dim? * self.hidden_size?; + let attn_vram = (self.num_heads? + 2 * self.num_kv_heads?) * self.get_head_dim()?; + let o_vram = self.num_heads? * self.get_head_dim()? * self.hidden_size?; // gate + up + down = 3 let mlp_vram = 3 * self.intermediate_size? * self.num_experts * self.hidden_size?; let layer_vram = mlp_vram + attn_vram + o_vram; @@ -349,6 +363,7 @@ impl From for Config { let num_kv_heads = other.num_key_value_heads.or(other.num_attention_heads); let intermediate_size = other.intermediate_size; let model_type = other.model_type; + let text_config = other.text_config; let vision_config = other.vision_config; let is_encoder_decoder = other.is_encoder_decoder.unwrap_or(false); let num_experts_per_token = other.num_experts_per_token.unwrap_or(1); @@ -360,6 +375,7 @@ impl From for Config { quantize, head_dim, model_type, + text_config, vision_config, is_encoder_decoder, hidden_size, @@ -2067,6 +2083,7 @@ fn main() -> Result<(), LauncherError> { let default_optimal = match config { Some(ref config) => match config.model_type.as_deref() { Some("qwen2_vl") | Some("qwen2_5_vl") => 10_000, + Some("gemma3") => 8000, _ => 4096, }, None => 4096, diff --git a/nix/docker.nix b/nix/docker.nix index c4b1d899..ed9e56d2 100644 --- a/nix/docker.nix +++ b/nix/docker.nix @@ -1,4 +1,5 @@ { + stdenv, dockerTools, cacert, text-generation-inference, @@ -11,13 +12,25 @@ in build { name = "tgi-docker"; tag = "latest"; + compressor = "zstd"; config = { EntryPoint = [ "${text-generation-inference}/bin/text-generation-inference" ]; Env = [ "HF_HOME=/data" "PORT=80" + # The CUDA container toolkit will mount the driver shim into the + # container. We just have to ensure that the dynamic loader finds + # the libraries. + "LD_LIBRARY_PATH=/usr/lib64" ]; }; - contents = [ cacert ]; + extraCommands = '' + mkdir -p tmp + chmod -R 1777 tmp + ''; + contents = [ + cacert + stdenv.cc + ]; } diff --git a/nix/server.nix b/nix/server.nix index 0640fe3a..1d00b978 100644 --- a/nix/server.nix +++ b/nix/server.nix @@ -16,8 +16,8 @@ grpcio-reflection, grpcio-status, grpcio-tools, - hf-kernels, hf-transfer, + kernels, loguru, mamba-ssm, moe, @@ -91,8 +91,8 @@ buildPythonPackage { grpcio-reflection grpcio-status grpcio-tools - hf-kernels hf-transfer + kernels loguru mamba-ssm moe diff --git a/router/src/chat.rs b/router/src/chat.rs new file mode 100644 index 00000000..d5824fea --- /dev/null +++ b/router/src/chat.rs @@ -0,0 +1,700 @@ +use crate::{ + infer::InferError, ChatCompletionChoice, ChatCompletionChunk, ChatCompletionDelta, + ChatCompletionLogprobs, CompletionType, DeltaToolCall, Function, FunctionDefinition, + StreamOptions, StreamResponse, TextMessage, ToolCallDelta, Usage, +}; +use serde::Deserialize; +use serde_json::Value; + +#[derive(Debug, Deserialize)] +struct ToolCall { + _name: String, + #[serde(flatten, default)] + /// Using Map to preserve order + arguments: serde_json::Map, +} +#[derive(Debug, Deserialize)] +struct Call { + function: ToolCall, +} + +#[cfg_attr(test, derive(Debug))] +pub(crate) enum ChatEvent { + NoTool, + Events(Vec), +} + +#[cfg_attr(test, derive(Debug))] +pub(crate) enum ChatChoice { + NoTool, + ToolCalls(Vec), +} + +pub(crate) fn parse_output(generated_text: &str) -> Result { + let call: Call = serde_json::from_str(generated_text).map_err(|e| { + InferError::ToolError(format!( + "Failed to parse generated text: {} {:?}", + e, generated_text + )) + })?; + let name = call.function._name; + + match &name[..] { + "no_tool" => { + // parse the content message + Ok(ChatChoice::NoTool) + } + name => { + let tool_calls = vec![crate::ToolCall { + id: "0".to_string(), + r#type: "function".to_string(), + function: FunctionDefinition { + description: None, + name: name.to_string(), + arguments: serde_json::to_value(call.function.arguments).map_err(|err| { + InferError::ToolError(format!( + "Could not convert arguments to JSON map {err}" + )) + })?, + }, + }]; + Ok(ChatChoice::ToolCalls(tool_calls)) + } + } +} + +/// Convert a StreamResponse into an Event to be sent over SSE +fn create_event_from_stream_token( + stream_token: &StreamResponse, + logprobs: bool, + inner_using_tools: bool, + system_fingerprint: String, + model_id: String, + function_name: Option, + id: String, +) -> CompletionType { + let current_time = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_else(|_| std::time::Duration::from_secs(0)) + .as_secs(); + + let logprobs = logprobs.then(|| { + ChatCompletionLogprobs::from((stream_token.token.clone(), stream_token.top_tokens.clone())) + }); + + // replace the content with the tool calls if grammar is present + let content = if !stream_token.token.special { + Some(stream_token.token.text.clone()) + } else { + None + }; + let (content, tool_calls) = if inner_using_tools { + // Cast into a vec + (None, content) + } else { + (content, None) + }; + let finish_reason = stream_token + .details + .as_ref() + .map(|details| details.finish_reason.format(true)); + let delta = match (content, tool_calls) { + (Some(delta), _) => ChatCompletionDelta::Chat(TextMessage { + role: "assistant".to_string(), + content: delta, + ..Default::default() + }), + (None, Some(tool_calls)) => ChatCompletionDelta::Tool(ToolCallDelta { + role: "assistant".to_string(), + tool_calls: vec![DeltaToolCall { + index: 0, + id, + r#type: "function".to_string(), + function: Function { + name: function_name, + arguments: tool_calls, + }, + }], + }), + (None, None) => ChatCompletionDelta::Chat(TextMessage { + role: "assistant".to_string(), + content: "".to_string(), + ..Default::default() + }), + }; + let choices = vec![ChatCompletionChoice { + index: 0, + delta, + logprobs, + finish_reason, + }]; + CompletionType::ChatCompletionChunk(ChatCompletionChunk::new( + model_id, + system_fingerprint, + current_time, + choices, + None, + )) +} + +#[derive(Debug)] +enum StreamState { + /// Before the tools was parsed + Buffering, + /// We detected a tool call here + Tool, + /// This is without tool calling + Content, +} + +pub struct ChatState { + state: StreamState, + text: String, + options: StreamOptions, + model_id: String, + fingerprint: String, + logprobs: bool, + id: String, +} + +impl ChatState { + pub fn new( + using_tools: bool, + options: StreamOptions, + fingerprint: String, + model_id: String, + logprobs: bool, + id: String, + ) -> Self { + let state = if using_tools { + StreamState::Buffering + } else { + StreamState::Content + }; + let text = String::new(); + Self { + state, + text, + options, + fingerprint, + model_id, + logprobs, + id, + } + } + + pub fn push(&mut self, mut stream_token: StreamResponse) -> ChatEvent { + let mut events = vec![]; + let token_text = &stream_token.token.text; + match self.state { + StreamState::Buffering => { + self.text.push_str(token_text); + tracing::info!("Current text {:?}", self.text); + let partial = &self.text; + let partial = + partial.trim_end_matches(|c: char| c.is_whitespace() || c == ',' || c == '}'); + if let Ok(call) = serde_json::from_str::(&format!("{}}}}}", partial)) { + // This can be no_tool before the content has been emitted + if call.function._name != "no_tool" { + stream_token.token.text = "{".to_string(); + let chat_complete = create_event_from_stream_token( + &stream_token, + self.logprobs, + true, + self.fingerprint.clone(), + self.model_id.clone(), + Some(call.function._name), + self.id.clone(), + ); + + events.push(chat_complete); + self.state = StreamState::Tool; + } else { + return ChatEvent::NoTool; + } + } + } + StreamState::Tool => { + self.text.push_str(token_text); + if serde_json::from_str::(&self.text).is_ok() { + self.state = StreamState::Buffering; + let mut text = stream_token.token.text.trim_end(); + // Effectively trimming only the last closing brace + if text.ends_with('}') { + text = &text[..text.len() - 1]; + } + stream_token.token.text = text.to_string(); + let chat_complete = create_event_from_stream_token( + &stream_token, + self.logprobs, + true, + self.fingerprint.clone(), + self.model_id.clone(), + None, + self.id.clone(), + ); + events.push(chat_complete); + } else { + let chat_complete = create_event_from_stream_token( + &stream_token, + self.logprobs, + true, + self.fingerprint.clone(), + self.model_id.clone(), + None, + self.id.clone(), + ); + events.push(chat_complete); + } + } + StreamState::Content => { + let chat_complete = create_event_from_stream_token( + &stream_token, + self.logprobs, + false, + self.fingerprint.clone(), + self.model_id.clone(), + None, + self.id.clone(), + ); + + events.push(chat_complete); + } + } + + if self.options.include_usage { + if let Some(details) = stream_token.details { + let completion_tokens = details.generated_tokens; + let prompt_tokens = details.input_length; + let total_tokens = prompt_tokens + completion_tokens; + + let usage = Usage { + completion_tokens, + prompt_tokens, + total_tokens, + }; + let current_time = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_else(|_| std::time::Duration::from_secs(0)) + .as_secs(); + + let chat_complete = CompletionType::ChatCompletionChunk(ChatCompletionChunk { + id: String::new(), + created: current_time, + model: self.model_id.clone(), + system_fingerprint: self.fingerprint.clone(), + choices: vec![], + usage: Some(Usage { + prompt_tokens: usage.prompt_tokens, + completion_tokens: usage.completion_tokens, + total_tokens: usage.total_tokens, + }), + }); + + events.push(chat_complete); + } + } + ChatEvent::Events(events) + } +} + +#[cfg(test)] +mod tests { + use crate::{ + ChatCompletionChoice, ChatCompletionDelta, FinishReason, StreamDetails, TextMessage, Token, + }; + + use super::*; + + fn get_tool_call_content(event: &CompletionType) -> (Option<&String>, &String) { + match event { + CompletionType::ChatCompletionChunk(ChatCompletionChunk { choices, .. }) => { + assert_eq!(choices.len(), 1); + if let ChatCompletionChoice { + delta: ChatCompletionDelta::Tool(ToolCallDelta { tool_calls, .. }), + .. + } = &choices[0] + { + assert_eq!(tool_calls.len(), 1); + let DeltaToolCall { + index, + id, + r#type, + function, + } = &tool_calls[0]; + assert_eq!(*index, 0); + assert_eq!(id, "0"); + assert_eq!(r#type, "function"); + (function.name.as_ref(), &function.arguments) + } else { + panic!("Expected plain message"); + } + } + _ => panic!("Unexpected chunk"), + } + } + + #[test] + fn test_chat_stream() { + let mut chat_state = ChatState::new( + false, + StreamOptions { + include_usage: false, + }, + "fingerprint".to_string(), + "model_id".to_string(), + false, + "0".to_string(), + ); + + let events = chat_state.push(StreamResponse { + generated_text: None, + token: Token { + id: 42, + text: "Hi".to_string(), + logprob: 0.0, + special: false, + }, + top_tokens: vec![], + index: 0, + details: None, + }); + if let ChatEvent::Events(events) = events { + assert_eq!(events.len(), 1); + match &events[0] { + CompletionType::ChatCompletionChunk(ChatCompletionChunk { choices, .. }) => { + assert_eq!( + choices, + &[ChatCompletionChoice { + index: 0, + delta: ChatCompletionDelta::Chat(TextMessage { + role: "assistant".to_string(), + content: "Hi".to_string(), + tool_call_id: None, + }), + logprobs: None, + finish_reason: None, + }] + ); + } + _ => panic!("Unexpected chunk"), + } + } else { + panic!("Expected chat events"); + } + } + + #[test] + fn test_chat_stream_usage() { + let mut chat_state = ChatState::new( + false, + StreamOptions { + include_usage: true, + }, + "fingerprint".to_string(), + "model_id".to_string(), + false, + "0".to_string(), + ); + + let events = chat_state.push(StreamResponse { + generated_text: None, + token: Token { + id: 42, + text: "Hi".to_string(), + logprob: 0.0, + special: false, + }, + top_tokens: vec![], + index: 0, + details: Some(StreamDetails { + input_length: 2, + generated_tokens: 10, + seed: None, + finish_reason: FinishReason::Length, + }), + }); + if let ChatEvent::Events(events) = events { + assert_eq!(events.len(), 2); + match &events[0] { + CompletionType::ChatCompletionChunk(ChatCompletionChunk { choices, .. }) => { + assert_eq!( + choices, + &[ChatCompletionChoice { + index: 0, + delta: ChatCompletionDelta::Chat(TextMessage { + role: "assistant".to_string(), + content: "Hi".to_string(), + tool_call_id: None, + }), + logprobs: None, + // HAS A FINISH REASON + finish_reason: Some("length".to_string()), + }] + ); + } + _ => panic!("Unexpected chunk"), + } + match &events[1] { + CompletionType::ChatCompletionChunk(ChatCompletionChunk { usage, .. }) => { + assert_eq!( + *usage, + Some(Usage { + prompt_tokens: 2, + completion_tokens: 10, + total_tokens: 12, + }) + ); + } + _ => panic!("Unexpected chunk"), + } + } else { + panic!("Expected chat events"); + } + } + + #[test] + fn test_chat_stream_tool_no_tool_simple() { + let mut chat_state = ChatState::new( + true, + StreamOptions { + include_usage: true, + }, + "fingerprint".to_string(), + "model_id".to_string(), + false, + "0".to_string(), + ); + + let tokens = vec![ + "{\"".to_string(), + "function".to_string(), + "\":".to_string(), + " {\"".to_string(), + "_".to_string(), + "name".to_string(), + "\":".to_string(), + " \"".to_string(), + "no".to_string(), + "_tool".to_string(), + "\",".to_string(), + " \"".to_string(), + "content".to_string(), + "\":".to_string(), + " \"".to_string(), // Token 14 + "I".to_string(), // Event 1 + " am".to_string(), // Event 2 + " a".to_string(), // Event 3 + " helpful".to_string(), // Event 4 + " assistant".to_string(), // Event 5 + "!\"".to_string(), // Event 6 (with trailing quore removed) + "}".to_string(), + "}".to_string(), + ]; + let tokens: Vec<_> = tokens + .into_iter() + .map(|text| StreamResponse { + generated_text: None, + token: Token { + id: 42, + text: text.to_string(), + logprob: 0.0, + special: false, + }, + top_tokens: vec![], + index: 0, + details: None, + }) + .collect(); + + // Initial ignored output + for token in &tokens[..10] { + let events = chat_state.push(token.clone()); + if let ChatEvent::Events(events) = events { + assert_eq!(events.len(), 0, "{events:?}"); + } else { + panic!("Expected chat events"); + } + } + + // No tool output + let events = chat_state.push(tokens[10].clone()); + if let ChatEvent::NoTool = events { + assert!(true); + } else { + panic!("Expected chat events"); + } + } + + #[test] + fn test_chat_stream_tool_no_tool_empty() { + let mut chat_state = ChatState::new( + true, + StreamOptions { + include_usage: true, + }, + "fingerprint".to_string(), + "model_id".to_string(), + false, + "0".to_string(), + ); + + let tokens = vec![ + "{\"".to_string(), + "function".to_string(), + "\":".to_string(), + " {\"".to_string(), + "_".to_string(), + "name".to_string(), + "\":".to_string(), + " \"".to_string(), + "no".to_string(), + "_tool".to_string(), + "\",".to_string(), + " \"".to_string(), + "content".to_string(), + "\":\"".to_string(), + "\"}".to_string(), // Token 13 + "}".to_string(), // Event 1 + ]; + let tokens: Vec<_> = tokens + .into_iter() + .map(|text| StreamResponse { + generated_text: None, + token: Token { + id: 42, + text: text.to_string(), + logprob: 0.0, + special: false, + }, + top_tokens: vec![], + index: 0, + details: None, + }) + .collect(); + + // Initial ignored output + for token in &tokens[..10] { + let events = chat_state.push(token.clone()); + if let ChatEvent::Events(events) = events { + assert_eq!(events.len(), 0, "{events:?}"); + } else { + panic!("Expected chat events"); + } + } + + // No tool output + let events = chat_state.push(tokens[10].clone()); + if let ChatEvent::NoTool = events { + assert!(true); + } else { + panic!("Expected chat events"); + } + } + + #[test] + fn test_chat_stream_tool_get_weather() { + let mut chat_state = ChatState::new( + true, + StreamOptions { + include_usage: true, + }, + "fingerprint".to_string(), + "model_id".to_string(), + false, + "0".to_string(), + ); + + let tokens = vec![ + "{\"".to_string(), + "function".to_string(), + "\":".to_string(), + " {\"".to_string(), + "_".to_string(), + "name".to_string(), + "\":".to_string(), + " \"".to_string(), + "get".to_string(), + "_current".to_string(), + "_weather".to_string(), + "\",".to_string(), + // Event 1 is the function name + // Event 2 is the start of the arguments "{" + " \"".to_string(), // Event 3 + "location".to_string(), // Event 4 + "\":".to_string(), // Event 5 + " \"".to_string(), // Event 6 + "San".to_string(), // Event 7 + " Francisco".to_string(), // Event 8 + ",".to_string(), // Event 9 + " CA".to_string(), // Event 10 + "\",".to_string(), // Event 11 + " \"".to_string(), // Event 12 + "format".to_string(), // Event 13 + "\":".to_string(), // Event 14 + " \"".to_string(), // Event 15 + "c".to_string(), // Event 16 + "elsius".to_string(), // Event 17 + "\"}}".to_string(), // Event 18 retained (trailing brace removed) + ]; + let tokens: Vec<_> = tokens + .into_iter() + .map(|text| StreamResponse { + generated_text: None, + token: Token { + id: 42, + text: text.to_string(), + logprob: 0.0, + special: false, + }, + top_tokens: vec![], + index: 0, + details: None, + }) + .collect(); + + // Initial ignored output + for token in &tokens[..11] { + let events = chat_state.push(token.clone()); + if let ChatEvent::Events(events) = events { + assert_eq!(events.len(), 0, "{events:?}"); + } else { + panic!("Expected chat events"); + } + } + + // No tool output + let mut output = String::new(); + let mut output_name = String::new(); + for token in &tokens[11..11 + 17] { + let events = chat_state.push(token.clone()); + if let ChatEvent::Events(events) = events { + assert_eq!(events.len(), 1); + let (name, arguments) = get_tool_call_content(&events[0]); + if let Some(name) = name { + assert_eq!(name, "get_current_weather"); + output_name.push_str(&name); + } + output.push_str(arguments); + } else { + panic!("Expected chat events"); + } + } + + assert_eq!(output_name, "get_current_weather"); + assert_eq!( + output, + "{ \"location\": \"San Francisco, CA\", \"format\": \"celsius\"}" + ); + + // No tool finish + for token in &tokens[11 + 17..] { + let events = chat_state.push(token.clone()); + if let ChatEvent::Events(events) = events { + assert_eq!(events.len(), 0, "{events:?}"); + } else { + panic!("Expected chat events"); + } + } + } +} diff --git a/router/src/config.rs b/router/src/config.rs index a0135984..4460eb00 100644 --- a/router/src/config.rs +++ b/router/src/config.rs @@ -216,6 +216,19 @@ impl Qwen2_5Vl { } } +#[derive(Clone, Debug, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub struct Gemma3VisionConfig { + pub(crate) image_size: usize, + pub(crate) patch_size: usize, +} + +#[derive(Clone, Debug, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub struct Gemma3 { + vision_config: Gemma3VisionConfig, +} + #[derive(Clone, Debug, Serialize, Deserialize)] #[serde(tag = "model_type")] #[serde(rename_all = "snake_case")] @@ -249,6 +262,8 @@ pub enum Config { Paligemma(Paligemma), Gemma, Gemma2, + Gemma3(Gemma3), + Gemma3Text, Cohere, Drbx, Falcon, diff --git a/router/src/infer/chat_template.rs b/router/src/infer/chat_template.rs index b179dd4d..a6687203 100644 --- a/router/src/infer/chat_template.rs +++ b/router/src/infer/chat_template.rs @@ -16,7 +16,7 @@ pub(crate) fn strftime_now(format_str: String) -> Result, bos_token: Option, @@ -33,7 +33,16 @@ impl ChatTemplate { let mut env = Box::new(Environment::new()); // enable things like .strip() or .capitalize() env.set_unknown_method_callback(pycompat::unknown_method_callback); - let template_str = template.into_boxed_str(); + + // TODO: replace with better solution + // hack to adjust gemma3 template for debug + // replace 'messages[0]['content'][0]['text']' with 'messages[0]['content']' + let mutated_template = template.replace( + "messages[0]['content'][0]['text']", + "messages[0]['content']", + ); + + let template_str = mutated_template.into_boxed_str(); env.add_function("raise_exception", raise_exception); env.add_function("strftime_now", strftime_now); tracing::debug!("Loading template: {}", template_str); @@ -123,8 +132,8 @@ mod tests { use crate::infer::chat_template::{raise_exception, strftime_now}; use crate::infer::ChatTemplate; use crate::{ - ChatTemplateInputs, Message, MessageBody, MessageContent, TextMessage, - TokenizerConfigToken, Tool, + ChatTemplateInputs, Message, MessageBody, MessageChunk, MessageContent, TextMessage, + TokenizerConfigToken, Tool, Url, }; use chrono::Local; use minijinja::Environment; @@ -1230,4 +1239,98 @@ TOOL CALL ID: 0 let expected = "<|start_header_id|>system<|end_header_id|>\n\nEnvironment: ipython\nCutting Knowledge Date: December 2023\nToday Date: 26 Jul 2024\n\nYoure a helpful assistant! Answer the users question best you can.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nGiven the following functions, please respond with a JSON for a function call with its proper arguments that best answers the given prompt.\n\nRespond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.Do not use variables.\n\n{\n \"function\": {\n \"arguments\": \"{\\\"type\\\":\\\"object\\\",\\\"properties\\\":{\\\"location\\\":{\\\"type\\\":\\\"string\\\",\\\"description\\\":\\\"The city and state, e.g. San Francisco, CA\\\"},\\\"format\\\":{\\\"type\\\":\\\"string\\\",\\\"enum\\\":[\\\"celsius\\\",\\\"fahrenheit\\\"],\\\"description\\\":\\\"The temperature unit to use. Infer this from the users location.\\\"}},\\\"required\\\":[\\\"location\\\",\\\"format\\\"]}\",\n \"description\": \"Get the current weather\",\n \"name\": \"get_current_weather\"\n },\n \"type\": \"function\"\n}\n\nWhat is the weather like in Brooklyn, New York?\n---\nThis default prompt will be used<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n".to_string(); assert_eq!(result.unwrap(), expected); } + + #[test] + fn test_chat_template_with_special_system_prompt() { + // chat template from gemma3 + let ct = ChatTemplate::new( + r#"{{ bos_token }} +{%- if messages[0]['role'] == 'system' -%} + {%- set first_user_prefix = messages[0]['content'][0]['text'] + ' + +' -%} + {%- set loop_messages = messages[1:] -%} +{%- else -%} + {%- set first_user_prefix = "" -%} + {%- set loop_messages = messages -%} +{%- endif -%} +{%- for message in loop_messages -%} + {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%} + {{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }} + {%- endif -%} + {%- if (message['role'] == 'assistant') -%} + {%- set role = "model" -%} + {%- else -%} + {%- set role = message['role'] -%} + {%- endif -%} + {{ '' + role + ' +' + (first_user_prefix if loop.first else "") }} + {%- if message['content'] is string -%} + {{ message['content'] | trim }} + {%- elif message['content'] is iterable -%} + {%- for item in message['content'] -%} + {%- if item['type'] == 'image' -%} + {{ '' }} + {%- elif item['type'] == 'text' -%} + {{ item['text'] | trim }} + {%- endif -%} + {%- endfor -%} + {%- else -%} + {{ raise_exception("Invalid content type") }} + {%- endif -%} + {{ ' +' }} +{%- endfor -%} +{%- if add_generation_prompt -%} + {{'model +'}} +{%- endif -%} +"# + .to_string(), + Some(TokenizerConfigToken::String("".to_string())), + Some(TokenizerConfigToken::String("".to_string())), + ); + let msgs: Vec = vec![ + Message { + name: None, + role: "system".to_string(), + body: MessageBody::Content { + content: MessageContent::MultipleChunks(vec![MessageChunk::Text { + text: "You are a helpful assistant.".to_string(), + }]), + }, + }, + Message { + name: None, + role: "user".to_string(), + body: MessageBody::Content { + content: MessageContent::MultipleChunks(vec![ + MessageChunk::Text { + text: "I'm already using this supplement ".to_string(), + }, + MessageChunk::ImageUrl { + image_url: Url { + url: "https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/IMG_3018.JPG".to_string() + }, + }, + MessageChunk::Text { + text: "and I want to use this one too ".to_string() + }, + MessageChunk::ImageUrl { + image_url: Url { + url: "https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/IMG_3015.jpg".to_string() + }, + }, + MessageChunk::Text { + text: " what are cautions?".to_string() + }, + ]), + }, + }, + ]; + + let result = ct.apply(msgs, None); + let expected = "user\nYou are a helpful assistant.\n\nI'm already using this supplement ![](https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/IMG_3018.JPG)and I want to use this one too ![](https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/IMG_3015.jpg) what are cautions?\nmodel\n".to_string(); + assert_eq!(result.unwrap(), expected); + } } diff --git a/router/src/infer/mod.rs b/router/src/infer/mod.rs index 7eb8a41b..cdce8188 100644 --- a/router/src/infer/mod.rs +++ b/router/src/infer/mod.rs @@ -52,7 +52,7 @@ pub struct Infer { /// Request backend backend: Arc, /// Chat template - chat_template: Option, + pub(crate) chat_template: Option, /// Inference limit limit_concurrent_requests: Arc, /// Backend health diff --git a/router/src/infer/tool_grammar.rs b/router/src/infer/tool_grammar.rs index 7770cd9d..e4e20859 100644 --- a/router/src/infer/tool_grammar.rs +++ b/router/src/infer/tool_grammar.rs @@ -40,13 +40,13 @@ impl ToolGrammar { ), arguments: json!({ "type": "object", - "properties": { - "content": { - "type": "string", - "description": "The response content", - } - }, - "required": ["content"] + // "properties": { + // "content": { + // "type": "string", + // "description": "The response content", + // } + // }, + // "required": ["content"] }), }, })) diff --git a/router/src/lib.rs b/router/src/lib.rs index 08c31b64..e8b8f663 100644 --- a/router/src/lib.rs +++ b/router/src/lib.rs @@ -8,6 +8,7 @@ pub mod validation; mod kserve; pub mod logging; +mod chat; mod sagemaker; pub mod usage_stats; mod vertex; @@ -20,6 +21,7 @@ use serde::{Deserialize, Serialize}; use tokenizers::Encoding; use tracing::warn; use utoipa::ToSchema; +use uuid::Uuid; use validation::Validation; #[allow(clippy::large_enum_variant)] @@ -150,6 +152,11 @@ impl HubTokenizerConfig { } } +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct ChatTemplateStandalone { + pub chat_template: ChatTemplateVersions, +} + #[derive(Debug, Clone, Deserialize, Serialize, PartialEq)] #[serde(untagged)] pub enum TokenizerConfigToken { @@ -171,6 +178,7 @@ impl TokenizerConfigToken { pub enum HubPreprocessorConfig { Idefics2Processor(Idefics2Preprocessor), Idefics3Processor(Idefics2Preprocessor), + Gemma3Processor(Gemma3Processor), } impl HubPreprocessorConfig { @@ -186,6 +194,12 @@ pub struct Idefics2Preprocessor { do_image_splitting: bool, } +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct Gemma3Processor { + #[serde(default)] + do_image_splitting: bool, +} + #[derive(Debug, Clone, Deserialize, Default)] pub struct HubProcessorConfig { pub chat_template: Option, @@ -541,6 +555,7 @@ pub(crate) struct Chunk { } #[derive(Clone, Deserialize, Serialize, ToSchema)] +#[cfg_attr(test, derive(Debug))] pub(crate) struct ChatCompletion { pub id: String, #[schema(example = "1706270835")] @@ -553,6 +568,7 @@ pub(crate) struct ChatCompletion { } #[derive(Clone, Deserialize, Serialize, ToSchema)] +#[cfg_attr(test, derive(Debug))] pub(crate) struct ChatCompletionComplete { pub index: u32, pub message: OutputMessage, @@ -561,6 +577,7 @@ pub(crate) struct ChatCompletionComplete { } #[derive(Clone, Deserialize, Serialize, ToSchema)] +#[cfg_attr(test, derive(Debug, PartialEq))] pub(crate) struct ChatCompletionLogprobs { content: Vec, } @@ -619,6 +636,7 @@ impl From<(Vec, Vec>)> for ChatCompletionLogprobs { } #[derive(Clone, Deserialize, Serialize, ToSchema)] +#[cfg_attr(test, derive(Debug, PartialEq))] pub(crate) struct ChatCompletionLogprob { token: String, logprob: f32, @@ -626,12 +644,14 @@ pub(crate) struct ChatCompletionLogprob { } #[derive(Clone, Deserialize, Serialize, ToSchema)] +#[cfg_attr(test, derive(Debug, PartialEq))] pub(crate) struct ChatCompletionTopLogprob { token: String, logprob: f32, } #[derive(Clone, Deserialize, Serialize, ToSchema, Default)] +#[cfg_attr(test, derive(Debug, PartialEq))] pub(crate) struct Usage { pub prompt_tokens: u32, pub completion_tokens: u32, @@ -640,6 +660,7 @@ pub(crate) struct Usage { #[derive(Clone, Serialize, ToSchema)] #[serde(tag = "object")] +#[cfg_attr(test, derive(Debug))] enum CompletionType { #[serde(rename = "chat.completion.chunk")] ChatCompletionChunk(ChatCompletionChunk), @@ -707,6 +728,7 @@ impl ChatCompletion { } } #[derive(Clone, Serialize, ToSchema)] +#[cfg_attr(test, derive(Debug))] pub(crate) struct ChatCompletionChunk { pub id: String, #[schema(example = "1706270978")] @@ -719,6 +741,7 @@ pub(crate) struct ChatCompletionChunk { } #[derive(Clone, Serialize, ToSchema)] +#[cfg_attr(test, derive(Debug, PartialEq))] pub(crate) struct ChatCompletionChoice { pub index: u32, pub delta: ChatCompletionDelta, @@ -735,6 +758,7 @@ pub struct ToolCallDelta { #[derive(Clone, Debug, Serialize, ToSchema)] #[serde(untagged)] +#[cfg_attr(test, derive(PartialEq))] enum ChatCompletionDelta { Chat(TextMessage), Tool(ToolCallDelta), @@ -759,48 +783,17 @@ impl ChatCompletionChunk { pub(crate) fn new( model: String, system_fingerprint: String, - delta: Option, - tool_calls: Option>, created: u64, - logprobs: Option, - finish_reason: Option, + choices: Vec, + usage: Option, ) -> Self { - let delta = match (delta, tool_calls) { - (Some(delta), _) => ChatCompletionDelta::Chat(TextMessage { - role: "assistant".to_string(), - content: delta, - ..Default::default() - }), - (None, Some(tool_calls)) => ChatCompletionDelta::Tool(ToolCallDelta { - role: "assistant".to_string(), - tool_calls: vec![DeltaToolCall { - index: 0, - id: String::new(), - r#type: "function".to_string(), - function: Function { - name: None, - arguments: tool_calls[0].to_string(), - }, - }], - }), - (None, None) => ChatCompletionDelta::Chat(TextMessage { - role: "assistant".to_string(), - content: "".to_string(), - ..Default::default() - }), - }; Self { id: String::new(), created, model, system_fingerprint, - choices: vec![ChatCompletionChoice { - index: 0, - delta, - logprobs, - finish_reason, - }], - usage: None, + choices, + usage, } } } @@ -915,7 +908,7 @@ pub(crate) struct ChatRequest { /// Options for streaming response. Only set this when you set stream: true. #[serde(default)] #[schema(nullable = true, example = "null")] - pub stream_options: Option, + pub stream_options: StreamOptions, } impl ChatRequest { @@ -1015,13 +1008,37 @@ impl ChatRequest { using_tools, )) } + + fn next_int_id(&self) -> Result> { + let mut id: usize = 0; + for message in &self.messages { + if let MessageBody::Tool { tool_calls } = &message.body { + for tool_call in tool_calls { + let new_id: usize = tool_call.id.parse()?; + id = std::cmp::max(id, new_id + 1); + } + } + } + Ok(id.to_string()) + } + + /// Try to have linearly increasing id + /// or resort to using Uuid if the initial + /// scheme is not understood + fn next_tool_call_id(&self) -> String { + self.next_int_id().unwrap_or_else(|_| { + let uid = Uuid::new_v4().to_string(); + uid.to_string() + }) + } } -#[derive(Clone, Deserialize, ToSchema, Serialize)] +#[derive(Clone, Deserialize, ToSchema, Serialize, Default)] #[cfg_attr(test, derive(Debug, PartialEq))] struct StreamOptions { /// If set, an additional chunk will be streamed before the data: [DONE] message. The usage field on this chunk shows the token usage statistics for the entire request, and the choices field will always be an empty array. All other chunks will also include a usage field, but with a null value. #[schema(example = "true")] + #[serde(default)] include_usage: bool, } @@ -1445,7 +1462,7 @@ pub(crate) struct ChatTokenizeResponse { #[serde(transparent)] pub(crate) struct TokenizeResponse(Vec); -#[derive(Serialize, ToSchema)] +#[derive(Serialize, ToSchema, Clone)] pub(crate) struct StreamDetails { #[schema(example = "length")] pub finish_reason: FinishReason, @@ -1457,7 +1474,7 @@ pub(crate) struct StreamDetails { pub input_length: u32, } -#[derive(Serialize, ToSchema)] +#[derive(Serialize, ToSchema, Clone)] pub(crate) struct StreamResponse { pub index: u32, pub token: Token, @@ -1700,9 +1717,25 @@ mod tests { assert!(matches!( request.stream_options, - Some(StreamOptions { + StreamOptions { include_usage: true - }) + } + )); + + let json = json!({ + "model": "", + "messages": [{ + "role": "user", + "content": "Hello" + }] + }); + let request: ChatRequest = serde_json::from_str(json.to_string().as_str()).unwrap(); + + assert!(matches!( + request.stream_options, + StreamOptions { + include_usage: false + } )); } diff --git a/router/src/server.rs b/router/src/server.rs index df9e16ff..45d2b9f3 100644 --- a/router/src/server.rs +++ b/router/src/server.rs @@ -1,3 +1,4 @@ +use crate::chat::{ChatChoice, ChatEvent, ChatState}; /// HTTP Server logic use crate::config::Config; use crate::infer::{Backend, Infer, InferError, InferResponse, InferStreamResponse}; @@ -47,8 +48,6 @@ use http::header::AUTHORIZATION; use metrics_exporter_prometheus::{Matcher, PrometheusBuilder, PrometheusHandle}; use pyo3::prelude::*; use pyo3::types::IntoPyDict; -use regex::Regex; -use serde_json::Value; use std::convert::Infallible; use std::fs::File; use std::io::BufReader; @@ -1114,62 +1113,6 @@ pub(crate) async fn completions( } } -enum StreamState { - Buffering, - BufferTrailing, - Content { skip_close_quote: bool }, -} - -/// Convert a StreamResponse into an Event to be sent over SSE -fn create_event_from_stream_token( - stream_token: &StreamResponse, - logprobs: bool, - inner_using_tools: bool, - system_fingerprint: String, - model_id: String, -) -> Event { - let event = Event::default(); - let current_time = std::time::SystemTime::now() - .duration_since(std::time::UNIX_EPOCH) - .unwrap_or_else(|_| std::time::Duration::from_secs(0)) - .as_secs(); - - let logprobs = logprobs.then(|| { - ChatCompletionLogprobs::from((stream_token.token.clone(), stream_token.top_tokens.clone())) - }); - - // replace the content with the tool calls if grammar is present - let (content, tool_calls) = if inner_using_tools { - (None, Some(vec![stream_token.token.text.clone()])) - } else { - let content = if !stream_token.token.special { - Some(stream_token.token.text.clone()) - } else { - None - }; - - (content, None) - }; - let finish_reason = stream_token - .details - .as_ref() - .map(|details| details.finish_reason.format(true)); - let chat_complete = CompletionType::ChatCompletionChunk(ChatCompletionChunk::new( - model_id.clone(), - system_fingerprint.clone(), - content, - tool_calls, - current_time, - logprobs, - finish_reason, - )); - - event.json_data(chat_complete).unwrap_or_else(|e| { - println!("Failed to serialize ChatCompletionChunk: {:?}", e); - Event::default() - }) -} - /// Generate tokens #[utoipa::path( post, @@ -1208,7 +1151,7 @@ pub(crate) async fn chat_completions( Extension(infer): Extension, Extension(compute_type): Extension, Extension(info): Extension, - Json(chat): Json, + Json(mut chat): Json, ) -> Result)> { let span = tracing::Span::current(); metrics::counter!("tgi_request_count").increment(1); @@ -1219,8 +1162,11 @@ pub(crate) async fn chat_completions( logprobs, .. } = chat.clone(); + + tracing::debug!("Got chat_template {:?}", infer.chat_template); + let id = chat.next_tool_call_id(); let (generate_request, using_tools): (GenerateRequest, bool) = - chat.try_into_generate(&infer)?; + chat.clone().try_into_generate(&infer)?; span.record("parameters", format!("{:?}", generate_request.parameters)); let logprobs = logprobs.unwrap_or_default(); @@ -1232,167 +1178,41 @@ pub(crate) async fn chat_completions( let system_fingerprint = format!("{}-{}", info.version, info.docker_label.unwrap_or("native")); // switch on stream if stream { - let (headers, response_stream) = - generate_stream_internal(infer, compute_type, Json(generate_request), span).await; - - // regex to match any function name - let function_regex = match Regex::new(r#"\{"function":\{"_name":"([^"]+)""#) { - Ok(regex) => regex, - Err(e) => { - return Err(( - StatusCode::INTERNAL_SERVER_ERROR, - Json(ErrorResponse { - error: format!("Failed to compile regex: {}", e), - error_type: "regex".to_string(), - }), - )) - } - }; + let (headers, response_stream) = generate_stream_internal( + infer.clone(), + compute_type.clone(), + Json(generate_request), + span.clone(), + ) + .await; let response_stream = async_stream::stream! { let mut response_stream = Box::pin(response_stream); - let mut buffer = Vec::new(); - let mut json_buffer = String::new(); - let mut state = if using_tools { - StreamState::Buffering - } else { - StreamState::Content { - skip_close_quote: false, - } - }; - let mut response_as_tool = using_tools; + let mut state = ChatState::new(using_tools, stream_options.clone(), system_fingerprint.clone(), model_id.clone(), logprobs, id.clone()); while let Some(result) = response_stream.next().await { match result{ Ok(stream_token) => { - let token_text = &stream_token.token.text.clone(); - let usage = stream_token.details.as_ref().map(|details| { - let completion_tokens = details.generated_tokens; - let prompt_tokens = details.input_length; - let total_tokens = prompt_tokens + completion_tokens; - - Usage { - completion_tokens, - prompt_tokens, - total_tokens, + let events = state.push(stream_token); + match events{ + ChatEvent::NoTool => { + chat.tools = None; + chat.response_format = None; + let (generate_request, using_tools): (GenerateRequest, bool) = + chat.clone().try_into_generate(&infer).unwrap(); + assert!(!using_tools); + let (_headers, response_stream2) = + generate_stream_internal(infer.clone(), compute_type.clone(), Json(generate_request), span.clone()).await; + state = ChatState::new(using_tools, stream_options.clone(), system_fingerprint.clone(), model_id.clone(), logprobs, id.clone()); + response_stream = Box::pin(response_stream2); } - }); - match state { - StreamState::Buffering => { - json_buffer.push_str(&token_text.replace(" ", "")); - buffer.push(stream_token); - if let Some(captures) = function_regex.captures(&json_buffer) { - let function_name = captures[1].to_string(); - if function_name == "no_tool" { - state = StreamState::BufferTrailing; - response_as_tool = false; - buffer.clear(); - json_buffer.clear(); - } else { - state = StreamState::Content { - skip_close_quote: false, - }; - // send all the buffered messages - for stream_token in &buffer { - let event = create_event_from_stream_token( - stream_token, - logprobs, - response_as_tool, - system_fingerprint.clone(), - model_id.clone(), - ); - yield Ok::(event); - } - } - } - } - // if we skipped sending the buffer we need to avoid sending the following json key and quotes - StreamState::BufferTrailing => { - let infix_text = "\"content\":\""; - json_buffer.push_str(&token_text.replace(" ", "")); - // keep capturing until we find the infix text - match json_buffer.find(infix_text) { - Some(content_key_index) => { - json_buffer = - json_buffer[content_key_index + infix_text.len()..].to_string(); - } - None => { - continue; - } - } - // if there is leftover text after removing the infix text, we need to send it - if !json_buffer.is_empty() { - let event = Event::default(); - let current_time = std::time::SystemTime::now() - .duration_since(std::time::UNIX_EPOCH) - .unwrap_or_else(|_| std::time::Duration::from_secs(0)) - .as_secs(); - let chat_complete = - CompletionType::ChatCompletionChunk(ChatCompletionChunk::new( - model_id.clone(), - system_fingerprint.clone(), - Some(json_buffer.clone()), - None, - current_time, - None, - None, - )); - yield Ok(event.json_data(chat_complete).unwrap_or_else(|e| { - InferError::StreamSerializationError(e.to_string()).into() + ChatEvent::Events(events) => { + for chat_complete in events{ + yield Ok(Event::default().json_data(chat_complete).unwrap_or_else(|e| { + tracing::error!("Failed to serialize ChatCompletionChunk: {:?}", e); + Event::default() })); } - // cleanup the buffers - buffer.clear(); - json_buffer.clear(); - state = StreamState::Content { - skip_close_quote: true, - }; } - StreamState::Content { skip_close_quote } => { - if skip_close_quote && token_text.contains('"') { - break; - } - - // send the content - let event = create_event_from_stream_token( - &stream_token, - logprobs, - response_as_tool, - system_fingerprint.clone(), - model_id.clone(), - ); - - yield Ok::(event); - } - } - - let should_send_usage = usage.is_some() - && stream_options - .as_ref() - .is_some_and(|opts| opts.include_usage); - - if should_send_usage { - let usage_data = usage.unwrap(); - let current_time = std::time::SystemTime::now() - .duration_since(std::time::UNIX_EPOCH) - .unwrap_or_else(|_| std::time::Duration::from_secs(0)) - .as_secs(); - - let chat_complete = CompletionType::ChatCompletionChunk(ChatCompletionChunk { - id: String::new(), - created: current_time, - model: model_id.clone(), - system_fingerprint: system_fingerprint.clone(), - choices: vec![], - usage: Some(Usage { - prompt_tokens: usage_data.prompt_tokens, - completion_tokens: usage_data.completion_tokens, - total_tokens: usage_data.total_tokens, - }), - }); - - yield Ok(Event::default() - .json_data(chat_complete) - .unwrap_or_else(|e| InferError::StreamSerializationError(e.to_string()).into())); } } Err(err) => yield Ok(err.into_openai_event()) @@ -1404,8 +1224,13 @@ pub(crate) async fn chat_completions( let sse = Sse::new(response_stream).keep_alive(KeepAlive::default()); Ok((headers, sse).into_response()) } else { - let (headers, input_length, Json(generation)) = - generate_internal(Extension(infer), compute_type, Json(generate_request), span).await?; + let (mut headers, mut input_length, Json(generation)) = generate_internal( + Extension(infer.clone()), + compute_type.clone(), + Json(generate_request), + span.clone(), + ) + .await?; let current_time = std::time::SystemTime::now() .duration_since(std::time::UNIX_EPOCH) @@ -1413,55 +1238,25 @@ pub(crate) async fn chat_completions( .as_secs(); let (tool_calls, output) = if using_tools { - let gen_text_value: Value = - serde_json::from_str(&generation.generated_text).map_err(|e| { - InferError::ToolError(format!( - "Failed to parse generated text: {} {:?}", - e, generation.generated_text - )) - })?; - let function = gen_text_value.get("function").ok_or(InferError::ToolError( - "No function found in generated text".to_string(), - ))?; - - let name = function - .get("_name") - .and_then(Value::as_str) - .ok_or(InferError::ToolError( - "No _name found in generated text".to_string(), - ))? - .to_string(); - - let mut arguments = function.clone(); - if let Value::Object(ref mut props) = arguments { - props.remove("_name"); - } - match name.as_str() { - "no_tool" => { - // parse the content message - let content_message = arguments - .get("content") - .and_then(Value::as_str) - .ok_or_else(|| { - InferError::ToolError( - "No `content` found in generated text".to_string(), - ) - })? - .to_string(); - (None, Some(content_message)) - } - _ => { - let tool_calls = vec![ToolCall { - id: "0".to_string(), - r#type: "function".to_string(), - function: FunctionDefinition { - description: None, - name, - arguments, - }, - }]; - (Some(tool_calls), None) + match crate::chat::parse_output(&generation.generated_text)? { + ChatChoice::NoTool => { + chat.tools = None; + chat.response_format = None; + let (generate_request, using_tools): (GenerateRequest, bool) = + chat.clone().try_into_generate(&infer)?; + assert!(!using_tools); + let (headers_final, input_length_final, Json(generation)) = generate_internal( + Extension(infer), + compute_type, + Json(generate_request), + span, + ) + .await?; + headers = headers_final; + input_length = input_length_final; + (None, Some(generation.generated_text)) } + ChatChoice::ToolCalls(tool_calls) => (Some(tool_calls), None), } } else { (None, Some(generation.generated_text)) @@ -1727,7 +1522,7 @@ pub async fn run( // Shared API builder initialization let api_builder = || { - let mut builder = ApiBuilder::new().with_progress(false); + let mut builder = ApiBuilder::from_env().with_progress(false); if let Some(token) = authorization_token { builder = builder.with_token(Some(token)); } @@ -1781,6 +1576,7 @@ pub async fn run( tokenizer_config_filename, preprocessor_config_filename, processor_config_filename, + chat_template_filename, model_info, ) = match api { Type::None => ( @@ -1788,6 +1584,7 @@ pub async fn run( Some(local_path.join("tokenizer_config.json")), Some(local_path.join("preprocessor_config.json")), Some(local_path.join("processor_config.json")), + Some(local_path.join("chat_template.json")), None, ), Type::Api(api) => { @@ -1801,6 +1598,7 @@ pub async fn run( let tokenizer_config_filename = api_repo.get("tokenizer_config.json").await.ok(); let preprocessor_config_filename = api_repo.get("preprocessor_config.json").await.ok(); let processor_config_filename = api_repo.get("processor_config.json").await.ok(); + let chat_template_filename = api_repo.get("chat_template.json").await.ok(); let model_info = if let Some(model_info) = get_hub_model_info(&api_repo).await { Some(model_info) @@ -1813,10 +1611,12 @@ pub async fn run( tokenizer_config_filename, preprocessor_config_filename, processor_config_filename, + chat_template_filename, model_info, ) } Type::Cache(cache) => { + tracing::info!("Cache {cache:?}"); let repo = cache.repo(Repo::with_revision( tokenizer_name.to_string(), RepoType::Model, @@ -1827,23 +1627,41 @@ pub async fn run( repo.get("tokenizer_config.json"), repo.get("preprocessor_config.json"), repo.get("processor_config.json"), + repo.get("chat_template.json"), None, ) } }; + // if chat_template_filename is present, load the chat template + let chat_template: Option = chat_template_filename + .and_then(|f| std::fs::read_to_string(f).ok()) + .and_then(|c| { + let res = serde_json::from_str::(&c); + if let Err(e) = &res { + tracing::warn!("Could not parse chat template {e:?}"); + } + res.ok().map(|t| t.chat_template) + }); + // Read the JSON contents of the file as an instance of 'HubTokenizerConfig'. + tracing::warn!("Tokenizer_config {tokenizer_config_path:?} - {tokenizer_config_filename:?}"); let tokenizer_config: Option = if let Some(filename) = tokenizer_config_path { HubTokenizerConfig::from_file(filename) } else { tokenizer_config_filename.and_then(HubTokenizerConfig::from_file) }; - let tokenizer_config = tokenizer_config.unwrap_or_else(|| { + let mut tokenizer_config = tokenizer_config.unwrap_or_else(|| { tracing::warn!("Could not find tokenizer config locally and no API specified"); HubTokenizerConfig::default() }); + if chat_template.is_some() { + tracing::info!("Using chat template from chat_template.json"); + tokenizer_config.chat_template = chat_template; + } + let tokenizer: Result = { use pyo3::prelude::*; Python::with_gil(|py| -> PyResult<()> { diff --git a/router/src/validation.rs b/router/src/validation.rs index 320e7f03..1119347d 100644 --- a/router/src/validation.rs +++ b/router/src/validation.rs @@ -18,6 +18,7 @@ use std::sync::Arc; use thiserror::Error; use tokio::sync::mpsc; use tokio::sync::oneshot; +use tracing::warn; use tracing::{instrument, Span}; use {once_cell::sync::Lazy, regex::Regex}; @@ -694,6 +695,14 @@ fn image_tokens( "<|vision_start|>{:?}<|vision_end|>", "<|image_pad|>".repeat(config.get_number_of_features(height, width)) ), + Gemma3(_config) => { + // TODO: prefer using the config to determine the number of features + let num_mm_soft_tokens_per_image = 256; + format!( + "\n\n{}\n\n", + "".repeat(num_mm_soft_tokens_per_image) + ) + } _ => unimplemented!("Images tokens are not supported for this model configuration"), } } @@ -721,8 +730,8 @@ fn prepare_input( static RE: Lazy = Lazy::new(|| Regex::new(r"!\[\]\([^\)]*\)").unwrap()); let (tokenizer_query, input_chunks) = match config { Some( - config @ (Idefics | Mllama | Idefics2(_) | Idefics3(_) | Paligemma(_) | LlavaNext(_) - | Qwen2Vl(_) | Qwen2_5Vl(_)), + config @ (Idefics | Mllama | Idefics2(_) | Idefics3(_) | Gemma3(_) | Paligemma(_) + | LlavaNext(_) | Qwen2Vl(_) | Qwen2_5Vl(_)), ) => { let mut input_chunks = Vec::new(); let mut tokenizer_query = String::with_capacity(inputs.len()); diff --git a/server/Makefile b/server/Makefile index 0db6f89b..3abc917e 100644 --- a/server/Makefile +++ b/server/Makefile @@ -39,6 +39,7 @@ install: install-cuda install-cuda: install-server install-flash-attention-v2-cuda install-flash-attention uv pip install -e ".[attention,bnb,marlin,moe]" uv pip install nvidia-nccl-cu12==2.22.3 + kernels download . install-rocm: install-server install-flash-attention-v2-rocm install-vllm-rocm diff --git a/server/hf-kernels.lock b/server/hf-kernels.lock deleted file mode 100644 index 7dc75943..00000000 --- a/server/hf-kernels.lock +++ /dev/null @@ -1,7088 +0,0 @@ -[ - { - "repo_id": "kernels-community/paged-attention", - "sha": "331b7e63a6b592799c8bc992f681bb1ee2c865a2", - "files": [ - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/paged_attention/__init__.py", - "blob_id": "9de56043369487facc1f163df6bd319c9806e5ca" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/paged_attention/_custom_ops.py", - "blob_id": "a0c0b8db085468dee5100c98d14106a9ee917bf2" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/paged_attention/_ops.py", - "blob_id": "609570440c63122010e6254ac2f92d4e4e52ec02" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/paged_attention/_paged_attention_fao6f4gjjrpl6.abi3.so", - "blob_id": "a4e60f2c567eb63c84430e9b80acaa0aa6974b1e" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/paged_attention/platforms.py", - "blob_id": "aa06132e74cd7fb634044a76e528979b02a3559b" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/paged_attention/__init__.py", - "blob_id": "9de56043369487facc1f163df6bd319c9806e5ca" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/paged_attention/_custom_ops.py", - "blob_id": "a0c0b8db085468dee5100c98d14106a9ee917bf2" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/paged_attention/_ops.py", - "blob_id": "9e52382b912b4e2d07f84982f762345debdbbfc8" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/paged_attention/_paged_attention_eo7ts45r6k64y.abi3.so", - "blob_id": "c20f9501a41daa820dfda27434674d032931b51e" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/paged_attention/platforms.py", - "blob_id": "aa06132e74cd7fb634044a76e528979b02a3559b" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/paged_attention/__init__.py", - "blob_id": "9de56043369487facc1f163df6bd319c9806e5ca" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/paged_attention/_custom_ops.py", - "blob_id": "a0c0b8db085468dee5100c98d14106a9ee917bf2" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/paged_attention/_ops.py", - "blob_id": "5f01e3f8c4ae3a031f109f78e010014d34347647" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/paged_attention/_paged_attention_5odgyxqhwqtv2.abi3.so", - "blob_id": "74f9714690337f49661c641a4f60f6e1e1f56cfa" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/paged_attention/platforms.py", - "blob_id": "aa06132e74cd7fb634044a76e528979b02a3559b" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/paged_attention/__init__.py", - "blob_id": "9de56043369487facc1f163df6bd319c9806e5ca" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/paged_attention/_custom_ops.py", - "blob_id": "a0c0b8db085468dee5100c98d14106a9ee917bf2" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/paged_attention/_ops.py", - "blob_id": "a3016a6b1cd7ae051012084bbd39d6f2e0913ace" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/paged_attention/_paged_attention_uy2moinaww2jc.abi3.so", - "blob_id": "445652acd4719542710cda86a2d08c70a56c8094" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/paged_attention/platforms.py", - "blob_id": "aa06132e74cd7fb634044a76e528979b02a3559b" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/paged_attention/__init__.py", - "blob_id": "9de56043369487facc1f163df6bd319c9806e5ca" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/paged_attention/_custom_ops.py", - "blob_id": "a0c0b8db085468dee5100c98d14106a9ee917bf2" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/paged_attention/_ops.py", - "blob_id": "e2cd992a80d4b938f243f0e6060e863278aca7f6" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/paged_attention/_paged_attention_35dt23tewn2p2.abi3.so", - "blob_id": "1f6414c382a753edb7512927ac5f3e31b196531d" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/paged_attention/platforms.py", - "blob_id": "aa06132e74cd7fb634044a76e528979b02a3559b" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/paged_attention/__init__.py", - "blob_id": "9de56043369487facc1f163df6bd319c9806e5ca" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/paged_attention/_custom_ops.py", - "blob_id": "a0c0b8db085468dee5100c98d14106a9ee917bf2" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/paged_attention/_ops.py", - "blob_id": "150412d67365be8ae5668f83d1939148bb576050" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/paged_attention/_paged_attention_fhq57q56w3m5o.abi3.so", - "blob_id": "ee97eee26a4de8d14d7ccdadaf406eed8405de39" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/paged_attention/platforms.py", - "blob_id": "aa06132e74cd7fb634044a76e528979b02a3559b" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/paged_attention/__init__.py", - "blob_id": "9de56043369487facc1f163df6bd319c9806e5ca" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/paged_attention/_custom_ops.py", - "blob_id": "a0c0b8db085468dee5100c98d14106a9ee917bf2" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/paged_attention/_ops.py", - "blob_id": "2bfef111c96308e595eb628bc88ab660a443089c" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/paged_attention/_paged_attention_xvepb4loq5mm2.abi3.so", - "blob_id": "1ea51bd49f8ec76bbe306a261021da52fe6a980f" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/paged_attention/platforms.py", - "blob_id": "aa06132e74cd7fb634044a76e528979b02a3559b" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/paged_attention/__init__.py", - "blob_id": "9de56043369487facc1f163df6bd319c9806e5ca" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/paged_attention/_custom_ops.py", - "blob_id": "a0c0b8db085468dee5100c98d14106a9ee917bf2" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/paged_attention/_ops.py", - "blob_id": "8928daeec47128544cef187bf18f214fc2238019" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/paged_attention/_paged_attention_uyfdujhnc2xoe.abi3.so", - "blob_id": "cf8ebe40f27db0fa87c46d7b4066494e65843820" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/paged_attention/platforms.py", - "blob_id": "aa06132e74cd7fb634044a76e528979b02a3559b" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/paged_attention/__init__.py", - "blob_id": "9de56043369487facc1f163df6bd319c9806e5ca" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/paged_attention/_custom_ops.py", - "blob_id": "a0c0b8db085468dee5100c98d14106a9ee917bf2" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/paged_attention/_ops.py", - "blob_id": "dff8537df63e1ef37769a6b7ba6b8c58192d7faa" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/paged_attention/_paged_attention_pervvqmod6pi4.abi3.so", - "blob_id": "77eb42e3471e9aa84d1f5d9854995c9737ed6bf3" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/paged_attention/platforms.py", - "blob_id": "aa06132e74cd7fb634044a76e528979b02a3559b" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/paged_attention/__init__.py", - "blob_id": "9de56043369487facc1f163df6bd319c9806e5ca" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/paged_attention/_custom_ops.py", - "blob_id": "a0c0b8db085468dee5100c98d14106a9ee917bf2" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/paged_attention/_ops.py", - "blob_id": "543c64d1589cb1747d7dc1ac29bd8f2cbeb61ab7" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/paged_attention/_paged_attention_24rowhxd5ebcc.abi3.so", - "blob_id": "43ec3529d8eac816c31cc1eaad4cc2baa3cbd3d6" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/paged_attention/platforms.py", - "blob_id": "aa06132e74cd7fb634044a76e528979b02a3559b" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/paged_attention/__init__.py", - "blob_id": "9de56043369487facc1f163df6bd319c9806e5ca" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/paged_attention/_custom_ops.py", - "blob_id": "a0c0b8db085468dee5100c98d14106a9ee917bf2" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/paged_attention/_ops.py", - "blob_id": "1d62b9bb1cfb040d7f68cd108ac9067100b4cf2d" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/paged_attention/_paged_attention_5yleoqr3zje4w.abi3.so", - "blob_id": "ffed60cc0a3948bdea6aa7fb4d486d9b943215ec" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/paged_attention/platforms.py", - "blob_id": "aa06132e74cd7fb634044a76e528979b02a3559b" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/paged_attention/__init__.py", - "blob_id": "9de56043369487facc1f163df6bd319c9806e5ca" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/paged_attention/_custom_ops.py", - "blob_id": "a0c0b8db085468dee5100c98d14106a9ee917bf2" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/paged_attention/_ops.py", - "blob_id": "ee817d13be64b46e3cb44ad192af4a5f3817bbf7" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/paged_attention/_paged_attention_3rbp7xipfucgo.abi3.so", - "blob_id": "5d5b3ffda2fd6a830d12341bab26dc5ec03f4a86" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/paged_attention/platforms.py", - "blob_id": "aa06132e74cd7fb634044a76e528979b02a3559b" - } - ] - }, - { - "repo_id": "kernels-community/moe", - "sha": "605a216f507b9a97b543140dee8937a4622069a8", - "files": [ - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/__init__.py", - "blob_id": "cc806778863c03ccb3157343cd6331c1c6ca332c" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/_moe_2f2wzwk42r5t2.abi3.so", - "blob_id": "b1f0ac7d52d2cbb7b49dd4e3e23eaf0b6acd3364" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/_ops.py", - "blob_id": "83a6a6a42d633c9b40e263b40b028086d2609b80" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "56c1a4e3af0b4a93fff71028d8e04bf73f0abb29" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "d3677bebb82a7f3f19344ef6471626493cf2c5bb" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "265768fb900ccfe9612b4a0d25973e6618f22a79" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "d3be23dfc903ba61d3d4d79c0230952b24d2ead0" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "589f5d39f31418d5121e7cbb2e6f2894b0a7ed32" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json", - "blob_id": "2c78bfaba7890772bf266721f5577202ea443882" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "4da841e74a79f9589fecac1fa557ea132d34805f" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "200356713c0d0a76e199671c7ec8f10d0e5ee0ac" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "e076615ee541a5043556f630ecf0946c4e2c1408" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "ee896554b921040d7810bb6e9368cc200777951d" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "05aed8b1c81492151d128ef251afc510d8cc8ed5" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json", - "blob_id": "9262a74a4a0e1e3789f260a3ef7f6cb9551f3f2b" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "d251f9b5accaec977fc87a0999cd56ee387fc650" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "0ecf814a28a9441e89f892eb3d63dcf8dcb0dd97" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "51ad5b299eb22465fa80530d12bdd5d7a03ce398" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "ee5119182556cf49434c10e56cf04e3baeb26408" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "68793c77b33c4f4b97d0a4b780fcbe8043c799de" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "612910720ed9439e56c4af4c03f30fee224fac80" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "039a10ed127b77836a7f41c03513292613852b30" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "3793fcafee60bc7e8f5f12d601cb3192abfa9ca8" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "51d03d8607122d7b9bc20ba48d8432d62367fa00" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json", - "blob_id": "26f9abd6b789e9dd0f83ec7721fd1bae8aa76bec" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "cd0cdbea0c3372674cb610870dd0b30325864549" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "64be6e6591422aa0f441c3747b6c49850929652e" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "0a6a6a73fa45e270f01ba7ebdc6d9d55bf9daad3" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "ba9041d008507e31ae4179ef2bc863a49c606582" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "7a7508aab04599cb06641c835d8b0a14f54d0716" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "dbf9a2dd6f048d8adee290961e2aea72035f7615" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json", - "blob_id": "bbb2386046b1135a2cc7ab7cb26c1d0b039bcf3a" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "57055453aa24c831dad9ac8e37fdab707c63ef91" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "8cc6c643f236d2f7f9ad29354d9e469d00b20d3f" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "d4c9ddd12972ac0b5fd2be11a9cd1075906e3978" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "b2799ed3a866e25b78d60d92910c000ebb21ff71" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "b8d3be2313fa14025d8aeb2fd11e0d1ee997ffa6" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json", - "blob_id": "6a976788f9b10af19ebcfe582a69cbc627f9457b" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "3f3ccdafa88f3452a695efad4cb9622d6ae79e6a" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json", - "blob_id": "0a46390b2e31bba6a7c3ab2c9f6c8de6004857bb" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json", - "blob_id": "f4c0f8417b384870050a95e0cf57edbdf6352b23" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "5c8185cfdeec167ec4b88de51b4b395e28769cc5" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "97c9f4445b166657ad29f1db9fc8281f9c463ec4" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "0bb423b28f5ab3825929a4870b96393262a9dd9f" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "55571873395464a3b58f549523905f439a8f1716" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "26bcbf26970c7a77c99e2c8eacd83eefa86967bf" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json", - "blob_id": "91011e64c7de4505e9bb462bc70e6a3e7affa878" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json", - "blob_id": "b41f9d443e50678334f906b44fce6d018d69500e" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "edf2a38d12ad3f420f232d2cd61ab149ad138725" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "673bae2ba8ef80ed4d4930739ca7daf0e8f28ee1" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "b2100cebb7f589747430be9ca8c8db368c152d78" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json", - "blob_id": "d720deb4bdd73d194b1023c99e190b8fcfecdaef" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "dbc624731f5cb9afcdc9213183d00d1e5edd4a00" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "cc614e635ea57327c610ce79e99ae5339614f22e" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "32c0c9da471cbe479044095e0ed14a0f54b73620" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json", - "blob_id": "f807d4a5abaed9dd686df26837f2dd9f6161300f" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "f578c8d0160ac3ef85b53c8539d3675455a97173" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "918f6839620cbab1f30b0f9383a9129c2cf2cf3d" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "e341a67917d5177bacb3f6767e7b6d92539826ad" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "34b916e574f88c65db1dac5889d74a990dc25e9b" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "6496a38fba8ae09b3025a75f357815b9d6a5e3f4" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "3618053b65831b95c4bb0f20ef3b9aa816b2d637" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "46a982f5ee9a4bd67ce244b101c576efeeb53b78" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "035ec027fa56622196b24a03a5042ce010deaebf" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "8b49f2781cb54d19a2789767ebb7e8c3fb55b981" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "851bc9f9f0b50b41451b929eaa518869b6a05412" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "d1227c2157990216d2ca51c69ad0944017f53b6a" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "77ba0d7477bdbcb036a43263e7aaa6b6913f8f4e" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "1c61451fb34e52deec827f8f63c80fb15830c202" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "63e661c80de6a7b1422f7a994a2ee7a4b724911c" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "cf354037903c0d1fcd077c4647aabce026a723fb" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "0a5d7bfdba4852da9ed08d1bc27cd7d521d09965" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "eccb86a76df0d7302b760ab6d83a8ceb9fa9d0d9" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "cb91a279d423d0ca25197e0edd5e8c2f4da58720" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "88af48431d8b8791af8df03429704606b670f1f7" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "dd069726d7ed4dcbb449af243f4f4af21815f854" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "7febe3d272b4bb76500f7c6b523396129fd53680" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "56b939e52fac3ed53a4e0ba640c40010cb3af30a" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "63d9a0bf5d79ddaaad547d44338ad4b959ad72b1" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "7fa398c15a2a535401709b0f25e20f6e4b23e58e" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "f15d8f64c7090bd71d0091a524c65d7818fec38e" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "9d7658bfc41b2c8fd4daf3fbdf62d15936d3d546" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "cd3e07804fdec10c2cfb291c1ede3ba67b753f9c" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "9d5a329d7466a37c0ca68a65a089fbb99f9327a9" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "03dba5ad15ba5f7f49100a5c78e8685e64334b2a" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "96e1594a3eabbaedc792b84b07f05ae8752b7251" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "5ffd367df833d773355590220598a3c7eceba4e0" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "9a5ff48b8942957dde9b862aed848390dd267948" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "eabc423949a24c2a1fb2368a73e5249caf8d07df" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "386928de139ce718f28222b9c1a6555df3958491" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "51e237b91b8e775a36bcf783c078c2c1cecbcbd2" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "6280219c9ee7d26f7e2fd3625dc92d847ddc7982" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "40c01c0b92b4b26fe480879dda33f18c5eb59a6d" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "c6fd3659799bc31e17f3577e7f0e8d7268faf1fb" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "160f12ed3f95a6967439ff53bc3e3a2cdc97c700" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "e5c4a1d2c94e5c7864f462e083ea5f530b8efe3f" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "2bf5eb27e38208871d50348b170c8c74b80fc519" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "9c908e80406587da4d246ce4e3a8a98a14c875b1" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "0a1e14cffbb2a894a701352193947d272427db0d" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "15b1c93f60fc5068ba11b82b6d5924dd2024a824" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "f78e7060e6840ff721d306db556636b0bbc8d9b3" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "8ff12e64c172f5a5d0fbdf900728fe60b33877e2" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "4532f93681e2be175b1bf94f81bfde711821cd60" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "1d3ce5c94c2d9a4a1637204efb3b14f7a5579bdb" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "ca7f32b9552b479dc05495792b7e426db5eb1b56" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "5acea242cc0ad094cba8ee5f568ff88afb1b41ae" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "3ab5796ee15b6ec8d4ab1f4ab5a594fecb30e4b4" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "58cdd93e90b8c29bc7a211861711565dbeeb529a" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "b72e0371d1421a1decc9d57860f83eea8f790942" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "3cb7eaa07c745fd3aa2b3242780a7061bedac1de" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "293adce387e066fce75b6e606d4b8b6a5aa10bdb" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/fp8.py", - "blob_id": "23bd7d6703104b0020671cc6ba6f78a6df37e4bf" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/fp8_utils.py", - "blob_id": "acb4f3e3bb1a34f209fdac9ecca8c123aaf67f12" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/fused_marlin_moe.py", - "blob_id": "b3e0a5c24599730faf973fad3cf3fb6031a30522" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/fused_moe.py", - "blob_id": "af2d798cbe5d7c3c1760ce79f717ab5f6d7700ba" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/platforms.py", - "blob_id": "735fab87f2add390f7bf6408ebe31d1f5de6d02b" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/scalar_type.py", - "blob_id": "ea749fe8247b6846620ccbba30ddf48d914ca4e1" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/utils/__init__.py", - "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/utils/marlin_utils.py", - "blob_id": "5037f774b8a8b7e88d822efacbb3b4ea5b95d356" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/utils/marlin_utils_test.py", - "blob_id": "83faac032ca93b3564c620c5b4b1ef63c74aaddf" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/utils/quant_utils.py", - "blob_id": "5819ab753e57655185572ce1e49c24e6268171b4" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/__init__.py", - "blob_id": "cc806778863c03ccb3157343cd6331c1c6ca332c" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/_moe_qx7m4hiw6tx7s.abi3.so", - "blob_id": "cfdb823fabc296c258f58ce8e03a347be7eb558f" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/_ops.py", - "blob_id": "0d77c5d3e29106cf62a45153770fafbff59b2932" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "56c1a4e3af0b4a93fff71028d8e04bf73f0abb29" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "d3677bebb82a7f3f19344ef6471626493cf2c5bb" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "265768fb900ccfe9612b4a0d25973e6618f22a79" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "d3be23dfc903ba61d3d4d79c0230952b24d2ead0" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "589f5d39f31418d5121e7cbb2e6f2894b0a7ed32" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json", - "blob_id": "2c78bfaba7890772bf266721f5577202ea443882" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "4da841e74a79f9589fecac1fa557ea132d34805f" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "200356713c0d0a76e199671c7ec8f10d0e5ee0ac" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "e076615ee541a5043556f630ecf0946c4e2c1408" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "ee896554b921040d7810bb6e9368cc200777951d" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "05aed8b1c81492151d128ef251afc510d8cc8ed5" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json", - "blob_id": "9262a74a4a0e1e3789f260a3ef7f6cb9551f3f2b" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "d251f9b5accaec977fc87a0999cd56ee387fc650" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "0ecf814a28a9441e89f892eb3d63dcf8dcb0dd97" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "51ad5b299eb22465fa80530d12bdd5d7a03ce398" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "ee5119182556cf49434c10e56cf04e3baeb26408" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "68793c77b33c4f4b97d0a4b780fcbe8043c799de" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "612910720ed9439e56c4af4c03f30fee224fac80" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "039a10ed127b77836a7f41c03513292613852b30" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "3793fcafee60bc7e8f5f12d601cb3192abfa9ca8" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "51d03d8607122d7b9bc20ba48d8432d62367fa00" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json", - "blob_id": "26f9abd6b789e9dd0f83ec7721fd1bae8aa76bec" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "cd0cdbea0c3372674cb610870dd0b30325864549" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "64be6e6591422aa0f441c3747b6c49850929652e" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "0a6a6a73fa45e270f01ba7ebdc6d9d55bf9daad3" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "ba9041d008507e31ae4179ef2bc863a49c606582" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "7a7508aab04599cb06641c835d8b0a14f54d0716" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "dbf9a2dd6f048d8adee290961e2aea72035f7615" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json", - "blob_id": "bbb2386046b1135a2cc7ab7cb26c1d0b039bcf3a" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "57055453aa24c831dad9ac8e37fdab707c63ef91" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "8cc6c643f236d2f7f9ad29354d9e469d00b20d3f" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "d4c9ddd12972ac0b5fd2be11a9cd1075906e3978" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "b2799ed3a866e25b78d60d92910c000ebb21ff71" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "b8d3be2313fa14025d8aeb2fd11e0d1ee997ffa6" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json", - "blob_id": "6a976788f9b10af19ebcfe582a69cbc627f9457b" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "3f3ccdafa88f3452a695efad4cb9622d6ae79e6a" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json", - "blob_id": "0a46390b2e31bba6a7c3ab2c9f6c8de6004857bb" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json", - "blob_id": "f4c0f8417b384870050a95e0cf57edbdf6352b23" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "5c8185cfdeec167ec4b88de51b4b395e28769cc5" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "97c9f4445b166657ad29f1db9fc8281f9c463ec4" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "0bb423b28f5ab3825929a4870b96393262a9dd9f" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "55571873395464a3b58f549523905f439a8f1716" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "26bcbf26970c7a77c99e2c8eacd83eefa86967bf" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json", - "blob_id": "91011e64c7de4505e9bb462bc70e6a3e7affa878" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json", - "blob_id": "b41f9d443e50678334f906b44fce6d018d69500e" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "edf2a38d12ad3f420f232d2cd61ab149ad138725" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "673bae2ba8ef80ed4d4930739ca7daf0e8f28ee1" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "b2100cebb7f589747430be9ca8c8db368c152d78" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json", - "blob_id": "d720deb4bdd73d194b1023c99e190b8fcfecdaef" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "dbc624731f5cb9afcdc9213183d00d1e5edd4a00" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "cc614e635ea57327c610ce79e99ae5339614f22e" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "32c0c9da471cbe479044095e0ed14a0f54b73620" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json", - "blob_id": "f807d4a5abaed9dd686df26837f2dd9f6161300f" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "f578c8d0160ac3ef85b53c8539d3675455a97173" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "918f6839620cbab1f30b0f9383a9129c2cf2cf3d" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "e341a67917d5177bacb3f6767e7b6d92539826ad" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "34b916e574f88c65db1dac5889d74a990dc25e9b" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "6496a38fba8ae09b3025a75f357815b9d6a5e3f4" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "3618053b65831b95c4bb0f20ef3b9aa816b2d637" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "46a982f5ee9a4bd67ce244b101c576efeeb53b78" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "035ec027fa56622196b24a03a5042ce010deaebf" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "8b49f2781cb54d19a2789767ebb7e8c3fb55b981" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "851bc9f9f0b50b41451b929eaa518869b6a05412" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "d1227c2157990216d2ca51c69ad0944017f53b6a" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "77ba0d7477bdbcb036a43263e7aaa6b6913f8f4e" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "1c61451fb34e52deec827f8f63c80fb15830c202" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "63e661c80de6a7b1422f7a994a2ee7a4b724911c" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "cf354037903c0d1fcd077c4647aabce026a723fb" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "0a5d7bfdba4852da9ed08d1bc27cd7d521d09965" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "eccb86a76df0d7302b760ab6d83a8ceb9fa9d0d9" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "cb91a279d423d0ca25197e0edd5e8c2f4da58720" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "88af48431d8b8791af8df03429704606b670f1f7" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "dd069726d7ed4dcbb449af243f4f4af21815f854" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "7febe3d272b4bb76500f7c6b523396129fd53680" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "56b939e52fac3ed53a4e0ba640c40010cb3af30a" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "63d9a0bf5d79ddaaad547d44338ad4b959ad72b1" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "7fa398c15a2a535401709b0f25e20f6e4b23e58e" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "f15d8f64c7090bd71d0091a524c65d7818fec38e" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "9d7658bfc41b2c8fd4daf3fbdf62d15936d3d546" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "cd3e07804fdec10c2cfb291c1ede3ba67b753f9c" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "9d5a329d7466a37c0ca68a65a089fbb99f9327a9" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "03dba5ad15ba5f7f49100a5c78e8685e64334b2a" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "96e1594a3eabbaedc792b84b07f05ae8752b7251" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "5ffd367df833d773355590220598a3c7eceba4e0" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "9a5ff48b8942957dde9b862aed848390dd267948" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "eabc423949a24c2a1fb2368a73e5249caf8d07df" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "386928de139ce718f28222b9c1a6555df3958491" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "51e237b91b8e775a36bcf783c078c2c1cecbcbd2" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "6280219c9ee7d26f7e2fd3625dc92d847ddc7982" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "40c01c0b92b4b26fe480879dda33f18c5eb59a6d" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "c6fd3659799bc31e17f3577e7f0e8d7268faf1fb" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "160f12ed3f95a6967439ff53bc3e3a2cdc97c700" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "e5c4a1d2c94e5c7864f462e083ea5f530b8efe3f" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "2bf5eb27e38208871d50348b170c8c74b80fc519" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "9c908e80406587da4d246ce4e3a8a98a14c875b1" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "0a1e14cffbb2a894a701352193947d272427db0d" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "15b1c93f60fc5068ba11b82b6d5924dd2024a824" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "f78e7060e6840ff721d306db556636b0bbc8d9b3" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "8ff12e64c172f5a5d0fbdf900728fe60b33877e2" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "4532f93681e2be175b1bf94f81bfde711821cd60" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "1d3ce5c94c2d9a4a1637204efb3b14f7a5579bdb" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "ca7f32b9552b479dc05495792b7e426db5eb1b56" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "5acea242cc0ad094cba8ee5f568ff88afb1b41ae" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "3ab5796ee15b6ec8d4ab1f4ab5a594fecb30e4b4" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "58cdd93e90b8c29bc7a211861711565dbeeb529a" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "b72e0371d1421a1decc9d57860f83eea8f790942" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "3cb7eaa07c745fd3aa2b3242780a7061bedac1de" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "293adce387e066fce75b6e606d4b8b6a5aa10bdb" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/fp8.py", - "blob_id": "23bd7d6703104b0020671cc6ba6f78a6df37e4bf" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/fp8_utils.py", - "blob_id": "acb4f3e3bb1a34f209fdac9ecca8c123aaf67f12" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/fused_marlin_moe.py", - "blob_id": "b3e0a5c24599730faf973fad3cf3fb6031a30522" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/fused_moe.py", - "blob_id": "af2d798cbe5d7c3c1760ce79f717ab5f6d7700ba" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/platforms.py", - "blob_id": "735fab87f2add390f7bf6408ebe31d1f5de6d02b" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/scalar_type.py", - "blob_id": "ea749fe8247b6846620ccbba30ddf48d914ca4e1" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/utils/__init__.py", - "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/utils/marlin_utils.py", - "blob_id": "5037f774b8a8b7e88d822efacbb3b4ea5b95d356" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/utils/marlin_utils_test.py", - "blob_id": "83faac032ca93b3564c620c5b4b1ef63c74aaddf" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/utils/quant_utils.py", - "blob_id": "5819ab753e57655185572ce1e49c24e6268171b4" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/__init__.py", - "blob_id": "cc806778863c03ccb3157343cd6331c1c6ca332c" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/_moe_ctvji3e7dq64w.abi3.so", - "blob_id": "db5c3be15bc329bc0aa5b87d34223b747751484e" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/_ops.py", - "blob_id": "b22bd5b27938464e6c7359b1974db9b472effa6b" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "56c1a4e3af0b4a93fff71028d8e04bf73f0abb29" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "d3677bebb82a7f3f19344ef6471626493cf2c5bb" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "265768fb900ccfe9612b4a0d25973e6618f22a79" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "d3be23dfc903ba61d3d4d79c0230952b24d2ead0" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "589f5d39f31418d5121e7cbb2e6f2894b0a7ed32" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json", - "blob_id": "2c78bfaba7890772bf266721f5577202ea443882" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "4da841e74a79f9589fecac1fa557ea132d34805f" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "200356713c0d0a76e199671c7ec8f10d0e5ee0ac" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "e076615ee541a5043556f630ecf0946c4e2c1408" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "ee896554b921040d7810bb6e9368cc200777951d" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "05aed8b1c81492151d128ef251afc510d8cc8ed5" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json", - "blob_id": "9262a74a4a0e1e3789f260a3ef7f6cb9551f3f2b" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "d251f9b5accaec977fc87a0999cd56ee387fc650" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "0ecf814a28a9441e89f892eb3d63dcf8dcb0dd97" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "51ad5b299eb22465fa80530d12bdd5d7a03ce398" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "ee5119182556cf49434c10e56cf04e3baeb26408" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "68793c77b33c4f4b97d0a4b780fcbe8043c799de" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "612910720ed9439e56c4af4c03f30fee224fac80" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "039a10ed127b77836a7f41c03513292613852b30" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "3793fcafee60bc7e8f5f12d601cb3192abfa9ca8" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "51d03d8607122d7b9bc20ba48d8432d62367fa00" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json", - "blob_id": "26f9abd6b789e9dd0f83ec7721fd1bae8aa76bec" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "cd0cdbea0c3372674cb610870dd0b30325864549" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "64be6e6591422aa0f441c3747b6c49850929652e" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "0a6a6a73fa45e270f01ba7ebdc6d9d55bf9daad3" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "ba9041d008507e31ae4179ef2bc863a49c606582" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "7a7508aab04599cb06641c835d8b0a14f54d0716" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "dbf9a2dd6f048d8adee290961e2aea72035f7615" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json", - "blob_id": "bbb2386046b1135a2cc7ab7cb26c1d0b039bcf3a" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "57055453aa24c831dad9ac8e37fdab707c63ef91" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "8cc6c643f236d2f7f9ad29354d9e469d00b20d3f" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "d4c9ddd12972ac0b5fd2be11a9cd1075906e3978" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "b2799ed3a866e25b78d60d92910c000ebb21ff71" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "b8d3be2313fa14025d8aeb2fd11e0d1ee997ffa6" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json", - "blob_id": "6a976788f9b10af19ebcfe582a69cbc627f9457b" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "3f3ccdafa88f3452a695efad4cb9622d6ae79e6a" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json", - "blob_id": "0a46390b2e31bba6a7c3ab2c9f6c8de6004857bb" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json", - "blob_id": "f4c0f8417b384870050a95e0cf57edbdf6352b23" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "5c8185cfdeec167ec4b88de51b4b395e28769cc5" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "97c9f4445b166657ad29f1db9fc8281f9c463ec4" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "0bb423b28f5ab3825929a4870b96393262a9dd9f" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "55571873395464a3b58f549523905f439a8f1716" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "26bcbf26970c7a77c99e2c8eacd83eefa86967bf" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json", - "blob_id": "91011e64c7de4505e9bb462bc70e6a3e7affa878" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json", - "blob_id": "b41f9d443e50678334f906b44fce6d018d69500e" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "edf2a38d12ad3f420f232d2cd61ab149ad138725" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "673bae2ba8ef80ed4d4930739ca7daf0e8f28ee1" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "b2100cebb7f589747430be9ca8c8db368c152d78" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json", - "blob_id": "d720deb4bdd73d194b1023c99e190b8fcfecdaef" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "dbc624731f5cb9afcdc9213183d00d1e5edd4a00" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "cc614e635ea57327c610ce79e99ae5339614f22e" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "32c0c9da471cbe479044095e0ed14a0f54b73620" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json", - "blob_id": "f807d4a5abaed9dd686df26837f2dd9f6161300f" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "f578c8d0160ac3ef85b53c8539d3675455a97173" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "918f6839620cbab1f30b0f9383a9129c2cf2cf3d" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "e341a67917d5177bacb3f6767e7b6d92539826ad" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "34b916e574f88c65db1dac5889d74a990dc25e9b" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "6496a38fba8ae09b3025a75f357815b9d6a5e3f4" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "3618053b65831b95c4bb0f20ef3b9aa816b2d637" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "46a982f5ee9a4bd67ce244b101c576efeeb53b78" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "035ec027fa56622196b24a03a5042ce010deaebf" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "8b49f2781cb54d19a2789767ebb7e8c3fb55b981" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "851bc9f9f0b50b41451b929eaa518869b6a05412" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "d1227c2157990216d2ca51c69ad0944017f53b6a" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "77ba0d7477bdbcb036a43263e7aaa6b6913f8f4e" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "1c61451fb34e52deec827f8f63c80fb15830c202" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "63e661c80de6a7b1422f7a994a2ee7a4b724911c" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "cf354037903c0d1fcd077c4647aabce026a723fb" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "0a5d7bfdba4852da9ed08d1bc27cd7d521d09965" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "eccb86a76df0d7302b760ab6d83a8ceb9fa9d0d9" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "cb91a279d423d0ca25197e0edd5e8c2f4da58720" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "88af48431d8b8791af8df03429704606b670f1f7" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "dd069726d7ed4dcbb449af243f4f4af21815f854" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "7febe3d272b4bb76500f7c6b523396129fd53680" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "56b939e52fac3ed53a4e0ba640c40010cb3af30a" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "63d9a0bf5d79ddaaad547d44338ad4b959ad72b1" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "7fa398c15a2a535401709b0f25e20f6e4b23e58e" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "f15d8f64c7090bd71d0091a524c65d7818fec38e" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "9d7658bfc41b2c8fd4daf3fbdf62d15936d3d546" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "cd3e07804fdec10c2cfb291c1ede3ba67b753f9c" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "9d5a329d7466a37c0ca68a65a089fbb99f9327a9" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "03dba5ad15ba5f7f49100a5c78e8685e64334b2a" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "96e1594a3eabbaedc792b84b07f05ae8752b7251" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "5ffd367df833d773355590220598a3c7eceba4e0" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "9a5ff48b8942957dde9b862aed848390dd267948" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "eabc423949a24c2a1fb2368a73e5249caf8d07df" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "386928de139ce718f28222b9c1a6555df3958491" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "51e237b91b8e775a36bcf783c078c2c1cecbcbd2" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "6280219c9ee7d26f7e2fd3625dc92d847ddc7982" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "40c01c0b92b4b26fe480879dda33f18c5eb59a6d" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "c6fd3659799bc31e17f3577e7f0e8d7268faf1fb" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "160f12ed3f95a6967439ff53bc3e3a2cdc97c700" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "e5c4a1d2c94e5c7864f462e083ea5f530b8efe3f" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "2bf5eb27e38208871d50348b170c8c74b80fc519" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "9c908e80406587da4d246ce4e3a8a98a14c875b1" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "0a1e14cffbb2a894a701352193947d272427db0d" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "15b1c93f60fc5068ba11b82b6d5924dd2024a824" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "f78e7060e6840ff721d306db556636b0bbc8d9b3" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "8ff12e64c172f5a5d0fbdf900728fe60b33877e2" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "4532f93681e2be175b1bf94f81bfde711821cd60" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "1d3ce5c94c2d9a4a1637204efb3b14f7a5579bdb" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "ca7f32b9552b479dc05495792b7e426db5eb1b56" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "5acea242cc0ad094cba8ee5f568ff88afb1b41ae" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "3ab5796ee15b6ec8d4ab1f4ab5a594fecb30e4b4" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "58cdd93e90b8c29bc7a211861711565dbeeb529a" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "b72e0371d1421a1decc9d57860f83eea8f790942" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "3cb7eaa07c745fd3aa2b3242780a7061bedac1de" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "293adce387e066fce75b6e606d4b8b6a5aa10bdb" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/fp8.py", - "blob_id": "23bd7d6703104b0020671cc6ba6f78a6df37e4bf" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/fp8_utils.py", - "blob_id": "acb4f3e3bb1a34f209fdac9ecca8c123aaf67f12" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/fused_marlin_moe.py", - "blob_id": "b3e0a5c24599730faf973fad3cf3fb6031a30522" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/fused_moe.py", - "blob_id": "af2d798cbe5d7c3c1760ce79f717ab5f6d7700ba" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/platforms.py", - "blob_id": "735fab87f2add390f7bf6408ebe31d1f5de6d02b" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/scalar_type.py", - "blob_id": "ea749fe8247b6846620ccbba30ddf48d914ca4e1" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/utils/__init__.py", - "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/utils/marlin_utils.py", - "blob_id": "5037f774b8a8b7e88d822efacbb3b4ea5b95d356" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/utils/marlin_utils_test.py", - "blob_id": "83faac032ca93b3564c620c5b4b1ef63c74aaddf" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/utils/quant_utils.py", - "blob_id": "5819ab753e57655185572ce1e49c24e6268171b4" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/__init__.py", - "blob_id": "cc806778863c03ccb3157343cd6331c1c6ca332c" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/_moe_2dhx2xkm6c5wu.abi3.so", - "blob_id": "56dc9c91ddc02b281dcf7c996071bee341ef026c" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/_ops.py", - "blob_id": "97f4f6344f4a61a52c8077cdc7884400e56c558b" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "56c1a4e3af0b4a93fff71028d8e04bf73f0abb29" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "d3677bebb82a7f3f19344ef6471626493cf2c5bb" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "265768fb900ccfe9612b4a0d25973e6618f22a79" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "d3be23dfc903ba61d3d4d79c0230952b24d2ead0" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "589f5d39f31418d5121e7cbb2e6f2894b0a7ed32" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json", - "blob_id": "2c78bfaba7890772bf266721f5577202ea443882" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "4da841e74a79f9589fecac1fa557ea132d34805f" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "200356713c0d0a76e199671c7ec8f10d0e5ee0ac" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "e076615ee541a5043556f630ecf0946c4e2c1408" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "ee896554b921040d7810bb6e9368cc200777951d" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "05aed8b1c81492151d128ef251afc510d8cc8ed5" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json", - "blob_id": "9262a74a4a0e1e3789f260a3ef7f6cb9551f3f2b" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "d251f9b5accaec977fc87a0999cd56ee387fc650" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "0ecf814a28a9441e89f892eb3d63dcf8dcb0dd97" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "51ad5b299eb22465fa80530d12bdd5d7a03ce398" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "ee5119182556cf49434c10e56cf04e3baeb26408" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "68793c77b33c4f4b97d0a4b780fcbe8043c799de" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "612910720ed9439e56c4af4c03f30fee224fac80" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "039a10ed127b77836a7f41c03513292613852b30" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "3793fcafee60bc7e8f5f12d601cb3192abfa9ca8" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "51d03d8607122d7b9bc20ba48d8432d62367fa00" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json", - "blob_id": "26f9abd6b789e9dd0f83ec7721fd1bae8aa76bec" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "cd0cdbea0c3372674cb610870dd0b30325864549" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "64be6e6591422aa0f441c3747b6c49850929652e" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "0a6a6a73fa45e270f01ba7ebdc6d9d55bf9daad3" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "ba9041d008507e31ae4179ef2bc863a49c606582" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "7a7508aab04599cb06641c835d8b0a14f54d0716" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "dbf9a2dd6f048d8adee290961e2aea72035f7615" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json", - "blob_id": "bbb2386046b1135a2cc7ab7cb26c1d0b039bcf3a" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "57055453aa24c831dad9ac8e37fdab707c63ef91" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "8cc6c643f236d2f7f9ad29354d9e469d00b20d3f" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "d4c9ddd12972ac0b5fd2be11a9cd1075906e3978" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "b2799ed3a866e25b78d60d92910c000ebb21ff71" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "b8d3be2313fa14025d8aeb2fd11e0d1ee997ffa6" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json", - "blob_id": "6a976788f9b10af19ebcfe582a69cbc627f9457b" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "3f3ccdafa88f3452a695efad4cb9622d6ae79e6a" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json", - "blob_id": "0a46390b2e31bba6a7c3ab2c9f6c8de6004857bb" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json", - "blob_id": "f4c0f8417b384870050a95e0cf57edbdf6352b23" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "5c8185cfdeec167ec4b88de51b4b395e28769cc5" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "97c9f4445b166657ad29f1db9fc8281f9c463ec4" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "0bb423b28f5ab3825929a4870b96393262a9dd9f" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "55571873395464a3b58f549523905f439a8f1716" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "26bcbf26970c7a77c99e2c8eacd83eefa86967bf" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json", - "blob_id": "91011e64c7de4505e9bb462bc70e6a3e7affa878" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json", - "blob_id": "b41f9d443e50678334f906b44fce6d018d69500e" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "edf2a38d12ad3f420f232d2cd61ab149ad138725" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "673bae2ba8ef80ed4d4930739ca7daf0e8f28ee1" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "b2100cebb7f589747430be9ca8c8db368c152d78" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json", - "blob_id": "d720deb4bdd73d194b1023c99e190b8fcfecdaef" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "dbc624731f5cb9afcdc9213183d00d1e5edd4a00" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "cc614e635ea57327c610ce79e99ae5339614f22e" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "32c0c9da471cbe479044095e0ed14a0f54b73620" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json", - "blob_id": "f807d4a5abaed9dd686df26837f2dd9f6161300f" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "f578c8d0160ac3ef85b53c8539d3675455a97173" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "918f6839620cbab1f30b0f9383a9129c2cf2cf3d" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "e341a67917d5177bacb3f6767e7b6d92539826ad" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "34b916e574f88c65db1dac5889d74a990dc25e9b" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "6496a38fba8ae09b3025a75f357815b9d6a5e3f4" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "3618053b65831b95c4bb0f20ef3b9aa816b2d637" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "46a982f5ee9a4bd67ce244b101c576efeeb53b78" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "035ec027fa56622196b24a03a5042ce010deaebf" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "8b49f2781cb54d19a2789767ebb7e8c3fb55b981" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "851bc9f9f0b50b41451b929eaa518869b6a05412" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "d1227c2157990216d2ca51c69ad0944017f53b6a" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "77ba0d7477bdbcb036a43263e7aaa6b6913f8f4e" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "1c61451fb34e52deec827f8f63c80fb15830c202" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "63e661c80de6a7b1422f7a994a2ee7a4b724911c" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "cf354037903c0d1fcd077c4647aabce026a723fb" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "0a5d7bfdba4852da9ed08d1bc27cd7d521d09965" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "eccb86a76df0d7302b760ab6d83a8ceb9fa9d0d9" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "cb91a279d423d0ca25197e0edd5e8c2f4da58720" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "88af48431d8b8791af8df03429704606b670f1f7" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "dd069726d7ed4dcbb449af243f4f4af21815f854" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "7febe3d272b4bb76500f7c6b523396129fd53680" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "56b939e52fac3ed53a4e0ba640c40010cb3af30a" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "63d9a0bf5d79ddaaad547d44338ad4b959ad72b1" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "7fa398c15a2a535401709b0f25e20f6e4b23e58e" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "f15d8f64c7090bd71d0091a524c65d7818fec38e" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "9d7658bfc41b2c8fd4daf3fbdf62d15936d3d546" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "cd3e07804fdec10c2cfb291c1ede3ba67b753f9c" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "9d5a329d7466a37c0ca68a65a089fbb99f9327a9" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "03dba5ad15ba5f7f49100a5c78e8685e64334b2a" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "96e1594a3eabbaedc792b84b07f05ae8752b7251" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "5ffd367df833d773355590220598a3c7eceba4e0" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "9a5ff48b8942957dde9b862aed848390dd267948" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "eabc423949a24c2a1fb2368a73e5249caf8d07df" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "386928de139ce718f28222b9c1a6555df3958491" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "51e237b91b8e775a36bcf783c078c2c1cecbcbd2" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "6280219c9ee7d26f7e2fd3625dc92d847ddc7982" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "40c01c0b92b4b26fe480879dda33f18c5eb59a6d" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "c6fd3659799bc31e17f3577e7f0e8d7268faf1fb" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "160f12ed3f95a6967439ff53bc3e3a2cdc97c700" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "e5c4a1d2c94e5c7864f462e083ea5f530b8efe3f" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "2bf5eb27e38208871d50348b170c8c74b80fc519" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "9c908e80406587da4d246ce4e3a8a98a14c875b1" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "0a1e14cffbb2a894a701352193947d272427db0d" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "15b1c93f60fc5068ba11b82b6d5924dd2024a824" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "f78e7060e6840ff721d306db556636b0bbc8d9b3" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "8ff12e64c172f5a5d0fbdf900728fe60b33877e2" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "4532f93681e2be175b1bf94f81bfde711821cd60" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "1d3ce5c94c2d9a4a1637204efb3b14f7a5579bdb" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "ca7f32b9552b479dc05495792b7e426db5eb1b56" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "5acea242cc0ad094cba8ee5f568ff88afb1b41ae" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "3ab5796ee15b6ec8d4ab1f4ab5a594fecb30e4b4" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "58cdd93e90b8c29bc7a211861711565dbeeb529a" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "b72e0371d1421a1decc9d57860f83eea8f790942" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "3cb7eaa07c745fd3aa2b3242780a7061bedac1de" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "293adce387e066fce75b6e606d4b8b6a5aa10bdb" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/fp8.py", - "blob_id": "23bd7d6703104b0020671cc6ba6f78a6df37e4bf" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/fp8_utils.py", - "blob_id": "acb4f3e3bb1a34f209fdac9ecca8c123aaf67f12" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/fused_marlin_moe.py", - "blob_id": "b3e0a5c24599730faf973fad3cf3fb6031a30522" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/fused_moe.py", - "blob_id": "af2d798cbe5d7c3c1760ce79f717ab5f6d7700ba" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/platforms.py", - "blob_id": "735fab87f2add390f7bf6408ebe31d1f5de6d02b" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/scalar_type.py", - "blob_id": "ea749fe8247b6846620ccbba30ddf48d914ca4e1" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/utils/__init__.py", - "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/utils/marlin_utils.py", - "blob_id": "5037f774b8a8b7e88d822efacbb3b4ea5b95d356" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/utils/marlin_utils_test.py", - "blob_id": "83faac032ca93b3564c620c5b4b1ef63c74aaddf" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/utils/quant_utils.py", - "blob_id": "5819ab753e57655185572ce1e49c24e6268171b4" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/__init__.py", - "blob_id": "cc806778863c03ccb3157343cd6331c1c6ca332c" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/_moe_wfwoejktckaue.abi3.so", - "blob_id": "a679d38667a74beffaca30bb9c6628c6b7d0b1c0" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/_ops.py", - "blob_id": "9aebf2be86d230b6de5510163c7b53dcc3aa7c51" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "56c1a4e3af0b4a93fff71028d8e04bf73f0abb29" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "d3677bebb82a7f3f19344ef6471626493cf2c5bb" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "265768fb900ccfe9612b4a0d25973e6618f22a79" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "d3be23dfc903ba61d3d4d79c0230952b24d2ead0" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "589f5d39f31418d5121e7cbb2e6f2894b0a7ed32" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json", - "blob_id": "2c78bfaba7890772bf266721f5577202ea443882" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "4da841e74a79f9589fecac1fa557ea132d34805f" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "200356713c0d0a76e199671c7ec8f10d0e5ee0ac" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "e076615ee541a5043556f630ecf0946c4e2c1408" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "ee896554b921040d7810bb6e9368cc200777951d" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "05aed8b1c81492151d128ef251afc510d8cc8ed5" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json", - "blob_id": "9262a74a4a0e1e3789f260a3ef7f6cb9551f3f2b" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "d251f9b5accaec977fc87a0999cd56ee387fc650" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "0ecf814a28a9441e89f892eb3d63dcf8dcb0dd97" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "51ad5b299eb22465fa80530d12bdd5d7a03ce398" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "ee5119182556cf49434c10e56cf04e3baeb26408" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "68793c77b33c4f4b97d0a4b780fcbe8043c799de" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "612910720ed9439e56c4af4c03f30fee224fac80" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "039a10ed127b77836a7f41c03513292613852b30" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "3793fcafee60bc7e8f5f12d601cb3192abfa9ca8" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "51d03d8607122d7b9bc20ba48d8432d62367fa00" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json", - "blob_id": "26f9abd6b789e9dd0f83ec7721fd1bae8aa76bec" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "cd0cdbea0c3372674cb610870dd0b30325864549" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "64be6e6591422aa0f441c3747b6c49850929652e" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "0a6a6a73fa45e270f01ba7ebdc6d9d55bf9daad3" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "ba9041d008507e31ae4179ef2bc863a49c606582" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "7a7508aab04599cb06641c835d8b0a14f54d0716" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "dbf9a2dd6f048d8adee290961e2aea72035f7615" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json", - "blob_id": "bbb2386046b1135a2cc7ab7cb26c1d0b039bcf3a" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "57055453aa24c831dad9ac8e37fdab707c63ef91" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "8cc6c643f236d2f7f9ad29354d9e469d00b20d3f" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "d4c9ddd12972ac0b5fd2be11a9cd1075906e3978" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "b2799ed3a866e25b78d60d92910c000ebb21ff71" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "b8d3be2313fa14025d8aeb2fd11e0d1ee997ffa6" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json", - "blob_id": "6a976788f9b10af19ebcfe582a69cbc627f9457b" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "3f3ccdafa88f3452a695efad4cb9622d6ae79e6a" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json", - "blob_id": "0a46390b2e31bba6a7c3ab2c9f6c8de6004857bb" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json", - "blob_id": "f4c0f8417b384870050a95e0cf57edbdf6352b23" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "5c8185cfdeec167ec4b88de51b4b395e28769cc5" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "97c9f4445b166657ad29f1db9fc8281f9c463ec4" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "0bb423b28f5ab3825929a4870b96393262a9dd9f" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "55571873395464a3b58f549523905f439a8f1716" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "26bcbf26970c7a77c99e2c8eacd83eefa86967bf" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json", - "blob_id": "91011e64c7de4505e9bb462bc70e6a3e7affa878" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json", - "blob_id": "b41f9d443e50678334f906b44fce6d018d69500e" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "edf2a38d12ad3f420f232d2cd61ab149ad138725" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "673bae2ba8ef80ed4d4930739ca7daf0e8f28ee1" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "b2100cebb7f589747430be9ca8c8db368c152d78" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json", - "blob_id": "d720deb4bdd73d194b1023c99e190b8fcfecdaef" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "dbc624731f5cb9afcdc9213183d00d1e5edd4a00" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "cc614e635ea57327c610ce79e99ae5339614f22e" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "32c0c9da471cbe479044095e0ed14a0f54b73620" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json", - "blob_id": "f807d4a5abaed9dd686df26837f2dd9f6161300f" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "f578c8d0160ac3ef85b53c8539d3675455a97173" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "918f6839620cbab1f30b0f9383a9129c2cf2cf3d" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "e341a67917d5177bacb3f6767e7b6d92539826ad" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "34b916e574f88c65db1dac5889d74a990dc25e9b" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "6496a38fba8ae09b3025a75f357815b9d6a5e3f4" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "3618053b65831b95c4bb0f20ef3b9aa816b2d637" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "46a982f5ee9a4bd67ce244b101c576efeeb53b78" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "035ec027fa56622196b24a03a5042ce010deaebf" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "8b49f2781cb54d19a2789767ebb7e8c3fb55b981" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "851bc9f9f0b50b41451b929eaa518869b6a05412" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "d1227c2157990216d2ca51c69ad0944017f53b6a" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "77ba0d7477bdbcb036a43263e7aaa6b6913f8f4e" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "1c61451fb34e52deec827f8f63c80fb15830c202" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "63e661c80de6a7b1422f7a994a2ee7a4b724911c" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "cf354037903c0d1fcd077c4647aabce026a723fb" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "0a5d7bfdba4852da9ed08d1bc27cd7d521d09965" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "eccb86a76df0d7302b760ab6d83a8ceb9fa9d0d9" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "cb91a279d423d0ca25197e0edd5e8c2f4da58720" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "88af48431d8b8791af8df03429704606b670f1f7" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "dd069726d7ed4dcbb449af243f4f4af21815f854" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "7febe3d272b4bb76500f7c6b523396129fd53680" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "56b939e52fac3ed53a4e0ba640c40010cb3af30a" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "63d9a0bf5d79ddaaad547d44338ad4b959ad72b1" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "7fa398c15a2a535401709b0f25e20f6e4b23e58e" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "f15d8f64c7090bd71d0091a524c65d7818fec38e" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "9d7658bfc41b2c8fd4daf3fbdf62d15936d3d546" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "cd3e07804fdec10c2cfb291c1ede3ba67b753f9c" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "9d5a329d7466a37c0ca68a65a089fbb99f9327a9" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "03dba5ad15ba5f7f49100a5c78e8685e64334b2a" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "96e1594a3eabbaedc792b84b07f05ae8752b7251" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "5ffd367df833d773355590220598a3c7eceba4e0" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "9a5ff48b8942957dde9b862aed848390dd267948" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "eabc423949a24c2a1fb2368a73e5249caf8d07df" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "386928de139ce718f28222b9c1a6555df3958491" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "51e237b91b8e775a36bcf783c078c2c1cecbcbd2" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "6280219c9ee7d26f7e2fd3625dc92d847ddc7982" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "40c01c0b92b4b26fe480879dda33f18c5eb59a6d" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "c6fd3659799bc31e17f3577e7f0e8d7268faf1fb" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "160f12ed3f95a6967439ff53bc3e3a2cdc97c700" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "e5c4a1d2c94e5c7864f462e083ea5f530b8efe3f" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "2bf5eb27e38208871d50348b170c8c74b80fc519" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "9c908e80406587da4d246ce4e3a8a98a14c875b1" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "0a1e14cffbb2a894a701352193947d272427db0d" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "15b1c93f60fc5068ba11b82b6d5924dd2024a824" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "f78e7060e6840ff721d306db556636b0bbc8d9b3" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "8ff12e64c172f5a5d0fbdf900728fe60b33877e2" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "4532f93681e2be175b1bf94f81bfde711821cd60" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "1d3ce5c94c2d9a4a1637204efb3b14f7a5579bdb" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "ca7f32b9552b479dc05495792b7e426db5eb1b56" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "5acea242cc0ad094cba8ee5f568ff88afb1b41ae" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "3ab5796ee15b6ec8d4ab1f4ab5a594fecb30e4b4" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "58cdd93e90b8c29bc7a211861711565dbeeb529a" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "b72e0371d1421a1decc9d57860f83eea8f790942" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "3cb7eaa07c745fd3aa2b3242780a7061bedac1de" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "293adce387e066fce75b6e606d4b8b6a5aa10bdb" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/fp8.py", - "blob_id": "23bd7d6703104b0020671cc6ba6f78a6df37e4bf" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/fp8_utils.py", - "blob_id": "acb4f3e3bb1a34f209fdac9ecca8c123aaf67f12" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/fused_marlin_moe.py", - "blob_id": "b3e0a5c24599730faf973fad3cf3fb6031a30522" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/fused_moe.py", - "blob_id": "af2d798cbe5d7c3c1760ce79f717ab5f6d7700ba" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/platforms.py", - "blob_id": "735fab87f2add390f7bf6408ebe31d1f5de6d02b" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/scalar_type.py", - "blob_id": "ea749fe8247b6846620ccbba30ddf48d914ca4e1" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/utils/__init__.py", - "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/utils/marlin_utils.py", - "blob_id": "5037f774b8a8b7e88d822efacbb3b4ea5b95d356" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/utils/marlin_utils_test.py", - "blob_id": "83faac032ca93b3564c620c5b4b1ef63c74aaddf" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/utils/quant_utils.py", - "blob_id": "5819ab753e57655185572ce1e49c24e6268171b4" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/__init__.py", - "blob_id": "cc806778863c03ccb3157343cd6331c1c6ca332c" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/_moe_plhnk6yxrdq3c.abi3.so", - "blob_id": "a16cd30c7ff53b3d73fa081369b6443efa5fb184" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/_ops.py", - "blob_id": "5b5d43b13c586ead5f177bcb71ba17c078eb016d" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "56c1a4e3af0b4a93fff71028d8e04bf73f0abb29" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "d3677bebb82a7f3f19344ef6471626493cf2c5bb" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "265768fb900ccfe9612b4a0d25973e6618f22a79" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "d3be23dfc903ba61d3d4d79c0230952b24d2ead0" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "589f5d39f31418d5121e7cbb2e6f2894b0a7ed32" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json", - "blob_id": "2c78bfaba7890772bf266721f5577202ea443882" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "4da841e74a79f9589fecac1fa557ea132d34805f" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "200356713c0d0a76e199671c7ec8f10d0e5ee0ac" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "e076615ee541a5043556f630ecf0946c4e2c1408" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "ee896554b921040d7810bb6e9368cc200777951d" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "05aed8b1c81492151d128ef251afc510d8cc8ed5" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json", - "blob_id": "9262a74a4a0e1e3789f260a3ef7f6cb9551f3f2b" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "d251f9b5accaec977fc87a0999cd56ee387fc650" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "0ecf814a28a9441e89f892eb3d63dcf8dcb0dd97" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "51ad5b299eb22465fa80530d12bdd5d7a03ce398" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "ee5119182556cf49434c10e56cf04e3baeb26408" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "68793c77b33c4f4b97d0a4b780fcbe8043c799de" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "612910720ed9439e56c4af4c03f30fee224fac80" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "039a10ed127b77836a7f41c03513292613852b30" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "3793fcafee60bc7e8f5f12d601cb3192abfa9ca8" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "51d03d8607122d7b9bc20ba48d8432d62367fa00" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json", - "blob_id": "26f9abd6b789e9dd0f83ec7721fd1bae8aa76bec" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "cd0cdbea0c3372674cb610870dd0b30325864549" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "64be6e6591422aa0f441c3747b6c49850929652e" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "0a6a6a73fa45e270f01ba7ebdc6d9d55bf9daad3" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "ba9041d008507e31ae4179ef2bc863a49c606582" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "7a7508aab04599cb06641c835d8b0a14f54d0716" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "dbf9a2dd6f048d8adee290961e2aea72035f7615" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json", - "blob_id": "bbb2386046b1135a2cc7ab7cb26c1d0b039bcf3a" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "57055453aa24c831dad9ac8e37fdab707c63ef91" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "8cc6c643f236d2f7f9ad29354d9e469d00b20d3f" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "d4c9ddd12972ac0b5fd2be11a9cd1075906e3978" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "b2799ed3a866e25b78d60d92910c000ebb21ff71" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "b8d3be2313fa14025d8aeb2fd11e0d1ee997ffa6" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json", - "blob_id": "6a976788f9b10af19ebcfe582a69cbc627f9457b" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "3f3ccdafa88f3452a695efad4cb9622d6ae79e6a" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json", - "blob_id": "0a46390b2e31bba6a7c3ab2c9f6c8de6004857bb" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json", - "blob_id": "f4c0f8417b384870050a95e0cf57edbdf6352b23" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "5c8185cfdeec167ec4b88de51b4b395e28769cc5" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "97c9f4445b166657ad29f1db9fc8281f9c463ec4" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "0bb423b28f5ab3825929a4870b96393262a9dd9f" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "55571873395464a3b58f549523905f439a8f1716" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "26bcbf26970c7a77c99e2c8eacd83eefa86967bf" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json", - "blob_id": "91011e64c7de4505e9bb462bc70e6a3e7affa878" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json", - "blob_id": "b41f9d443e50678334f906b44fce6d018d69500e" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "edf2a38d12ad3f420f232d2cd61ab149ad138725" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "673bae2ba8ef80ed4d4930739ca7daf0e8f28ee1" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "b2100cebb7f589747430be9ca8c8db368c152d78" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json", - "blob_id": "d720deb4bdd73d194b1023c99e190b8fcfecdaef" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "dbc624731f5cb9afcdc9213183d00d1e5edd4a00" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "cc614e635ea57327c610ce79e99ae5339614f22e" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "32c0c9da471cbe479044095e0ed14a0f54b73620" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json", - "blob_id": "f807d4a5abaed9dd686df26837f2dd9f6161300f" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "f578c8d0160ac3ef85b53c8539d3675455a97173" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "918f6839620cbab1f30b0f9383a9129c2cf2cf3d" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "e341a67917d5177bacb3f6767e7b6d92539826ad" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "34b916e574f88c65db1dac5889d74a990dc25e9b" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "6496a38fba8ae09b3025a75f357815b9d6a5e3f4" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "3618053b65831b95c4bb0f20ef3b9aa816b2d637" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "46a982f5ee9a4bd67ce244b101c576efeeb53b78" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "035ec027fa56622196b24a03a5042ce010deaebf" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "8b49f2781cb54d19a2789767ebb7e8c3fb55b981" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "851bc9f9f0b50b41451b929eaa518869b6a05412" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "d1227c2157990216d2ca51c69ad0944017f53b6a" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "77ba0d7477bdbcb036a43263e7aaa6b6913f8f4e" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "1c61451fb34e52deec827f8f63c80fb15830c202" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "63e661c80de6a7b1422f7a994a2ee7a4b724911c" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "cf354037903c0d1fcd077c4647aabce026a723fb" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "0a5d7bfdba4852da9ed08d1bc27cd7d521d09965" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "eccb86a76df0d7302b760ab6d83a8ceb9fa9d0d9" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "cb91a279d423d0ca25197e0edd5e8c2f4da58720" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "88af48431d8b8791af8df03429704606b670f1f7" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "dd069726d7ed4dcbb449af243f4f4af21815f854" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "7febe3d272b4bb76500f7c6b523396129fd53680" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "56b939e52fac3ed53a4e0ba640c40010cb3af30a" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "63d9a0bf5d79ddaaad547d44338ad4b959ad72b1" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "7fa398c15a2a535401709b0f25e20f6e4b23e58e" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "f15d8f64c7090bd71d0091a524c65d7818fec38e" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "9d7658bfc41b2c8fd4daf3fbdf62d15936d3d546" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "cd3e07804fdec10c2cfb291c1ede3ba67b753f9c" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "9d5a329d7466a37c0ca68a65a089fbb99f9327a9" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "03dba5ad15ba5f7f49100a5c78e8685e64334b2a" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "96e1594a3eabbaedc792b84b07f05ae8752b7251" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "5ffd367df833d773355590220598a3c7eceba4e0" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "9a5ff48b8942957dde9b862aed848390dd267948" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "eabc423949a24c2a1fb2368a73e5249caf8d07df" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "386928de139ce718f28222b9c1a6555df3958491" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "51e237b91b8e775a36bcf783c078c2c1cecbcbd2" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "6280219c9ee7d26f7e2fd3625dc92d847ddc7982" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "40c01c0b92b4b26fe480879dda33f18c5eb59a6d" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "c6fd3659799bc31e17f3577e7f0e8d7268faf1fb" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "160f12ed3f95a6967439ff53bc3e3a2cdc97c700" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "e5c4a1d2c94e5c7864f462e083ea5f530b8efe3f" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "2bf5eb27e38208871d50348b170c8c74b80fc519" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "9c908e80406587da4d246ce4e3a8a98a14c875b1" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "0a1e14cffbb2a894a701352193947d272427db0d" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "15b1c93f60fc5068ba11b82b6d5924dd2024a824" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "f78e7060e6840ff721d306db556636b0bbc8d9b3" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "8ff12e64c172f5a5d0fbdf900728fe60b33877e2" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "4532f93681e2be175b1bf94f81bfde711821cd60" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "1d3ce5c94c2d9a4a1637204efb3b14f7a5579bdb" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "ca7f32b9552b479dc05495792b7e426db5eb1b56" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "5acea242cc0ad094cba8ee5f568ff88afb1b41ae" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "3ab5796ee15b6ec8d4ab1f4ab5a594fecb30e4b4" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "58cdd93e90b8c29bc7a211861711565dbeeb529a" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "b72e0371d1421a1decc9d57860f83eea8f790942" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "3cb7eaa07c745fd3aa2b3242780a7061bedac1de" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "293adce387e066fce75b6e606d4b8b6a5aa10bdb" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/fp8.py", - "blob_id": "23bd7d6703104b0020671cc6ba6f78a6df37e4bf" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/fp8_utils.py", - "blob_id": "acb4f3e3bb1a34f209fdac9ecca8c123aaf67f12" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/fused_marlin_moe.py", - "blob_id": "b3e0a5c24599730faf973fad3cf3fb6031a30522" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/fused_moe.py", - "blob_id": "af2d798cbe5d7c3c1760ce79f717ab5f6d7700ba" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/platforms.py", - "blob_id": "735fab87f2add390f7bf6408ebe31d1f5de6d02b" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/scalar_type.py", - "blob_id": "ea749fe8247b6846620ccbba30ddf48d914ca4e1" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/utils/__init__.py", - "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/utils/marlin_utils.py", - "blob_id": "5037f774b8a8b7e88d822efacbb3b4ea5b95d356" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/utils/marlin_utils_test.py", - "blob_id": "83faac032ca93b3564c620c5b4b1ef63c74aaddf" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/utils/quant_utils.py", - "blob_id": "5819ab753e57655185572ce1e49c24e6268171b4" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/__init__.py", - "blob_id": "cc806778863c03ccb3157343cd6331c1c6ca332c" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/_moe_mrhezivmofzdg.abi3.so", - "blob_id": "5423719c9ac75f96528fc0b7386a108aedc996b1" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/_ops.py", - "blob_id": "f78c4d78eceaa86a9d245eea4d5562167db8f59b" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "56c1a4e3af0b4a93fff71028d8e04bf73f0abb29" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "d3677bebb82a7f3f19344ef6471626493cf2c5bb" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "265768fb900ccfe9612b4a0d25973e6618f22a79" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "d3be23dfc903ba61d3d4d79c0230952b24d2ead0" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "589f5d39f31418d5121e7cbb2e6f2894b0a7ed32" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json", - "blob_id": "2c78bfaba7890772bf266721f5577202ea443882" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "4da841e74a79f9589fecac1fa557ea132d34805f" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "200356713c0d0a76e199671c7ec8f10d0e5ee0ac" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "e076615ee541a5043556f630ecf0946c4e2c1408" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "ee896554b921040d7810bb6e9368cc200777951d" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "05aed8b1c81492151d128ef251afc510d8cc8ed5" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json", - "blob_id": "9262a74a4a0e1e3789f260a3ef7f6cb9551f3f2b" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "d251f9b5accaec977fc87a0999cd56ee387fc650" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "0ecf814a28a9441e89f892eb3d63dcf8dcb0dd97" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "51ad5b299eb22465fa80530d12bdd5d7a03ce398" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "ee5119182556cf49434c10e56cf04e3baeb26408" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "68793c77b33c4f4b97d0a4b780fcbe8043c799de" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "612910720ed9439e56c4af4c03f30fee224fac80" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "039a10ed127b77836a7f41c03513292613852b30" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "3793fcafee60bc7e8f5f12d601cb3192abfa9ca8" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "51d03d8607122d7b9bc20ba48d8432d62367fa00" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json", - "blob_id": "26f9abd6b789e9dd0f83ec7721fd1bae8aa76bec" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "cd0cdbea0c3372674cb610870dd0b30325864549" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "64be6e6591422aa0f441c3747b6c49850929652e" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "0a6a6a73fa45e270f01ba7ebdc6d9d55bf9daad3" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "ba9041d008507e31ae4179ef2bc863a49c606582" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "7a7508aab04599cb06641c835d8b0a14f54d0716" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "dbf9a2dd6f048d8adee290961e2aea72035f7615" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json", - "blob_id": "bbb2386046b1135a2cc7ab7cb26c1d0b039bcf3a" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "57055453aa24c831dad9ac8e37fdab707c63ef91" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "8cc6c643f236d2f7f9ad29354d9e469d00b20d3f" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "d4c9ddd12972ac0b5fd2be11a9cd1075906e3978" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "b2799ed3a866e25b78d60d92910c000ebb21ff71" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "b8d3be2313fa14025d8aeb2fd11e0d1ee997ffa6" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json", - "blob_id": "6a976788f9b10af19ebcfe582a69cbc627f9457b" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "3f3ccdafa88f3452a695efad4cb9622d6ae79e6a" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json", - "blob_id": "0a46390b2e31bba6a7c3ab2c9f6c8de6004857bb" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json", - "blob_id": "f4c0f8417b384870050a95e0cf57edbdf6352b23" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "5c8185cfdeec167ec4b88de51b4b395e28769cc5" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "97c9f4445b166657ad29f1db9fc8281f9c463ec4" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "0bb423b28f5ab3825929a4870b96393262a9dd9f" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "55571873395464a3b58f549523905f439a8f1716" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "26bcbf26970c7a77c99e2c8eacd83eefa86967bf" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json", - "blob_id": "91011e64c7de4505e9bb462bc70e6a3e7affa878" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json", - "blob_id": "b41f9d443e50678334f906b44fce6d018d69500e" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "edf2a38d12ad3f420f232d2cd61ab149ad138725" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "673bae2ba8ef80ed4d4930739ca7daf0e8f28ee1" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "b2100cebb7f589747430be9ca8c8db368c152d78" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json", - "blob_id": "d720deb4bdd73d194b1023c99e190b8fcfecdaef" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "dbc624731f5cb9afcdc9213183d00d1e5edd4a00" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "cc614e635ea57327c610ce79e99ae5339614f22e" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "32c0c9da471cbe479044095e0ed14a0f54b73620" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json", - "blob_id": "f807d4a5abaed9dd686df26837f2dd9f6161300f" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "f578c8d0160ac3ef85b53c8539d3675455a97173" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "918f6839620cbab1f30b0f9383a9129c2cf2cf3d" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "e341a67917d5177bacb3f6767e7b6d92539826ad" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "34b916e574f88c65db1dac5889d74a990dc25e9b" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "6496a38fba8ae09b3025a75f357815b9d6a5e3f4" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "3618053b65831b95c4bb0f20ef3b9aa816b2d637" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "46a982f5ee9a4bd67ce244b101c576efeeb53b78" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "035ec027fa56622196b24a03a5042ce010deaebf" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "8b49f2781cb54d19a2789767ebb7e8c3fb55b981" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "851bc9f9f0b50b41451b929eaa518869b6a05412" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "d1227c2157990216d2ca51c69ad0944017f53b6a" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "77ba0d7477bdbcb036a43263e7aaa6b6913f8f4e" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "1c61451fb34e52deec827f8f63c80fb15830c202" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "63e661c80de6a7b1422f7a994a2ee7a4b724911c" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "cf354037903c0d1fcd077c4647aabce026a723fb" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "0a5d7bfdba4852da9ed08d1bc27cd7d521d09965" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "eccb86a76df0d7302b760ab6d83a8ceb9fa9d0d9" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "cb91a279d423d0ca25197e0edd5e8c2f4da58720" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "88af48431d8b8791af8df03429704606b670f1f7" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "dd069726d7ed4dcbb449af243f4f4af21815f854" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "7febe3d272b4bb76500f7c6b523396129fd53680" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "56b939e52fac3ed53a4e0ba640c40010cb3af30a" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "63d9a0bf5d79ddaaad547d44338ad4b959ad72b1" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "7fa398c15a2a535401709b0f25e20f6e4b23e58e" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "f15d8f64c7090bd71d0091a524c65d7818fec38e" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "9d7658bfc41b2c8fd4daf3fbdf62d15936d3d546" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "cd3e07804fdec10c2cfb291c1ede3ba67b753f9c" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "9d5a329d7466a37c0ca68a65a089fbb99f9327a9" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "03dba5ad15ba5f7f49100a5c78e8685e64334b2a" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "96e1594a3eabbaedc792b84b07f05ae8752b7251" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "5ffd367df833d773355590220598a3c7eceba4e0" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "9a5ff48b8942957dde9b862aed848390dd267948" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "eabc423949a24c2a1fb2368a73e5249caf8d07df" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "386928de139ce718f28222b9c1a6555df3958491" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "51e237b91b8e775a36bcf783c078c2c1cecbcbd2" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "6280219c9ee7d26f7e2fd3625dc92d847ddc7982" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "40c01c0b92b4b26fe480879dda33f18c5eb59a6d" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "c6fd3659799bc31e17f3577e7f0e8d7268faf1fb" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "160f12ed3f95a6967439ff53bc3e3a2cdc97c700" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "e5c4a1d2c94e5c7864f462e083ea5f530b8efe3f" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "2bf5eb27e38208871d50348b170c8c74b80fc519" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "9c908e80406587da4d246ce4e3a8a98a14c875b1" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "0a1e14cffbb2a894a701352193947d272427db0d" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "15b1c93f60fc5068ba11b82b6d5924dd2024a824" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "f78e7060e6840ff721d306db556636b0bbc8d9b3" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "8ff12e64c172f5a5d0fbdf900728fe60b33877e2" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "4532f93681e2be175b1bf94f81bfde711821cd60" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "1d3ce5c94c2d9a4a1637204efb3b14f7a5579bdb" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "ca7f32b9552b479dc05495792b7e426db5eb1b56" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "5acea242cc0ad094cba8ee5f568ff88afb1b41ae" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "3ab5796ee15b6ec8d4ab1f4ab5a594fecb30e4b4" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "58cdd93e90b8c29bc7a211861711565dbeeb529a" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "b72e0371d1421a1decc9d57860f83eea8f790942" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "3cb7eaa07c745fd3aa2b3242780a7061bedac1de" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "293adce387e066fce75b6e606d4b8b6a5aa10bdb" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/fp8.py", - "blob_id": "23bd7d6703104b0020671cc6ba6f78a6df37e4bf" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/fp8_utils.py", - "blob_id": "acb4f3e3bb1a34f209fdac9ecca8c123aaf67f12" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/fused_marlin_moe.py", - "blob_id": "b3e0a5c24599730faf973fad3cf3fb6031a30522" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/fused_moe.py", - "blob_id": "af2d798cbe5d7c3c1760ce79f717ab5f6d7700ba" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/platforms.py", - "blob_id": "735fab87f2add390f7bf6408ebe31d1f5de6d02b" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/scalar_type.py", - "blob_id": "ea749fe8247b6846620ccbba30ddf48d914ca4e1" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/utils/__init__.py", - "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/utils/marlin_utils.py", - "blob_id": "5037f774b8a8b7e88d822efacbb3b4ea5b95d356" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/utils/marlin_utils_test.py", - "blob_id": "83faac032ca93b3564c620c5b4b1ef63c74aaddf" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/utils/quant_utils.py", - "blob_id": "5819ab753e57655185572ce1e49c24e6268171b4" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/__init__.py", - "blob_id": "cc806778863c03ccb3157343cd6331c1c6ca332c" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/_moe_lvqy7x44edhqo.abi3.so", - "blob_id": "51f9c5792c1d7bcb03a8906e9bc60e779ba1b343" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/_ops.py", - "blob_id": "510c892dcc2479877c6c2fc5c20f6a534dc90d51" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "56c1a4e3af0b4a93fff71028d8e04bf73f0abb29" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "d3677bebb82a7f3f19344ef6471626493cf2c5bb" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "265768fb900ccfe9612b4a0d25973e6618f22a79" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "d3be23dfc903ba61d3d4d79c0230952b24d2ead0" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "589f5d39f31418d5121e7cbb2e6f2894b0a7ed32" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json", - "blob_id": "2c78bfaba7890772bf266721f5577202ea443882" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "4da841e74a79f9589fecac1fa557ea132d34805f" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "200356713c0d0a76e199671c7ec8f10d0e5ee0ac" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "e076615ee541a5043556f630ecf0946c4e2c1408" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "ee896554b921040d7810bb6e9368cc200777951d" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "05aed8b1c81492151d128ef251afc510d8cc8ed5" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json", - "blob_id": "9262a74a4a0e1e3789f260a3ef7f6cb9551f3f2b" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "d251f9b5accaec977fc87a0999cd56ee387fc650" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "0ecf814a28a9441e89f892eb3d63dcf8dcb0dd97" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "51ad5b299eb22465fa80530d12bdd5d7a03ce398" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "ee5119182556cf49434c10e56cf04e3baeb26408" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "68793c77b33c4f4b97d0a4b780fcbe8043c799de" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "612910720ed9439e56c4af4c03f30fee224fac80" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "039a10ed127b77836a7f41c03513292613852b30" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "3793fcafee60bc7e8f5f12d601cb3192abfa9ca8" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "51d03d8607122d7b9bc20ba48d8432d62367fa00" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json", - "blob_id": "26f9abd6b789e9dd0f83ec7721fd1bae8aa76bec" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "cd0cdbea0c3372674cb610870dd0b30325864549" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "64be6e6591422aa0f441c3747b6c49850929652e" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "0a6a6a73fa45e270f01ba7ebdc6d9d55bf9daad3" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "ba9041d008507e31ae4179ef2bc863a49c606582" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "7a7508aab04599cb06641c835d8b0a14f54d0716" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "dbf9a2dd6f048d8adee290961e2aea72035f7615" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json", - "blob_id": "bbb2386046b1135a2cc7ab7cb26c1d0b039bcf3a" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "57055453aa24c831dad9ac8e37fdab707c63ef91" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "8cc6c643f236d2f7f9ad29354d9e469d00b20d3f" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "d4c9ddd12972ac0b5fd2be11a9cd1075906e3978" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "b2799ed3a866e25b78d60d92910c000ebb21ff71" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "b8d3be2313fa14025d8aeb2fd11e0d1ee997ffa6" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json", - "blob_id": "6a976788f9b10af19ebcfe582a69cbc627f9457b" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "3f3ccdafa88f3452a695efad4cb9622d6ae79e6a" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json", - "blob_id": "0a46390b2e31bba6a7c3ab2c9f6c8de6004857bb" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json", - "blob_id": "f4c0f8417b384870050a95e0cf57edbdf6352b23" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "5c8185cfdeec167ec4b88de51b4b395e28769cc5" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "97c9f4445b166657ad29f1db9fc8281f9c463ec4" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "0bb423b28f5ab3825929a4870b96393262a9dd9f" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "55571873395464a3b58f549523905f439a8f1716" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "26bcbf26970c7a77c99e2c8eacd83eefa86967bf" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json", - "blob_id": "91011e64c7de4505e9bb462bc70e6a3e7affa878" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json", - "blob_id": "b41f9d443e50678334f906b44fce6d018d69500e" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "edf2a38d12ad3f420f232d2cd61ab149ad138725" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "673bae2ba8ef80ed4d4930739ca7daf0e8f28ee1" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "b2100cebb7f589747430be9ca8c8db368c152d78" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json", - "blob_id": "d720deb4bdd73d194b1023c99e190b8fcfecdaef" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "dbc624731f5cb9afcdc9213183d00d1e5edd4a00" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "cc614e635ea57327c610ce79e99ae5339614f22e" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "32c0c9da471cbe479044095e0ed14a0f54b73620" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json", - "blob_id": "f807d4a5abaed9dd686df26837f2dd9f6161300f" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "f578c8d0160ac3ef85b53c8539d3675455a97173" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "918f6839620cbab1f30b0f9383a9129c2cf2cf3d" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "e341a67917d5177bacb3f6767e7b6d92539826ad" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "34b916e574f88c65db1dac5889d74a990dc25e9b" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "6496a38fba8ae09b3025a75f357815b9d6a5e3f4" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "3618053b65831b95c4bb0f20ef3b9aa816b2d637" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "46a982f5ee9a4bd67ce244b101c576efeeb53b78" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "035ec027fa56622196b24a03a5042ce010deaebf" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "8b49f2781cb54d19a2789767ebb7e8c3fb55b981" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "851bc9f9f0b50b41451b929eaa518869b6a05412" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "d1227c2157990216d2ca51c69ad0944017f53b6a" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "77ba0d7477bdbcb036a43263e7aaa6b6913f8f4e" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "1c61451fb34e52deec827f8f63c80fb15830c202" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "63e661c80de6a7b1422f7a994a2ee7a4b724911c" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "cf354037903c0d1fcd077c4647aabce026a723fb" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "0a5d7bfdba4852da9ed08d1bc27cd7d521d09965" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "eccb86a76df0d7302b760ab6d83a8ceb9fa9d0d9" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "cb91a279d423d0ca25197e0edd5e8c2f4da58720" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "88af48431d8b8791af8df03429704606b670f1f7" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "dd069726d7ed4dcbb449af243f4f4af21815f854" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "7febe3d272b4bb76500f7c6b523396129fd53680" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "56b939e52fac3ed53a4e0ba640c40010cb3af30a" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "63d9a0bf5d79ddaaad547d44338ad4b959ad72b1" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "7fa398c15a2a535401709b0f25e20f6e4b23e58e" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "f15d8f64c7090bd71d0091a524c65d7818fec38e" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "9d7658bfc41b2c8fd4daf3fbdf62d15936d3d546" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "cd3e07804fdec10c2cfb291c1ede3ba67b753f9c" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "9d5a329d7466a37c0ca68a65a089fbb99f9327a9" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "03dba5ad15ba5f7f49100a5c78e8685e64334b2a" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "96e1594a3eabbaedc792b84b07f05ae8752b7251" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "5ffd367df833d773355590220598a3c7eceba4e0" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "9a5ff48b8942957dde9b862aed848390dd267948" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "eabc423949a24c2a1fb2368a73e5249caf8d07df" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "386928de139ce718f28222b9c1a6555df3958491" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "51e237b91b8e775a36bcf783c078c2c1cecbcbd2" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "6280219c9ee7d26f7e2fd3625dc92d847ddc7982" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "40c01c0b92b4b26fe480879dda33f18c5eb59a6d" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "c6fd3659799bc31e17f3577e7f0e8d7268faf1fb" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "160f12ed3f95a6967439ff53bc3e3a2cdc97c700" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "e5c4a1d2c94e5c7864f462e083ea5f530b8efe3f" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "2bf5eb27e38208871d50348b170c8c74b80fc519" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "9c908e80406587da4d246ce4e3a8a98a14c875b1" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "0a1e14cffbb2a894a701352193947d272427db0d" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "15b1c93f60fc5068ba11b82b6d5924dd2024a824" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "f78e7060e6840ff721d306db556636b0bbc8d9b3" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "8ff12e64c172f5a5d0fbdf900728fe60b33877e2" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "4532f93681e2be175b1bf94f81bfde711821cd60" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "1d3ce5c94c2d9a4a1637204efb3b14f7a5579bdb" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "ca7f32b9552b479dc05495792b7e426db5eb1b56" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "5acea242cc0ad094cba8ee5f568ff88afb1b41ae" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "3ab5796ee15b6ec8d4ab1f4ab5a594fecb30e4b4" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "58cdd93e90b8c29bc7a211861711565dbeeb529a" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "b72e0371d1421a1decc9d57860f83eea8f790942" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "3cb7eaa07c745fd3aa2b3242780a7061bedac1de" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "293adce387e066fce75b6e606d4b8b6a5aa10bdb" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/fp8.py", - "blob_id": "23bd7d6703104b0020671cc6ba6f78a6df37e4bf" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/fp8_utils.py", - "blob_id": "acb4f3e3bb1a34f209fdac9ecca8c123aaf67f12" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/fused_marlin_moe.py", - "blob_id": "b3e0a5c24599730faf973fad3cf3fb6031a30522" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/fused_moe.py", - "blob_id": "af2d798cbe5d7c3c1760ce79f717ab5f6d7700ba" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/platforms.py", - "blob_id": "735fab87f2add390f7bf6408ebe31d1f5de6d02b" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/scalar_type.py", - "blob_id": "ea749fe8247b6846620ccbba30ddf48d914ca4e1" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/utils/__init__.py", - "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/utils/marlin_utils.py", - "blob_id": "5037f774b8a8b7e88d822efacbb3b4ea5b95d356" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/utils/marlin_utils_test.py", - "blob_id": "83faac032ca93b3564c620c5b4b1ef63c74aaddf" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/utils/quant_utils.py", - "blob_id": "5819ab753e57655185572ce1e49c24e6268171b4" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/__init__.py", - "blob_id": "cc806778863c03ccb3157343cd6331c1c6ca332c" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/_moe_gasoel7noy6kw.abi3.so", - "blob_id": "cd914e2830fbe3fbdcf31c5fa2f37c384d2c36d5" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/_ops.py", - "blob_id": "c69fb498baf329dda803ec0f90dc4b7756fb1ff0" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "56c1a4e3af0b4a93fff71028d8e04bf73f0abb29" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "d3677bebb82a7f3f19344ef6471626493cf2c5bb" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "265768fb900ccfe9612b4a0d25973e6618f22a79" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "d3be23dfc903ba61d3d4d79c0230952b24d2ead0" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "589f5d39f31418d5121e7cbb2e6f2894b0a7ed32" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json", - "blob_id": "2c78bfaba7890772bf266721f5577202ea443882" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "4da841e74a79f9589fecac1fa557ea132d34805f" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "200356713c0d0a76e199671c7ec8f10d0e5ee0ac" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "e076615ee541a5043556f630ecf0946c4e2c1408" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "ee896554b921040d7810bb6e9368cc200777951d" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "05aed8b1c81492151d128ef251afc510d8cc8ed5" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json", - "blob_id": "9262a74a4a0e1e3789f260a3ef7f6cb9551f3f2b" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "d251f9b5accaec977fc87a0999cd56ee387fc650" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "0ecf814a28a9441e89f892eb3d63dcf8dcb0dd97" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "51ad5b299eb22465fa80530d12bdd5d7a03ce398" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "ee5119182556cf49434c10e56cf04e3baeb26408" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "68793c77b33c4f4b97d0a4b780fcbe8043c799de" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "612910720ed9439e56c4af4c03f30fee224fac80" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "039a10ed127b77836a7f41c03513292613852b30" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "3793fcafee60bc7e8f5f12d601cb3192abfa9ca8" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "51d03d8607122d7b9bc20ba48d8432d62367fa00" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json", - "blob_id": "26f9abd6b789e9dd0f83ec7721fd1bae8aa76bec" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "cd0cdbea0c3372674cb610870dd0b30325864549" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "64be6e6591422aa0f441c3747b6c49850929652e" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "0a6a6a73fa45e270f01ba7ebdc6d9d55bf9daad3" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "ba9041d008507e31ae4179ef2bc863a49c606582" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "7a7508aab04599cb06641c835d8b0a14f54d0716" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "dbf9a2dd6f048d8adee290961e2aea72035f7615" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json", - "blob_id": "bbb2386046b1135a2cc7ab7cb26c1d0b039bcf3a" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "57055453aa24c831dad9ac8e37fdab707c63ef91" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "8cc6c643f236d2f7f9ad29354d9e469d00b20d3f" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "d4c9ddd12972ac0b5fd2be11a9cd1075906e3978" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "b2799ed3a866e25b78d60d92910c000ebb21ff71" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "b8d3be2313fa14025d8aeb2fd11e0d1ee997ffa6" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json", - "blob_id": "6a976788f9b10af19ebcfe582a69cbc627f9457b" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "3f3ccdafa88f3452a695efad4cb9622d6ae79e6a" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json", - "blob_id": "0a46390b2e31bba6a7c3ab2c9f6c8de6004857bb" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json", - "blob_id": "f4c0f8417b384870050a95e0cf57edbdf6352b23" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "5c8185cfdeec167ec4b88de51b4b395e28769cc5" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "97c9f4445b166657ad29f1db9fc8281f9c463ec4" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "0bb423b28f5ab3825929a4870b96393262a9dd9f" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "55571873395464a3b58f549523905f439a8f1716" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "26bcbf26970c7a77c99e2c8eacd83eefa86967bf" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json", - "blob_id": "91011e64c7de4505e9bb462bc70e6a3e7affa878" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json", - "blob_id": "b41f9d443e50678334f906b44fce6d018d69500e" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "edf2a38d12ad3f420f232d2cd61ab149ad138725" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "673bae2ba8ef80ed4d4930739ca7daf0e8f28ee1" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "b2100cebb7f589747430be9ca8c8db368c152d78" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json", - "blob_id": "d720deb4bdd73d194b1023c99e190b8fcfecdaef" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "dbc624731f5cb9afcdc9213183d00d1e5edd4a00" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "cc614e635ea57327c610ce79e99ae5339614f22e" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "32c0c9da471cbe479044095e0ed14a0f54b73620" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json", - "blob_id": "f807d4a5abaed9dd686df26837f2dd9f6161300f" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "f578c8d0160ac3ef85b53c8539d3675455a97173" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "918f6839620cbab1f30b0f9383a9129c2cf2cf3d" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "e341a67917d5177bacb3f6767e7b6d92539826ad" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "34b916e574f88c65db1dac5889d74a990dc25e9b" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "6496a38fba8ae09b3025a75f357815b9d6a5e3f4" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "3618053b65831b95c4bb0f20ef3b9aa816b2d637" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "46a982f5ee9a4bd67ce244b101c576efeeb53b78" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "035ec027fa56622196b24a03a5042ce010deaebf" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "8b49f2781cb54d19a2789767ebb7e8c3fb55b981" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "851bc9f9f0b50b41451b929eaa518869b6a05412" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "d1227c2157990216d2ca51c69ad0944017f53b6a" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "77ba0d7477bdbcb036a43263e7aaa6b6913f8f4e" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "1c61451fb34e52deec827f8f63c80fb15830c202" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "63e661c80de6a7b1422f7a994a2ee7a4b724911c" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "cf354037903c0d1fcd077c4647aabce026a723fb" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "0a5d7bfdba4852da9ed08d1bc27cd7d521d09965" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "eccb86a76df0d7302b760ab6d83a8ceb9fa9d0d9" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "cb91a279d423d0ca25197e0edd5e8c2f4da58720" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "88af48431d8b8791af8df03429704606b670f1f7" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "dd069726d7ed4dcbb449af243f4f4af21815f854" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "7febe3d272b4bb76500f7c6b523396129fd53680" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "56b939e52fac3ed53a4e0ba640c40010cb3af30a" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "63d9a0bf5d79ddaaad547d44338ad4b959ad72b1" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "7fa398c15a2a535401709b0f25e20f6e4b23e58e" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "f15d8f64c7090bd71d0091a524c65d7818fec38e" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "9d7658bfc41b2c8fd4daf3fbdf62d15936d3d546" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "cd3e07804fdec10c2cfb291c1ede3ba67b753f9c" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "9d5a329d7466a37c0ca68a65a089fbb99f9327a9" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "03dba5ad15ba5f7f49100a5c78e8685e64334b2a" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "96e1594a3eabbaedc792b84b07f05ae8752b7251" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "5ffd367df833d773355590220598a3c7eceba4e0" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "9a5ff48b8942957dde9b862aed848390dd267948" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "eabc423949a24c2a1fb2368a73e5249caf8d07df" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "386928de139ce718f28222b9c1a6555df3958491" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "51e237b91b8e775a36bcf783c078c2c1cecbcbd2" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "6280219c9ee7d26f7e2fd3625dc92d847ddc7982" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "40c01c0b92b4b26fe480879dda33f18c5eb59a6d" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "c6fd3659799bc31e17f3577e7f0e8d7268faf1fb" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "160f12ed3f95a6967439ff53bc3e3a2cdc97c700" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "e5c4a1d2c94e5c7864f462e083ea5f530b8efe3f" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "2bf5eb27e38208871d50348b170c8c74b80fc519" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "9c908e80406587da4d246ce4e3a8a98a14c875b1" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "0a1e14cffbb2a894a701352193947d272427db0d" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "15b1c93f60fc5068ba11b82b6d5924dd2024a824" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "f78e7060e6840ff721d306db556636b0bbc8d9b3" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "8ff12e64c172f5a5d0fbdf900728fe60b33877e2" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "4532f93681e2be175b1bf94f81bfde711821cd60" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "1d3ce5c94c2d9a4a1637204efb3b14f7a5579bdb" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "ca7f32b9552b479dc05495792b7e426db5eb1b56" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "5acea242cc0ad094cba8ee5f568ff88afb1b41ae" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "3ab5796ee15b6ec8d4ab1f4ab5a594fecb30e4b4" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "58cdd93e90b8c29bc7a211861711565dbeeb529a" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "b72e0371d1421a1decc9d57860f83eea8f790942" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "3cb7eaa07c745fd3aa2b3242780a7061bedac1de" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "293adce387e066fce75b6e606d4b8b6a5aa10bdb" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/fp8.py", - "blob_id": "23bd7d6703104b0020671cc6ba6f78a6df37e4bf" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/fp8_utils.py", - "blob_id": "acb4f3e3bb1a34f209fdac9ecca8c123aaf67f12" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/fused_marlin_moe.py", - "blob_id": "b3e0a5c24599730faf973fad3cf3fb6031a30522" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/fused_moe.py", - "blob_id": "af2d798cbe5d7c3c1760ce79f717ab5f6d7700ba" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/platforms.py", - "blob_id": "735fab87f2add390f7bf6408ebe31d1f5de6d02b" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/scalar_type.py", - "blob_id": "ea749fe8247b6846620ccbba30ddf48d914ca4e1" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/utils/__init__.py", - "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/utils/marlin_utils.py", - "blob_id": "5037f774b8a8b7e88d822efacbb3b4ea5b95d356" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/utils/marlin_utils_test.py", - "blob_id": "83faac032ca93b3564c620c5b4b1ef63c74aaddf" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/utils/quant_utils.py", - "blob_id": "5819ab753e57655185572ce1e49c24e6268171b4" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/__init__.py", - "blob_id": "cc806778863c03ccb3157343cd6331c1c6ca332c" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/_moe_cobe53r755p6a.abi3.so", - "blob_id": "0082e984366e264ad72eb429f4e138d45f5cbcaf" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/_ops.py", - "blob_id": "153d250d92d9f1bf4b6e318287370706aa5cd385" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "56c1a4e3af0b4a93fff71028d8e04bf73f0abb29" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "d3677bebb82a7f3f19344ef6471626493cf2c5bb" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "265768fb900ccfe9612b4a0d25973e6618f22a79" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "d3be23dfc903ba61d3d4d79c0230952b24d2ead0" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "589f5d39f31418d5121e7cbb2e6f2894b0a7ed32" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json", - "blob_id": "2c78bfaba7890772bf266721f5577202ea443882" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "4da841e74a79f9589fecac1fa557ea132d34805f" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "200356713c0d0a76e199671c7ec8f10d0e5ee0ac" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "e076615ee541a5043556f630ecf0946c4e2c1408" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "ee896554b921040d7810bb6e9368cc200777951d" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "05aed8b1c81492151d128ef251afc510d8cc8ed5" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json", - "blob_id": "9262a74a4a0e1e3789f260a3ef7f6cb9551f3f2b" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "d251f9b5accaec977fc87a0999cd56ee387fc650" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "0ecf814a28a9441e89f892eb3d63dcf8dcb0dd97" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "51ad5b299eb22465fa80530d12bdd5d7a03ce398" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "ee5119182556cf49434c10e56cf04e3baeb26408" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "68793c77b33c4f4b97d0a4b780fcbe8043c799de" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "612910720ed9439e56c4af4c03f30fee224fac80" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "039a10ed127b77836a7f41c03513292613852b30" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "3793fcafee60bc7e8f5f12d601cb3192abfa9ca8" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "51d03d8607122d7b9bc20ba48d8432d62367fa00" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json", - "blob_id": "26f9abd6b789e9dd0f83ec7721fd1bae8aa76bec" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "cd0cdbea0c3372674cb610870dd0b30325864549" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "64be6e6591422aa0f441c3747b6c49850929652e" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "0a6a6a73fa45e270f01ba7ebdc6d9d55bf9daad3" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "ba9041d008507e31ae4179ef2bc863a49c606582" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "7a7508aab04599cb06641c835d8b0a14f54d0716" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "dbf9a2dd6f048d8adee290961e2aea72035f7615" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json", - "blob_id": "bbb2386046b1135a2cc7ab7cb26c1d0b039bcf3a" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "57055453aa24c831dad9ac8e37fdab707c63ef91" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "8cc6c643f236d2f7f9ad29354d9e469d00b20d3f" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "d4c9ddd12972ac0b5fd2be11a9cd1075906e3978" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "b2799ed3a866e25b78d60d92910c000ebb21ff71" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "b8d3be2313fa14025d8aeb2fd11e0d1ee997ffa6" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json", - "blob_id": "6a976788f9b10af19ebcfe582a69cbc627f9457b" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "3f3ccdafa88f3452a695efad4cb9622d6ae79e6a" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json", - "blob_id": "0a46390b2e31bba6a7c3ab2c9f6c8de6004857bb" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json", - "blob_id": "f4c0f8417b384870050a95e0cf57edbdf6352b23" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "5c8185cfdeec167ec4b88de51b4b395e28769cc5" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "97c9f4445b166657ad29f1db9fc8281f9c463ec4" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "0bb423b28f5ab3825929a4870b96393262a9dd9f" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "55571873395464a3b58f549523905f439a8f1716" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "26bcbf26970c7a77c99e2c8eacd83eefa86967bf" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json", - "blob_id": "91011e64c7de4505e9bb462bc70e6a3e7affa878" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json", - "blob_id": "b41f9d443e50678334f906b44fce6d018d69500e" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "edf2a38d12ad3f420f232d2cd61ab149ad138725" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "673bae2ba8ef80ed4d4930739ca7daf0e8f28ee1" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "b2100cebb7f589747430be9ca8c8db368c152d78" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json", - "blob_id": "d720deb4bdd73d194b1023c99e190b8fcfecdaef" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "dbc624731f5cb9afcdc9213183d00d1e5edd4a00" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "cc614e635ea57327c610ce79e99ae5339614f22e" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "32c0c9da471cbe479044095e0ed14a0f54b73620" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json", - "blob_id": "f807d4a5abaed9dd686df26837f2dd9f6161300f" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "f578c8d0160ac3ef85b53c8539d3675455a97173" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "918f6839620cbab1f30b0f9383a9129c2cf2cf3d" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "e341a67917d5177bacb3f6767e7b6d92539826ad" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "34b916e574f88c65db1dac5889d74a990dc25e9b" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "6496a38fba8ae09b3025a75f357815b9d6a5e3f4" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "3618053b65831b95c4bb0f20ef3b9aa816b2d637" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "46a982f5ee9a4bd67ce244b101c576efeeb53b78" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "035ec027fa56622196b24a03a5042ce010deaebf" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "8b49f2781cb54d19a2789767ebb7e8c3fb55b981" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "851bc9f9f0b50b41451b929eaa518869b6a05412" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "d1227c2157990216d2ca51c69ad0944017f53b6a" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "77ba0d7477bdbcb036a43263e7aaa6b6913f8f4e" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "1c61451fb34e52deec827f8f63c80fb15830c202" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "63e661c80de6a7b1422f7a994a2ee7a4b724911c" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "cf354037903c0d1fcd077c4647aabce026a723fb" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "0a5d7bfdba4852da9ed08d1bc27cd7d521d09965" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "eccb86a76df0d7302b760ab6d83a8ceb9fa9d0d9" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "cb91a279d423d0ca25197e0edd5e8c2f4da58720" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "88af48431d8b8791af8df03429704606b670f1f7" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "dd069726d7ed4dcbb449af243f4f4af21815f854" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "7febe3d272b4bb76500f7c6b523396129fd53680" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "56b939e52fac3ed53a4e0ba640c40010cb3af30a" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "63d9a0bf5d79ddaaad547d44338ad4b959ad72b1" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "7fa398c15a2a535401709b0f25e20f6e4b23e58e" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "f15d8f64c7090bd71d0091a524c65d7818fec38e" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "9d7658bfc41b2c8fd4daf3fbdf62d15936d3d546" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "cd3e07804fdec10c2cfb291c1ede3ba67b753f9c" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "9d5a329d7466a37c0ca68a65a089fbb99f9327a9" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "03dba5ad15ba5f7f49100a5c78e8685e64334b2a" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "96e1594a3eabbaedc792b84b07f05ae8752b7251" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "5ffd367df833d773355590220598a3c7eceba4e0" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "9a5ff48b8942957dde9b862aed848390dd267948" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "eabc423949a24c2a1fb2368a73e5249caf8d07df" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "386928de139ce718f28222b9c1a6555df3958491" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "51e237b91b8e775a36bcf783c078c2c1cecbcbd2" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "6280219c9ee7d26f7e2fd3625dc92d847ddc7982" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "40c01c0b92b4b26fe480879dda33f18c5eb59a6d" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "c6fd3659799bc31e17f3577e7f0e8d7268faf1fb" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "160f12ed3f95a6967439ff53bc3e3a2cdc97c700" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "e5c4a1d2c94e5c7864f462e083ea5f530b8efe3f" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "2bf5eb27e38208871d50348b170c8c74b80fc519" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "9c908e80406587da4d246ce4e3a8a98a14c875b1" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "0a1e14cffbb2a894a701352193947d272427db0d" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "15b1c93f60fc5068ba11b82b6d5924dd2024a824" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "f78e7060e6840ff721d306db556636b0bbc8d9b3" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "8ff12e64c172f5a5d0fbdf900728fe60b33877e2" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "4532f93681e2be175b1bf94f81bfde711821cd60" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "1d3ce5c94c2d9a4a1637204efb3b14f7a5579bdb" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "ca7f32b9552b479dc05495792b7e426db5eb1b56" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "5acea242cc0ad094cba8ee5f568ff88afb1b41ae" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "3ab5796ee15b6ec8d4ab1f4ab5a594fecb30e4b4" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "58cdd93e90b8c29bc7a211861711565dbeeb529a" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "b72e0371d1421a1decc9d57860f83eea8f790942" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "3cb7eaa07c745fd3aa2b3242780a7061bedac1de" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "293adce387e066fce75b6e606d4b8b6a5aa10bdb" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/fp8.py", - "blob_id": "23bd7d6703104b0020671cc6ba6f78a6df37e4bf" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/fp8_utils.py", - "blob_id": "acb4f3e3bb1a34f209fdac9ecca8c123aaf67f12" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/fused_marlin_moe.py", - "blob_id": "b3e0a5c24599730faf973fad3cf3fb6031a30522" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/fused_moe.py", - "blob_id": "af2d798cbe5d7c3c1760ce79f717ab5f6d7700ba" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/platforms.py", - "blob_id": "735fab87f2add390f7bf6408ebe31d1f5de6d02b" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/scalar_type.py", - "blob_id": "ea749fe8247b6846620ccbba30ddf48d914ca4e1" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/utils/__init__.py", - "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/utils/marlin_utils.py", - "blob_id": "5037f774b8a8b7e88d822efacbb3b4ea5b95d356" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/utils/marlin_utils_test.py", - "blob_id": "83faac032ca93b3564c620c5b4b1ef63c74aaddf" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/utils/quant_utils.py", - "blob_id": "5819ab753e57655185572ce1e49c24e6268171b4" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/__init__.py", - "blob_id": "cc806778863c03ccb3157343cd6331c1c6ca332c" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/_moe_iqmfy23diekyw.abi3.so", - "blob_id": "25f80aad80865365eae32a1609be0219f7e6582e" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/_ops.py", - "blob_id": "73e0213d234c5717aea5d708cff8e0938f14bce9" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "56c1a4e3af0b4a93fff71028d8e04bf73f0abb29" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "d3677bebb82a7f3f19344ef6471626493cf2c5bb" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "265768fb900ccfe9612b4a0d25973e6618f22a79" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "d3be23dfc903ba61d3d4d79c0230952b24d2ead0" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "589f5d39f31418d5121e7cbb2e6f2894b0a7ed32" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json", - "blob_id": "2c78bfaba7890772bf266721f5577202ea443882" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "4da841e74a79f9589fecac1fa557ea132d34805f" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "200356713c0d0a76e199671c7ec8f10d0e5ee0ac" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "e076615ee541a5043556f630ecf0946c4e2c1408" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "ee896554b921040d7810bb6e9368cc200777951d" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "05aed8b1c81492151d128ef251afc510d8cc8ed5" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json", - "blob_id": "9262a74a4a0e1e3789f260a3ef7f6cb9551f3f2b" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "d251f9b5accaec977fc87a0999cd56ee387fc650" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "0ecf814a28a9441e89f892eb3d63dcf8dcb0dd97" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "51ad5b299eb22465fa80530d12bdd5d7a03ce398" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "ee5119182556cf49434c10e56cf04e3baeb26408" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "68793c77b33c4f4b97d0a4b780fcbe8043c799de" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "612910720ed9439e56c4af4c03f30fee224fac80" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "039a10ed127b77836a7f41c03513292613852b30" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "3793fcafee60bc7e8f5f12d601cb3192abfa9ca8" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "51d03d8607122d7b9bc20ba48d8432d62367fa00" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json", - "blob_id": "26f9abd6b789e9dd0f83ec7721fd1bae8aa76bec" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "cd0cdbea0c3372674cb610870dd0b30325864549" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "64be6e6591422aa0f441c3747b6c49850929652e" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "0a6a6a73fa45e270f01ba7ebdc6d9d55bf9daad3" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "ba9041d008507e31ae4179ef2bc863a49c606582" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "7a7508aab04599cb06641c835d8b0a14f54d0716" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "dbf9a2dd6f048d8adee290961e2aea72035f7615" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json", - "blob_id": "bbb2386046b1135a2cc7ab7cb26c1d0b039bcf3a" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "57055453aa24c831dad9ac8e37fdab707c63ef91" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "8cc6c643f236d2f7f9ad29354d9e469d00b20d3f" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "d4c9ddd12972ac0b5fd2be11a9cd1075906e3978" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "b2799ed3a866e25b78d60d92910c000ebb21ff71" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "b8d3be2313fa14025d8aeb2fd11e0d1ee997ffa6" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json", - "blob_id": "6a976788f9b10af19ebcfe582a69cbc627f9457b" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "3f3ccdafa88f3452a695efad4cb9622d6ae79e6a" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json", - "blob_id": "0a46390b2e31bba6a7c3ab2c9f6c8de6004857bb" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json", - "blob_id": "f4c0f8417b384870050a95e0cf57edbdf6352b23" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "5c8185cfdeec167ec4b88de51b4b395e28769cc5" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "97c9f4445b166657ad29f1db9fc8281f9c463ec4" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "0bb423b28f5ab3825929a4870b96393262a9dd9f" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "55571873395464a3b58f549523905f439a8f1716" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "26bcbf26970c7a77c99e2c8eacd83eefa86967bf" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json", - "blob_id": "91011e64c7de4505e9bb462bc70e6a3e7affa878" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json", - "blob_id": "b41f9d443e50678334f906b44fce6d018d69500e" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "edf2a38d12ad3f420f232d2cd61ab149ad138725" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "673bae2ba8ef80ed4d4930739ca7daf0e8f28ee1" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "b2100cebb7f589747430be9ca8c8db368c152d78" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json", - "blob_id": "d720deb4bdd73d194b1023c99e190b8fcfecdaef" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "dbc624731f5cb9afcdc9213183d00d1e5edd4a00" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "cc614e635ea57327c610ce79e99ae5339614f22e" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "32c0c9da471cbe479044095e0ed14a0f54b73620" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json", - "blob_id": "f807d4a5abaed9dd686df26837f2dd9f6161300f" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "f578c8d0160ac3ef85b53c8539d3675455a97173" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "918f6839620cbab1f30b0f9383a9129c2cf2cf3d" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "e341a67917d5177bacb3f6767e7b6d92539826ad" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "34b916e574f88c65db1dac5889d74a990dc25e9b" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "6496a38fba8ae09b3025a75f357815b9d6a5e3f4" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "3618053b65831b95c4bb0f20ef3b9aa816b2d637" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "46a982f5ee9a4bd67ce244b101c576efeeb53b78" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "035ec027fa56622196b24a03a5042ce010deaebf" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "8b49f2781cb54d19a2789767ebb7e8c3fb55b981" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "851bc9f9f0b50b41451b929eaa518869b6a05412" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "d1227c2157990216d2ca51c69ad0944017f53b6a" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "77ba0d7477bdbcb036a43263e7aaa6b6913f8f4e" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "1c61451fb34e52deec827f8f63c80fb15830c202" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "63e661c80de6a7b1422f7a994a2ee7a4b724911c" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "cf354037903c0d1fcd077c4647aabce026a723fb" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "0a5d7bfdba4852da9ed08d1bc27cd7d521d09965" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "eccb86a76df0d7302b760ab6d83a8ceb9fa9d0d9" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "cb91a279d423d0ca25197e0edd5e8c2f4da58720" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "88af48431d8b8791af8df03429704606b670f1f7" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "dd069726d7ed4dcbb449af243f4f4af21815f854" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "7febe3d272b4bb76500f7c6b523396129fd53680" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "56b939e52fac3ed53a4e0ba640c40010cb3af30a" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "63d9a0bf5d79ddaaad547d44338ad4b959ad72b1" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "7fa398c15a2a535401709b0f25e20f6e4b23e58e" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "f15d8f64c7090bd71d0091a524c65d7818fec38e" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "9d7658bfc41b2c8fd4daf3fbdf62d15936d3d546" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "cd3e07804fdec10c2cfb291c1ede3ba67b753f9c" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "9d5a329d7466a37c0ca68a65a089fbb99f9327a9" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "03dba5ad15ba5f7f49100a5c78e8685e64334b2a" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "96e1594a3eabbaedc792b84b07f05ae8752b7251" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "5ffd367df833d773355590220598a3c7eceba4e0" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "9a5ff48b8942957dde9b862aed848390dd267948" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "eabc423949a24c2a1fb2368a73e5249caf8d07df" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "386928de139ce718f28222b9c1a6555df3958491" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "51e237b91b8e775a36bcf783c078c2c1cecbcbd2" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "6280219c9ee7d26f7e2fd3625dc92d847ddc7982" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "40c01c0b92b4b26fe480879dda33f18c5eb59a6d" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "c6fd3659799bc31e17f3577e7f0e8d7268faf1fb" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "160f12ed3f95a6967439ff53bc3e3a2cdc97c700" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "e5c4a1d2c94e5c7864f462e083ea5f530b8efe3f" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "2bf5eb27e38208871d50348b170c8c74b80fc519" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "9c908e80406587da4d246ce4e3a8a98a14c875b1" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "0a1e14cffbb2a894a701352193947d272427db0d" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "15b1c93f60fc5068ba11b82b6d5924dd2024a824" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "f78e7060e6840ff721d306db556636b0bbc8d9b3" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "8ff12e64c172f5a5d0fbdf900728fe60b33877e2" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "4532f93681e2be175b1bf94f81bfde711821cd60" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "1d3ce5c94c2d9a4a1637204efb3b14f7a5579bdb" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "ca7f32b9552b479dc05495792b7e426db5eb1b56" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "5acea242cc0ad094cba8ee5f568ff88afb1b41ae" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "3ab5796ee15b6ec8d4ab1f4ab5a594fecb30e4b4" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "58cdd93e90b8c29bc7a211861711565dbeeb529a" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "b72e0371d1421a1decc9d57860f83eea8f790942" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "3cb7eaa07c745fd3aa2b3242780a7061bedac1de" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "293adce387e066fce75b6e606d4b8b6a5aa10bdb" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/fp8.py", - "blob_id": "23bd7d6703104b0020671cc6ba6f78a6df37e4bf" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/fp8_utils.py", - "blob_id": "acb4f3e3bb1a34f209fdac9ecca8c123aaf67f12" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/fused_marlin_moe.py", - "blob_id": "b3e0a5c24599730faf973fad3cf3fb6031a30522" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/fused_moe.py", - "blob_id": "af2d798cbe5d7c3c1760ce79f717ab5f6d7700ba" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/platforms.py", - "blob_id": "735fab87f2add390f7bf6408ebe31d1f5de6d02b" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/scalar_type.py", - "blob_id": "ea749fe8247b6846620ccbba30ddf48d914ca4e1" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/utils/__init__.py", - "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/utils/marlin_utils.py", - "blob_id": "5037f774b8a8b7e88d822efacbb3b4ea5b95d356" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/utils/marlin_utils_test.py", - "blob_id": "83faac032ca93b3564c620c5b4b1ef63c74aaddf" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/utils/quant_utils.py", - "blob_id": "5819ab753e57655185572ce1e49c24e6268171b4" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/__init__.py", - "blob_id": "cc806778863c03ccb3157343cd6331c1c6ca332c" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/_moe_6xluzhr5x6fw4.abi3.so", - "blob_id": "63a2723397b7c031719534dd6c23e3eaa1a85a23" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/_ops.py", - "blob_id": "82fd79cf07706341ad17f8ea5d841b6d018cd676" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "56c1a4e3af0b4a93fff71028d8e04bf73f0abb29" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "d3677bebb82a7f3f19344ef6471626493cf2c5bb" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "265768fb900ccfe9612b4a0d25973e6618f22a79" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "d3be23dfc903ba61d3d4d79c0230952b24d2ead0" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "589f5d39f31418d5121e7cbb2e6f2894b0a7ed32" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json", - "blob_id": "2c78bfaba7890772bf266721f5577202ea443882" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "4da841e74a79f9589fecac1fa557ea132d34805f" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "200356713c0d0a76e199671c7ec8f10d0e5ee0ac" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "e076615ee541a5043556f630ecf0946c4e2c1408" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "ee896554b921040d7810bb6e9368cc200777951d" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "05aed8b1c81492151d128ef251afc510d8cc8ed5" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json", - "blob_id": "9262a74a4a0e1e3789f260a3ef7f6cb9551f3f2b" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "d251f9b5accaec977fc87a0999cd56ee387fc650" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "0ecf814a28a9441e89f892eb3d63dcf8dcb0dd97" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "51ad5b299eb22465fa80530d12bdd5d7a03ce398" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "ee5119182556cf49434c10e56cf04e3baeb26408" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "68793c77b33c4f4b97d0a4b780fcbe8043c799de" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "612910720ed9439e56c4af4c03f30fee224fac80" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "039a10ed127b77836a7f41c03513292613852b30" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "3793fcafee60bc7e8f5f12d601cb3192abfa9ca8" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "51d03d8607122d7b9bc20ba48d8432d62367fa00" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json", - "blob_id": "26f9abd6b789e9dd0f83ec7721fd1bae8aa76bec" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "cd0cdbea0c3372674cb610870dd0b30325864549" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "64be6e6591422aa0f441c3747b6c49850929652e" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "0a6a6a73fa45e270f01ba7ebdc6d9d55bf9daad3" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "ba9041d008507e31ae4179ef2bc863a49c606582" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json", - "blob_id": "7a7508aab04599cb06641c835d8b0a14f54d0716" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "dbf9a2dd6f048d8adee290961e2aea72035f7615" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json", - "blob_id": "bbb2386046b1135a2cc7ab7cb26c1d0b039bcf3a" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "57055453aa24c831dad9ac8e37fdab707c63ef91" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "8cc6c643f236d2f7f9ad29354d9e469d00b20d3f" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "d4c9ddd12972ac0b5fd2be11a9cd1075906e3978" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "b2799ed3a866e25b78d60d92910c000ebb21ff71" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "b8d3be2313fa14025d8aeb2fd11e0d1ee997ffa6" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json", - "blob_id": "6a976788f9b10af19ebcfe582a69cbc627f9457b" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "3f3ccdafa88f3452a695efad4cb9622d6ae79e6a" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json", - "blob_id": "0a46390b2e31bba6a7c3ab2c9f6c8de6004857bb" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json", - "blob_id": "f4c0f8417b384870050a95e0cf57edbdf6352b23" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "5c8185cfdeec167ec4b88de51b4b395e28769cc5" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "97c9f4445b166657ad29f1db9fc8281f9c463ec4" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "0bb423b28f5ab3825929a4870b96393262a9dd9f" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "55571873395464a3b58f549523905f439a8f1716" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "26bcbf26970c7a77c99e2c8eacd83eefa86967bf" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json", - "blob_id": "91011e64c7de4505e9bb462bc70e6a3e7affa878" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json", - "blob_id": "b41f9d443e50678334f906b44fce6d018d69500e" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "edf2a38d12ad3f420f232d2cd61ab149ad138725" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "673bae2ba8ef80ed4d4930739ca7daf0e8f28ee1" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "b2100cebb7f589747430be9ca8c8db368c152d78" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json", - "blob_id": "d720deb4bdd73d194b1023c99e190b8fcfecdaef" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "dbc624731f5cb9afcdc9213183d00d1e5edd4a00" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "cc614e635ea57327c610ce79e99ae5339614f22e" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "32c0c9da471cbe479044095e0ed14a0f54b73620" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json", - "blob_id": "f807d4a5abaed9dd686df26837f2dd9f6161300f" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json", - "blob_id": "f578c8d0160ac3ef85b53c8539d3675455a97173" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "918f6839620cbab1f30b0f9383a9129c2cf2cf3d" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json", - "blob_id": "e341a67917d5177bacb3f6767e7b6d92539826ad" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json", - "blob_id": "34b916e574f88c65db1dac5889d74a990dc25e9b" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "6496a38fba8ae09b3025a75f357815b9d6a5e3f4" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "3618053b65831b95c4bb0f20ef3b9aa816b2d637" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "46a982f5ee9a4bd67ce244b101c576efeeb53b78" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "035ec027fa56622196b24a03a5042ce010deaebf" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "8b49f2781cb54d19a2789767ebb7e8c3fb55b981" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "851bc9f9f0b50b41451b929eaa518869b6a05412" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "d1227c2157990216d2ca51c69ad0944017f53b6a" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "77ba0d7477bdbcb036a43263e7aaa6b6913f8f4e" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "1c61451fb34e52deec827f8f63c80fb15830c202" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "63e661c80de6a7b1422f7a994a2ee7a4b724911c" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "cf354037903c0d1fcd077c4647aabce026a723fb" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "0a5d7bfdba4852da9ed08d1bc27cd7d521d09965" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "eccb86a76df0d7302b760ab6d83a8ceb9fa9d0d9" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "cb91a279d423d0ca25197e0edd5e8c2f4da58720" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "88af48431d8b8791af8df03429704606b670f1f7" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "dd069726d7ed4dcbb449af243f4f4af21815f854" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "7febe3d272b4bb76500f7c6b523396129fd53680" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "56b939e52fac3ed53a4e0ba640c40010cb3af30a" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "63d9a0bf5d79ddaaad547d44338ad4b959ad72b1" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "7fa398c15a2a535401709b0f25e20f6e4b23e58e" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "f15d8f64c7090bd71d0091a524c65d7818fec38e" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "9d7658bfc41b2c8fd4daf3fbdf62d15936d3d546" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "cd3e07804fdec10c2cfb291c1ede3ba67b753f9c" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "9d5a329d7466a37c0ca68a65a089fbb99f9327a9" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "03dba5ad15ba5f7f49100a5c78e8685e64334b2a" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "96e1594a3eabbaedc792b84b07f05ae8752b7251" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "5ffd367df833d773355590220598a3c7eceba4e0" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "9a5ff48b8942957dde9b862aed848390dd267948" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "eabc423949a24c2a1fb2368a73e5249caf8d07df" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "386928de139ce718f28222b9c1a6555df3958491" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "51e237b91b8e775a36bcf783c078c2c1cecbcbd2" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "6280219c9ee7d26f7e2fd3625dc92d847ddc7982" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "40c01c0b92b4b26fe480879dda33f18c5eb59a6d" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "c6fd3659799bc31e17f3577e7f0e8d7268faf1fb" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "160f12ed3f95a6967439ff53bc3e3a2cdc97c700" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "e5c4a1d2c94e5c7864f462e083ea5f530b8efe3f" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "2bf5eb27e38208871d50348b170c8c74b80fc519" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "9c908e80406587da4d246ce4e3a8a98a14c875b1" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "0a1e14cffbb2a894a701352193947d272427db0d" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "15b1c93f60fc5068ba11b82b6d5924dd2024a824" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "f78e7060e6840ff721d306db556636b0bbc8d9b3" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "8ff12e64c172f5a5d0fbdf900728fe60b33877e2" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "4532f93681e2be175b1bf94f81bfde711821cd60" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "1d3ce5c94c2d9a4a1637204efb3b14f7a5579bdb" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "ca7f32b9552b479dc05495792b7e426db5eb1b56" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "5acea242cc0ad094cba8ee5f568ff88afb1b41ae" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "3ab5796ee15b6ec8d4ab1f4ab5a594fecb30e4b4" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "58cdd93e90b8c29bc7a211861711565dbeeb529a" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "b72e0371d1421a1decc9d57860f83eea8f790942" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "3cb7eaa07c745fd3aa2b3242780a7061bedac1de" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json", - "blob_id": "293adce387e066fce75b6e606d4b8b6a5aa10bdb" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/fp8.py", - "blob_id": "23bd7d6703104b0020671cc6ba6f78a6df37e4bf" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/fp8_utils.py", - "blob_id": "acb4f3e3bb1a34f209fdac9ecca8c123aaf67f12" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/fused_marlin_moe.py", - "blob_id": "b3e0a5c24599730faf973fad3cf3fb6031a30522" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/fused_moe.py", - "blob_id": "af2d798cbe5d7c3c1760ce79f717ab5f6d7700ba" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/platforms.py", - "blob_id": "735fab87f2add390f7bf6408ebe31d1f5de6d02b" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/scalar_type.py", - "blob_id": "ea749fe8247b6846620ccbba30ddf48d914ca4e1" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/utils/__init__.py", - "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/utils/marlin_utils.py", - "blob_id": "5037f774b8a8b7e88d822efacbb3b4ea5b95d356" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/utils/marlin_utils_test.py", - "blob_id": "83faac032ca93b3564c620c5b4b1ef63c74aaddf" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/utils/quant_utils.py", - "blob_id": "5819ab753e57655185572ce1e49c24e6268171b4" - } - ] - }, - { - "repo_id": "kernels-community/quantization", - "sha": "95272c71ca71b1ddbacb0105dab54e5d5240bd5c", - "files": [ - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/__init__.py", - "blob_id": "c3ab3b032c29f7bbafd549915dbc677c45a33837" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/_ops.py", - "blob_id": "07486f19bf899de0eef6e7de9a2a1b08f48e4530" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/_quantization_diycyzqnjjd5k.abi3.so", - "blob_id": "4f729314a4b4a86bc21b05b5e50f6677e7a83a06" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/compressed_tensors.py", - "blob_id": "c3ba30bac87979a307fc5061a46f5d2cbf0efbf9" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/cutlass.py", - "blob_id": "c378b846d0c59de183a321fcad4b403c47b3d750" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/marlin.py", - "blob_id": "44d5d28a2fb67af955c017af3cf1403feeecbd32" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/scalar_type.py", - "blob_id": "9d711b0debcd8aaa343818edc9d6bbca20587d0a" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/utils/__init__.py", - "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/utils/marlin_utils.py", - "blob_id": "b1c94c38858a5cd6f02eb134d1a94b99a2b15566" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/utils/marlin_utils_fp8.py", - "blob_id": "b269fa6a4cee316e8299ecc86c3e7594b336b499" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/utils/marlin_utils_test.py", - "blob_id": "7d4f5f3cfbb872bf7b32e0972d6143b43f354a5e" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/utils/marlin_utils_test_24.py", - "blob_id": "927fa9016ba25f381c09d768db0c468066193a76" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/utils/marlin_utils_test_qqq.py", - "blob_id": "cb58eb945836393c58c53f5c6d702d53861c33f9" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/utils/quant_utils.py", - "blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/__init__.py", - "blob_id": "c3ab3b032c29f7bbafd549915dbc677c45a33837" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/_ops.py", - "blob_id": "5430a686ea7ef8be7217b48a28b5429606cf93d6" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/_quantization_elb4wso45znfy.abi3.so", - "blob_id": "831303548db0b8341f67b8c281c5dff35d68c5e3" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/compressed_tensors.py", - "blob_id": "c3ba30bac87979a307fc5061a46f5d2cbf0efbf9" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/cutlass.py", - "blob_id": "c378b846d0c59de183a321fcad4b403c47b3d750" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/marlin.py", - "blob_id": "44d5d28a2fb67af955c017af3cf1403feeecbd32" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/scalar_type.py", - "blob_id": "9d711b0debcd8aaa343818edc9d6bbca20587d0a" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/utils/__init__.py", - "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/utils/marlin_utils.py", - "blob_id": "b1c94c38858a5cd6f02eb134d1a94b99a2b15566" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/utils/marlin_utils_fp8.py", - "blob_id": "b269fa6a4cee316e8299ecc86c3e7594b336b499" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/utils/marlin_utils_test.py", - "blob_id": "7d4f5f3cfbb872bf7b32e0972d6143b43f354a5e" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/utils/marlin_utils_test_24.py", - "blob_id": "927fa9016ba25f381c09d768db0c468066193a76" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/utils/marlin_utils_test_qqq.py", - "blob_id": "cb58eb945836393c58c53f5c6d702d53861c33f9" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/utils/quant_utils.py", - "blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/__init__.py", - "blob_id": "c3ab3b032c29f7bbafd549915dbc677c45a33837" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/_ops.py", - "blob_id": "f5217e75a63c8f07110478117396d92226c0c0ca" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/_quantization_unicgkq3a7la6.abi3.so", - "blob_id": "c16ea0b257847bdac8068a18921e0a3b5dce6b92" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/compressed_tensors.py", - "blob_id": "c3ba30bac87979a307fc5061a46f5d2cbf0efbf9" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/cutlass.py", - "blob_id": "c378b846d0c59de183a321fcad4b403c47b3d750" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/marlin.py", - "blob_id": "44d5d28a2fb67af955c017af3cf1403feeecbd32" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/scalar_type.py", - "blob_id": "9d711b0debcd8aaa343818edc9d6bbca20587d0a" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/utils/__init__.py", - "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/utils/marlin_utils.py", - "blob_id": "b1c94c38858a5cd6f02eb134d1a94b99a2b15566" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/utils/marlin_utils_fp8.py", - "blob_id": "b269fa6a4cee316e8299ecc86c3e7594b336b499" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/utils/marlin_utils_test.py", - "blob_id": "7d4f5f3cfbb872bf7b32e0972d6143b43f354a5e" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/utils/marlin_utils_test_24.py", - "blob_id": "927fa9016ba25f381c09d768db0c468066193a76" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/utils/marlin_utils_test_qqq.py", - "blob_id": "cb58eb945836393c58c53f5c6d702d53861c33f9" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/utils/quant_utils.py", - "blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/__init__.py", - "blob_id": "c3ab3b032c29f7bbafd549915dbc677c45a33837" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/_ops.py", - "blob_id": "1e16b3f78f15e21f94d1241338a5ad6fcf9fa6dc" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/_quantization_f4o2yj2oj7kni.abi3.so", - "blob_id": "bda41c03212d0ca513c15046c8b4ca07edf23544" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/compressed_tensors.py", - "blob_id": "c3ba30bac87979a307fc5061a46f5d2cbf0efbf9" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/cutlass.py", - "blob_id": "c378b846d0c59de183a321fcad4b403c47b3d750" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/marlin.py", - "blob_id": "44d5d28a2fb67af955c017af3cf1403feeecbd32" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/scalar_type.py", - "blob_id": "9d711b0debcd8aaa343818edc9d6bbca20587d0a" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/utils/__init__.py", - "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/utils/marlin_utils.py", - "blob_id": "b1c94c38858a5cd6f02eb134d1a94b99a2b15566" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/utils/marlin_utils_fp8.py", - "blob_id": "b269fa6a4cee316e8299ecc86c3e7594b336b499" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/utils/marlin_utils_test.py", - "blob_id": "7d4f5f3cfbb872bf7b32e0972d6143b43f354a5e" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/utils/marlin_utils_test_24.py", - "blob_id": "927fa9016ba25f381c09d768db0c468066193a76" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/utils/marlin_utils_test_qqq.py", - "blob_id": "cb58eb945836393c58c53f5c6d702d53861c33f9" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/utils/quant_utils.py", - "blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/__init__.py", - "blob_id": "c3ab3b032c29f7bbafd549915dbc677c45a33837" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/_ops.py", - "blob_id": "0d812eef750061481eba1b7ed5fa708cfec31f42" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/_quantization_6nd6n6ctlfohq.abi3.so", - "blob_id": "3a69017602930aa71d55f179769d10bcba21b444" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/compressed_tensors.py", - "blob_id": "c3ba30bac87979a307fc5061a46f5d2cbf0efbf9" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/cutlass.py", - "blob_id": "c378b846d0c59de183a321fcad4b403c47b3d750" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/marlin.py", - "blob_id": "44d5d28a2fb67af955c017af3cf1403feeecbd32" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/scalar_type.py", - "blob_id": "9d711b0debcd8aaa343818edc9d6bbca20587d0a" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/utils/__init__.py", - "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/utils/marlin_utils.py", - "blob_id": "b1c94c38858a5cd6f02eb134d1a94b99a2b15566" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/utils/marlin_utils_fp8.py", - "blob_id": "b269fa6a4cee316e8299ecc86c3e7594b336b499" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/utils/marlin_utils_test.py", - "blob_id": "7d4f5f3cfbb872bf7b32e0972d6143b43f354a5e" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/utils/marlin_utils_test_24.py", - "blob_id": "927fa9016ba25f381c09d768db0c468066193a76" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/utils/marlin_utils_test_qqq.py", - "blob_id": "cb58eb945836393c58c53f5c6d702d53861c33f9" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/utils/quant_utils.py", - "blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/__init__.py", - "blob_id": "c3ab3b032c29f7bbafd549915dbc677c45a33837" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/_ops.py", - "blob_id": "e5bdaf0f73a5c870ed0d8ae6345cbec989274e16" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/_quantization_cxckebwxmlb3i.abi3.so", - "blob_id": "b51fa64aa95165e295a32926f8ca1e9ecea61ed3" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/compressed_tensors.py", - "blob_id": "c3ba30bac87979a307fc5061a46f5d2cbf0efbf9" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/cutlass.py", - "blob_id": "c378b846d0c59de183a321fcad4b403c47b3d750" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/marlin.py", - "blob_id": "44d5d28a2fb67af955c017af3cf1403feeecbd32" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/scalar_type.py", - "blob_id": "9d711b0debcd8aaa343818edc9d6bbca20587d0a" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/utils/__init__.py", - "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/utils/marlin_utils.py", - "blob_id": "b1c94c38858a5cd6f02eb134d1a94b99a2b15566" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/utils/marlin_utils_fp8.py", - "blob_id": "b269fa6a4cee316e8299ecc86c3e7594b336b499" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/utils/marlin_utils_test.py", - "blob_id": "7d4f5f3cfbb872bf7b32e0972d6143b43f354a5e" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/utils/marlin_utils_test_24.py", - "blob_id": "927fa9016ba25f381c09d768db0c468066193a76" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/utils/marlin_utils_test_qqq.py", - "blob_id": "cb58eb945836393c58c53f5c6d702d53861c33f9" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/utils/quant_utils.py", - "blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/quantization/__init__.py", - "blob_id": "c3ab3b032c29f7bbafd549915dbc677c45a33837" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/quantization/_ops.py", - "blob_id": "fd057ea2d2b103efdf01641c9767641e093ed947" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/quantization/_quantization_vsrdj55erbiei.abi3.so", - "blob_id": "eb6e2055b9b9330bcb04c316d2fb4570918d24c3" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/quantization/compressed_tensors.py", - "blob_id": "c3ba30bac87979a307fc5061a46f5d2cbf0efbf9" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/quantization/cutlass.py", - "blob_id": "c378b846d0c59de183a321fcad4b403c47b3d750" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/quantization/marlin.py", - "blob_id": "44d5d28a2fb67af955c017af3cf1403feeecbd32" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/quantization/scalar_type.py", - "blob_id": "9d711b0debcd8aaa343818edc9d6bbca20587d0a" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/quantization/utils/__init__.py", - "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/quantization/utils/marlin_utils.py", - "blob_id": "b1c94c38858a5cd6f02eb134d1a94b99a2b15566" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/quantization/utils/marlin_utils_fp8.py", - "blob_id": "b269fa6a4cee316e8299ecc86c3e7594b336b499" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/quantization/utils/marlin_utils_test.py", - "blob_id": "7d4f5f3cfbb872bf7b32e0972d6143b43f354a5e" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/quantization/utils/marlin_utils_test_24.py", - "blob_id": "927fa9016ba25f381c09d768db0c468066193a76" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/quantization/utils/marlin_utils_test_qqq.py", - "blob_id": "cb58eb945836393c58c53f5c6d702d53861c33f9" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/quantization/utils/quant_utils.py", - "blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/quantization/__init__.py", - "blob_id": "c3ab3b032c29f7bbafd549915dbc677c45a33837" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/quantization/_ops.py", - "blob_id": "0527a08325accf310054df153fbcaf26151b921e" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/quantization/_quantization_gsd2xjzq76rwy.abi3.so", - "blob_id": "46d5dc19eb270598aecb4654446449097da3ad8f" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/quantization/compressed_tensors.py", - "blob_id": "c3ba30bac87979a307fc5061a46f5d2cbf0efbf9" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/quantization/cutlass.py", - "blob_id": "c378b846d0c59de183a321fcad4b403c47b3d750" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/quantization/marlin.py", - "blob_id": "44d5d28a2fb67af955c017af3cf1403feeecbd32" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/quantization/scalar_type.py", - "blob_id": "9d711b0debcd8aaa343818edc9d6bbca20587d0a" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/quantization/utils/__init__.py", - "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/quantization/utils/marlin_utils.py", - "blob_id": "b1c94c38858a5cd6f02eb134d1a94b99a2b15566" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/quantization/utils/marlin_utils_fp8.py", - "blob_id": "b269fa6a4cee316e8299ecc86c3e7594b336b499" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/quantization/utils/marlin_utils_test.py", - "blob_id": "7d4f5f3cfbb872bf7b32e0972d6143b43f354a5e" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/quantization/utils/marlin_utils_test_24.py", - "blob_id": "927fa9016ba25f381c09d768db0c468066193a76" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/quantization/utils/marlin_utils_test_qqq.py", - "blob_id": "cb58eb945836393c58c53f5c6d702d53861c33f9" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/quantization/utils/quant_utils.py", - "blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/quantization/__init__.py", - "blob_id": "c3ab3b032c29f7bbafd549915dbc677c45a33837" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/quantization/_ops.py", - "blob_id": "83eb9dbc2c91b6b305544ea9f84d41ef73f5ea01" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/quantization/_quantization_hbfrcozzte6aq.abi3.so", - "blob_id": "d8feec129f933b3eecb28862aec4167876fd5ca8" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/quantization/compressed_tensors.py", - "blob_id": "c3ba30bac87979a307fc5061a46f5d2cbf0efbf9" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/quantization/cutlass.py", - "blob_id": "c378b846d0c59de183a321fcad4b403c47b3d750" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/quantization/marlin.py", - "blob_id": "44d5d28a2fb67af955c017af3cf1403feeecbd32" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/quantization/scalar_type.py", - "blob_id": "9d711b0debcd8aaa343818edc9d6bbca20587d0a" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/quantization/utils/__init__.py", - "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/quantization/utils/marlin_utils.py", - "blob_id": "b1c94c38858a5cd6f02eb134d1a94b99a2b15566" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/quantization/utils/marlin_utils_fp8.py", - "blob_id": "b269fa6a4cee316e8299ecc86c3e7594b336b499" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/quantization/utils/marlin_utils_test.py", - "blob_id": "7d4f5f3cfbb872bf7b32e0972d6143b43f354a5e" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/quantization/utils/marlin_utils_test_24.py", - "blob_id": "927fa9016ba25f381c09d768db0c468066193a76" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/quantization/utils/marlin_utils_test_qqq.py", - "blob_id": "cb58eb945836393c58c53f5c6d702d53861c33f9" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/quantization/utils/quant_utils.py", - "blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/quantization/__init__.py", - "blob_id": "c3ab3b032c29f7bbafd549915dbc677c45a33837" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/quantization/_ops.py", - "blob_id": "3b54f0bfc1030429b27a48d4e574778ebe86f820" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/quantization/_quantization_womr3pvjbirhe.abi3.so", - "blob_id": "f799e8bfc915d482daef4890ee6f8d6b342e0da1" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/quantization/compressed_tensors.py", - "blob_id": "c3ba30bac87979a307fc5061a46f5d2cbf0efbf9" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/quantization/cutlass.py", - "blob_id": "c378b846d0c59de183a321fcad4b403c47b3d750" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/quantization/marlin.py", - "blob_id": "44d5d28a2fb67af955c017af3cf1403feeecbd32" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/quantization/scalar_type.py", - "blob_id": "9d711b0debcd8aaa343818edc9d6bbca20587d0a" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/quantization/utils/__init__.py", - "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/quantization/utils/marlin_utils.py", - "blob_id": "b1c94c38858a5cd6f02eb134d1a94b99a2b15566" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/quantization/utils/marlin_utils_fp8.py", - "blob_id": "b269fa6a4cee316e8299ecc86c3e7594b336b499" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/quantization/utils/marlin_utils_test.py", - "blob_id": "7d4f5f3cfbb872bf7b32e0972d6143b43f354a5e" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/quantization/utils/marlin_utils_test_24.py", - "blob_id": "927fa9016ba25f381c09d768db0c468066193a76" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/quantization/utils/marlin_utils_test_qqq.py", - "blob_id": "cb58eb945836393c58c53f5c6d702d53861c33f9" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/quantization/utils/quant_utils.py", - "blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/quantization/__init__.py", - "blob_id": "c3ab3b032c29f7bbafd549915dbc677c45a33837" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/quantization/_ops.py", - "blob_id": "c83bf352ab1ca3283391b3dd8c209c1bf6a60eb1" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/quantization/_quantization_55qjuxe2uqrp6.abi3.so", - "blob_id": "4d1b45a1c98552ca7e4ba4c0e583e266e9f70060" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/quantization/compressed_tensors.py", - "blob_id": "c3ba30bac87979a307fc5061a46f5d2cbf0efbf9" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/quantization/cutlass.py", - "blob_id": "c378b846d0c59de183a321fcad4b403c47b3d750" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/quantization/marlin.py", - "blob_id": "44d5d28a2fb67af955c017af3cf1403feeecbd32" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/quantization/scalar_type.py", - "blob_id": "9d711b0debcd8aaa343818edc9d6bbca20587d0a" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/quantization/utils/__init__.py", - "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/quantization/utils/marlin_utils.py", - "blob_id": "b1c94c38858a5cd6f02eb134d1a94b99a2b15566" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/quantization/utils/marlin_utils_fp8.py", - "blob_id": "b269fa6a4cee316e8299ecc86c3e7594b336b499" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/quantization/utils/marlin_utils_test.py", - "blob_id": "7d4f5f3cfbb872bf7b32e0972d6143b43f354a5e" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/quantization/utils/marlin_utils_test_24.py", - "blob_id": "927fa9016ba25f381c09d768db0c468066193a76" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/quantization/utils/marlin_utils_test_qqq.py", - "blob_id": "cb58eb945836393c58c53f5c6d702d53861c33f9" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/quantization/utils/quant_utils.py", - "blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/quantization/__init__.py", - "blob_id": "c3ab3b032c29f7bbafd549915dbc677c45a33837" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/quantization/_ops.py", - "blob_id": "7552c079934b6f76bb1c221358e0ac2f1ca449be" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/quantization/_quantization_cyhrq7sx4uskw.abi3.so", - "blob_id": "0a4c95a688fcacd6a10c04e101e1deec5a03ffc9" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/quantization/compressed_tensors.py", - "blob_id": "c3ba30bac87979a307fc5061a46f5d2cbf0efbf9" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/quantization/cutlass.py", - "blob_id": "c378b846d0c59de183a321fcad4b403c47b3d750" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/quantization/marlin.py", - "blob_id": "44d5d28a2fb67af955c017af3cf1403feeecbd32" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/quantization/scalar_type.py", - "blob_id": "9d711b0debcd8aaa343818edc9d6bbca20587d0a" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/quantization/utils/__init__.py", - "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/quantization/utils/marlin_utils.py", - "blob_id": "b1c94c38858a5cd6f02eb134d1a94b99a2b15566" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/quantization/utils/marlin_utils_fp8.py", - "blob_id": "b269fa6a4cee316e8299ecc86c3e7594b336b499" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/quantization/utils/marlin_utils_test.py", - "blob_id": "7d4f5f3cfbb872bf7b32e0972d6143b43f354a5e" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/quantization/utils/marlin_utils_test_24.py", - "blob_id": "927fa9016ba25f381c09d768db0c468066193a76" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/quantization/utils/marlin_utils_test_qqq.py", - "blob_id": "cb58eb945836393c58c53f5c6d702d53861c33f9" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/quantization/utils/quant_utils.py", - "blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52" - } - ] - }, - { - "repo_id": "kernels-community/quantization-eetq", - "sha": "a80ce846d6270ddddeee109523ed947f594f246b", - "files": [ - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization_eetq/__init__.py", - "blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization_eetq/_ops.py", - "blob_id": "9c191845fb7acbd7ea6bae36ce8c237b168557e1" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization_eetq/_quantization_eetq_v7rnpcck3kry4.abi3.so", - "blob_id": "9edc9126b9ec8ce4f47a8e6688a5f0329c905329" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization_eetq/custom_ops.py", - "blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization_eetq/__init__.py", - "blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization_eetq/_ops.py", - "blob_id": "ccec58b06a2282da51356fe5d04dd1e2757ce80c" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization_eetq/_quantization_eetq_zcfiojfkx55be.abi3.so", - "blob_id": "ea27fb040515267ec631cec5545b878da680e7cc" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization_eetq/custom_ops.py", - "blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization_eetq/__init__.py", - "blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization_eetq/_ops.py", - "blob_id": "bb409419898138ffa9ade9ba505a167a067ea378" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization_eetq/_quantization_eetq_btymam4x7xvs6.abi3.so", - "blob_id": "0395dd048ccf10ed020a77fa04bcb026ba369d73" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization_eetq/custom_ops.py", - "blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization_eetq/__init__.py", - "blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization_eetq/_ops.py", - "blob_id": "f250a00832d2044f7bbb87557a1c878d9c8dd24d" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization_eetq/_quantization_eetq_yy3p6bsf622sq.abi3.so", - "blob_id": "c98d156835e442b039d38a82e9f111036750329c" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization_eetq/custom_ops.py", - "blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization_eetq/__init__.py", - "blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization_eetq/_ops.py", - "blob_id": "b5259247e8fb3ed9429cf005a525edc8bcae4903" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization_eetq/_quantization_eetq_imijtykkseqze.abi3.so", - "blob_id": "c46908ce00d02376ae8e18efebb7fee55afbc3ac" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization_eetq/custom_ops.py", - "blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization_eetq/__init__.py", - "blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization_eetq/_ops.py", - "blob_id": "79f8d42700ad34b9b46e6e328f90885d1ee9beab" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization_eetq/_quantization_eetq_4qerj3t7ddiry.abi3.so", - "blob_id": "9ba519d2fd4e347b784c21f4c171cbbab57c7774" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization_eetq/custom_ops.py", - "blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/quantization_eetq/__init__.py", - "blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/quantization_eetq/_ops.py", - "blob_id": "805ec785b7f5196f78dfe77b6cd7c2603c02490e" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/quantization_eetq/_quantization_eetq_j23ltbqvrnixg.abi3.so", - "blob_id": "77d53c16e57c658e8f9caa37b0084c4a3a7ffda1" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/quantization_eetq/custom_ops.py", - "blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/quantization_eetq/__init__.py", - "blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/quantization_eetq/_ops.py", - "blob_id": "7b590a5a6ede67e0ae13f97dbd7a82a4674e1b23" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/quantization_eetq/_quantization_eetq_p5neqtnhdgxv2.abi3.so", - "blob_id": "e3e5fbd8ce3232b6e9a7c3077eab9665b95bef49" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/quantization_eetq/custom_ops.py", - "blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/quantization_eetq/__init__.py", - "blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/quantization_eetq/_ops.py", - "blob_id": "0be7ffcb2e9590899683a197b977ec0b39ca7cb7" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/quantization_eetq/_quantization_eetq_idk3dezy35dfk.abi3.so", - "blob_id": "61aa67cbe7ce810bf9792e6e8f19219c757ff181" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/quantization_eetq/custom_ops.py", - "blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/quantization_eetq/__init__.py", - "blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/quantization_eetq/_ops.py", - "blob_id": "998eba3eddd0520769a2b4ecb3402c024bde44ea" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/quantization_eetq/_quantization_eetq_fpjoxzd7nm2qa.abi3.so", - "blob_id": "31d835db1d0348e3f35c23e6a8f2532fd7e9fea7" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/quantization_eetq/custom_ops.py", - "blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/quantization_eetq/__init__.py", - "blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/quantization_eetq/_ops.py", - "blob_id": "6d5320b05b03f2f3ddfd299d6e2a72aa6116264f" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/quantization_eetq/_quantization_eetq_k7mlunxe2ye4s.abi3.so", - "blob_id": "1946e4c2fab63243d051012cb12e19895828145f" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/quantization_eetq/custom_ops.py", - "blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/quantization_eetq/__init__.py", - "blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/quantization_eetq/_ops.py", - "blob_id": "9b15d85f44e4223ce1f16df987feafd6640dcc62" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/quantization_eetq/_quantization_eetq_7m7hz3sbwkaio.abi3.so", - "blob_id": "eb1536ccd1dfa2655ea7de4445aa3c6790f3a0ae" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/quantization_eetq/custom_ops.py", - "blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88" - } - ] - }, - { - "repo_id": "kernels-community/rotary", - "sha": "4db658e027ec752840bb3f557ee076413b8db03f", - "files": [ - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/rotary/__init__.py", - "blob_id": "eba8039e210c8b710c5c663ef4e7930757f271be" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/rotary/_ops.py", - "blob_id": "4fe035c87ea1300ffedcfce17338167dd946e0e8" - }, - { - "filename": "build/torch25-cxx11-cu118-x86_64-linux/rotary/_rotary_5yzc45v7kk3yu.abi3.so", - "blob_id": "f315754ccb3e8b9dfb4d8954aefaac61b2a4e8bc" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/rotary/__init__.py", - "blob_id": "eba8039e210c8b710c5c663ef4e7930757f271be" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/rotary/_ops.py", - "blob_id": "d45359065a7cdc43e2d512d38fc0bfcd88138835" - }, - { - "filename": "build/torch25-cxx11-cu121-x86_64-linux/rotary/_rotary_tbiepw2a2ep3e.abi3.so", - "blob_id": "9bc986ca760b6a57d05e891c3def1769341a2c29" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/rotary/__init__.py", - "blob_id": "eba8039e210c8b710c5c663ef4e7930757f271be" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/rotary/_ops.py", - "blob_id": "7421978682125139f1169f3f71789e0cb44d3b45" - }, - { - "filename": "build/torch25-cxx11-cu124-x86_64-linux/rotary/_rotary_6w5syhrhmerj6.abi3.so", - "blob_id": "35caf7d755000ca2afac33cc7d4f344112751f9f" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/rotary/__init__.py", - "blob_id": "eba8039e210c8b710c5c663ef4e7930757f271be" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/rotary/_ops.py", - "blob_id": "d6f569b471eae628e738b3504c8a9a18b4973d97" - }, - { - "filename": "build/torch25-cxx98-cu118-x86_64-linux/rotary/_rotary_joujmbgvsytzg.abi3.so", - "blob_id": "30c4aaa83f2549f7363631a51b3341cdf0612f15" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/rotary/__init__.py", - "blob_id": "eba8039e210c8b710c5c663ef4e7930757f271be" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/rotary/_ops.py", - "blob_id": "f1f71f34bf0f3c5dffb7c147b48f6396f8054310" - }, - { - "filename": "build/torch25-cxx98-cu121-x86_64-linux/rotary/_rotary_mi2o7e7sishyw.abi3.so", - "blob_id": "e022e9c1101bdb89d43913979107f7a56717ea6d" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/rotary/__init__.py", - "blob_id": "eba8039e210c8b710c5c663ef4e7930757f271be" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/rotary/_ops.py", - "blob_id": "a46c19bd5adfb85d5b7795b3b9277e416f31d8ce" - }, - { - "filename": "build/torch25-cxx98-cu124-x86_64-linux/rotary/_rotary_rngiohfhfwuge.abi3.so", - "blob_id": "1621cba0150465f67aa931fe3a55e38928b48bcb" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/rotary/__init__.py", - "blob_id": "eba8039e210c8b710c5c663ef4e7930757f271be" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/rotary/_ops.py", - "blob_id": "3296d23431d1ec084e8644ff5d3d203a74d82ea1" - }, - { - "filename": "build/torch26-cxx11-cu118-x86_64-linux/rotary/_rotary_alv7mzltcxxpq.abi3.so", - "blob_id": "2f8b3b93bb7c8fae22c8e08c67771683f549f170" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/rotary/__init__.py", - "blob_id": "eba8039e210c8b710c5c663ef4e7930757f271be" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/rotary/_ops.py", - "blob_id": "0bae33b64c71d6a6ad748be66a410a662bb5b28a" - }, - { - "filename": "build/torch26-cxx11-cu124-x86_64-linux/rotary/_rotary_c4eyapeep6gty.abi3.so", - "blob_id": "3a15076dbd1f9a05f1089cc2cb13c03e5838f2b9" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/rotary/__init__.py", - "blob_id": "eba8039e210c8b710c5c663ef4e7930757f271be" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/rotary/_ops.py", - "blob_id": "5c5d7e4497962e0a3e9531ec6a5fdb18e995e0f8" - }, - { - "filename": "build/torch26-cxx11-cu126-x86_64-linux/rotary/_rotary_lodp6xeztste6.abi3.so", - "blob_id": "efc4a4a0001fba7b0743d1d4a9774d1fc9089ee5" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/rotary/__init__.py", - "blob_id": "eba8039e210c8b710c5c663ef4e7930757f271be" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/rotary/_ops.py", - "blob_id": "3cdda1ceda90e76b30b08cae6ad718aa2c2ec3ef" - }, - { - "filename": "build/torch26-cxx98-cu118-x86_64-linux/rotary/_rotary_z27mls7mz4e7m.abi3.so", - "blob_id": "43b0f08ea035cd2fadb6e119802e7d841a523246" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/rotary/__init__.py", - "blob_id": "eba8039e210c8b710c5c663ef4e7930757f271be" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/rotary/_ops.py", - "blob_id": "914f1bb6f9499d0245c3f47345f3b95582be28b2" - }, - { - "filename": "build/torch26-cxx98-cu124-x86_64-linux/rotary/_rotary_3bktke4p3hz3a.abi3.so", - "blob_id": "29c0bba399e6f348ad8baf93daa5166a6ec6994a" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/rotary/__init__.py", - "blob_id": "eba8039e210c8b710c5c663ef4e7930757f271be" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/rotary/_ops.py", - "blob_id": "11056cd0ed09530830b614b8207cbb7fa7ef3288" - }, - { - "filename": "build/torch26-cxx98-cu126-x86_64-linux/rotary/_rotary_fvednlzeqgg5s.abi3.so", - "blob_id": "8c7c34d9e603640576ba522dcbed341c0d780a9c" - } - ] - } -] diff --git a/server/kernels.lock b/server/kernels.lock new file mode 100644 index 00000000..9e11de68 --- /dev/null +++ b/server/kernels.lock @@ -0,0 +1,272 @@ +[ + { + "repo_id": "kernels-community/paged-attention", + "sha": "331b7e63a6b592799c8bc992f681bb1ee2c865a2", + "variants": { + "torch25-cxx11-cu118-x86_64-linux": { + "hash": "sha256-8e0aa39abab82f1d21b661d35e0470a24c3ebbdda38532ded805c18037a1ad1e", + "hash_type": "git_lfs_concat" + }, + "torch25-cxx11-cu121-x86_64-linux": { + "hash": "sha256-b0c3aef6c4c9aac627975cb1a2bfc46a70390763c8165575b89d1651d007c38a", + "hash_type": "git_lfs_concat" + }, + "torch25-cxx11-cu124-x86_64-linux": { + "hash": "sha256-960fbc8998439d779adb47fb2a37cce68c7dc075d8a49893bd487be9ca2d1389", + "hash_type": "git_lfs_concat" + }, + "torch25-cxx98-cu118-x86_64-linux": { + "hash": "sha256-9d6d60c411c55aa2f9d7c681c2be96f4262d56c96f592f3d4fb35ce4f4f1e18e", + "hash_type": "git_lfs_concat" + }, + "torch25-cxx98-cu121-x86_64-linux": { + "hash": "sha256-98c0a305b2cc9b7be757fab923d9aa406c686dcd0460e462926f87d051ef3d19", + "hash_type": "git_lfs_concat" + }, + "torch25-cxx98-cu124-x86_64-linux": { + "hash": "sha256-71e586416213c96ffbdeae0d077ba97bfde5b00005f2746d4cba2320cb53bf87", + "hash_type": "git_lfs_concat" + }, + "torch26-cxx11-cu118-x86_64-linux": { + "hash": "sha256-2f559312c54d558b33a4082ffc3fcf923f51da40ced19bfc8920e998ba2b71bf", + "hash_type": "git_lfs_concat" + }, + "torch26-cxx11-cu124-x86_64-linux": { + "hash": "sha256-6033b41a0f8a9509887c6171f0b42d9aa738490903b3fd5ea2c52703c5fb8fc3", + "hash_type": "git_lfs_concat" + }, + "torch26-cxx11-cu126-x86_64-linux": { + "hash": "sha256-3139f66a53f2bf0c314b4d309893095746bdc9c3914c904fc31adfdf553ed219", + "hash_type": "git_lfs_concat" + }, + "torch26-cxx98-cu118-x86_64-linux": { + "hash": "sha256-2173d77e384d8e2881fc38603992c09e8be7bcd9da4cafdd4f2a5ce0ce22caf4", + "hash_type": "git_lfs_concat" + }, + "torch26-cxx98-cu124-x86_64-linux": { + "hash": "sha256-7b1aaef81e01ecce83e03c50872910680ff2953f7c6ffd3ff15e8d9497ca9239", + "hash_type": "git_lfs_concat" + }, + "torch26-cxx98-cu126-x86_64-linux": { + "hash": "sha256-818b160a88b12b8e871099e40f76aa436ee828e2e060ecc35502dbe34a6ebd3b", + "hash_type": "git_lfs_concat" + } + } + }, + { + "repo_id": "kernels-community/moe", + "sha": "605a216f507b9a97b543140dee8937a4622069a8", + "variants": { + "torch25-cxx11-cu118-x86_64-linux": { + "hash": "sha256-855d92f02be3bfba0758161fa1266159d76c172e7c5d43d30816d22cfba76074", + "hash_type": "git_lfs_concat" + }, + "torch25-cxx11-cu121-x86_64-linux": { + "hash": "sha256-e6e780230477bbbc26fc40cc7fcff50298155998af4fc77a026c9f815ec984b1", + "hash_type": "git_lfs_concat" + }, + "torch25-cxx11-cu124-x86_64-linux": { + "hash": "sha256-52c1fb337033c4d1d7a279c5cb28aebbc7389976f21dc5803aeb16b2f7aeb94c", + "hash_type": "git_lfs_concat" + }, + "torch25-cxx98-cu118-x86_64-linux": { + "hash": "sha256-1fb654e8d02dda2a2382d1fb3a3ca9738d292eea674b30b80030cdcdfb6a0035", + "hash_type": "git_lfs_concat" + }, + "torch25-cxx98-cu121-x86_64-linux": { + "hash": "sha256-0cf235f1de85d4ce7490c79aa64220f608f886f313b676d91c331a6a2fd67bbb", + "hash_type": "git_lfs_concat" + }, + "torch25-cxx98-cu124-x86_64-linux": { + "hash": "sha256-3def11fee9bf1ea9b1579206fd5f5ecbcaad47ac478e2c3aa7b2c9c7fd5db934", + "hash_type": "git_lfs_concat" + }, + "torch26-cxx11-cu118-x86_64-linux": { + "hash": "sha256-3a49ee03f675190a79c7c74a45cc403d491eceb63a943f47d52064a11ca6ef6f", + "hash_type": "git_lfs_concat" + }, + "torch26-cxx11-cu124-x86_64-linux": { + "hash": "sha256-dbf20cb11db7d53e11147ab13641eefaa235f9ac2fde1beaf8f56f850c11bd54", + "hash_type": "git_lfs_concat" + }, + "torch26-cxx11-cu126-x86_64-linux": { + "hash": "sha256-8a07232ab316e8eab74747662cb7b86aac03f44ff158f275768fd59390df2525", + "hash_type": "git_lfs_concat" + }, + "torch26-cxx98-cu118-x86_64-linux": { + "hash": "sha256-cdd46301af997eeace5e016d8590969981b3a3f8647828d04baa5fa10c696746", + "hash_type": "git_lfs_concat" + }, + "torch26-cxx98-cu124-x86_64-linux": { + "hash": "sha256-c865188e9d2c17f3358f3d343fb40340232457572744bf85efd6b20af545d5f3", + "hash_type": "git_lfs_concat" + }, + "torch26-cxx98-cu126-x86_64-linux": { + "hash": "sha256-2a8b09f3272ea80491e78a39ff886680471d99f7ba571581809adfe918013898", + "hash_type": "git_lfs_concat" + } + } + }, + { + "repo_id": "kernels-community/quantization", + "sha": "95272c71ca71b1ddbacb0105dab54e5d5240bd5c", + "variants": { + "torch25-cxx11-cu118-x86_64-linux": { + "hash": "sha256-2d0a274cf0117bf7880d6040adafa1b70fe8bff3a00ef2834ed5435a6b525a49", + "hash_type": "git_lfs_concat" + }, + "torch25-cxx11-cu121-x86_64-linux": { + "hash": "sha256-116458beac63ea5eeb1e7fba7edc68d160cd8ac28f55b926d79035551aac7d5f", + "hash_type": "git_lfs_concat" + }, + "torch25-cxx11-cu124-x86_64-linux": { + "hash": "sha256-cace644c6fb04470384796c18987135cb051dfb90a14e902c51a3786fc07c599", + "hash_type": "git_lfs_concat" + }, + "torch25-cxx98-cu118-x86_64-linux": { + "hash": "sha256-104c6961cd3e1a74efdf14ea2172acc6647846852fccafe3698a27a6cf37941d", + "hash_type": "git_lfs_concat" + }, + "torch25-cxx98-cu121-x86_64-linux": { + "hash": "sha256-cdc95b41aa91a803f11f8cd53001895c2b69550b5af2fb278d6f124381229d0b", + "hash_type": "git_lfs_concat" + }, + "torch25-cxx98-cu124-x86_64-linux": { + "hash": "sha256-d5388469cb6074f196f20b1e1e4805bb3c967a8147b31ca2c0461aa87b50604e", + "hash_type": "git_lfs_concat" + }, + "torch26-cxx11-cu118-x86_64-linux": { + "hash": "sha256-70c4bb3792c4c3207d4963173d8d0ef3b2bda677151aef140662dd87bfa1b69f", + "hash_type": "git_lfs_concat" + }, + "torch26-cxx11-cu124-x86_64-linux": { + "hash": "sha256-bcacbb2232f49345f27e07fa821b48a7e3df643c01af37281fcafc74c471f682", + "hash_type": "git_lfs_concat" + }, + "torch26-cxx11-cu126-x86_64-linux": { + "hash": "sha256-344d20964f7eb133e5ec6fda976fa5ee62807b739a4361f236aca5ae53beb9ac", + "hash_type": "git_lfs_concat" + }, + "torch26-cxx98-cu118-x86_64-linux": { + "hash": "sha256-dfaec226550254fbce1a5c7e2f547e85700958a1a4087e1c873d22e6f71a5ceb", + "hash_type": "git_lfs_concat" + }, + "torch26-cxx98-cu124-x86_64-linux": { + "hash": "sha256-0abe6460d0a2202b0086e3663092595e5b93b9a9cbb85c10034180cc9bfebc6e", + "hash_type": "git_lfs_concat" + }, + "torch26-cxx98-cu126-x86_64-linux": { + "hash": "sha256-68e156f94c3c0c9523773b62eaeced93766e0d9ee67d8191fb9570fb5af30d5b", + "hash_type": "git_lfs_concat" + } + } + }, + { + "repo_id": "kernels-community/quantization-eetq", + "sha": "a80ce846d6270ddddeee109523ed947f594f246b", + "variants": { + "torch25-cxx11-cu118-x86_64-linux": { + "hash": "sha256-e06beb00799b1e656583eb0496f09fc0bf1b26f75e9864a2fe19ebd5b62c3671", + "hash_type": "git_lfs_concat" + }, + "torch25-cxx11-cu121-x86_64-linux": { + "hash": "sha256-c128d3ef6558cfedf045c4a713891792708851b7f6f027de835d9083cb3b297d", + "hash_type": "git_lfs_concat" + }, + "torch25-cxx11-cu124-x86_64-linux": { + "hash": "sha256-c7e2e14fc114788634b34a4f670f7bf4d27321e5ed40ff446f5a25eef70222c7", + "hash_type": "git_lfs_concat" + }, + "torch25-cxx98-cu118-x86_64-linux": { + "hash": "sha256-58dad53cfbf1315af464f9d8ba7be9012089c839d4f06a8d2cf8ce0deaf5949a", + "hash_type": "git_lfs_concat" + }, + "torch25-cxx98-cu121-x86_64-linux": { + "hash": "sha256-6519af49c0f689744a7b49497ad2bea1524b69e4095446087d7ab622b898aa30", + "hash_type": "git_lfs_concat" + }, + "torch25-cxx98-cu124-x86_64-linux": { + "hash": "sha256-94e0731b58a9ba0e5e2f37b100c8d987c80b5d349008ef625917d020b6c52d25", + "hash_type": "git_lfs_concat" + }, + "torch26-cxx11-cu118-x86_64-linux": { + "hash": "sha256-e5b04475538f49d7b4ffded080e4c9c86a658abc12667e3838ebcc410ab1eef4", + "hash_type": "git_lfs_concat" + }, + "torch26-cxx11-cu124-x86_64-linux": { + "hash": "sha256-783c02db737a6ec9958b3090f164b87888d3b26e30a4fb6e1cd0c1a635753fab", + "hash_type": "git_lfs_concat" + }, + "torch26-cxx11-cu126-x86_64-linux": { + "hash": "sha256-a3d81f82f9cfe9d8a6d46758758b3a1b3055d902f41917b4ef2976373db843d6", + "hash_type": "git_lfs_concat" + }, + "torch26-cxx98-cu118-x86_64-linux": { + "hash": "sha256-f1de67e17944a9816f778c72ae73bbbc90d795cb4885c2f9ee5e0b9a3c57583b", + "hash_type": "git_lfs_concat" + }, + "torch26-cxx98-cu124-x86_64-linux": { + "hash": "sha256-789b50d767a5121a7e5a52eaf0c8e897bf1787f049ca08faffb220e5053a5f10", + "hash_type": "git_lfs_concat" + }, + "torch26-cxx98-cu126-x86_64-linux": { + "hash": "sha256-7c7fe57fea7b9be253085d506f01b2487b2306f22bdffe1de44397fc9f8a3613", + "hash_type": "git_lfs_concat" + } + } + }, + { + "repo_id": "kernels-community/rotary", + "sha": "4db658e027ec752840bb3f557ee076413b8db03f", + "variants": { + "torch25-cxx11-cu118-x86_64-linux": { + "hash": "sha256-907df2035267a65793985bb7f69fb2a975955fb08c2bbc78c58def43d02801da", + "hash_type": "git_lfs_concat" + }, + "torch25-cxx11-cu121-x86_64-linux": { + "hash": "sha256-b614735ae61ee2c1825a3c823fa0cdd3aa07d0bb3f4106001b9e1a557c0ca9b9", + "hash_type": "git_lfs_concat" + }, + "torch25-cxx11-cu124-x86_64-linux": { + "hash": "sha256-f2e98ec72faaebc1cae25f83ccdbb151868b6902fb5a0623e09d700a514c2a7e", + "hash_type": "git_lfs_concat" + }, + "torch25-cxx98-cu118-x86_64-linux": { + "hash": "sha256-421214c5a576fac2e0b7998395dccd7f66010f65a6fc647ce06b106ea91105d2", + "hash_type": "git_lfs_concat" + }, + "torch25-cxx98-cu121-x86_64-linux": { + "hash": "sha256-9d1c464cf7f391975afa48f2254a639f41582155ad1b50c25bb122418ce8db58", + "hash_type": "git_lfs_concat" + }, + "torch25-cxx98-cu124-x86_64-linux": { + "hash": "sha256-82f8012d78304efaa7318f106907630294d10c8b5c9f56923c71df0b03e09f14", + "hash_type": "git_lfs_concat" + }, + "torch26-cxx11-cu118-x86_64-linux": { + "hash": "sha256-a3247919dcc392efc7e54725dfbce9ee8a796fe4ee53d113048b313de074d3da", + "hash_type": "git_lfs_concat" + }, + "torch26-cxx11-cu124-x86_64-linux": { + "hash": "sha256-a21c9734d15946f4cc967d0555d45d7effc6624990c6889fc49162af744fbbe9", + "hash_type": "git_lfs_concat" + }, + "torch26-cxx11-cu126-x86_64-linux": { + "hash": "sha256-01cdda160425b29db0d9bb084874ade4ac081735f9717f272aaefe5bcb379ae1", + "hash_type": "git_lfs_concat" + }, + "torch26-cxx98-cu118-x86_64-linux": { + "hash": "sha256-17be5b770418ad47101c49d8945b5aa32af9eb5a840bdffb0514d0e264edd860", + "hash_type": "git_lfs_concat" + }, + "torch26-cxx98-cu124-x86_64-linux": { + "hash": "sha256-3cd4b9f63cc903e01325b7e5b204e40fc6600c0685f2e19e6f1fa604a599d82d", + "hash_type": "git_lfs_concat" + }, + "torch26-cxx98-cu126-x86_64-linux": { + "hash": "sha256-c569f4a4f9b64792507c58d7cfa31dde1285b52125ef07cc98d9f23636af09ca", + "hash_type": "git_lfs_concat" + } + } + } +] diff --git a/server/pyproject.toml b/server/pyproject.toml index 07ea1048..e3ec734a 100644 --- a/server/pyproject.toml +++ b/server/pyproject.toml @@ -14,7 +14,7 @@ dependencies = [ "grpcio>=1.67.0", "grpcio-reflection>=1.67.0", "grpcio-status>=1.67.0", - "hf-kernels>=0.1.5", + "kernels>=0.2.1", "hf-transfer>=0.1.8", "loguru>=0.7.3", "numpy>=1.26,<3", @@ -36,7 +36,7 @@ dependencies = [ ] [build-system] -requires = ["hf-kernels>=0.1.2", "setuptools"] +requires = ["kernels>=0.1.7", "setuptools"] build-backend = "setuptools.build_meta" [tool.kernels.dependencies] diff --git a/server/text_generation_server/adapters/lora.py b/server/text_generation_server/adapters/lora.py index cdcfe91b..782d66e4 100644 --- a/server/text_generation_server/adapters/lora.py +++ b/server/text_generation_server/adapters/lora.py @@ -205,7 +205,6 @@ class LoraWeights(AdapterWeights): lora_a_list = [None] * nlayers lora_b_list = [None] * nlayers - # import ipdb; ipdb.set_trace() for layer_id in range(nlayers): key = (layer_id, layer_type) if key not in target_to_layer: diff --git a/server/text_generation_server/layers/attention/cuda.py b/server/text_generation_server/layers/attention/cuda.py index 4f25cc19..fb50dda6 100644 --- a/server/text_generation_server/layers/attention/cuda.py +++ b/server/text_generation_server/layers/attention/cuda.py @@ -38,6 +38,7 @@ def paged_attention( *, kv_scales: KVScales, softcap: Optional[float] = None, + window_size_left: Optional[int] = -1, ): # Adapted from: https://github.com/vllm-project/vllm/blob/f8a1e39fae05ca610be8d5a78be9d40f5274e5fc/vllm/model_executor/layers/attention.py # Copyright 2023 The vLLM team. All rights @@ -79,12 +80,15 @@ def paged_attention( sm_scale=softmax_scale, k_scale=kv_scales.key_scale_cpu if can_scale else 1.0, v_scale=kv_scales.value_scale_cpu if can_scale else 1.0, + window_left=window_size_left, ) elif ATTENTION == "flashdecoding": max_q = 1 max_k = max_s import flash_attn_2_cuda + window_size_right = -1 if window_size_left == -1 else 0 + # TODO fixme when flash contains the fix. # Number of splits is not correctly handled # by the current path @@ -109,8 +113,8 @@ def paged_attention( softmax_scale, False, # zero_tensors True, # causal - -1, # Window_left - -1, # Window right + window_size_left, # Window_left + window_size_right, # Window right softcap, False, # return softmax None, # generator @@ -253,6 +257,7 @@ def attention( sm_scale=softmax_scale, k_scale=kv_scales.key_scale_cpu if can_scale else 1.0, v_scale=kv_scales.value_scale_cpu if can_scale else 1.0, + window_left=window_size_left, ) # If we are using flashdecoding or paged, we always use flash-attn for diff --git a/server/text_generation_server/layers/attention/flashinfer.py b/server/text_generation_server/layers/attention/flashinfer.py index d2345184..9479b606 100644 --- a/server/text_generation_server/layers/attention/flashinfer.py +++ b/server/text_generation_server/layers/attention/flashinfer.py @@ -52,7 +52,6 @@ def use_prefill_with_paged_kv_state( page_size: int, kv_dtype: torch.dtype, q_dtype: torch.dtype, - window_left: int, ): """ Context manager to set the active flashinfer prefill state to the given @@ -95,7 +94,6 @@ def use_prefill_with_paged_kv_state( kv_data_type=kv_dtype, q_data_type=q_dtype, page_size=page_size, - window_left=-1 if window_left is None else window_left, ) yield finally: @@ -172,7 +170,6 @@ def use_decode_state( page_size: int, kv_cache_dtype: torch.dtype, q_dtype: torch.dtype, - window_left: int, ): """ Context manager to set the active flashinfer decoding state to the given @@ -209,7 +206,6 @@ def use_decode_state( page_size=page_size, data_type=kv_cache_dtype, q_data_type=q_dtype, - window_left=-1 if window_left is None else window_left, ) yield finally: diff --git a/server/text_generation_server/layers/attention/ipex.py b/server/text_generation_server/layers/attention/ipex.py index 54422308..2b89060e 100644 --- a/server/text_generation_server/layers/attention/ipex.py +++ b/server/text_generation_server/layers/attention/ipex.py @@ -78,6 +78,7 @@ def paged_attention( *, kv_scales: KVScales, softcap: Optional[float] = None, + window_size_left: Optional[int] = -1, ): if softcap is not None: raise NotImplementedError("softcap is not available in IPEX") diff --git a/server/text_generation_server/layers/attention/rocm.py b/server/text_generation_server/layers/attention/rocm.py index 65f3ea41..518e55ee 100644 --- a/server/text_generation_server/layers/attention/rocm.py +++ b/server/text_generation_server/layers/attention/rocm.py @@ -59,6 +59,7 @@ def paged_attention( *, kv_scales: KVScales, softcap: Optional[float] = None, + window_size_left: Optional[int] = -1, ): # Adapted from: https://github.com/vllm-project/vllm/blob/f8a1e39fae05ca610be8d5a78be9d40f5274e5fc/vllm/model_executor/layers/attention.py # Copyright 2023 The vLLM team. All rights @@ -82,6 +83,8 @@ def paged_attention( max_k = max_s import flash_attn_2_cuda + window_size_right = -1 if window_size_left == -1 else 0 + if softcap is None: softcap = 0.0 out = flash_attn_2_cuda.varlen_fwd( @@ -101,8 +104,8 @@ def paged_attention( softmax_scale, False, # zero_tensors True, # causal - -1, # Window_left - -1, # Window right + window_size_left, # Window_left + window_size_right, # Window right softcap, False, # return softmax None, # generator diff --git a/server/text_generation_server/models/__init__.py b/server/text_generation_server/models/__init__.py index ab12429b..ab830b58 100644 --- a/server/text_generation_server/models/__init__.py +++ b/server/text_generation_server/models/__init__.py @@ -106,6 +106,17 @@ try: from text_generation_server.models.custom_modeling.flash_gemma2_modeling import ( FlashGemma2ForCausalLM, ) + from text_generation_server.models.custom_modeling.flash_gemma3_modeling import ( + FlashGemma3ForCausalLM, + Gemma3ForConditionalGeneration, + ) + from text_generation_server.models.custom_modeling.gemma3.processing_gemma3 import ( + Gemma3Processor, + ) + from text_generation_server.models.custom_modeling.gemma3.configuration_gemma3 import ( + Gemma3Config, + Gemma3TextConfig, + ) from text_generation_server.models.custom_modeling.flash_dbrx_modeling import ( FlashDbrxForCausalLM, DbrxConfig, @@ -258,6 +269,16 @@ class ModelType(enum.Enum): "name": "Gemma2", "url": "https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315", } + GEMMA3 = { + "type": "gemma3", + "name": "Gemma3", + "url": "https://huggingface.co/collections/google/gemma-3-release-67c6c6f89c4f76621268bb6d", + } + GEMMA3_TEXT = { + "type": "gemma3_text", + "name": "Gemma3 Text", + "url": "https://huggingface.co/collections/google/gemma-3-release-67c6c6f89c4f76621268bb6d", + } COHERE = { "type": "cohere", "name": "Cohere", @@ -1094,6 +1115,83 @@ def get_model( dtype=dtype, trust_remote_code=trust_remote_code, ) + elif model_type == GEMMA3_TEXT: + if FLASH_ATTENTION: + return FlashCausalLM( + model_id=model_id, + model_class=FlashGemma3ForCausalLM, + revision=revision, + quantize=quantize, + speculator=speculator, + dtype=dtype, + kv_cache_dtype=kv_cache_dtype, + # TODO: once implemented in transformers, use the config class + # and processor class from there. + config_class=Gemma3TextConfig, + # Works better for these models + default_dtype=torch.bfloat16, + trust_remote_code=trust_remote_code, + lora_adapter_ids=lora_adapter_ids, + ) + elif FLASH_TRANSFORMERS_BACKEND: + return TransformersFlashCausalLM.fallback( + model_id, + revision, + quantize=quantize, + speculator=speculator, + dtype=dtype, + trust_remote_code=trust_remote_code, + ) + elif sharded: + raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded Gemma3")) + else: + return CausalLM.fallback( + model_id, + revision, + quantize=quantize, + speculator=speculator, + dtype=dtype, + trust_remote_code=trust_remote_code, + ) + elif model_type == GEMMA3: + if FLASH_ATTENTION: + # TODO: Use VlmCausalLM when image support is added. + return VlmCausalLM( + model_id=model_id, + model_class=Gemma3ForConditionalGeneration, + revision=revision, + quantize=quantize, + speculator=speculator, + dtype=dtype, + kv_cache_dtype=kv_cache_dtype, + # TODO: once implemented in transformers, use the config class + # and processor class from there. + config_class=Gemma3Config, + processor_class=Gemma3Processor, + default_dtype=torch.bfloat16, + trust_remote_code=trust_remote_code, + lora_adapter_ids=lora_adapter_ids, + ) + elif FLASH_TRANSFORMERS_BACKEND: + return TransformersFlashCausalLM.fallback( + model_id, + revision, + quantize=quantize, + speculator=speculator, + dtype=dtype, + trust_remote_code=trust_remote_code, + ) + elif sharded: + raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded Gemma3")) + else: + return CausalLM.fallback( + model_id, + revision, + quantize=quantize, + speculator=speculator, + dtype=dtype, + trust_remote_code=trust_remote_code, + ) if model_type == COHERE: if FLASH_ATTENTION: diff --git a/server/text_generation_server/models/custom_modeling/flash_gemma2_modeling.py b/server/text_generation_server/models/custom_modeling/flash_gemma2_modeling.py index ebf1b80e..2554bd26 100644 --- a/server/text_generation_server/models/custom_modeling/flash_gemma2_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_gemma2_modeling.py @@ -287,6 +287,7 @@ class FlashGemma2Attention(torch.nn.Module): max_s, softcap=self.softcap, kv_scales=self.kv_scales, + window_size_left=self.window_size, ) return self.o_proj( diff --git a/server/text_generation_server/models/custom_modeling/flash_gemma3_modeling.py b/server/text_generation_server/models/custom_modeling/flash_gemma3_modeling.py new file mode 100644 index 00000000..70fe9a3d --- /dev/null +++ b/server/text_generation_server/models/custom_modeling/flash_gemma3_modeling.py @@ -0,0 +1,902 @@ +# coding=utf-8 +# Copyright 2024 HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +import torch.distributed +from torch import nn +from typing import Optional, List, Tuple +import copy + +from text_generation_server.layers import ( + TensorParallelColumnLinear, + TensorParallelEmbedding, + TensorParallelRowLinear, + get_linear, + # + SpeculativeHead, + TensorParallelMultiAdapterLinear, + TensorParallelAdapterRowLinear, +) + +import torch +import torch.nn.functional as F + + +from text_generation_server.models.custom_modeling.vlm import ( + load_text_model, + load_vision_model, +) + + +from text_generation_server.layers.attention.kv_cache import get_kv_scales +from text_generation_server.layers.rotary import PositionRotaryEmbedding +from text_generation_server.layers.layernorm import ( + FastRMSNorm, +) +from text_generation_server.utils.weights import UnquantizedWeight +from transformers.activations import ACT2FN +from text_generation_server.layers.attention import ( + paged_attention, + attention, + Seqlen, +) + + +ATTENTION_TYPE_GLOBAL = "global" +ATTENTION_TYPE_LOCAL = "local_sliding" + + +class Gemma3FastRMSNorm(FastRMSNorm): + @classmethod + def load(cls, prefix: str, weights, eps=1e-6): + dtype = weights.dtype + weights.dtype = torch.float32 + weight = weights.get_tensor(f"{prefix}.weight") + 1 + weights.dtype = dtype + new = cls(weight, eps) + new.dtype = dtype + return new + + # perform the multiplication in full precision and downcast after + def forward(self, hidden_states, residual=None): + if residual is not None: + hidden_states += residual + residual = hidden_states + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + hidden_states = hidden_states * self.weight + return hidden_states.to(self.dtype), residual + + +def load_attention(config, prefix: str, weights): + if config.num_attention_heads != config.num_key_value_heads: + return _load_gqa(config, prefix, weights) + else: + return TensorParallelColumnLinear.load_multi( + config, + prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"], + dim=0, + weights=weights, + bias=False, + ) + + +def _load_gqa(config, prefix: str, weights): + assert config.num_attention_heads % weights.process_group.size() == 0 + + weight = weights.get_multi_weights_col( + prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"], + dim=0, + ) + + if isinstance(weight, UnquantizedWeight): + weight.weight = weight.weight.to(dtype=weights.dtype).to(device=weights.device) + + head_size = config.head_dim + num_heads = config.num_attention_heads // weights.process_group.size() + num_key_value_heads = config.num_key_value_heads // weights.process_group.size() + assert list(weight.weight.shape) == [ + (num_heads + 2 * num_key_value_heads) * head_size, + config.hidden_size, + ], f"{list(weight.weight.shape)} != {[(num_heads + 2 * config.num_key_value_heads) * head_size, config.hidden_size]}" + + return TensorParallelColumnLinear(get_linear(weight, bias=None)) + + +class FlashGemma3Attention(torch.nn.Module): + def __init__( + self, prefix: str, config, weights, layer_id, causal: bool, is_sliding: bool + ): + super().__init__() + self.num_heads = config.num_attention_heads + self.head_size = config.head_dim + self.causal = causal + if is_sliding: + self.window_size = config.sliding_window + # TODO: remove this hack to support local sliding window + config = copy.deepcopy(config) + config.rope_scaling = dict(rope_type="default") + self.rotary_emb = PositionRotaryEmbedding.static( + config=config, + dim=config.head_dim, + base=config.rope_local_base_freq, + device=weights.device, + ) + else: + self.window_size = -1 + self.rotary_emb = PositionRotaryEmbedding.static( + config=config, + dim=config.head_dim, + base=config.rope_theta, + device=weights.device, + ) + + self.softmax_scale = ( + config.query_pre_attn_scalar**-0.5 + if config.query_pre_attn_scalar is not None + else None + ) + if self.num_heads % weights.process_group.size() != 0: + raise ValueError( + f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} " + f"and `num_shards`: {weights.process_group.size()}" + ) + self.num_heads = self.num_heads // weights.process_group.size() + self.num_key_value_heads = ( + config.num_key_value_heads // weights.process_group.size() + ) + self.softcap = None # config.attn_logit_softcapping + + query_key_value = load_attention(config, prefix, weights) + self.query_key_value = TensorParallelMultiAdapterLinear.load( + query_key_value, + layer_id, + ["q_proj", "k_proj", "v_proj"], + sizes=[ + self.head_size * config.num_attention_heads, + self.head_size * config.num_key_value_heads, + self.head_size * config.num_key_value_heads, + ], + process_group=weights.process_group, + ) + self.kv_scales = get_kv_scales(weights, f"{prefix}") + + o_proj = TensorParallelRowLinear.load( + config, + prefix=f"{prefix}.o_proj", + weights=weights, + bias=False, + ) + self.o_proj = TensorParallelAdapterRowLinear.load( + o_proj, + layer_id, + "o_proj", + process_group=weights.process_group, + ) + + self.num_groups = self.num_heads // self.num_key_value_heads + self.kv_head_mapping = torch.arange( + 0, self.num_key_value_heads, dtype=torch.int32, device=weights.device + ).repeat_interleave(self.num_groups) + self.q_norm = Gemma3FastRMSNorm.load( + prefix=f"{prefix}.q_norm", weights=weights, eps=config.rms_norm_eps + ) + self.k_norm = Gemma3FastRMSNorm.load( + prefix=f"{prefix}.k_norm", weights=weights, eps=config.rms_norm_eps + ) + self.enable_gqa = self.num_heads != self.num_key_value_heads + + def forward( + self, + hidden_states, + cos, + sin, + cu_seqlen_prefill, + kv_cache, + block_tables, + slots, + seqlen, + max_s, + adapter_data, + attention_mask, + ): + + qkv = self.query_key_value(hidden_states, adapter_data) + query, kv = qkv.split( + [ + self.head_size * self.num_heads, + 2 * self.head_size * self.num_key_value_heads, + ], + dim=1, + ) + + kv = kv.view(-1, 2, self.num_key_value_heads * self.head_size) + key = kv[:, 0] + value = kv[:, 1] + + query = query.reshape(-1, self.head_size) + key = key.reshape(-1, self.head_size) + + query, _ = self.q_norm(query.contiguous()) + key, _ = self.k_norm(key.contiguous()) + + query = query.view(-1, self.num_heads, self.head_size) + key = key.view(-1, self.num_key_value_heads, self.head_size) + value = value.view(-1, self.num_key_value_heads, self.head_size) + + self.rotary_emb(query, key, cos, sin) + + kv_cache.store( + key=key, + value=value, + slots=slots, + kv_scales=self.kv_scales, + ) + + # Prefill + if cu_seqlen_prefill is not None: + if attention_mask is None: + # flash attention + attn_output = attention( + query=query, + key=key, + value=value, + kv_cache=kv_cache, + kv_scales=self.kv_scales, + seqlen=seqlen, + block_tables=block_tables, + softmax_scale=self.softmax_scale, + window_size_left=self.window_size, + softcap=self.softcap, + ) + else: + lengths = cu_seqlen_prefill[1:] - cu_seqlen_prefill[:-1] + + # Split tensors using vectorized split + query_list = torch.split(query, lengths.tolist(), dim=0) + key_list = torch.split(key, lengths.tolist(), dim=0) + value_list = torch.split(value, lengths.tolist(), dim=0) + + padded_query = torch.nn.utils.rnn.pad_sequence( + query_list, batch_first=True + ) + padded_key = torch.nn.utils.rnn.pad_sequence(key_list, batch_first=True) + padded_value = torch.nn.utils.rnn.pad_sequence( + value_list, batch_first=True + ) + + padded_query = padded_query.transpose(1, 2).contiguous() + padded_key = padded_key.transpose(1, 2).contiguous() + padded_value = padded_value.transpose(1, 2).contiguous() + + # Compute attention + attn_output = F.scaled_dot_product_attention( + padded_query, + padded_key, + padded_value, + attn_mask=attention_mask, + scale=self.softmax_scale, + enable_gqa=self.enable_gqa, + ) + + attn_output = attn_output.transpose( + 1, 2 + ) # [batch_size, seq_len, num_heads, head_dim] + max_seq_len = padded_query.size(2) + seq_range = torch.arange( + max_seq_len, device=padded_query.device + ).unsqueeze(0) + lengths_tensor = torch.tensor( + lengths, device=padded_query.device + ).unsqueeze(1) + mask = seq_range < lengths_tensor # [batch, max_seq_len] + attn_output = attn_output[mask] # [total_seq_len, num_heads, head_dim] + + # Decode + else: + attn_output = paged_attention( + query, + kv_cache, + self.kv_head_mapping, + self.softmax_scale, + block_tables, + seqlen, + max_s, + softcap=self.softcap, + kv_scales=self.kv_scales, + window_size_left=self.window_size, + ) + + return self.o_proj( + attn_output.view(-1, self.num_heads * self.head_size), adapter_data + ) + + +class Gemma3MLP(nn.Module): + def __init__(self, prefix, config, weights, layer_id): + super().__init__() + act = config.hidden_activation + self.act = ( + ACT2FN[act] + if "gelu" not in act + else lambda x: torch.nn.functional.gelu( + x, + approximate=( + "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none" + ), + ) + ) + # Fuse gate and up proj + gate_up_proj = TensorParallelColumnLinear.load_multi( + config, + prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"], + weights=weights, + dim=0, + bias=False, + ) + self.gate_up_proj = TensorParallelMultiAdapterLinear.load( + gate_up_proj, + layer_id, + ["gate_proj", "up_proj"], + sizes=[ + config.intermediate_size, + config.intermediate_size, + ], + process_group=weights.process_group, + ) + + down_proj = TensorParallelRowLinear.load( + config, + prefix=f"{prefix}.down_proj", + weights=weights, + bias=False, + ) + self.down_proj = TensorParallelAdapterRowLinear.load( + down_proj, + layer_id, + "down_proj", + process_group=weights.process_group, + ) + + self.intermediate_size = ( + config.intermediate_size // weights.process_group.size() + ) + + def forward(self, hidden_states, adapter_data): + gate_up_states = self.gate_up_proj(hidden_states, adapter_data) + gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size) + return self.down_proj( + self.act(gate_up_states[:, 0]) * gate_up_states[:, 1], adapter_data + ) + + +class FlashGemma3Layer(nn.Module): + def __init__( + self, prefix: str, config, weights, layer_id, causal: bool, is_sliding: bool + ): + super().__init__() + self.self_attn = FlashGemma3Attention( + prefix=f"{prefix}.self_attn", + config=config, + weights=weights, + layer_id=layer_id, + causal=causal, + is_sliding=is_sliding, + ) + self.mlp = Gemma3MLP( + prefix=f"{prefix}.mlp", config=config, weights=weights, layer_id=layer_id + ) + + self.input_layernorm = Gemma3FastRMSNorm.load( + prefix=f"{prefix}.input_layernorm", weights=weights, eps=config.rms_norm_eps + ) + self.post_attention_layernorm = Gemma3FastRMSNorm.load( + prefix=f"{prefix}.post_attention_layernorm", + weights=weights, + eps=config.rms_norm_eps, + ) + self.pre_feedforward_layernorm = Gemma3FastRMSNorm.load( + prefix=f"{prefix}.pre_feedforward_layernorm", + weights=weights, + eps=config.rms_norm_eps, + ) + self.post_feedforward_layernorm = Gemma3FastRMSNorm.load( + prefix=f"{prefix}.post_feedforward_layernorm", + weights=weights, + eps=config.rms_norm_eps, + ) + + def forward( + self, + hidden_states, + residual, + cos, + sin, + cu_seqlen_prefill, + kv_cache, + block_tables, + slots, + seqlen, + max_s, + adapter_data, + attention_mask, + ): + normed_hidden_states, res = self.input_layernorm(hidden_states, residual) + + # Self Attention + attn_output = self.self_attn( + normed_hidden_states, + cos, + sin, + cu_seqlen_prefill, + kv_cache, + block_tables, + slots, + seqlen, + max_s, + adapter_data, + attention_mask, + ) + + # faster post attention rms norm + normed_attn_res_output, _ = self.post_attention_layernorm(attn_output) + normed_attn_res_output = normed_attn_res_output + res + res = normed_attn_res_output + + pre_normed, _ = self.pre_feedforward_layernorm(normed_attn_res_output) + mlp_output = self.mlp(pre_normed, adapter_data) + post_hidden_states, _ = self.post_feedforward_layernorm(mlp_output) + + return post_hidden_states, normed_attn_res_output + + +class FlashGemma3Model(torch.nn.Module): + def __init__(self, prefix: str, config, weights, causal: bool): + super().__init__() + + process_group = weights.process_group + self.tp_rank = process_group.rank() + self.tp_world_size = process_group.size() + + self.layers = nn.ModuleList( + [ + FlashGemma3Layer( + prefix=f"{prefix}.layers.{layer_id}", + config=config, + weights=weights, + layer_id=layer_id, + causal=causal, + is_sliding=bool((layer_id + 1) % config.sliding_window_pattern), + ) + for layer_id in range(config.num_hidden_layers) + ] + ) + self.norm = Gemma3FastRMSNorm.load( + prefix=f"{prefix}.norm", weights=weights, eps=config.rms_norm_eps + ) + + self.head_size = self.layers[0].self_attn.head_size + self.num_heads = self.layers[0].self_attn.num_heads + self.num_key_value_heads = self.layers[0].self_attn.num_key_value_heads + + def forward( + self, + inputs_embeds: torch.Tensor, + position_ids: torch.Tensor, + cu_seqlen_prefill: Optional[torch.Tensor], + kv_cache: List[Tuple[torch.Tensor, torch.Tensor]], + block_tables: torch.Tensor, + slots: torch.Tensor, + seqlen: Seqlen, + max_s: int, + adapter_data: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + attention_mask_local: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + hidden_states = inputs_embeds + + # Get rotary cos and sin for this forward + # Avoid to index in each layer + + residual = None + for i, layer in enumerate(self.layers): + cos, sin = self.layers[i].self_attn.rotary_emb.get_cos_sin( + position_ids, max_s, hidden_states.dtype + ) + + hidden_states, residual = layer( + hidden_states, + residual, + cos, + sin, + cu_seqlen_prefill, + kv_cache[i], + block_tables, + slots, + seqlen, + max_s, + adapter_data, + ( + attention_mask + if self.layers[i].self_attn.window_size == -1 + else attention_mask_local + ), + ) + + hidden_states, _ = self.norm(hidden_states, residual) + + return hidden_states + + +class FlashGemma3ForCausalLM(torch.nn.Module): + def __init__(self, prefix: str, config, weights, *, causal: bool = True): + super().__init__() + + embed_norm = config.hidden_size**0.5 + if not prefix: + prefix = "model" + else: + prefix = f"{prefix}.model" + + self.embed_tokens = TensorParallelEmbedding( + prefix=f"{prefix}.embed_tokens", weights=weights + ) + self.embed_tokens.weight *= embed_norm + + self.model = FlashGemma3Model( + prefix=prefix, config=config, weights=weights, causal=causal + ) + self.lm_head = SpeculativeHead.load( + prefix=( + f"{prefix}.embed_tokens" + if config.tie_word_embeddings + else f"{prefix}.lm_head" + ), + config=config, + weights=weights, + ) + # self.softcap = config.attn_logit_softcapping + # assert isinstance(self.softcap, float) + self.softcap = None + + def forward( + self, + input_ids: torch.Tensor, + position_ids: torch.Tensor, + cu_seqlen_prefill: Optional[torch.Tensor], + kv_cache: List[Tuple[torch.Tensor, torch.Tensor]], + block_tables: torch.Tensor, + slots: torch.Tensor, + seqlen: Seqlen, + max_s: int, + prefill_cache_indices: Optional[torch.Tensor], + lm_head_indices: Optional[torch.Tensor] = None, + adapter_data: Optional[torch.Tensor] = None, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: + input_embeds = self.embed_tokens(input_ids) + + hidden_states = self.model( + input_embeds, + position_ids, + cu_seqlen_prefill, + kv_cache, + block_tables, + slots, + seqlen, + max_s, + adapter_data, + ) + if lm_head_indices is not None: + hidden_states = hidden_states[lm_head_indices] + logits, speculative_logits = self.lm_head(hidden_states) + + return logits, speculative_logits + + +class Gemma3MultimodalInputProjection(torch.nn.Module): + def __init__(self, prefix, config, weights): + super().__init__() + + self.mm_input_projection_weight = weights.get_tensor( + "multi_modal_projector.mm_input_projection_weight" + ) + + self.mm_soft_emb_norm = Gemma3FastRMSNorm.load( + prefix=f"{prefix}.mm_soft_emb_norm", + weights=weights, + eps=config.vision_config.layer_norm_eps, + ) + + self.patches_per_image = int( + config.vision_config.image_size // config.vision_config.patch_size + ) + self.tokens_per_side = int(config.mm_tokens_per_image**0.5) + self.kernel_size = self.patches_per_image // self.tokens_per_side + self.avg_pool = nn.AvgPool2d( + kernel_size=self.kernel_size, stride=self.kernel_size + ) + + def forward(self, vision_outputs: torch.Tensor): + batch_size, _, seq_length = vision_outputs.shape + + reshaped_vision_outputs = vision_outputs.transpose(1, 2) + reshaped_vision_outputs = reshaped_vision_outputs.reshape( + batch_size, seq_length, self.patches_per_image, self.patches_per_image + ) + reshaped_vision_outputs = reshaped_vision_outputs.contiguous() + + pooled_vision_outputs = self.avg_pool(reshaped_vision_outputs) + pooled_vision_outputs = pooled_vision_outputs.flatten(2) + pooled_vision_outputs = pooled_vision_outputs.transpose(1, 2) + + normed_vision_outputs, _ = self.mm_soft_emb_norm(pooled_vision_outputs) + + projected_vision_outputs = torch.matmul( + normed_vision_outputs, self.mm_input_projection_weight + ) + return projected_vision_outputs.type_as(vision_outputs) + + +class Gemma3ForConditionalGeneration(nn.Module): + def __init__(self, prefix, config, weights): + super().__init__() + + self.config = config + + if config.vision_config is not None: + + config.vision_config.quantize = config.quantize + + self.post_vision_model_layernorm = nn.LayerNorm.load( + prefix="vision_tower.vision_model.post_layernorm", + weights=weights, + eps=config.vision_config.layer_norm_eps, + ) + + self.multimodal_projector = Gemma3MultimodalInputProjection( + prefix="multi_modal_projector", + config=config, + weights=weights, + ) + + text_config = config.text_config + text_config.speculator = config.speculator + text_config.quantize = config.quantize + + self.vision_model = load_vision_model( + prefix="vision_tower" if not prefix else f"{prefix}.vision_tower", + config=config.vision_config, + weights=weights, + ) + + self.text_model = load_text_model( + prefix="language_model" if not prefix else f"{prefix}.language_model", + config=config.text_config, + weights=weights, + ) + else: + config.text_config.quantize = config.quantize + config.text_config.speculator = config.speculator + self.text_model = load_text_model( + prefix=prefix, + config=config.text_config, + weights=weights, + ) + + self.pad_token_id = ( + config.pad_token_id if config.pad_token_id is not None else -1 + ) + + def get_attention_mask( + self, input_ids, max_s, cu_seqlen_prefill, dtype, image_token_mask + ): + device = input_ids.device + min_dtype = torch.finfo(dtype).min + + lengths = (cu_seqlen_prefill[1:] - cu_seqlen_prefill[:-1]).tolist() + batch_size = len(lengths) + + sequence_length = max(lengths) + target_length = sequence_length + # Create the padding mask from the computed lengths. + # pad_mask: [batch, sequence_length] where True indicates valid tokens. + seq_range = torch.arange(sequence_length, device=device).unsqueeze(0) + lengths_tensor = torch.tensor(lengths, device=device).unsqueeze(1) + pad_mask = seq_range < lengths_tensor # shape: [batch, sequence_length] + + # Build the base causal mask (for non-image tokens): + causal_mask = torch.tril( + torch.ones( + (sequence_length, sequence_length), dtype=torch.bool, device=device + ) + ) + base_mask = pad_mask.unsqueeze(2) & pad_mask.unsqueeze( + 1 + ) # [batch, sequence_length, sequence_length] + base_mask = base_mask & causal_mask.unsqueeze(0) # apply causal constraint + + image_token_mask = torch.nn.utils.rnn.pad_sequence( + torch.split(image_token_mask, lengths), batch_first=True, padding_value=0 + ) + bidirectional_mask = image_token_mask.unsqueeze(2) & image_token_mask.unsqueeze( + 1 + ) + + # Combine the causal base mask and the bidirectional mask. + combined_mask = torch.logical_or( + base_mask.unsqueeze(1), bidirectional_mask.unsqueeze(1) + ).to(device) + # combined_mask now has shape [batch, 1, sequence_length, sequence_length] + + full_attention_mask = torch.zeros( + (batch_size, 1, sequence_length, target_length), + device=device, + dtype=torch.bool, + ) + full_attention_mask[:, :, :, :sequence_length] = combined_mask + + final_attention_mask = torch.where(full_attention_mask, 0, min_dtype).to(device) + + return final_attention_mask + + def forward( + self, + input_ids: torch.Tensor, + position_ids: torch.Tensor, + cu_seqlen_prefill: Optional[torch.Tensor], + kv_cache: List[Tuple[torch.Tensor, torch.Tensor]], + block_tables: torch.Tensor, + slots: torch.Tensor, + seqlen: Seqlen, + max_s: int, + prefill_cache_indices: Optional[torch.Tensor] = None, + lm_head_indices: Optional[torch.Tensor] = None, + pixel_values: torch.FloatTensor = None, + # Unused here + attention_mask: Optional[torch.BoolTensor] = None, + pixel_attention_mask: Optional[torch.BoolTensor] = None, + image_sizes: Optional[torch.Tensor] = None, + adapter_data: Optional[torch.Tensor] = None, + image_grid_thw: Optional[torch.LongTensor] = None, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: + inputs_embeds = self.text_model.embed_tokens(input_ids) + if cu_seqlen_prefill is not None: + max_s += 1 + position_ids += 1 + + if pixel_values is not None: + pixel_values = pixel_values.to(dtype=inputs_embeds.dtype) + image_outputs = self.vision_model(pixel_values) + vision_outputs = self.post_vision_model_layernorm( + image_outputs.last_hidden_state + ) + image_features = self.multimodal_projector(vision_outputs) + + image_token_mask = (input_ids == self.config.image_token_index).to( + input_ids.device + ) + inputs_embeds[image_token_mask] = image_features.view( + -1, image_features.shape[-1] + ) + attention_mask = self.get_attention_mask( + input_ids, + max_s, + cu_seqlen_prefill, + inputs_embeds.dtype, + image_token_mask, + ) + # Use flash attention for text-only input + # else: + # if cu_seqlen_prefill is not None: + # min_dtype = torch.finfo(inputs_embeds.dtype).min + # lengths = (cu_seqlen_prefill[1:] - cu_seqlen_prefill[:-1]).tolist() + + # # Determine the maximum sequence length (after padding) from query. + # sequence_length = max(lengths) + # target_length = sequence_length + + # # Create the padding mask from the computed lengths. + # # pad_mask: [batch, sequence_length] where True indicates valid tokens. + # seq_range = torch.arange( + # sequence_length, device=input_ids.device + # ).unsqueeze(0) + # lengths_tensor = torch.tensor( + # lengths, device=input_ids.device + # ).unsqueeze(1) + # pad_mask = seq_range < lengths_tensor # shape: [batch, sequence_length] + + # # Build the base causal mask (for non-image tokens): + # causal_mask = torch.tril( + # torch.ones( + # (sequence_length, sequence_length), + # dtype=torch.bool, + # device=input_ids.device, + # ) + # ) + # base_mask = pad_mask.unsqueeze(2) & pad_mask.unsqueeze( + # 1 + # ) # [batch, sequence_length, sequence_length] + # base_mask = base_mask & causal_mask.unsqueeze(0) + # attention_mask = base_mask.unsqueeze( + # 1 + # ) # [batch, 1, sequence_length, sequence_length] + # full_attention_mask = torch.zeros( + # (len(lengths), 1, sequence_length, target_length), + # device=input_ids.device, + # dtype=torch.bool, + # ) + # full_attention_mask[:, :, :, :sequence_length] = attention_mask + + # attention_mask = torch.where(full_attention_mask, 0, min_dtype).to( + # input_ids.device + # ) + + if attention_mask is not None: + min_dtype = torch.finfo(inputs_embeds.dtype).min + # prefill may be larger than sliding window + effective_seq_len = max( + position_ids.shape[0], self.config.text_config.sliding_window + ) + sliding_window_mask = torch.tril( + torch.ones_like(attention_mask, dtype=torch.bool), + diagonal=-self.config.text_config.sliding_window, + ) + attention_mask_local = torch.where( + sliding_window_mask, min_dtype, attention_mask + ) + offset = max(0, position_ids.shape[0] - effective_seq_len) + attention_mask_local = attention_mask_local[ + :, :, :, offset : offset + effective_seq_len + ] + else: + attention_mask_local = None + + hidden_states = self.text_model.model( + inputs_embeds=inputs_embeds, + position_ids=position_ids, + cu_seqlen_prefill=cu_seqlen_prefill, + kv_cache=kv_cache, + block_tables=block_tables, + slots=slots, + seqlen=seqlen, + max_s=max_s, + attention_mask=attention_mask, + attention_mask_local=attention_mask_local, + ) + + if lm_head_indices is not None: + hidden_states = hidden_states[lm_head_indices] + logits, speculative_logits = self.text_model.lm_head(hidden_states) + + # pad logit with 1 zero logit for the image token + if pixel_values is not None: + logits = torch.cat( + [logits, torch.zeros(logits.size(0), 1, device=logits.device)], dim=1 + ) + if speculative_logits is not None: + speculative_logits = torch.cat( + [ + speculative_logits, + torch.zeros( + speculative_logits.size(0), + 1, + device=speculative_logits.device, + ), + ], + dim=1, + ) + + return logits, speculative_logits diff --git a/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py b/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py index 0fa172d0..7ad294f4 100644 --- a/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py @@ -242,6 +242,7 @@ class MistralAttention(torch.nn.Module): seqlen, max_s, kv_scales=self.kv_scales, + window_size_left=self.max_past, ) return self.o_proj( diff --git a/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py b/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py index a45dd1e6..e2a3e586 100644 --- a/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py @@ -290,6 +290,7 @@ class MixtralAttention(torch.nn.Module): seqlen, max_s, kv_scales=self.kv_scales, + window_size_left=self.max_past, ) return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size)) diff --git a/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py b/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py index 9d956222..f5e4e15c 100644 --- a/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py @@ -74,7 +74,7 @@ class Qwen2Attention(torch.nn.Module): weights, ): super().__init__() - self.max_past = ( + self.window_size = ( config.sliding_window if config.sliding_window is not None else -1 ) self.num_heads = config.num_attention_heads @@ -172,7 +172,7 @@ class Qwen2Attention(torch.nn.Module): seqlen=seqlen, block_tables=block_tables, softmax_scale=self.softmax_scale, - window_size_left=self.max_past, + window_size_left=self.window_size, ) # Decode else: @@ -185,6 +185,7 @@ class Qwen2Attention(torch.nn.Module): seqlen, max_s, kv_scales=self.kv_scales, + window_size_left=self.window_size, ) return self.o_proj( @@ -405,10 +406,10 @@ class Qwen2ForCausalLM(torch.nn.Module): weights=weights, ) - self.max_past = config.sliding_window - self.max_past_tensor = ( + self.window_size = config.sliding_window + self.window_size_tensor = ( torch.tensor(config.sliding_window, device=weights.device) - if self.max_past is not None + if self.window_size is not None else None ) @@ -430,10 +431,10 @@ class Qwen2ForCausalLM(torch.nn.Module): if prefill_cache_indices is not None: # Slots also need to be sliced as it has the same size as the whole kv tensor slots = slots[prefill_cache_indices] - elif self.max_past is not None: + elif self.window_size is not None: # Clamp in decode mode as paged attention requires clamped values whereas the flash attention # kernel requires the true values - seqlen = seqlen.clamp(max=self.max_past_tensor) + seqlen = seqlen.clamp(max=self.window_size_tensor) inputs_embeds = self.embed_tokens(input_ids) diff --git a/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py b/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py index 5e090369..9508cc4f 100644 --- a/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py @@ -291,6 +291,7 @@ class Starcoder2Attention(torch.nn.Module): seqlen, max_s, kv_scales=self.kv_scales, + window_size_left=self.max_past, ) return self.o_proj( diff --git a/server/text_generation_server/models/custom_modeling/gemma3/configuration_gemma3.py b/server/text_generation_server/models/custom_modeling/gemma3/configuration_gemma3.py new file mode 100644 index 00000000..08b468e2 --- /dev/null +++ b/server/text_generation_server/models/custom_modeling/gemma3/configuration_gemma3.py @@ -0,0 +1,313 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/gemma3/modular_gemma3.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_gemma3.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2025 Google Inc. HuggingFace Inc. team. All rights reserved. +# +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Optional + +from transformers.configuration_utils import PretrainedConfig +from transformers.modeling_rope_utils import rope_config_validation +from transformers.utils import logging +from transformers import SiglipVisionConfig + +logger = logging.get_logger(__name__) + + +class Gemma3TextConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`Gemma3Model`]. It is used to instantiate a Gemma3 + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the Gemma3-4B. + e.g. [google/gemma-3-4b](https://huggingface.co/google/gemma-3-4b) + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vocab_size (`int`, *optional*, defaults to 262144): + Vocabulary size of the Gemma3 model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`Gemma3Model`] + hidden_size (`int`, *optional*, defaults to 2304): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 9216): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 26): + Number of hidden layers in the Transformer decoder. + num_attention_heads (`int`, *optional*, defaults to 8): + Number of attention heads for each attention layer in the Transformer decoder. + num_key_value_heads (`int`, *optional*, defaults to 4): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details checkout [this + paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to + `num_attention_heads`. + head_dim (`int`, *optional*, defaults to 256): + The attention head dimension. + sliding_window (`int`, *optional*, defaults to 4096): in Gemma3, every other layer uses sliding window + attention. This is the size of the sliding window. + query_pre_attn_scalar (`float`, *optional*): + The scaling factor used on the attention scores, not that + rope_theta (`float`, *optional*, defaults to 1000000.0): + The base period of the RoPE embeddings used for global attention. + rope_scaling (`Dict`, *optional*): + Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type + and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value + accordingly. + Expected contents: + `rope_type` (`str`): + The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', + 'llama3'], with 'default' being the original RoPE implementation. + `factor` (`float`, *optional*): + Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In + most scaling types, a `factor` of x will enable the model to handle sequences of length x * + original maximum pre-trained length. + `original_max_position_embeddings` (`int`, *optional*): + Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during + pretraining. + `attention_factor` (`float`, *optional*): + Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention + computation. If unspecified, it defaults to value recommended by the implementation, using the + `factor` field to infer the suggested value. + `beta_fast` (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear + ramp function. If unspecified, it defaults to 32. + `beta_slow` (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear + ramp function. If unspecified, it defaults to 1. + `short_factor` (`List[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to short contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + `long_factor` (`List[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to long contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + `low_freq_factor` (`float`, *optional*): + Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE + `high_freq_factor` (`float`, *optional*): + Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + rope_local_base_freq (float, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings for local attention. + sliding_window_pattern (`int`, *optional*, defaults to 6): + Pattern for the sliding window attention. + rms_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the rms normalization layers. + hidden_activation (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`): + The non-linear activation function (function or string) in the decoder. Will default to + `"gelu_pytorch_tanh"` if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"` + activation function. + pad_token_id (`int`, *optional*, defaults to 0): + Padding token id. + eos_token_id (`int`, *optional*, defaults to 1): + End of stream token id. + bos_token_id (`int`, *optional*, defaults to 2): + Beginning of stream token id. + tie_word_embeddings (`bool`, *optional*, defaults to `True`): + Whether to tie weight embeddings + max_position_embeddings (`int`, *optional*, defaults to 131072): + The maximum sequence length that this model might ever be used with. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + attention_bias (`bool`, *optional*, defaults to `False`): + Whether to use a bias in the query, key, value and output projection layers during self-attention. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + final_logit_softcapping (`bool`, *optional*, defaults to `True`): + Whether to apply logit softcapping or nor + attn_logit_softcapping (`float`, *optional*, defaults to 50.0): + Scaling factor when applying tanh soft-capping on the attention scorexs. + cache_implementation (`str`, *optional*, defaults to `"hybrid"`): + The cache type to be used with `generate`. + + ```python + >>> from transformers import Gemma3Model, Gemma3TextConfig + >>> # Initializing a Gemma3 gemma3-4b style configuration + >>> configuration = Gemma3Config() + >>> # Initializing a model from the gemma3-4b style configuration + >>> model = Gemma3Model(configuration) + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "gemma3_text" + + def __init__( + self, + vocab_size: int = 262_144, + hidden_size: int = 2304, + intermediate_size: int = 9216, + num_hidden_layers: int = 26, + num_attention_heads: int = 8, + num_key_value_heads: int = 4, + head_dim: int = 256, + sliding_window: int = 4096, + query_pre_attn_scalar: Optional[float] = 256, + rope_theta: float = 1_000_000.0, + rope_scaling=None, + rope_local_base_freq: float = 10_000.0, + sliding_window_pattern: int = 6, + rms_norm_eps: float = 1e-6, + hidden_activation: str = "gelu_pytorch_tanh", + pad_token_id: int = 0, + eos_token_id: int = 1, + bos_token_id: int = 2, + tie_word_embeddings: bool = True, + max_position_embeddings: int = 131_072, + initializer_range: float = 0.02, + attention_bias: bool = False, + attention_dropout: float = 0.0, + use_cache: bool = True, + final_logit_softcapping=None, + attn_logit_softcapping=None, + cache_implementation: str = "hybrid", + **kwargs, + ): + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.head_dim = head_dim + self.num_key_value_heads = num_key_value_heads + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + self.rope_local_base_freq = rope_local_base_freq + # For configuring HybridCache to work with 5:1 attention pattern + self.sliding_window_pattern = sliding_window_pattern + self.attention_bias = attention_bias + self.attention_dropout = attention_dropout + self.hidden_activation = hidden_activation + self.query_pre_attn_scalar = query_pre_attn_scalar + self.sliding_window = sliding_window + self.final_logit_softcapping = final_logit_softcapping + self.attn_logit_softcapping = attn_logit_softcapping + self.cache_implementation = cache_implementation + rope_config_validation(self) + + +class Gemma3Config(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`Gemma3ForConditionalGeneration`]. It is used to instantiate an + Gemma3ForConditionalGeneration according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the PaliGemma-2B. + + e.g. [google/gemma-3-4b](https://huggingface.co/google/gemma-3-4b) + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + text_config (`Union[Gemma3TextConfig, dict]`, *optional*): + The config object of the text backbone. + vision_config (`Union[AutoConfig, dict]`, *optional*): + Custom vision config or dict. + mm_tokens_per_image (`int`, *optional*, defaults to 256): + The number of tokens per image embedding. + boi_token_index (`int`, *optional*, defaults to 255999): + The begin-of-image token index to wrap the image prompt. + eoi_token_index (`int`, *optional*, defaults to 256000): + The end-of-image token index to wrap the image prompt. + image_token_index (`int`, *optional*, defaults to 262144): + The image token index to encode the image prompt. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + + Example: + + ```python + >>> from transformers import Gemma3ForConditionalGeneration, Gemma3Config, SiglipVisionConfig, Gemma3TextConfig + + >>> # Initializing a Siglip-like vision config + >>> vision_config = SiglipVisionConfig() + + >>> # Initializing a Gemma3 Text config + >>> text_config = Gemma3TextConfig() + + >>> # Initializing a Gemma3 gemma-3-4b style configuration + >>> configuration = Gemma3Config(vision_config, text_config) + + >>> # Initializing a model from the gemma-3-4b style configuration + >>> model = Gemma3TextConfig(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "gemma3" + sub_configs = { + "text_config": Gemma3TextConfig, + "vision_config": SiglipVisionConfig, + } + + def __init__( + self, + text_config: Optional[Gemma3TextConfig] = None, + vision_config: Optional[SiglipVisionConfig] = None, + mm_tokens_per_image: int = 256, + boi_token_index: int = 255_999, + eoi_token_index: int = 256_000, + image_token_index: int = 262_144, + initializer_range: float = 0.02, + **kwargs, + ): + if text_config is None: + text_config = Gemma3TextConfig() + logger.info( + "text_config is None, using default Gemma3TextConfig vision config." + ) + elif isinstance(text_config, dict): + text_config = Gemma3TextConfig(**text_config) + + if isinstance(vision_config, dict): + vision_config = SiglipVisionConfig(**vision_config) + else: + vision_config = SiglipVisionConfig() + logger.info( + "vision_config is None or incompatible with Gemma3VisionConfig intialization. Gemma3 will be limited " + "to text tasks." + ) + + self.text_config = text_config + self.vision_config = vision_config + self.mm_tokens_per_image = mm_tokens_per_image + self.boi_token_index = boi_token_index + self.eoi_token_index = eoi_token_index + self.image_token_index = image_token_index + self.initializer_range = initializer_range + + super().__init__(**kwargs) + + +__all__ = ["Gemma3Config", "Gemma3TextConfig"] diff --git a/server/text_generation_server/models/custom_modeling/gemma3/image_processing_gemma3.py b/server/text_generation_server/models/custom_modeling/gemma3/image_processing_gemma3.py new file mode 100644 index 00000000..2972abea --- /dev/null +++ b/server/text_generation_server/models/custom_modeling/gemma3/image_processing_gemma3.py @@ -0,0 +1,463 @@ +# coding=utf-8 +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Image processor class for Gemma3.""" + +import itertools +import math +from typing import Dict, List, Optional, Union + +import numpy as np + +from transformers.image_processing_utils import ( + BaseImageProcessor, + BatchFeature, + get_size_dict, +) +from transformers.image_transforms import ( + convert_to_rgb, + resize, + to_channel_dimension_format, +) +from transformers.image_utils import ( + IMAGENET_STANDARD_MEAN, + IMAGENET_STANDARD_STD, + ChannelDimension, + ImageInput, + PILImageResampling, + get_image_size, + infer_channel_dimension_format, + is_scaled_image, + to_numpy_array, + valid_images, + validate_preprocess_arguments, +) +from transformers.utils import ( + TensorType, + filter_out_non_signature_kwargs, + is_vision_available, + logging, +) + +from .utils import make_nested_list_of_images + + +logger = logging.get_logger(__name__) + + +if is_vision_available(): + import PIL + + +class Gemma3ImageProcessor(BaseImageProcessor): + r""" + Constructs a SigLIP image processor. + + Args: + do_resize (`bool`, *optional*, defaults to `True`): + Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by + `do_resize` in the `preprocess` method. + size (`Dict[str, int]` *optional*, defaults to `{"height": 224, "width": 224}`): + Size of the image after resizing. Can be overridden by `size` in the `preprocess` method. + resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`): + Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method. + do_rescale (`bool`, *optional*, defaults to `True`): + Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in + the `preprocess` method. + rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess` + method. + do_normalize (`bool`, *optional*, defaults to `True`): + Whether to normalize the image by the specified mean and standard deviation. Can be overridden by + `do_normalize` in the `preprocess` method. + image_mean (`float` or `List[float]`, *optional*, defaults to `[0.5, 0.5, 0.5]`): + Mean to use if normalizing the image. This is a float or list of floats the length of the number of + channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. + image_std (`float` or `List[float]`, *optional*, defaults to `[0.5, 0.5, 0.5]`): + Standard deviation to use if normalizing the image. This is a float or list of floats the length of the + number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method. + Can be overridden by the `image_std` parameter in the `preprocess` method. + do_convert_rgb (`bool`, *optional*, defaults to `True`): + Whether to convert the image to RGB. + do_pan_and_scan (`bool`, *optional*): + Whether to apply `pan_and_scan` to images. + """ + + model_input_names = ["pixel_values", "num_crops"] + + def __init__( + self, + do_resize: bool = True, + size: Dict[str, int] = None, + resample: PILImageResampling = PILImageResampling.BILINEAR, + do_rescale: bool = False, + rescale_factor: Union[int, float] = 1 / 255, + do_normalize: bool = True, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_convert_rgb: bool = None, + do_pan_and_scan: bool = None, + pan_and_scan_min_crop_size: int = None, + pan_and_scan_max_num_crops: int = None, + pan_and_scan_min_ratio_to_activate: float = None, + **kwargs, + ) -> None: + super().__init__(**kwargs) + size = size if size is not None else {"height": 224, "width": 224} + image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN + image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD + + self.do_resize = do_resize + self.size = size + self.resample = resample + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_normalize = do_normalize + self.image_mean = image_mean + self.image_std = image_std + self.do_convert_rgb = do_convert_rgb + self.do_pan_and_scan = do_pan_and_scan + self.pan_and_scan_min_crop_size = pan_and_scan_min_crop_size + self.pan_and_scan_max_num_crops = pan_and_scan_max_num_crops + self.pan_and_scan_min_ratio_to_activate = pan_and_scan_min_ratio_to_activate + + def pan_and_scan( + self, + image: np.ndarray, + pan_and_scan_min_crop_size: int, + pan_and_scan_max_num_crops: int, + pan_and_scan_min_ratio_to_activate: float, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ): + """ + Pan and Scan and image, whatever it means. TODO: write-up docs + + Args: + image (`np.ndarray`): + Image to resize. + pan_and_scan_min_crop_size (`int`): + Size of pan_and_scan_min_crop_size. + pan_and_scan_max_num_crops (`int`): + pan_and_scan_max_num_crops for the image. + pan_and_scan_min_ratio_to_activate (`int`): + pan_and_scan_min_ratio_to_activate for the image.. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format of the image. If not provided, it will be the same as the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred. + """ + height, width = get_image_size(image) + + # Square or landscape image. + if width >= height: + # Only apply PaS if the image is sufficiently exaggerated + if width / height < pan_and_scan_min_ratio_to_activate: + return [] + + # Select ideal number of crops close to the image aspect ratio and such that crop_size > min_crop_size. + num_crops_w = int( + math.floor(width / height + 0.5) + ) # Half round up rounding. + num_crops_w = min( + int(math.floor(width / pan_and_scan_min_crop_size)), num_crops_w + ) + + # Make sure the number of crops is in range [2, pan_and_scan_max_num_crops]. + num_crops_w = max(2, num_crops_w) + num_crops_w = min(pan_and_scan_max_num_crops, num_crops_w) + num_crops_h = 1 + + # Portrait image. + else: + # Only apply PaS if the image is sufficiently exaggerated + if height / width < pan_and_scan_min_ratio_to_activate: + return [] + + # Select ideal number of crops close to the image aspect ratio and such that crop_size > min_crop_size. + num_crops_h = int(math.floor(height / width + 0.5)) + num_crops_h = min( + int(math.floor(height / pan_and_scan_min_crop_size)), num_crops_h + ) + + # Make sure the number of crops is in range [2, pan_and_scan_max_num_crops]. + num_crops_h = max(2, num_crops_h) + num_crops_h = min(pan_and_scan_max_num_crops, num_crops_h) + num_crops_w = 1 + + crop_size_w = int(math.ceil(width / num_crops_w)) + crop_size_h = int(math.ceil(height / num_crops_h)) + + # Don't apply PaS if crop size is too small. + if min(crop_size_w, crop_size_h) < pan_and_scan_min_crop_size: + return [] + + crop_positions_w = [crop_size_w * i for i in range(num_crops_w)] + crop_positions_h = [crop_size_h * i for i in range(num_crops_h)] + + if input_data_format == ChannelDimension.LAST: + image_crops = [ + image[pos_h : pos_h + crop_size_h, pos_w : pos_w + crop_size_w] + for pos_h, pos_w in itertools.product( + crop_positions_h, crop_positions_w + ) + ] + else: + image_crops = [ + image[:, pos_h : pos_h + crop_size_h, pos_w : pos_w + crop_size_w] + for pos_h, pos_w in itertools.product( + crop_positions_h, crop_positions_w + ) + ] + + return image_crops + + def _process_images_for_pas( + self, + images: List[np.ndarray], + do_pan_and_scan: bool, + pan_and_scan_min_crop_size: int, + pan_and_scan_max_num_crops: int, + pan_and_scan_min_ratio_to_activate: float, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ): + pas_images_list = [] + num_crops = [] + for image in images: + pas_images = self.pan_and_scan( + image=image, + pan_and_scan_min_crop_size=pan_and_scan_min_crop_size, + pan_and_scan_max_num_crops=pan_and_scan_max_num_crops, + pan_and_scan_min_ratio_to_activate=pan_and_scan_min_ratio_to_activate, + data_format=data_format, + input_data_format=input_data_format, + ) + pas_images_list.extend([image] + pas_images) + num_crops.append(len(pas_images)) + return pas_images_list, num_crops + + @filter_out_non_signature_kwargs() + def preprocess( + self, + images: ImageInput, + do_resize: bool = None, + size: Dict[str, int] = None, + resample: PILImageResampling = None, + do_rescale: bool = None, + rescale_factor: float = None, + do_normalize: bool = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + do_convert_rgb: bool = True, + do_pan_and_scan: bool = None, + pan_and_scan_min_crop_size: int = None, + pan_and_scan_max_num_crops: int = None, + pan_and_scan_min_ratio_to_activate: float = None, + ) -> PIL.Image.Image: + """ + Preprocess an image or batch of images. + + Args: + images (`ImageInput`): + Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If + passing in images with pixel values between 0 and 1, set `do_rescale=False`. + do_resize (`bool`, *optional*, defaults to `self.do_resize`): + Whether to resize the image. + size (`Dict[str, int]`, *optional*, defaults to `self.size`): + Size of the image after resizing. + resample (`int`, *optional*, defaults to `self.resample`): + Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only + has an effect if `do_resize` is set to `True`. + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether to rescale the image. + rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): + Rescale factor to rescale the image by if `do_rescale` is set to `True`. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): + Whether to normalize the image. + image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`): + Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`. + image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`): + Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to + `True`. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: Use the channel dimension format of the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`): + Whether to convert the image to RGB. + do_pan_and_scan (`bool`, *optional*, defaults to `self.do_convert_rgb`): + Whether to apply `pan_and_scan` to images. + """ + do_resize = do_resize if do_resize is not None else self.do_resize + size = size if size is not None else self.size + size = get_size_dict(size, param_name="size", default_to_square=False) + resample = resample if resample is not None else self.resample + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + rescale_factor = ( + rescale_factor if rescale_factor is not None else self.rescale_factor + ) + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + image_mean = image_mean if image_mean is not None else self.image_mean + image_std = image_std if image_std is not None else self.image_std + do_convert_rgb = ( + do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb + ) + do_pan_and_scan = ( + do_pan_and_scan if do_pan_and_scan is not None else self.do_pan_and_scan + ) + pan_and_scan_min_crop_size = ( + pan_and_scan_min_crop_size + if pan_and_scan_min_crop_size is not None + else self.pan_and_scan_min_crop_size + ) + pan_and_scan_max_num_crops = ( + pan_and_scan_max_num_crops + if pan_and_scan_max_num_crops is not None + else self.pan_and_scan_max_num_crops + ) + pan_and_scan_min_ratio_to_activate = ( + pan_and_scan_min_ratio_to_activate + if pan_and_scan_min_ratio_to_activate is not None + else self.pan_and_scan_min_ratio_to_activate + ) + + images_list = make_nested_list_of_images(images) + + if not valid_images(images_list[0]): + raise ValueError( + "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + + validate_preprocess_arguments( + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + do_resize=do_resize, + size=size, + resample=resample, + ) + if do_convert_rgb: + images_list = [ + [convert_to_rgb(image) for image in images] for images in images_list + ] + + # All transformations expect numpy arrays. + images_list = [ + [to_numpy_array(image) for image in images] for images in images_list + ] + + if do_rescale and is_scaled_image(images_list[0][0]): + logger.warning_once( + "It looks like you are trying to rescale already rescaled images. If the input" + " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." + ) + + if input_data_format is None: + # We assume that all images have the same channel dimension format. + input_data_format = infer_channel_dimension_format(images_list[0][0]) + + if do_pan_and_scan: + images_list_and_num_crops = [ + self._process_images_for_pas( + images=images, + do_pan_and_scan=do_pan_and_scan, + pan_and_scan_min_crop_size=pan_and_scan_min_crop_size, + pan_and_scan_max_num_crops=pan_and_scan_max_num_crops, + pan_and_scan_min_ratio_to_activate=pan_and_scan_min_ratio_to_activate, + data_format=data_format, + input_data_format=input_data_format, + ) + for images in images_list + ] + images_list = [images for images, _ in images_list_and_num_crops] + num_crops = [num_crops for _, num_crops in images_list_and_num_crops] + else: + num_crops = [[0] for images in images_list] + + if do_resize: + height, width = size["height"], size["width"] + images_list = [ + [ + resize( + image=image, + size=(height, width), + resample=resample, + input_data_format=input_data_format, + ) + for image in images + ] + for images in images_list + ] + + if do_rescale: + images_list = [ + [ + self.rescale( + image=image, + scale=rescale_factor, + input_data_format=input_data_format, + ) + for image in images + ] + for images in images_list + ] + + if do_normalize: + images_list = [ + [ + self.normalize( + image=image, + mean=image_mean, + std=image_std, + input_data_format=input_data_format, + ) + for image in images + ] + for images in images_list + ] + + images = [ + to_channel_dimension_format( + image, data_format, input_channel_dim=input_data_format + ) + for images in images_list + for image in images + ] + + data = {"pixel_values": images, "num_crops": num_crops} + return BatchFeature(data=data, tensor_type=return_tensors) + + +__all__ = ["Gemma3ImageProcessor"] diff --git a/server/text_generation_server/models/custom_modeling/gemma3/processing_gemma3.py b/server/text_generation_server/models/custom_modeling/gemma3/processing_gemma3.py new file mode 100644 index 00000000..6bdf35c6 --- /dev/null +++ b/server/text_generation_server/models/custom_modeling/gemma3/processing_gemma3.py @@ -0,0 +1,204 @@ +# coding=utf-8 +# Copyright 2025 Google Inc. HuggingFace Inc. team. All rights reserved. +# +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import re +from typing import List, Optional, Union + +from transformers.feature_extraction_utils import BatchFeature +from transformers.image_utils import ImageInput +from transformers.processing_utils import ( + ImagesKwargs, + ProcessingKwargs, + ProcessorMixin, + Unpack, +) +from transformers.tokenization_utils_base import PreTokenizedInput, TextInput +from transformers.utils import to_py_obj +from text_generation_server.models.custom_modeling.gemma3.image_processing_gemma3 import ( + Gemma3ImageProcessor, +) + +from transformers.image_utils import PILImageResampling + +from .utils import make_nested_list_of_images + + +class Gemma3ImagesKwargs(ImagesKwargs): + do_pan_and_scan: Optional[bool] + pan_and_scan_min_crop_size: Optional[int] + pan_and_scan_max_num_crops: Optional[int] + pan_and_scan_min_ratio_to_activate: Optional[float] + do_convert_rgb: Optional[bool] + + +class Gemma3ProcessorKwargs(ProcessingKwargs, total=False): + _defaults = { + "text_kwargs": { + "padding": False, + }, + "images_kwargs": { + "do_pan_and_scan": False, + "pan_and_scan_min_crop_size": 256, + "pan_and_scan_max_num_crops": 4, + "pan_and_scan_min_ratio_to_activate": 1.2, + }, + } + + +class Gemma3Processor(ProcessorMixin): + attributes = ["image_processor", "tokenizer"] + valid_kwargs = ["chat_template"] + # # image_processor_class = "Gemma3ImageProcessor" + image_processor_class = "AutoProcessor" + tokenizer_class = "AutoTokenizer" + + def __init__( + self, + image_processor, + tokenizer, + chat_template=None, + num_mm_soft_tokens_per_image: int = 256, + **kwargs, + ): + num_mm_soft_tokens_per_image = 256 + chat_template = None + + image_processor = Gemma3ImageProcessor( + image_mean=(127.5,) * 3, + image_std=(127.5,) * 3, + size={"height": 896, "width": 896}, + do_rescale=False, + resample=PILImageResampling.BILINEAR, + ) + + self.image_token_id = tokenizer.image_token_id + image_tokens_expanded = "".join( + [tokenizer.image_token] * num_mm_soft_tokens_per_image + ) + self.full_image_sequence = ( + f"\n\n{tokenizer.boi_token}{image_tokens_expanded}{tokenizer.eoi_token}\n\n" + ) + + self.image_processor = image_processor + self.tokenizer = tokenizer + self.chat_template = chat_template + + # super().__init__( + # image_processor=image_processor, + # tokenizer=tokenizer, + # chat_template=chat_template, + # **kwargs, + # ) + + def __call__( + self, + images: ImageInput = None, + text: Union[ + TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput] + ] = None, + videos=None, + audio=None, + **kwargs: Unpack[Gemma3ProcessorKwargs], + ) -> BatchFeature: + if text is None and images is None: + raise ValueError("Provide at least one of `text` or `images`.") + + output_kwargs = self._merge_kwargs( + Gemma3ProcessorKwargs, + tokenizer_init_kwargs=self.tokenizer.init_kwargs, + **kwargs, + ) + + if isinstance(text, str): + text = [text] + elif not isinstance(text, list) and not isinstance(text[0], str): + raise ValueError( + "Invalid input text. Please provide a string, or a list of strings" + ) + + image_inputs = {} + if images is not None: + batched_images = make_nested_list_of_images(images) + image_inputs = self.image_processor( + batched_images, **output_kwargs["images_kwargs"] + ) + + # Create empty text to be replaced with placeholders + if not text: + text = [ + " ".join([""] * len(images)) for images in batched_images + ] + + if len(batched_images) != len(text): + raise ValueError( + f"Received inconsistently sized batches of images ({len(batched_images)}) and text ({len(text)})." + ) + + # Replace image tokens by the full expanded sequence + batch_num_crops = to_py_obj(image_inputs.pop("num_crops")) + for prompt, images, num_crops in zip(text, batched_images, batch_num_crops): + image_indexes = [m.start() for m in re.finditer("", prompt)] + + if len(images) != len(image_indexes): + raise ValueError( + f"Prompt contained {len(image_indexes)} image tokens but received {len(images)} images." + ) + + # Insert additional image tokens for Pan-and-Scan crops + for num, idx in reversed(list(zip(num_crops, image_indexes))): + if num: + formatted_image_text = ( + "Here is the original image and here are some crops to help you see better " + + " ".join([""] * num) + ) + prompt = ( + prompt[:idx] + + formatted_image_text + + prompt[idx + len("") :] + ) + + # Expand placeholder image tokens to the full image token sequence + text = [ + prompt.replace("", self.full_image_sequence) for prompt in text + ] + + text_input = self.tokenizer(text=text, **output_kwargs["text_kwargs"]) + return BatchFeature(data={**text_input, **image_inputs}) + + # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Gemma + def batch_decode(self, *args, **kwargs): + """ + This method forwards all its arguments to GemmaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please + refer to the docstring of this method for more information. + """ + return self.tokenizer.batch_decode(*args, **kwargs) + + # Copied from transformers.models.clip.processing_clip.CLIPProcessor.decode with CLIP->Gemma + def decode(self, *args, **kwargs): + """ + This method forwards all its arguments to GemmaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to + the docstring of this method for more information. + """ + return self.tokenizer.decode(*args, **kwargs) + + @property + # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names with CLIP->PaliGemma + def model_input_names(self): + tokenizer_input_names = self.tokenizer.model_input_names + image_processor_input_names = self.image_processor.model_input_names + return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names)) + + +__all__ = ["Gemma3Processor"] diff --git a/server/text_generation_server/models/custom_modeling/gemma3/utils.py b/server/text_generation_server/models/custom_modeling/gemma3/utils.py new file mode 100644 index 00000000..8d431fb2 --- /dev/null +++ b/server/text_generation_server/models/custom_modeling/gemma3/utils.py @@ -0,0 +1,61 @@ +# coding=utf-8 +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List, Union + + +from transformers.image_utils import ImageInput, is_valid_image, is_pil_image + + +def is_valid_list_of_images(images: List): + return images and all(is_valid_image(image) for image in images) + + +def make_nested_list_of_images( + images: Union[List[ImageInput], ImageInput], +) -> ImageInput: + """ + Ensure that the output is a nested list of images. + Args: + images (`Union[List[ImageInput], ImageInput]`): + The input image. + Returns: + list: A list of list of images or a list of 4d array of images. + """ + # If it's a list of batches, it's already in the right format + if ( + isinstance(images, (list, tuple)) + and all(isinstance(images_i, (list, tuple)) for images_i in images) + and all(is_valid_list_of_images(images_i) for images_i in images) + ): + return images + + # If it's a list of images, it's a single batch, so convert it to a list of lists + if isinstance(images, (list, tuple)) and is_valid_list_of_images(images): + if is_pil_image(images[0]) or images[0].ndim == 3: + return [images] + if images[0].ndim == 4: + return [list(image) for image in images] + + # If it's a single image, convert it to a list of lists + if is_valid_image(images): + if is_pil_image(images) or images.ndim == 3: + return [[images]] + if images.ndim == 4: + return [list(images)] + + raise ValueError( + "Invalid input type. Must be a single image, a list of images, or a list of batches of images." + ) diff --git a/server/text_generation_server/models/custom_modeling/qwen2_5_vl.py b/server/text_generation_server/models/custom_modeling/qwen2_5_vl.py index e317c5b5..066de6a2 100644 --- a/server/text_generation_server/models/custom_modeling/qwen2_5_vl.py +++ b/server/text_generation_server/models/custom_modeling/qwen2_5_vl.py @@ -633,7 +633,7 @@ class Qwen2_5VisionModel(nn.Module): config=config, weights=weights, ) - # import ipdb; ipdb.set_trace() + self.temporal_patch_size = config.temporal_patch_size self.spatial_patch_size = config.spatial_patch_size self.in_channels = config.in_channels diff --git a/server/text_generation_server/models/custom_modeling/qwen2_vl.py b/server/text_generation_server/models/custom_modeling/qwen2_vl.py index a72e0e55..26e6fede 100644 --- a/server/text_generation_server/models/custom_modeling/qwen2_vl.py +++ b/server/text_generation_server/models/custom_modeling/qwen2_vl.py @@ -542,6 +542,7 @@ class Qwen2VLForConditionalGeneration(nn.Module): max_s=max_s, true_max_s=max_s, prefill_cache_indices=prefill_cache_indices, + adapter_data=adapter_data, ) if lm_head_indices is not None: hidden_states = hidden_states[lm_head_indices] diff --git a/server/text_generation_server/models/custom_modeling/vlm.py b/server/text_generation_server/models/custom_modeling/vlm.py index 94b8522d..4447a73f 100644 --- a/server/text_generation_server/models/custom_modeling/vlm.py +++ b/server/text_generation_server/models/custom_modeling/vlm.py @@ -23,6 +23,13 @@ def load_text_model(prefix, config, weights, name=None): ) return FlashGemma2ForCausalLM(prefix, config, weights) + + elif config.model_type == "gemma3" or config.model_type == "gemma3_text": + from text_generation_server.models.custom_modeling.flash_gemma3_modeling import ( + FlashGemma3ForCausalLM, + ) + + return FlashGemma3ForCausalLM(prefix, config, weights) elif config.model_type == "paligemma": from text_generation_server.models.custom_modeling.flash_gemma_modeling import ( FlashGemmaForCausalLM, @@ -42,13 +49,21 @@ def load_vision_model(prefix, config, weights): return CLIPVisionTransformer( prefix=f"{prefix}.vision_model", config=config, weights=weights ) - if config.model_type == "siglip_vision_model": + if ( + config.model_type == "siglip_vision_model" + or config.model_type == "gemma3_vision" + ): from text_generation_server.models.custom_modeling.siglip import ( SiglipVisionTransformer, ) + # TODO: ensure that using the prefix doesn't break any existing models + # that rely on the old prefix (update the old models if necessary) return SiglipVisionTransformer( - prefix="vision_tower.vision_model", config=config, weights=weights + # prefix="vision_model.vision_model", config=config, weights=weights + prefix=f"{prefix}.vision_model", + config=config, + weights=weights, ) else: raise RuntimeError(f"Unsupported model type {config.model_type}") diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py index e268af8b..d3a83e27 100644 --- a/server/text_generation_server/models/flash_causal_lm.py +++ b/server/text_generation_server/models/flash_causal_lm.py @@ -83,24 +83,11 @@ from text_generation_server.models.metadata_kernels import ( tracer = trace.get_tracer(__name__) -# Will be set in init -SLIDING_WINDOW: Optional[int] = None - def small_power_of_2(n: int): return 1 << ((n - 1).bit_length() - 1) -def set_sliding_window(sliding_window: int): - global SLIDING_WINDOW - SLIDING_WINDOW = sliding_window - - -def get_sliding_windows() -> int: - global SLIDING_WINDOW - return SLIDING_WINDOW - - def init_cpu_threads_env(rank_id: int, world_size: int): import importlib.util @@ -1002,10 +989,8 @@ class FlashCausalLMBatch(Batch): self.slot_indices, ) - sliding_window = get_sliding_windows() position_ids = [] slot_indices = [] - prefill_cache_indices = [] all_prefill_logprobs = True no_prefill_logprobs = True prefill_cu_outlens = [0] @@ -1064,14 +1049,6 @@ class FlashCausalLMBatch(Batch): # Update cumulative_slot_tokens += len(request_slots) - # Create tensor to slice into the kv tensor in prefill - if sliding_window is not None: - request_prefill_cache_indices = torch.arange( - cumulative_length + max(0, input_length - sliding_window), - cumulative_length + input_length, - dtype=torch.int64, - ) - # Prefill logprobs is ignored if the request is done prefilling prefill_logprobs = r.prefill_logprobs and request_prefilling @@ -1085,9 +1062,6 @@ class FlashCausalLMBatch(Batch): prefill_cu_outlens.append(prefill_out_cumulative_length + 1) prefill_out_cumulative_length += 1 - if sliding_window is not None: - prefill_cache_indices.append(request_prefill_cache_indices) - ADAPTER_TO_INDEX = get_adapter_to_index() if ADAPTER_TO_INDEX: adapter_index = ADAPTER_TO_INDEX.get(r.adapter_id, 0) @@ -1151,24 +1125,18 @@ class FlashCausalLMBatch(Batch): position_ids = torch.cat(position_ids) if slot_indices: slot_indices = torch.cat(slot_indices) - if sliding_window is not None: - prefill_cache_indices = torch.cat(prefill_cache_indices) else: if position_ids: position_ids = position_ids[0] if slot_indices: slot_indices = slot_indices[0] - if sliding_window is not None: - prefill_cache_indices = prefill_cache_indices[0] if not has_triton(): self.position_ids = position_ids.to(device) self.slot_indices = slot_indices.to(device) self.prefill_cu_outlens = prefill_cu_outlens - self.prefill_cache_indices = ( - prefill_cache_indices.to(device) if sliding_window is not None else None - ) + self.prefill_cache_indices = None if all_prefill_logprobs: prefill_head_indices = None @@ -1306,9 +1274,7 @@ class FlashCausalLM(Model): if text_config is not None: config = text_config - if getattr(config, "sliding_window", None) is not None: - set_sliding_window(config.sliding_window) - else: + if getattr(config, "sliding_window", None) is None: config.sliding_window = None self.num_layers = config.num_hidden_layers @@ -2500,7 +2466,6 @@ class FlashCausalLM(Model): page_size=BLOCK_SIZE, kv_dtype=self.kv_cache_dtype, q_dtype=self.dtype, - window_left=self.sliding_window, ) else: assert input_lengths_tensor is not None @@ -2514,5 +2479,4 @@ class FlashCausalLM(Model): page_size=BLOCK_SIZE, kv_cache_dtype=self.kv_cache_dtype, q_dtype=self.dtype, - window_left=self.sliding_window, ) diff --git a/server/text_generation_server/models/model.py b/server/text_generation_server/models/model.py index af4d1f08..da317a62 100644 --- a/server/text_generation_server/models/model.py +++ b/server/text_generation_server/models/model.py @@ -110,7 +110,7 @@ class Model(ABC): requires_padding=self.requires_padding, dtype=str(self.dtype), device_type=self.device.type, - window_size=self.sliding_window, + window_size=None, # Setting this parameter to None disabled the block logic with sliding window. speculate=self.speculate, support_chunking=self.support_chunking, use_prefix_caching=PREFIX_CACHING, diff --git a/server/text_generation_server/models/vlm_causal_lm.py b/server/text_generation_server/models/vlm_causal_lm.py index 39046f2a..9111fdc0 100644 --- a/server/text_generation_server/models/vlm_causal_lm.py +++ b/server/text_generation_server/models/vlm_causal_lm.py @@ -128,6 +128,12 @@ def image_text_replacement(processor, image_input, config, image_id: int) -> str num_pads = grid_t * grid_h * grid_w // 4 padding = "<|image_pad|>" * num_pads return f"<|vision_start|>{padding}<|vision_end|>" + elif config.model_type == "gemma3": + # TODO: get correct number of features via reviewing the Gemma3 architecture + # and calculating the number of image tokens + num_pads = 256 + padding = "" * num_pads + return f"\n\n{padding}\n\n" else: raise RuntimeError(f"Unknown config {config.model_type} for multimodal") @@ -244,6 +250,8 @@ class VlmCausalLMBatch(FlashCausalLMBatch): if config.model_type == "llava_next": images.append(image) + elif config.model_type == "gemma3": + images.append(image) else: images.append([image]) else: diff --git a/server/text_generation_server/utils/import_utils.py b/server/text_generation_server/utils/import_utils.py index b693258c..730ac6cb 100644 --- a/server/text_generation_server/utils/import_utils.py +++ b/server/text_generation_server/utils/import_utils.py @@ -18,14 +18,10 @@ def get_cuda_free_memory(device, memory_fraction): def get_xpu_free_memory(device, memory_fraction): - total_memory = torch.xpu.get_device_properties(device).total_memory - device_id = device.index - memory_fraction = float(os.getenv("XPU_MEMORY_FRACTION", "1.0")) + total_free_memory, total_xpu_memory = torch.xpu.mem_get_info(device) + memory_fraction = float(os.getenv("XPU_MEMORY_FRACTION", "0.9")) free_memory = max( - 0, - int( - total_memory * 0.9 * memory_fraction - torch.xpu.memory_reserved(device_id) - ), + 0, int(total_free_memory - (1 - memory_fraction) * total_xpu_memory) ) return free_memory diff --git a/server/text_generation_server/utils/kernels.py b/server/text_generation_server/utils/kernels.py index 42745c71..3943d9a5 100644 --- a/server/text_generation_server/utils/kernels.py +++ b/server/text_generation_server/utils/kernels.py @@ -1,7 +1,7 @@ import importlib from loguru import logger -from hf_kernels import load_kernel as hf_load_kernel +from kernels import load_kernel as hf_load_kernel from text_generation_server.utils.log import log_once diff --git a/server/text_generation_server/utils/quantization.py b/server/text_generation_server/utils/quantization.py index e460361a..92111b19 100644 --- a/server/text_generation_server/utils/quantization.py +++ b/server/text_generation_server/utils/quantization.py @@ -79,6 +79,8 @@ def _get_quantizer_config(model_id, revision): modules_to_not_convert = data["quantization_config"].get( "modules_to_not_convert", [] ) + if modules_to_not_convert is None: + modules_to_not_convert = [] except Exception: filename = "quantize_config.json" try: diff --git a/server/uv.lock b/server/uv.lock index a2293a5c..d4bbb955 100644 --- a/server/uv.lock +++ b/server/uv.lock @@ -1,4 +1,5 @@ version = 1 +revision = 1 requires-python = ">=3.9" resolution-markers = [ "python_full_version >= '3.12'", @@ -674,18 +675,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/be/66/7c1a552545a9597fbd33d77c817f1f0cc56736ca64aa0821948f945118d6/grpcio_tools-1.70.0-cp39-cp39-win_amd64.whl", hash = "sha256:840ec536ab933db2ef8d5acaa6b712d0e9e8f397f62907c852ec50a3f69cdb78", size = 1119339 }, ] -[[package]] -name = "hf-kernels" -version = "0.1.6" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "huggingface-hub" }, - { name = "packaging" }, - { name = "tomli", marker = "python_full_version < '3.11'" }, - { name = "torch" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/01/fe/5aa3ea1b66bcc7d81aff19683ea04d4a9cd414c8d4ff05b150fc1f196ccd/hf_kernels-0.1.6.tar.gz", hash = "sha256:5effee5046552ce226ff86d3870a799f4ecae399bcb2beb4046c28c2dd736d2f", size = 8704 } - [[package]] name = "hf-transfer" version = "0.1.9" @@ -814,6 +803,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d1/0f/8910b19ac0670a0f80ce1008e5e751c4a57e14d2c4c13a482aa6079fa9d6/jsonschema_specifications-2024.10.1-py3-none-any.whl", hash = "sha256:a09a0680616357d9a0ecf05c12ad234479f549239d0f5b55f3deea67475da9bf", size = 18459 }, ] +[[package]] +name = "kernels" +version = "0.2.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "huggingface-hub" }, + { name = "packaging" }, + { name = "tomli", marker = "python_full_version < '3.11'" }, + { name = "torch" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/26/99/41af9dce502bb1682977fee1bc487a73fa8418cebbce16b8d27733947375/kernels-0.2.1.tar.gz", hash = "sha256:918942332819b28377b9d07070daddecfd8a5e7bab574dd3dc64a209ca6008b2", size = 9395 } + [[package]] name = "lark" version = "1.2.2" @@ -2561,9 +2562,9 @@ dependencies = [ { name = "grpcio" }, { name = "grpcio-reflection" }, { name = "grpcio-status" }, - { name = "hf-kernels" }, { name = "hf-transfer" }, { name = "huggingface-hub" }, + { name = "kernels" }, { name = "loguru" }, { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, { name = "numpy", version = "2.2.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, @@ -2626,9 +2627,9 @@ requires-dist = [ { name = "grpcio-status", specifier = ">=1.67.0" }, { name = "grpcio-tools", marker = "extra == 'dev'", specifier = ">=1.51.1,<2.0" }, { name = "grpcio-tools", marker = "extra == 'gen'", specifier = ">=1.69.0" }, - { name = "hf-kernels", specifier = ">=0.1.5" }, { name = "hf-transfer", specifier = ">=0.1.8" }, { name = "huggingface-hub", specifier = ">=0.29.0" }, + { name = "kernels", specifier = ">=0.2.1" }, { name = "loguru", specifier = ">=0.7.3" }, { name = "mypy-protobuf", marker = "extra == 'gen'", specifier = ">=3.6.0" }, { name = "numpy", specifier = ">=1.26,<3" }, @@ -2651,6 +2652,7 @@ requires-dist = [ { name = "transformers", specifier = ">=4.49.0" }, { name = "typer", specifier = ">=0.15.1" }, ] +provides-extras = ["accelerate", "bnb", "compressed-tensors", "peft", "outlines", "dev", "quantize", "gen"] [[package]] name = "texttable" diff --git a/tgi-entrypoint.sh b/tgi-entrypoint.sh index cd551ed5..32eccea5 100755 --- a/tgi-entrypoint.sh +++ b/tgi-entrypoint.sh @@ -2,5 +2,5 @@ ldconfig 2>/dev/null || echo 'unable to refresh ld cache, not a big deal in most cases' -source ./.venv/bin/activate +source /usr/src/.venv/bin/activate exec text-generation-launcher $@