diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 1bd7163fe..720a13cb3 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -202,8 +202,8 @@ jobs:
           target: ${{ env.TARGET }}
           tags: ${{ steps.meta.outputs.tags || steps.meta-pr.outputs.tags }}
           labels: ${{ steps.meta.outputs.labels || steps.meta-pr.outputs.labels }}
-          cache-from: type=s3,region=us-east-1,bucket=ci-docker-buildx-cache,name=text-generation-inference-cache${{ env.LABEL }},mode=min,access_key_id=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_ACCESS_KEY_ID }},secret_access_key=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_SECRET_ACCESS_KEY }},mode=max
-          cache-to: type=s3,region=us-east-1,bucket=ci-docker-buildx-cache,name=text-generation-inference-cache${{ env.LABEL }},mode=min,access_key_id=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_ACCESS_KEY_ID }},secret_access_key=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_SECRET_ACCESS_KEY }},mode=max
+          cache-from: type=s3,region=us-east-1,bucket=ci-docker-buildx-cache,name=text-generation-inference-cache${{ env.LABEL }},mode=max,access_key_id=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_ACCESS_KEY_ID }},secret_access_key=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_SECRET_ACCESS_KEY }},mode=max
+          cache-to: type=s3,region=us-east-1,bucket=ci-docker-buildx-cache,name=text-generation-inference-cache${{ env.LABEL }},mode=min,access_key_id=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_ACCESS_KEY_ID }},secret_access_key=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_SECRET_ACCESS_KEY }},mode=min
       - name: Final
         id: final
         run: |
diff --git a/Dockerfile b/Dockerfile
index 720053330..a963db2f0 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -195,50 +195,56 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins
         git \
         && rm -rf /var/lib/apt/lists/*
 
-# Copy conda with PyTorch installed
-COPY --from=pytorch-install /opt/conda /opt/conda
-
-# Copy build artifacts from flash attention builder
-COPY --from=flash-att-builder /usr/src/flash-attention/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
-COPY --from=flash-att-builder /usr/src/flash-attention/csrc/layer_norm/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
-COPY --from=flash-att-builder /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
-
-# Copy build artifacts from flash attention v2 builder
-COPY --from=flash-att-v2-builder /opt/conda/lib/python3.11/site-packages/flash_attn_2_cuda.cpython-311-x86_64-linux-gnu.so /opt/conda/lib/python3.11/site-packages
-
-# Copy build artifacts from custom kernels builder
-COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
-# Copy build artifacts from exllama kernels builder
-COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
-# Copy build artifacts from exllamav2 kernels builder
-COPY --from=exllamav2-kernels-builder /usr/src/exllamav2/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
-# Copy build artifacts from awq kernels builder
-COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
-# Copy build artifacts from eetq kernels builder
-COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
-# Copy build artifacts from lorax punica kernels builder
-COPY --from=lorax-punica-builder /usr/src/lorax-punica/server/punica_kernels/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
-# Copy build artifacts from mamba builder
-COPY --from=mamba-builder /usr/src/mamba/build/lib.linux-x86_64-cpython-311/ /opt/conda/lib/python3.11/site-packages
-COPY --from=mamba-builder /usr/src/causal-conv1d/build/lib.linux-x86_64-cpython-311/ /opt/conda/lib/python3.11/site-packages
-COPY --from=flashinfer-builder /opt/conda/lib/python3.11/site-packages/flashinfer/ /opt/conda/lib/python3.11/site-packages/flashinfer/
-
 # Install flash-attention dependencies
 # RUN pip install einops --no-cache-dir
 
+# Copy conda with PyTorch installed
+COPY --from=pytorch-install /opt/conda /opt/conda
+
 # Install server
 COPY proto proto
 COPY server server
 COPY server/Makefile server/Makefile
 ENV UV_SYSTEM_PYTHON=1
 RUN cd server && \
-    make gen-server && \
-    python -c "from text_generation_server.pb import generate_pb2" && \
     pip install -U pip uv && \
-    uv pip install -e ".[attention, bnb, accelerate, compressed-tensors, marlin, moe, quantize, peft, outlines]" --no-cache-dir # && \
-    # uv pip install nvidia-nccl-cu12==2.22.3
+	uv sync --frozen --extra gen --extra attention --extra bnb --extra accelerate --extra compressed-tensors --extra marlin --extra moe --extra quantize --extra peft --extra outlines --no-install-project && \
+    . ./.venv/bin/activate && \
+    make gen-server-raw
 
-ENV LD_PRELOAD=/opt/conda/lib/python3.11/site-packages/nvidia/nccl/lib/libnccl.so.2
+RUN cd server && \
+    uv sync --frozen --extra gen --extra attention --extra bnb --extra accelerate --extra compressed-tensors --extra marlin --extra moe --extra quantize --extra peft --extra outlines && \
+    . ./.venv/bin/activate && \
+    pwd && \
+    text-generation-server --help
+
+# Copy build artifacts from flash attention builder
+COPY --from=flash-att-builder /usr/src/flash-attention/build/lib.linux-x86_64-cpython-311 /usr/src/server/.venv/lib/python3.11/site-packages
+COPY --from=flash-att-builder /usr/src/flash-attention/csrc/layer_norm/build/lib.linux-x86_64-cpython-311 /usr/src/server/.venv/lib/python3.11/site-packages
+COPY --from=flash-att-builder /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-311 /usr/src/server/.venv/lib/python3.11/site-packages
+
+# Copy build artifacts from flash attention v2 builder
+COPY --from=flash-att-v2-builder /opt/conda/lib/python3.11/site-packages/flash_attn_2_cuda.cpython-311-x86_64-linux-gnu.so /usr/src/server/.venv/lib/python3.11/site-packages
+
+# Copy build artifacts from custom kernels builder
+COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /usr/src/server/.venv/lib/python3.11/site-packages
+# Copy build artifacts from exllama kernels builder
+COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /usr/src/server/.venv/lib/python3.11/site-packages
+# Copy build artifacts from exllamav2 kernels builder
+COPY --from=exllamav2-kernels-builder /usr/src/exllamav2/build/lib.linux-x86_64-cpython-311 /usr/src/server/.venv/lib/python3.11/site-packages
+# Copy build artifacts from awq kernels builder
+COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86_64-cpython-311 /usr/src/server/.venv/lib/python3.11/site-packages
+# Copy build artifacts from eetq kernels builder
+COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-311 /usr/src/server/.venv/lib/python3.11/site-packages
+# Copy build artifacts from lorax punica kernels builder
+COPY --from=lorax-punica-builder /usr/src/lorax-punica/server/punica_kernels/build/lib.linux-x86_64-cpython-311 /usr/src/server/.venv/lib/python3.11/site-packages
+# Copy build artifacts from mamba builder
+COPY --from=mamba-builder /usr/src/mamba/build/lib.linux-x86_64-cpython-311/ /usr/src/server/.venv/lib/python3.11/site-packages
+COPY --from=mamba-builder /usr/src/causal-conv1d/build/lib.linux-x86_64-cpython-311/ /usr/src/server/.venv/lib/python3.11/site-packages
+COPY --from=flashinfer-builder /opt/conda/lib/python3.11/site-packages/flashinfer/ /usr/src/server/.venv/lib/python3.11/site-packages/flashinfer/
+
+
+# ENV LD_PRELOAD=/opt/conda/lib/python3.11/site-packages/nvidia/nccl/lib/libnccl.so.2
 # Required to find libpython within the rust binaries
 ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/conda/lib/"
 # This is needed because exl2 tries to load flash-attn
diff --git a/Dockerfile_amd b/Dockerfile_amd
index 205e46d93..f95f3b6d6 100644
--- a/Dockerfile_amd
+++ b/Dockerfile_amd
@@ -320,9 +320,16 @@ COPY server server
 COPY server/Makefile server/Makefile
 ENV UV_SYSTEM_PYTHON=1
 RUN cd server && \
-    make gen-server && \
     pip install -U pip uv && \
-    uv pip install -e ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir
+	uv sync --frozen --extra gen --extra accelerate --extra compressed-tensors --extra quantize --extra peft --extra outlines --no-install-project && \
+    . ./.venv/bin/activate && \
+    make gen-server-raw
+
+RUN cd server && \
+	uv sync --frozen --extra gen --extra accelerate --extra compressed-tensors --extra quantize --extra peft --extra outlines && \
+    . ./.venv/bin/activate && \
+    pwd && \
+    text-generation-server --help
 
 # Install benchmarker
 COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
diff --git a/Dockerfile_intel b/Dockerfile_intel
index 0f0d43835..3bd697fd7 100644
--- a/Dockerfile_intel
+++ b/Dockerfile_intel
@@ -215,9 +215,16 @@ COPY server server
 COPY server/Makefile server/Makefile
 ENV UV_SYSTEM_PYTHON=1
 RUN cd server && \
-    make gen-server && \
     pip install -U pip uv && \
-    uv pip install -e ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir
+	uv sync --frozen --extra gen --extra accelerate --extra compressed-tensors --extra quantize --extra peft --extra outlines --no-install-project && \
+    . ./.venv/bin/activate && \
+    make gen-server-raw
+
+RUN cd server && \
+	uv sync --frozen --extra gen --extra accelerate --extra compressed-tensors --extra quantize --extra peft --extra outlines && \
+    . ./.venv/bin/activate && \
+    pwd && \
+    text-generation-server --help
 
 # Install benchmarker
 COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
@@ -231,5 +238,8 @@ ENV ATTENTION=flashdecoding-ipex
 ENV PREFIX_CACHING=1
 ENV PREFILL_CHUNKING=1
 ENV CUDA_GRAPHS=0
-ENTRYPOINT ["text-generation-launcher"]
+COPY ./tgi-entrypoint.sh /tgi-entrypoint.sh
+RUN chmod +x /tgi-entrypoint.sh
+
+ENTRYPOINT ["/tgi-entrypoint.sh"]
 CMD ["--json-output"]
diff --git a/server/Makefile b/server/Makefile
index 252e355d8..746b7faa2 100644
--- a/server/Makefile
+++ b/server/Makefile
@@ -16,7 +16,14 @@ unit-tests:
 gen-server:
 	# Compile protos
 	pip install -U pip uv
-	uv pip install ".[gen]"
+	uv pip install -r requirements_gen.txt
+	mkdir text_generation_server/pb || true
+	python -m grpc_tools.protoc -I../proto/v3 --python_out=text_generation_server/pb \
+		--grpc_python_out=text_generation_server/pb --mypy_out=text_generation_server/pb ../proto/v3/generate.proto
+	find text_generation_server/pb/ -type f -name "*.py" -print0 -exec sed -i -e 's/^\(import.*pb2\)/from . \1/g' {} \;
+	touch text_generation_server/pb/__init__.py
+
+gen-server-raw:
 	mkdir text_generation_server/pb || true
 	python -m grpc_tools.protoc -I../proto/v3 --python_out=text_generation_server/pb \
 		--grpc_python_out=text_generation_server/pb --mypy_out=text_generation_server/pb ../proto/v3/generate.proto
@@ -35,3 +42,9 @@ install-cuda: install-server install-flash-attention-v2-cuda install-flash-atten
 	uv pip install nvidia-nccl-cu12==2.22.3
 
 install-rocm: install-server install-flash-attention-v2-rocm  install-vllm-rocm
+
+export-requirements:
+	uv pip compile pyproject.toml --extra gen -o requirements_gen.txt --python-version 3.11
+	uv pip compile pyproject.toml --extra attention --extra bnb --extra accelerate --extra compressed-tensors --extra marlin --extra moe --extra quantize --extra peft --extra outlines -o requirements_cuda.txt --python-version 3.11
+	uv pip compile pyproject.toml --extra accelerate --extra compressed-tensors --extra quantize --extra peft --extra outlines -o requirements_intel.txt --python-version 3.11
+	uv pip compile pyproject.toml --extra accelerate --extra compressed-tensors --extra quantize --extra peft --extra outlines -o requirements_rocm.txt --python-version 3.11
diff --git a/server/pyproject.toml b/server/pyproject.toml
index da3ba820a..d64a143fd 100644
--- a/server/pyproject.toml
+++ b/server/pyproject.toml
@@ -30,6 +30,7 @@ dependencies = [
     "sentencepiece>=0.2.0",
     "tokenizers>=0.20.3",
     "typer>=0.15.1",
+    "transformers>=4.48.0"
 ]
 
 [project.scripts]
@@ -83,5 +84,8 @@ markers = ["private: marks tests as requiring an admin hf token (deselect with '
 [tool.isort]
 profile = "black"
 
+[tool.uv]
+package = true
+
 [tool.setuptools.packages.find]
 include = ["text_generation_server*"]
diff --git a/server/req.txt b/server/req.txt
new file mode 100644
index 000000000..f653999e3
--- /dev/null
+++ b/server/req.txt
@@ -0,0 +1,212 @@
+# This file was autogenerated by uv via the following command:
+#    uv pip compile pyproject.toml --extra attention --extra bnb -o req.txt
+attention-kernels @ https://github.com/danieldk/attention-kernels/releases/download/v0.2.0.post2/attention_kernels-0.2.0.post2+cu123torch2.5-cp39-abi3-linux_x86_64.whl
+    # via text-generation-server (pyproject.toml)
+bitsandbytes==0.45.1
+    # via text-generation-server (pyproject.toml)
+certifi==2025.1.31
+    # via requests
+charset-normalizer==3.4.1
+    # via requests
+click==8.1.8
+    # via typer
+deprecated==1.2.18
+    # via
+    #   opentelemetry-api
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
+    #   opentelemetry-semantic-conventions
+einops==0.8.0
+    # via text-generation-server (pyproject.toml)
+filelock==3.17.0
+    # via
+    #   huggingface-hub
+    #   torch
+fsspec==2025.2.0
+    # via
+    #   huggingface-hub
+    #   torch
+googleapis-common-protos==1.66.0
+    # via
+    #   grpcio-status
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
+grpc-interceptor==0.15.4
+    # via text-generation-server (pyproject.toml)
+grpcio==1.70.0
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   grpc-interceptor
+    #   grpcio-reflection
+    #   grpcio-status
+    #   opentelemetry-exporter-otlp-proto-grpc
+grpcio-reflection==1.70.0
+    # via text-generation-server (pyproject.toml)
+grpcio-status==1.70.0
+    # via text-generation-server (pyproject.toml)
+hf-transfer==0.1.9
+    # via text-generation-server (pyproject.toml)
+huggingface-hub==0.28.1
+    # via tokenizers
+idna==3.10
+    # via requests
+importlib-metadata==8.5.0
+    # via opentelemetry-api
+jinja2==3.1.5
+    # via torch
+loguru==0.7.3
+    # via text-generation-server (pyproject.toml)
+markdown-it-py==3.0.0
+    # via rich
+markupsafe==3.0.2
+    # via jinja2
+mdurl==0.1.2
+    # via markdown-it-py
+mpmath==1.3.0
+    # via sympy
+networkx==3.4.2
+    # via torch
+numpy==2.2.2
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   bitsandbytes
+    #   scipy
+nvidia-cublas-cu12==12.4.5.8
+    # via
+    #   nvidia-cudnn-cu12
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cuda-cupti-cu12==12.4.127
+    # via torch
+nvidia-cuda-nvrtc-cu12==12.4.127
+    # via torch
+nvidia-cuda-runtime-cu12==12.4.127
+    # via torch
+nvidia-cudnn-cu12==9.1.0.70
+    # via torch
+nvidia-cufft-cu12==11.2.1.3
+    # via torch
+nvidia-curand-cu12==10.3.5.147
+    # via torch
+nvidia-cusolver-cu12==11.6.1.9
+    # via torch
+nvidia-cusparse-cu12==12.3.1.170
+    # via
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cusparselt-cu12==0.6.2
+    # via torch
+nvidia-nccl-cu12==2.21.5
+    # via torch
+nvidia-nvjitlink-cu12==12.4.127
+    # via
+    #   nvidia-cusolver-cu12
+    #   nvidia-cusparse-cu12
+    #   torch
+nvidia-nvtx-cu12==12.4.127
+    # via torch
+opentelemetry-api==1.30.0
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
+    #   opentelemetry-instrumentation
+    #   opentelemetry-instrumentation-grpc
+    #   opentelemetry-sdk
+    #   opentelemetry-semantic-conventions
+opentelemetry-exporter-otlp==1.30.0
+    # via text-generation-server (pyproject.toml)
+opentelemetry-exporter-otlp-proto-common==1.30.0
+    # via
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
+opentelemetry-exporter-otlp-proto-grpc==1.30.0
+    # via opentelemetry-exporter-otlp
+opentelemetry-exporter-otlp-proto-http==1.30.0
+    # via opentelemetry-exporter-otlp
+opentelemetry-instrumentation==0.51b0
+    # via opentelemetry-instrumentation-grpc
+opentelemetry-instrumentation-grpc==0.51b0
+    # via text-generation-server (pyproject.toml)
+opentelemetry-proto==1.30.0
+    # via
+    #   opentelemetry-exporter-otlp-proto-common
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
+opentelemetry-sdk==1.30.0
+    # via
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
+opentelemetry-semantic-conventions==0.51b0
+    # via
+    #   opentelemetry-instrumentation
+    #   opentelemetry-instrumentation-grpc
+    #   opentelemetry-sdk
+packaging==24.2
+    # via
+    #   huggingface-hub
+    #   opentelemetry-instrumentation
+pillow==11.1.0
+    # via text-generation-server (pyproject.toml)
+prometheus-client==0.21.1
+    # via text-generation-server (pyproject.toml)
+protobuf==5.29.3
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   googleapis-common-protos
+    #   grpcio-reflection
+    #   grpcio-status
+    #   opentelemetry-proto
+py-cpuinfo==9.0.0
+    # via text-generation-server (pyproject.toml)
+pygments==2.19.1
+    # via rich
+pyyaml==6.0.2
+    # via huggingface-hub
+requests==2.32.3
+    # via
+    #   huggingface-hub
+    #   opentelemetry-exporter-otlp-proto-http
+rich==13.9.4
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   typer
+safetensors==0.5.2
+    # via text-generation-server (pyproject.toml)
+scipy==1.15.1
+    # via text-generation-server (pyproject.toml)
+sentencepiece==0.2.0
+    # via text-generation-server (pyproject.toml)
+setuptools==75.8.0
+    # via torch
+shellingham==1.5.4
+    # via typer
+sympy==1.13.1
+    # via torch
+tokenizers==0.21.0
+    # via text-generation-server (pyproject.toml)
+torch==2.6.0
+    # via
+    #   attention-kernels
+    #   bitsandbytes
+tqdm==4.67.1
+    # via huggingface-hub
+triton==3.2.0
+    # via torch
+typer==0.15.1
+    # via text-generation-server (pyproject.toml)
+typing-extensions==4.12.2
+    # via
+    #   huggingface-hub
+    #   opentelemetry-sdk
+    #   torch
+    #   typer
+urllib3==2.3.0
+    # via requests
+wrapt==1.17.2
+    # via
+    #   deprecated
+    #   opentelemetry-instrumentation
+    #   opentelemetry-instrumentation-grpc
+zipp==3.21.0
+    # via importlib-metadata
diff --git a/server/requirements_cuda.txt b/server/requirements_cuda.txt
index ee75b2b5a..051045bc9 100644
--- a/server/requirements_cuda.txt
+++ b/server/requirements_cuda.txt
@@ -1,55 +1,384 @@
-certifi==2024.8.30 ; python_version >= "3.9" and python_version < "3.13"
-charset-normalizer==3.4.0 ; python_version >= "3.9" and python_version < "3.13"
-click==8.1.7 ; python_version >= "3.9" and python_version < "3.13"
-colorama==0.4.6 ; python_version >= "3.9" and python_version < "3.13" and (sys_platform == "win32" or platform_system == "Windows")
-deprecated==1.2.14 ; python_version >= "3.9" and python_version < "3.13"
-einops==0.8.0 ; python_version >= "3.9" and python_version < "3.13"
-filelock==3.16.1 ; python_version >= "3.9" and python_version < "3.13"
-fsspec==2024.6.1 ; python_version >= "3.9" and python_version < "3.13"
-googleapis-common-protos==1.65.0 ; python_version >= "3.9" and python_version < "3.13"
-grpc-interceptor==0.15.4 ; python_version >= "3.9" and python_version < "3.13"
-grpcio-reflection==1.62.3 ; python_version >= "3.9" and python_version < "3.13"
-grpcio-status==1.62.3 ; python_version >= "3.9" and python_version < "3.13"
-grpcio==1.68.0 ; python_version >= "3.9" and python_version < "3.13"
-hf-transfer==0.1.8 ; python_version >= "3.9" and python_version < "3.13"
-huggingface-hub==0.23.5 ; python_version >= "3.9" and python_version < "3.13"
-idna==3.10 ; python_version >= "3.9" and python_version < "3.13"
-importlib-metadata==7.1.0 ; python_version >= "3.9" and python_version < "3.13"
-loguru==0.7.2 ; python_version >= "3.9" and python_version < "3.13"
-markdown-it-py==3.0.0 ; python_version >= "3.9" and python_version < "3.13"
-mdurl==0.1.2 ; python_version >= "3.9" and python_version < "3.13"
-numpy==1.26.4 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-api==1.27.0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-exporter-otlp-proto-common==1.27.0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-exporter-otlp-proto-grpc==1.27.0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-exporter-otlp-proto-http==1.27.0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-exporter-otlp==1.27.0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-instrumentation-grpc==0.48b0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-instrumentation==0.48b0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-proto==1.27.0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-sdk==1.27.0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-semantic-conventions==0.48b0 ; python_version >= "3.9" and python_version < "3.13"
-packaging==24.1 ; python_version >= "3.9" and python_version < "3.13"
-pillow==11.0.0 ; python_version >= "3.9" and python_version < "3.13"
-prometheus-client==0.20.0 ; python_version >= "3.9" and python_version < "3.13"
-protobuf==4.25.5 ; python_version >= "3.9" and python_version < "3.13"
-py-cpuinfo==9.0.0 ; python_version >= "3.9" and python_version < "3.13"
-pygments==2.18.0 ; python_version >= "3.9" and python_version < "3.13"
-pyyaml==6.0.2 ; python_version >= "3.9" and python_version < "3.13"
-regex==2024.9.11 ; python_version >= "3.9" and python_version < "3.13"
-requests==2.32.3 ; python_version >= "3.9" and python_version < "3.13"
-rich==13.9.4 ; python_version >= "3.9" and python_version < "3.13"
-safetensors==0.4.5 ; python_version >= "3.9" and python_version < "3.13"
-scipy==1.13.1 ; python_version >= "3.9" and python_version < "3.13"
-sentencepiece==0.2.0 ; python_version >= "3.9" and python_version < "3.13"
-setuptools==75.2.0 ; python_version >= "3.9" and python_version < "3.13"
-shellingham==1.5.4 ; python_version >= "3.9" and python_version < "3.13"
-tokenizers==0.20.3 ; python_version >= "3.9" and python_version < "3.13"
-tqdm==4.66.5 ; python_version >= "3.9" and python_version < "3.13"
-transformers==4.46.3 ; python_version >= "3.9" and python_version < "3.13"
-typer==0.12.5 ; python_version >= "3.9" and python_version < "3.13"
-typing-extensions==4.12.2 ; python_version >= "3.9" and python_version < "3.13"
-urllib3==2.2.3 ; python_version >= "3.9" and python_version < "3.13"
-win32-setctime==1.1.0 ; python_version >= "3.9" and python_version < "3.13" and sys_platform == "win32"
-wrapt==1.16.0 ; python_version >= "3.9" and python_version < "3.13"
-zipp==3.20.2 ; python_version >= "3.9" and python_version < "3.13"
+# This file was autogenerated by uv via the following command:
+#    uv pip compile pyproject.toml --extra attention --extra bnb --extra accelerate --extra compressed-tensors --extra marlin --extra moe --extra quantize --extra peft --extra outlines -o requirements_cuda.txt --python-version 3.11
+accelerate==1.3.0
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   peft
+aiohappyeyeballs==2.4.4
+    # via aiohttp
+aiohttp==3.11.11
+    # via
+    #   datasets
+    #   fsspec
+aiosignal==1.3.2
+    # via aiohttp
+airportsdata==20241001
+    # via outlines
+annotated-types==0.7.0
+    # via pydantic
+attention-kernels @ https://github.com/danieldk/attention-kernels/releases/download/v0.2.0.post2/attention_kernels-0.2.0.post2+cu123torch2.5-cp39-abi3-linux_x86_64.whl
+    # via text-generation-server (pyproject.toml)
+attrs==25.1.0
+    # via
+    #   aiohttp
+    #   jsonschema
+    #   referencing
+bitsandbytes==0.45.1
+    # via text-generation-server (pyproject.toml)
+certifi==2024.8.30
+    # via requests
+charset-normalizer==3.4.0
+    # via requests
+click==8.1.7
+    # via typer
+cloudpickle==3.1.1
+    # via outlines
+compressed-tensors==0.9.1
+    # via text-generation-server (pyproject.toml)
+datasets==2.21.0
+    # via text-generation-server (pyproject.toml)
+deprecated==1.2.14
+    # via
+    #   opentelemetry-api
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
+    #   opentelemetry-semantic-conventions
+dill==0.3.8
+    # via
+    #   datasets
+    #   multiprocess
+diskcache==5.6.3
+    # via outlines
+einops==0.8.0
+    # via text-generation-server (pyproject.toml)
+filelock==3.16.1
+    # via
+    #   datasets
+    #   huggingface-hub
+    #   torch
+    #   transformers
+frozenlist==1.5.0
+    # via
+    #   aiohttp
+    #   aiosignal
+fsspec==2024.6.1
+    # via
+    #   datasets
+    #   huggingface-hub
+    #   torch
+genson==1.3.0
+    # via outlines
+googleapis-common-protos==1.65.0
+    # via
+    #   grpcio-status
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
+grpc-interceptor==0.15.4
+    # via text-generation-server (pyproject.toml)
+grpcio==1.68.0
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   grpc-interceptor
+    #   grpcio-reflection
+    #   grpcio-status
+    #   opentelemetry-exporter-otlp-proto-grpc
+grpcio-reflection==1.68.0
+    # via text-generation-server (pyproject.toml)
+grpcio-status==1.68.0
+    # via text-generation-server (pyproject.toml)
+hf-transfer==0.1.8
+    # via text-generation-server (pyproject.toml)
+huggingface-hub==0.28.1
+    # via
+    #   accelerate
+    #   datasets
+    #   peft
+    #   tokenizers
+    #   transformers
+idna==3.10
+    # via
+    #   requests
+    #   yarl
+importlib-metadata==7.1.0
+    # via opentelemetry-api
+interegular==0.3.3
+    # via
+    #   outlines
+    #   outlines-core
+jinja2==3.1.5
+    # via
+    #   outlines
+    #   torch
+jsonschema==4.23.0
+    # via
+    #   outlines
+    #   outlines-core
+jsonschema-specifications==2024.10.1
+    # via jsonschema
+lark==1.2.2
+    # via outlines
+loguru==0.7.3
+    # via text-generation-server (pyproject.toml)
+markdown-it-py==3.0.0
+    # via rich
+markupsafe==3.0.2
+    # via jinja2
+marlin-kernels @ https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp311-cp311-linux_x86_64.whl
+    # via text-generation-server (pyproject.toml)
+mdurl==0.1.2
+    # via markdown-it-py
+moe-kernels @ https://github.com/danieldk/moe-kernels/releases/download/v0.8.2/moe_kernels-0.8.2+cu123torch2.5-cp39-abi3-linux_x86_64.whl
+    # via text-generation-server (pyproject.toml)
+mpmath==1.3.0
+    # via sympy
+multidict==6.1.0
+    # via
+    #   aiohttp
+    #   yarl
+multiprocess==0.70.16
+    # via datasets
+nest-asyncio==1.6.0
+    # via outlines
+networkx==3.4.2
+    # via torch
+numpy==1.26.4
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   accelerate
+    #   bitsandbytes
+    #   datasets
+    #   outlines
+    #   pandas
+    #   peft
+    #   scipy
+    #   transformers
+nvidia-cublas-cu12==12.4.5.8
+    # via
+    #   nvidia-cudnn-cu12
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cuda-cupti-cu12==12.4.127
+    # via torch
+nvidia-cuda-nvrtc-cu12==12.4.127
+    # via torch
+nvidia-cuda-runtime-cu12==12.4.127
+    # via torch
+nvidia-cudnn-cu12==9.1.0.70
+    # via torch
+nvidia-cufft-cu12==11.2.1.3
+    # via torch
+nvidia-curand-cu12==10.3.5.147
+    # via torch
+nvidia-cusolver-cu12==11.6.1.9
+    # via torch
+nvidia-cusparse-cu12==12.3.1.170
+    # via
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cusparselt-cu12==0.6.2
+    # via torch
+nvidia-ml-py==12.570.86
+    # via moe-kernels
+nvidia-nccl-cu12==2.21.5
+    # via torch
+nvidia-nvjitlink-cu12==12.4.127
+    # via
+    #   nvidia-cusolver-cu12
+    #   nvidia-cusparse-cu12
+    #   torch
+nvidia-nvtx-cu12==12.4.127
+    # via torch
+opentelemetry-api==1.30.0
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
+    #   opentelemetry-instrumentation
+    #   opentelemetry-instrumentation-grpc
+    #   opentelemetry-sdk
+    #   opentelemetry-semantic-conventions
+opentelemetry-exporter-otlp==1.30.0
+    # via text-generation-server (pyproject.toml)
+opentelemetry-exporter-otlp-proto-common==1.30.0
+    # via
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
+opentelemetry-exporter-otlp-proto-grpc==1.30.0
+    # via opentelemetry-exporter-otlp
+opentelemetry-exporter-otlp-proto-http==1.30.0
+    # via opentelemetry-exporter-otlp
+opentelemetry-instrumentation==0.51b0
+    # via opentelemetry-instrumentation-grpc
+opentelemetry-instrumentation-grpc==0.51b0
+    # via text-generation-server (pyproject.toml)
+opentelemetry-proto==1.30.0
+    # via
+    #   opentelemetry-exporter-otlp-proto-common
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
+opentelemetry-sdk==1.30.0
+    # via
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
+opentelemetry-semantic-conventions==0.51b0
+    # via
+    #   opentelemetry-instrumentation
+    #   opentelemetry-instrumentation-grpc
+    #   opentelemetry-sdk
+outlines==0.1.14
+    # via text-generation-server (pyproject.toml)
+outlines-core==0.1.26
+    # via outlines
+packaging==24.1
+    # via
+    #   accelerate
+    #   datasets
+    #   huggingface-hub
+    #   opentelemetry-instrumentation
+    #   peft
+    #   transformers
+pandas==2.2.3
+    # via datasets
+peft==0.14.0
+    # via text-generation-server (pyproject.toml)
+pillow==11.1.0
+    # via text-generation-server (pyproject.toml)
+prometheus-client==0.21.1
+    # via text-generation-server (pyproject.toml)
+propcache==0.2.1
+    # via
+    #   aiohttp
+    #   yarl
+protobuf==5.29.3
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   googleapis-common-protos
+    #   grpcio-reflection
+    #   grpcio-status
+    #   opentelemetry-proto
+psutil==6.1.1
+    # via
+    #   accelerate
+    #   peft
+py-cpuinfo==9.0.0
+    # via text-generation-server (pyproject.toml)
+pyarrow==19.0.0
+    # via datasets
+pycountry==24.6.1
+    # via outlines
+pydantic==2.10.6
+    # via
+    #   compressed-tensors
+    #   outlines
+pydantic-core==2.27.2
+    # via pydantic
+pygments==2.18.0
+    # via rich
+python-dateutil==2.9.0.post0
+    # via pandas
+pytz==2025.1
+    # via pandas
+pyyaml==6.0.2
+    # via
+    #   accelerate
+    #   datasets
+    #   huggingface-hub
+    #   peft
+    #   transformers
+referencing==0.36.2
+    # via
+    #   jsonschema
+    #   jsonschema-specifications
+    #   outlines
+regex==2024.9.11
+    # via transformers
+requests==2.32.3
+    # via
+    #   datasets
+    #   huggingface-hub
+    #   opentelemetry-exporter-otlp-proto-http
+    #   outlines
+    #   transformers
+rich==13.9.4
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   typer
+rpds-py==0.22.3
+    # via
+    #   jsonschema
+    #   referencing
+safetensors==0.4.5
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   accelerate
+    #   peft
+    #   transformers
+scipy==1.13.1
+    # via text-generation-server (pyproject.toml)
+sentencepiece==0.2.0
+    # via text-generation-server (pyproject.toml)
+shellingham==1.5.4
+    # via typer
+six==1.17.0
+    # via python-dateutil
+sympy==1.13.1
+    # via torch
+texttable==1.7.0
+    # via text-generation-server (pyproject.toml)
+tokenizers==0.21.0
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   transformers
+torch==2.6.0
+    # via
+    #   accelerate
+    #   attention-kernels
+    #   bitsandbytes
+    #   compressed-tensors
+    #   marlin-kernels
+    #   moe-kernels
+    #   outlines
+    #   peft
+tqdm==4.66.5
+    # via
+    #   datasets
+    #   huggingface-hub
+    #   outlines
+    #   peft
+    #   transformers
+transformers==4.48.2
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   compressed-tensors
+    #   peft
+triton==3.2.0
+    # via
+    #   moe-kernels
+    #   torch
+typer==0.15.1
+    # via text-generation-server (pyproject.toml)
+typing-extensions==4.12.2
+    # via
+    #   huggingface-hub
+    #   opentelemetry-sdk
+    #   outlines
+    #   pydantic
+    #   pydantic-core
+    #   referencing
+    #   torch
+    #   typer
+tzdata==2025.1
+    # via pandas
+urllib3==2.2.3
+    # via requests
+wrapt==1.16.0
+    # via
+    #   deprecated
+    #   opentelemetry-instrumentation
+    #   opentelemetry-instrumentation-grpc
+xxhash==3.5.0
+    # via datasets
+yarl==1.18.3
+    # via aiohttp
+zipp==3.20.2
+    # via importlib-metadata
diff --git a/server/requirements_gen.txt b/server/requirements_gen.txt
new file mode 100644
index 000000000..d9836ad71
--- /dev/null
+++ b/server/requirements_gen.txt
@@ -0,0 +1,180 @@
+# This file was autogenerated by uv via the following command:
+#    uv pip compile pyproject.toml --extra gen -o requirements_gen.txt --python-version 3.11
+certifi==2025.1.31
+    # via requests
+charset-normalizer==3.4.1
+    # via requests
+click==8.1.8
+    # via typer
+deprecated==1.2.18
+    # via
+    #   opentelemetry-api
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
+    #   opentelemetry-semantic-conventions
+einops==0.8.0
+    # via text-generation-server (pyproject.toml)
+filelock==3.17.0
+    # via
+    #   huggingface-hub
+    #   transformers
+fsspec==2025.2.0
+    # via huggingface-hub
+googleapis-common-protos==1.66.0
+    # via
+    #   grpcio-status
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
+grpc-interceptor==0.15.4
+    # via text-generation-server (pyproject.toml)
+grpcio==1.70.0
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   grpc-interceptor
+    #   grpcio-reflection
+    #   grpcio-status
+    #   grpcio-tools
+    #   opentelemetry-exporter-otlp-proto-grpc
+grpcio-reflection==1.70.0
+    # via text-generation-server (pyproject.toml)
+grpcio-status==1.70.0
+    # via text-generation-server (pyproject.toml)
+grpcio-tools==1.70.0
+    # via text-generation-server (pyproject.toml)
+hf-transfer==0.1.9
+    # via text-generation-server (pyproject.toml)
+huggingface-hub==0.28.1
+    # via
+    #   tokenizers
+    #   transformers
+idna==3.10
+    # via requests
+importlib-metadata==8.5.0
+    # via opentelemetry-api
+loguru==0.7.3
+    # via text-generation-server (pyproject.toml)
+markdown-it-py==3.0.0
+    # via rich
+mdurl==0.1.2
+    # via markdown-it-py
+mypy-protobuf==3.6.0
+    # via text-generation-server (pyproject.toml)
+numpy==2.2.2
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   scipy
+    #   transformers
+opentelemetry-api==1.30.0
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
+    #   opentelemetry-instrumentation
+    #   opentelemetry-instrumentation-grpc
+    #   opentelemetry-sdk
+    #   opentelemetry-semantic-conventions
+opentelemetry-exporter-otlp==1.30.0
+    # via text-generation-server (pyproject.toml)
+opentelemetry-exporter-otlp-proto-common==1.30.0
+    # via
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
+opentelemetry-exporter-otlp-proto-grpc==1.30.0
+    # via opentelemetry-exporter-otlp
+opentelemetry-exporter-otlp-proto-http==1.30.0
+    # via opentelemetry-exporter-otlp
+opentelemetry-instrumentation==0.51b0
+    # via opentelemetry-instrumentation-grpc
+opentelemetry-instrumentation-grpc==0.51b0
+    # via text-generation-server (pyproject.toml)
+opentelemetry-proto==1.30.0
+    # via
+    #   opentelemetry-exporter-otlp-proto-common
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
+opentelemetry-sdk==1.30.0
+    # via
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
+opentelemetry-semantic-conventions==0.51b0
+    # via
+    #   opentelemetry-instrumentation
+    #   opentelemetry-instrumentation-grpc
+    #   opentelemetry-sdk
+packaging==24.2
+    # via
+    #   huggingface-hub
+    #   opentelemetry-instrumentation
+    #   transformers
+pillow==11.1.0
+    # via text-generation-server (pyproject.toml)
+prometheus-client==0.21.1
+    # via text-generation-server (pyproject.toml)
+protobuf==5.29.3
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   googleapis-common-protos
+    #   grpcio-reflection
+    #   grpcio-status
+    #   grpcio-tools
+    #   mypy-protobuf
+    #   opentelemetry-proto
+py-cpuinfo==9.0.0
+    # via text-generation-server (pyproject.toml)
+pygments==2.19.1
+    # via rich
+pyyaml==6.0.2
+    # via
+    #   huggingface-hub
+    #   transformers
+regex==2024.11.6
+    # via transformers
+requests==2.32.3
+    # via
+    #   huggingface-hub
+    #   opentelemetry-exporter-otlp-proto-http
+    #   transformers
+rich==13.9.4
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   typer
+safetensors==0.5.2
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   transformers
+scipy==1.15.1
+    # via text-generation-server (pyproject.toml)
+sentencepiece==0.2.0
+    # via text-generation-server (pyproject.toml)
+setuptools==75.8.0
+    # via grpcio-tools
+shellingham==1.5.4
+    # via typer
+tokenizers==0.21.0
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   transformers
+tqdm==4.67.1
+    # via
+    #   huggingface-hub
+    #   transformers
+transformers==4.48.2
+    # via text-generation-server (pyproject.toml)
+typer==0.15.1
+    # via text-generation-server (pyproject.toml)
+types-protobuf==5.29.1.20241207
+    # via mypy-protobuf
+typing-extensions==4.12.2
+    # via
+    #   huggingface-hub
+    #   opentelemetry-sdk
+    #   typer
+urllib3==2.3.0
+    # via requests
+wrapt==1.17.2
+    # via
+    #   deprecated
+    #   opentelemetry-instrumentation
+    #   opentelemetry-instrumentation-grpc
+zipp==3.21.0
+    # via importlib-metadata
diff --git a/server/requirements_intel.txt b/server/requirements_intel.txt
index ee75b2b5a..778d892ec 100644
--- a/server/requirements_intel.txt
+++ b/server/requirements_intel.txt
@@ -1,55 +1,367 @@
-certifi==2024.8.30 ; python_version >= "3.9" and python_version < "3.13"
-charset-normalizer==3.4.0 ; python_version >= "3.9" and python_version < "3.13"
-click==8.1.7 ; python_version >= "3.9" and python_version < "3.13"
-colorama==0.4.6 ; python_version >= "3.9" and python_version < "3.13" and (sys_platform == "win32" or platform_system == "Windows")
-deprecated==1.2.14 ; python_version >= "3.9" and python_version < "3.13"
-einops==0.8.0 ; python_version >= "3.9" and python_version < "3.13"
-filelock==3.16.1 ; python_version >= "3.9" and python_version < "3.13"
-fsspec==2024.6.1 ; python_version >= "3.9" and python_version < "3.13"
-googleapis-common-protos==1.65.0 ; python_version >= "3.9" and python_version < "3.13"
-grpc-interceptor==0.15.4 ; python_version >= "3.9" and python_version < "3.13"
-grpcio-reflection==1.62.3 ; python_version >= "3.9" and python_version < "3.13"
-grpcio-status==1.62.3 ; python_version >= "3.9" and python_version < "3.13"
-grpcio==1.68.0 ; python_version >= "3.9" and python_version < "3.13"
-hf-transfer==0.1.8 ; python_version >= "3.9" and python_version < "3.13"
-huggingface-hub==0.23.5 ; python_version >= "3.9" and python_version < "3.13"
-idna==3.10 ; python_version >= "3.9" and python_version < "3.13"
-importlib-metadata==7.1.0 ; python_version >= "3.9" and python_version < "3.13"
-loguru==0.7.2 ; python_version >= "3.9" and python_version < "3.13"
-markdown-it-py==3.0.0 ; python_version >= "3.9" and python_version < "3.13"
-mdurl==0.1.2 ; python_version >= "3.9" and python_version < "3.13"
-numpy==1.26.4 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-api==1.27.0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-exporter-otlp-proto-common==1.27.0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-exporter-otlp-proto-grpc==1.27.0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-exporter-otlp-proto-http==1.27.0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-exporter-otlp==1.27.0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-instrumentation-grpc==0.48b0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-instrumentation==0.48b0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-proto==1.27.0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-sdk==1.27.0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-semantic-conventions==0.48b0 ; python_version >= "3.9" and python_version < "3.13"
-packaging==24.1 ; python_version >= "3.9" and python_version < "3.13"
-pillow==11.0.0 ; python_version >= "3.9" and python_version < "3.13"
-prometheus-client==0.20.0 ; python_version >= "3.9" and python_version < "3.13"
-protobuf==4.25.5 ; python_version >= "3.9" and python_version < "3.13"
-py-cpuinfo==9.0.0 ; python_version >= "3.9" and python_version < "3.13"
-pygments==2.18.0 ; python_version >= "3.9" and python_version < "3.13"
-pyyaml==6.0.2 ; python_version >= "3.9" and python_version < "3.13"
-regex==2024.9.11 ; python_version >= "3.9" and python_version < "3.13"
-requests==2.32.3 ; python_version >= "3.9" and python_version < "3.13"
-rich==13.9.4 ; python_version >= "3.9" and python_version < "3.13"
-safetensors==0.4.5 ; python_version >= "3.9" and python_version < "3.13"
-scipy==1.13.1 ; python_version >= "3.9" and python_version < "3.13"
-sentencepiece==0.2.0 ; python_version >= "3.9" and python_version < "3.13"
-setuptools==75.2.0 ; python_version >= "3.9" and python_version < "3.13"
-shellingham==1.5.4 ; python_version >= "3.9" and python_version < "3.13"
-tokenizers==0.20.3 ; python_version >= "3.9" and python_version < "3.13"
-tqdm==4.66.5 ; python_version >= "3.9" and python_version < "3.13"
-transformers==4.46.3 ; python_version >= "3.9" and python_version < "3.13"
-typer==0.12.5 ; python_version >= "3.9" and python_version < "3.13"
-typing-extensions==4.12.2 ; python_version >= "3.9" and python_version < "3.13"
-urllib3==2.2.3 ; python_version >= "3.9" and python_version < "3.13"
-win32-setctime==1.1.0 ; python_version >= "3.9" and python_version < "3.13" and sys_platform == "win32"
-wrapt==1.16.0 ; python_version >= "3.9" and python_version < "3.13"
-zipp==3.20.2 ; python_version >= "3.9" and python_version < "3.13"
+# This file was autogenerated by uv via the following command:
+#    uv pip compile pyproject.toml --extra accelerate --extra compressed-tensors --extra quantize --extra peft --extra outlines -o requirements_intel.txt --python-version 3.11
+accelerate==1.3.0
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   peft
+aiohappyeyeballs==2.4.4
+    # via aiohttp
+aiohttp==3.11.11
+    # via
+    #   datasets
+    #   fsspec
+aiosignal==1.3.2
+    # via aiohttp
+airportsdata==20241001
+    # via outlines
+annotated-types==0.7.0
+    # via pydantic
+attrs==25.1.0
+    # via
+    #   aiohttp
+    #   jsonschema
+    #   referencing
+certifi==2024.8.30
+    # via requests
+charset-normalizer==3.4.0
+    # via requests
+click==8.1.7
+    # via typer
+cloudpickle==3.1.1
+    # via outlines
+compressed-tensors==0.9.1
+    # via text-generation-server (pyproject.toml)
+datasets==2.21.0
+    # via text-generation-server (pyproject.toml)
+deprecated==1.2.14
+    # via
+    #   opentelemetry-api
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
+    #   opentelemetry-semantic-conventions
+dill==0.3.8
+    # via
+    #   datasets
+    #   multiprocess
+diskcache==5.6.3
+    # via outlines
+einops==0.8.0
+    # via text-generation-server (pyproject.toml)
+filelock==3.16.1
+    # via
+    #   datasets
+    #   huggingface-hub
+    #   torch
+    #   transformers
+frozenlist==1.5.0
+    # via
+    #   aiohttp
+    #   aiosignal
+fsspec==2024.6.1
+    # via
+    #   datasets
+    #   huggingface-hub
+    #   torch
+genson==1.3.0
+    # via outlines
+googleapis-common-protos==1.65.0
+    # via
+    #   grpcio-status
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
+grpc-interceptor==0.15.4
+    # via text-generation-server (pyproject.toml)
+grpcio==1.68.0
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   grpc-interceptor
+    #   grpcio-reflection
+    #   grpcio-status
+    #   opentelemetry-exporter-otlp-proto-grpc
+grpcio-reflection==1.68.0
+    # via text-generation-server (pyproject.toml)
+grpcio-status==1.68.0
+    # via text-generation-server (pyproject.toml)
+hf-transfer==0.1.8
+    # via text-generation-server (pyproject.toml)
+huggingface-hub==0.28.1
+    # via
+    #   accelerate
+    #   datasets
+    #   peft
+    #   tokenizers
+    #   transformers
+idna==3.10
+    # via
+    #   requests
+    #   yarl
+importlib-metadata==7.1.0
+    # via opentelemetry-api
+interegular==0.3.3
+    # via
+    #   outlines
+    #   outlines-core
+jinja2==3.1.5
+    # via
+    #   outlines
+    #   torch
+jsonschema==4.23.0
+    # via
+    #   outlines
+    #   outlines-core
+jsonschema-specifications==2024.10.1
+    # via jsonschema
+lark==1.2.2
+    # via outlines
+loguru==0.7.3
+    # via text-generation-server (pyproject.toml)
+markdown-it-py==3.0.0
+    # via rich
+markupsafe==3.0.2
+    # via jinja2
+mdurl==0.1.2
+    # via markdown-it-py
+mpmath==1.3.0
+    # via sympy
+multidict==6.1.0
+    # via
+    #   aiohttp
+    #   yarl
+multiprocess==0.70.16
+    # via datasets
+nest-asyncio==1.6.0
+    # via outlines
+networkx==3.4.2
+    # via torch
+numpy==1.26.4
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   accelerate
+    #   datasets
+    #   outlines
+    #   pandas
+    #   peft
+    #   scipy
+    #   transformers
+nvidia-cublas-cu12==12.4.5.8
+    # via
+    #   nvidia-cudnn-cu12
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cuda-cupti-cu12==12.4.127
+    # via torch
+nvidia-cuda-nvrtc-cu12==12.4.127
+    # via torch
+nvidia-cuda-runtime-cu12==12.4.127
+    # via torch
+nvidia-cudnn-cu12==9.1.0.70
+    # via torch
+nvidia-cufft-cu12==11.2.1.3
+    # via torch
+nvidia-curand-cu12==10.3.5.147
+    # via torch
+nvidia-cusolver-cu12==11.6.1.9
+    # via torch
+nvidia-cusparse-cu12==12.3.1.170
+    # via
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cusparselt-cu12==0.6.2
+    # via torch
+nvidia-nccl-cu12==2.21.5
+    # via torch
+nvidia-nvjitlink-cu12==12.4.127
+    # via
+    #   nvidia-cusolver-cu12
+    #   nvidia-cusparse-cu12
+    #   torch
+nvidia-nvtx-cu12==12.4.127
+    # via torch
+opentelemetry-api==1.30.0
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
+    #   opentelemetry-instrumentation
+    #   opentelemetry-instrumentation-grpc
+    #   opentelemetry-sdk
+    #   opentelemetry-semantic-conventions
+opentelemetry-exporter-otlp==1.30.0
+    # via text-generation-server (pyproject.toml)
+opentelemetry-exporter-otlp-proto-common==1.30.0
+    # via
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
+opentelemetry-exporter-otlp-proto-grpc==1.30.0
+    # via opentelemetry-exporter-otlp
+opentelemetry-exporter-otlp-proto-http==1.30.0
+    # via opentelemetry-exporter-otlp
+opentelemetry-instrumentation==0.51b0
+    # via opentelemetry-instrumentation-grpc
+opentelemetry-instrumentation-grpc==0.51b0
+    # via text-generation-server (pyproject.toml)
+opentelemetry-proto==1.30.0
+    # via
+    #   opentelemetry-exporter-otlp-proto-common
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
+opentelemetry-sdk==1.30.0
+    # via
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
+opentelemetry-semantic-conventions==0.51b0
+    # via
+    #   opentelemetry-instrumentation
+    #   opentelemetry-instrumentation-grpc
+    #   opentelemetry-sdk
+outlines==0.1.14
+    # via text-generation-server (pyproject.toml)
+outlines-core==0.1.26
+    # via outlines
+packaging==24.1
+    # via
+    #   accelerate
+    #   datasets
+    #   huggingface-hub
+    #   opentelemetry-instrumentation
+    #   peft
+    #   transformers
+pandas==2.2.3
+    # via datasets
+peft==0.14.0
+    # via text-generation-server (pyproject.toml)
+pillow==11.1.0
+    # via text-generation-server (pyproject.toml)
+prometheus-client==0.21.1
+    # via text-generation-server (pyproject.toml)
+propcache==0.2.1
+    # via
+    #   aiohttp
+    #   yarl
+protobuf==5.29.3
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   googleapis-common-protos
+    #   grpcio-reflection
+    #   grpcio-status
+    #   opentelemetry-proto
+psutil==6.1.1
+    # via
+    #   accelerate
+    #   peft
+py-cpuinfo==9.0.0
+    # via text-generation-server (pyproject.toml)
+pyarrow==19.0.0
+    # via datasets
+pycountry==24.6.1
+    # via outlines
+pydantic==2.10.6
+    # via
+    #   compressed-tensors
+    #   outlines
+pydantic-core==2.27.2
+    # via pydantic
+pygments==2.18.0
+    # via rich
+python-dateutil==2.9.0.post0
+    # via pandas
+pytz==2025.1
+    # via pandas
+pyyaml==6.0.2
+    # via
+    #   accelerate
+    #   datasets
+    #   huggingface-hub
+    #   peft
+    #   transformers
+referencing==0.36.2
+    # via
+    #   jsonschema
+    #   jsonschema-specifications
+    #   outlines
+regex==2024.9.11
+    # via transformers
+requests==2.32.3
+    # via
+    #   datasets
+    #   huggingface-hub
+    #   opentelemetry-exporter-otlp-proto-http
+    #   outlines
+    #   transformers
+rich==13.9.4
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   typer
+rpds-py==0.22.3
+    # via
+    #   jsonschema
+    #   referencing
+safetensors==0.4.5
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   accelerate
+    #   peft
+    #   transformers
+scipy==1.13.1
+    # via text-generation-server (pyproject.toml)
+sentencepiece==0.2.0
+    # via text-generation-server (pyproject.toml)
+shellingham==1.5.4
+    # via typer
+six==1.17.0
+    # via python-dateutil
+sympy==1.13.1
+    # via torch
+texttable==1.7.0
+    # via text-generation-server (pyproject.toml)
+tokenizers==0.21.0
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   transformers
+torch==2.6.0
+    # via
+    #   accelerate
+    #   compressed-tensors
+    #   outlines
+    #   peft
+tqdm==4.66.5
+    # via
+    #   datasets
+    #   huggingface-hub
+    #   outlines
+    #   peft
+    #   transformers
+transformers==4.48.2
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   compressed-tensors
+    #   peft
+triton==3.2.0
+    # via torch
+typer==0.15.1
+    # via text-generation-server (pyproject.toml)
+typing-extensions==4.12.2
+    # via
+    #   huggingface-hub
+    #   opentelemetry-sdk
+    #   outlines
+    #   pydantic
+    #   pydantic-core
+    #   referencing
+    #   torch
+    #   typer
+tzdata==2025.1
+    # via pandas
+urllib3==2.2.3
+    # via requests
+wrapt==1.16.0
+    # via
+    #   deprecated
+    #   opentelemetry-instrumentation
+    #   opentelemetry-instrumentation-grpc
+xxhash==3.5.0
+    # via datasets
+yarl==1.18.3
+    # via aiohttp
+zipp==3.20.2
+    # via importlib-metadata
diff --git a/server/requirements_rocm.txt b/server/requirements_rocm.txt
index ee75b2b5a..65eb998b7 100644
--- a/server/requirements_rocm.txt
+++ b/server/requirements_rocm.txt
@@ -1,55 +1,367 @@
-certifi==2024.8.30 ; python_version >= "3.9" and python_version < "3.13"
-charset-normalizer==3.4.0 ; python_version >= "3.9" and python_version < "3.13"
-click==8.1.7 ; python_version >= "3.9" and python_version < "3.13"
-colorama==0.4.6 ; python_version >= "3.9" and python_version < "3.13" and (sys_platform == "win32" or platform_system == "Windows")
-deprecated==1.2.14 ; python_version >= "3.9" and python_version < "3.13"
-einops==0.8.0 ; python_version >= "3.9" and python_version < "3.13"
-filelock==3.16.1 ; python_version >= "3.9" and python_version < "3.13"
-fsspec==2024.6.1 ; python_version >= "3.9" and python_version < "3.13"
-googleapis-common-protos==1.65.0 ; python_version >= "3.9" and python_version < "3.13"
-grpc-interceptor==0.15.4 ; python_version >= "3.9" and python_version < "3.13"
-grpcio-reflection==1.62.3 ; python_version >= "3.9" and python_version < "3.13"
-grpcio-status==1.62.3 ; python_version >= "3.9" and python_version < "3.13"
-grpcio==1.68.0 ; python_version >= "3.9" and python_version < "3.13"
-hf-transfer==0.1.8 ; python_version >= "3.9" and python_version < "3.13"
-huggingface-hub==0.23.5 ; python_version >= "3.9" and python_version < "3.13"
-idna==3.10 ; python_version >= "3.9" and python_version < "3.13"
-importlib-metadata==7.1.0 ; python_version >= "3.9" and python_version < "3.13"
-loguru==0.7.2 ; python_version >= "3.9" and python_version < "3.13"
-markdown-it-py==3.0.0 ; python_version >= "3.9" and python_version < "3.13"
-mdurl==0.1.2 ; python_version >= "3.9" and python_version < "3.13"
-numpy==1.26.4 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-api==1.27.0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-exporter-otlp-proto-common==1.27.0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-exporter-otlp-proto-grpc==1.27.0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-exporter-otlp-proto-http==1.27.0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-exporter-otlp==1.27.0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-instrumentation-grpc==0.48b0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-instrumentation==0.48b0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-proto==1.27.0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-sdk==1.27.0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-semantic-conventions==0.48b0 ; python_version >= "3.9" and python_version < "3.13"
-packaging==24.1 ; python_version >= "3.9" and python_version < "3.13"
-pillow==11.0.0 ; python_version >= "3.9" and python_version < "3.13"
-prometheus-client==0.20.0 ; python_version >= "3.9" and python_version < "3.13"
-protobuf==4.25.5 ; python_version >= "3.9" and python_version < "3.13"
-py-cpuinfo==9.0.0 ; python_version >= "3.9" and python_version < "3.13"
-pygments==2.18.0 ; python_version >= "3.9" and python_version < "3.13"
-pyyaml==6.0.2 ; python_version >= "3.9" and python_version < "3.13"
-regex==2024.9.11 ; python_version >= "3.9" and python_version < "3.13"
-requests==2.32.3 ; python_version >= "3.9" and python_version < "3.13"
-rich==13.9.4 ; python_version >= "3.9" and python_version < "3.13"
-safetensors==0.4.5 ; python_version >= "3.9" and python_version < "3.13"
-scipy==1.13.1 ; python_version >= "3.9" and python_version < "3.13"
-sentencepiece==0.2.0 ; python_version >= "3.9" and python_version < "3.13"
-setuptools==75.2.0 ; python_version >= "3.9" and python_version < "3.13"
-shellingham==1.5.4 ; python_version >= "3.9" and python_version < "3.13"
-tokenizers==0.20.3 ; python_version >= "3.9" and python_version < "3.13"
-tqdm==4.66.5 ; python_version >= "3.9" and python_version < "3.13"
-transformers==4.46.3 ; python_version >= "3.9" and python_version < "3.13"
-typer==0.12.5 ; python_version >= "3.9" and python_version < "3.13"
-typing-extensions==4.12.2 ; python_version >= "3.9" and python_version < "3.13"
-urllib3==2.2.3 ; python_version >= "3.9" and python_version < "3.13"
-win32-setctime==1.1.0 ; python_version >= "3.9" and python_version < "3.13" and sys_platform == "win32"
-wrapt==1.16.0 ; python_version >= "3.9" and python_version < "3.13"
-zipp==3.20.2 ; python_version >= "3.9" and python_version < "3.13"
+# This file was autogenerated by uv via the following command:
+#    uv pip compile pyproject.toml --extra accelerate --extra compressed-tensors --extra quantize --extra peft --extra outlines -o requirements_rocm.txt --python-version 3.11
+accelerate==1.3.0
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   peft
+aiohappyeyeballs==2.4.4
+    # via aiohttp
+aiohttp==3.11.11
+    # via
+    #   datasets
+    #   fsspec
+aiosignal==1.3.2
+    # via aiohttp
+airportsdata==20241001
+    # via outlines
+annotated-types==0.7.0
+    # via pydantic
+attrs==25.1.0
+    # via
+    #   aiohttp
+    #   jsonschema
+    #   referencing
+certifi==2024.8.30
+    # via requests
+charset-normalizer==3.4.0
+    # via requests
+click==8.1.7
+    # via typer
+cloudpickle==3.1.1
+    # via outlines
+compressed-tensors==0.9.1
+    # via text-generation-server (pyproject.toml)
+datasets==2.21.0
+    # via text-generation-server (pyproject.toml)
+deprecated==1.2.14
+    # via
+    #   opentelemetry-api
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
+    #   opentelemetry-semantic-conventions
+dill==0.3.8
+    # via
+    #   datasets
+    #   multiprocess
+diskcache==5.6.3
+    # via outlines
+einops==0.8.0
+    # via text-generation-server (pyproject.toml)
+filelock==3.16.1
+    # via
+    #   datasets
+    #   huggingface-hub
+    #   torch
+    #   transformers
+frozenlist==1.5.0
+    # via
+    #   aiohttp
+    #   aiosignal
+fsspec==2024.6.1
+    # via
+    #   datasets
+    #   huggingface-hub
+    #   torch
+genson==1.3.0
+    # via outlines
+googleapis-common-protos==1.65.0
+    # via
+    #   grpcio-status
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
+grpc-interceptor==0.15.4
+    # via text-generation-server (pyproject.toml)
+grpcio==1.68.0
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   grpc-interceptor
+    #   grpcio-reflection
+    #   grpcio-status
+    #   opentelemetry-exporter-otlp-proto-grpc
+grpcio-reflection==1.68.0
+    # via text-generation-server (pyproject.toml)
+grpcio-status==1.68.0
+    # via text-generation-server (pyproject.toml)
+hf-transfer==0.1.8
+    # via text-generation-server (pyproject.toml)
+huggingface-hub==0.28.1
+    # via
+    #   accelerate
+    #   datasets
+    #   peft
+    #   tokenizers
+    #   transformers
+idna==3.10
+    # via
+    #   requests
+    #   yarl
+importlib-metadata==7.1.0
+    # via opentelemetry-api
+interegular==0.3.3
+    # via
+    #   outlines
+    #   outlines-core
+jinja2==3.1.5
+    # via
+    #   outlines
+    #   torch
+jsonschema==4.23.0
+    # via
+    #   outlines
+    #   outlines-core
+jsonschema-specifications==2024.10.1
+    # via jsonschema
+lark==1.2.2
+    # via outlines
+loguru==0.7.3
+    # via text-generation-server (pyproject.toml)
+markdown-it-py==3.0.0
+    # via rich
+markupsafe==3.0.2
+    # via jinja2
+mdurl==0.1.2
+    # via markdown-it-py
+mpmath==1.3.0
+    # via sympy
+multidict==6.1.0
+    # via
+    #   aiohttp
+    #   yarl
+multiprocess==0.70.16
+    # via datasets
+nest-asyncio==1.6.0
+    # via outlines
+networkx==3.4.2
+    # via torch
+numpy==1.26.4
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   accelerate
+    #   datasets
+    #   outlines
+    #   pandas
+    #   peft
+    #   scipy
+    #   transformers
+nvidia-cublas-cu12==12.4.5.8
+    # via
+    #   nvidia-cudnn-cu12
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cuda-cupti-cu12==12.4.127
+    # via torch
+nvidia-cuda-nvrtc-cu12==12.4.127
+    # via torch
+nvidia-cuda-runtime-cu12==12.4.127
+    # via torch
+nvidia-cudnn-cu12==9.1.0.70
+    # via torch
+nvidia-cufft-cu12==11.2.1.3
+    # via torch
+nvidia-curand-cu12==10.3.5.147
+    # via torch
+nvidia-cusolver-cu12==11.6.1.9
+    # via torch
+nvidia-cusparse-cu12==12.3.1.170
+    # via
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cusparselt-cu12==0.6.2
+    # via torch
+nvidia-nccl-cu12==2.21.5
+    # via torch
+nvidia-nvjitlink-cu12==12.4.127
+    # via
+    #   nvidia-cusolver-cu12
+    #   nvidia-cusparse-cu12
+    #   torch
+nvidia-nvtx-cu12==12.4.127
+    # via torch
+opentelemetry-api==1.30.0
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
+    #   opentelemetry-instrumentation
+    #   opentelemetry-instrumentation-grpc
+    #   opentelemetry-sdk
+    #   opentelemetry-semantic-conventions
+opentelemetry-exporter-otlp==1.30.0
+    # via text-generation-server (pyproject.toml)
+opentelemetry-exporter-otlp-proto-common==1.30.0
+    # via
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
+opentelemetry-exporter-otlp-proto-grpc==1.30.0
+    # via opentelemetry-exporter-otlp
+opentelemetry-exporter-otlp-proto-http==1.30.0
+    # via opentelemetry-exporter-otlp
+opentelemetry-instrumentation==0.51b0
+    # via opentelemetry-instrumentation-grpc
+opentelemetry-instrumentation-grpc==0.51b0
+    # via text-generation-server (pyproject.toml)
+opentelemetry-proto==1.30.0
+    # via
+    #   opentelemetry-exporter-otlp-proto-common
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
+opentelemetry-sdk==1.30.0
+    # via
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
+opentelemetry-semantic-conventions==0.51b0
+    # via
+    #   opentelemetry-instrumentation
+    #   opentelemetry-instrumentation-grpc
+    #   opentelemetry-sdk
+outlines==0.1.14
+    # via text-generation-server (pyproject.toml)
+outlines-core==0.1.26
+    # via outlines
+packaging==24.1
+    # via
+    #   accelerate
+    #   datasets
+    #   huggingface-hub
+    #   opentelemetry-instrumentation
+    #   peft
+    #   transformers
+pandas==2.2.3
+    # via datasets
+peft==0.14.0
+    # via text-generation-server (pyproject.toml)
+pillow==11.1.0
+    # via text-generation-server (pyproject.toml)
+prometheus-client==0.21.1
+    # via text-generation-server (pyproject.toml)
+propcache==0.2.1
+    # via
+    #   aiohttp
+    #   yarl
+protobuf==5.29.3
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   googleapis-common-protos
+    #   grpcio-reflection
+    #   grpcio-status
+    #   opentelemetry-proto
+psutil==6.1.1
+    # via
+    #   accelerate
+    #   peft
+py-cpuinfo==9.0.0
+    # via text-generation-server (pyproject.toml)
+pyarrow==19.0.0
+    # via datasets
+pycountry==24.6.1
+    # via outlines
+pydantic==2.10.6
+    # via
+    #   compressed-tensors
+    #   outlines
+pydantic-core==2.27.2
+    # via pydantic
+pygments==2.18.0
+    # via rich
+python-dateutil==2.9.0.post0
+    # via pandas
+pytz==2025.1
+    # via pandas
+pyyaml==6.0.2
+    # via
+    #   accelerate
+    #   datasets
+    #   huggingface-hub
+    #   peft
+    #   transformers
+referencing==0.36.2
+    # via
+    #   jsonschema
+    #   jsonschema-specifications
+    #   outlines
+regex==2024.9.11
+    # via transformers
+requests==2.32.3
+    # via
+    #   datasets
+    #   huggingface-hub
+    #   opentelemetry-exporter-otlp-proto-http
+    #   outlines
+    #   transformers
+rich==13.9.4
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   typer
+rpds-py==0.22.3
+    # via
+    #   jsonschema
+    #   referencing
+safetensors==0.4.5
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   accelerate
+    #   peft
+    #   transformers
+scipy==1.13.1
+    # via text-generation-server (pyproject.toml)
+sentencepiece==0.2.0
+    # via text-generation-server (pyproject.toml)
+shellingham==1.5.4
+    # via typer
+six==1.17.0
+    # via python-dateutil
+sympy==1.13.1
+    # via torch
+texttable==1.7.0
+    # via text-generation-server (pyproject.toml)
+tokenizers==0.21.0
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   transformers
+torch==2.6.0
+    # via
+    #   accelerate
+    #   compressed-tensors
+    #   outlines
+    #   peft
+tqdm==4.66.5
+    # via
+    #   datasets
+    #   huggingface-hub
+    #   outlines
+    #   peft
+    #   transformers
+transformers==4.48.2
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   compressed-tensors
+    #   peft
+triton==3.2.0
+    # via torch
+typer==0.15.1
+    # via text-generation-server (pyproject.toml)
+typing-extensions==4.12.2
+    # via
+    #   huggingface-hub
+    #   opentelemetry-sdk
+    #   outlines
+    #   pydantic
+    #   pydantic-core
+    #   referencing
+    #   torch
+    #   typer
+tzdata==2025.1
+    # via pandas
+urllib3==2.2.3
+    # via requests
+wrapt==1.16.0
+    # via
+    #   deprecated
+    #   opentelemetry-instrumentation
+    #   opentelemetry-instrumentation-grpc
+xxhash==3.5.0
+    # via datasets
+yarl==1.18.3
+    # via aiohttp
+zipp==3.20.2
+    # via importlib-metadata
diff --git a/server/uv.lock b/server/uv.lock
index 5684d581e..5410a58cb 100644
--- a/server/uv.lock
+++ b/server/uv.lock
@@ -997,15 +997,15 @@ wheels = [
 
 [[package]]
 name = "moe-kernels"
-version = "0.8.0"
-source = { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.8.0/moe_kernels-0.8.0+cu123torch2.5-cp39-abi3-linux_x86_64.whl" }
+version = "0.8.2"
+source = { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.8.2/moe_kernels-0.8.2+cu123torch2.5-cp39-abi3-linux_x86_64.whl" }
 dependencies = [
     { name = "nvidia-ml-py" },
     { name = "torch" },
     { name = "triton" },
 ]
 wheels = [
-    { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.8.0/moe_kernels-0.8.0+cu123torch2.5-cp39-abi3-linux_x86_64.whl", hash = "sha256:92c4e083c037a325458e731dda6770790495cab273c9bbf5f50fb8e262c099de" },
+    { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.8.2/moe_kernels-0.8.2+cu123torch2.5-cp39-abi3-linux_x86_64.whl", hash = "sha256:1ed5b26f52339d25ea2513e99e8b6239cf1921af3eac54e03a46bb8f8efb380b" },
 ]
 
 [package.metadata]
@@ -1308,7 +1308,6 @@ name = "nvidia-cublas-cu12"
 version = "12.4.5.8"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/7f/7f/7fbae15a3982dc9595e49ce0f19332423b260045d0a6afe93cdbe2f1f624/nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_aarch64.whl", hash = "sha256:0f8aa1706812e00b9f19dfe0cdb3999b092ccb8ca168c0db5b8ea712456fd9b3", size = 363333771 },
     { url = "https://files.pythonhosted.org/packages/ae/71/1c91302526c45ab494c23f61c7a84aa568b8c1f9d196efa5993957faf906/nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl", hash = "sha256:2fc8da60df463fdefa81e323eef2e36489e1c94335b5358bcb38360adf75ac9b", size = 363438805 },
 ]
 
@@ -1317,7 +1316,6 @@ name = "nvidia-cuda-cupti-cu12"
 version = "12.4.127"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/93/b5/9fb3d00386d3361b03874246190dfec7b206fd74e6e287b26a8fcb359d95/nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:79279b35cf6f91da114182a5ce1864997fd52294a87a16179ce275773799458a", size = 12354556 },
     { url = "https://files.pythonhosted.org/packages/67/42/f4f60238e8194a3106d06a058d494b18e006c10bb2b915655bd9f6ea4cb1/nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:9dec60f5ac126f7bb551c055072b69d85392b13311fcc1bcda2202d172df30fb", size = 13813957 },
 ]
 
@@ -1326,7 +1324,6 @@ name = "nvidia-cuda-nvrtc-cu12"
 version = "12.4.127"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/77/aa/083b01c427e963ad0b314040565ea396f914349914c298556484f799e61b/nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:0eedf14185e04b76aa05b1fea04133e59f465b6f960c0cbf4e37c3cb6b0ea198", size = 24133372 },
     { url = "https://files.pythonhosted.org/packages/2c/14/91ae57cd4db3f9ef7aa99f4019cfa8d54cb4caa7e00975df6467e9725a9f/nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:a178759ebb095827bd30ef56598ec182b85547f1508941a3d560eb7ea1fbf338", size = 24640306 },
 ]
 
@@ -1335,7 +1332,6 @@ name = "nvidia-cuda-runtime-cu12"
 version = "12.4.127"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/a1/aa/b656d755f474e2084971e9a297def515938d56b466ab39624012070cb773/nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:961fe0e2e716a2a1d967aab7caee97512f71767f852f67432d572e36cb3a11f3", size = 894177 },
     { url = "https://files.pythonhosted.org/packages/ea/27/1795d86fe88ef397885f2e580ac37628ed058a92ed2c39dc8eac3adf0619/nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:64403288fa2136ee8e467cdc9c9427e0434110899d07c779f25b5c068934faa5", size = 883737 },
 ]
 
@@ -1358,7 +1354,6 @@ dependencies = [
     { name = "nvidia-nvjitlink-cu12" },
 ]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/7a/8a/0e728f749baca3fbeffad762738276e5df60851958be7783af121a7221e7/nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_aarch64.whl", hash = "sha256:5dad8008fc7f92f5ddfa2101430917ce2ffacd86824914c82e28990ad7f00399", size = 211422548 },
     { url = "https://files.pythonhosted.org/packages/27/94/3266821f65b92b3138631e9c8e7fe1fb513804ac934485a8d05776e1dd43/nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f083fc24912aa410be21fa16d157fed2055dab1cc4b6934a0e03cba69eb242b9", size = 211459117 },
 ]
 
@@ -1367,7 +1362,6 @@ name = "nvidia-curand-cu12"
 version = "10.3.5.147"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/80/9c/a79180e4d70995fdf030c6946991d0171555c6edf95c265c6b2bf7011112/nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_aarch64.whl", hash = "sha256:1f173f09e3e3c76ab084aba0de819c49e56614feae5c12f69883f4ae9bb5fad9", size = 56314811 },
     { url = "https://files.pythonhosted.org/packages/8a/6d/44ad094874c6f1b9c654f8ed939590bdc408349f137f9b98a3a23ccec411/nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl", hash = "sha256:a88f583d4e0bb643c49743469964103aa59f7f708d862c3ddb0fc07f851e3b8b", size = 56305206 },
 ]
 
@@ -1381,7 +1375,6 @@ dependencies = [
     { name = "nvidia-nvjitlink-cu12" },
 ]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/46/6b/a5c33cf16af09166845345275c34ad2190944bcc6026797a39f8e0a282e0/nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_aarch64.whl", hash = "sha256:d338f155f174f90724bbde3758b7ac375a70ce8e706d70b018dd3375545fc84e", size = 127634111 },
     { url = "https://files.pythonhosted.org/packages/3a/e1/5b9089a4b2a4790dfdea8b3a006052cfecff58139d5a4e34cb1a51df8d6f/nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl", hash = "sha256:19e33fa442bcfd085b3086c4ebf7e8debc07cfe01e11513cc6d332fd918ac260", size = 127936057 },
 ]
 
@@ -1393,7 +1386,6 @@ dependencies = [
     { name = "nvidia-nvjitlink-cu12" },
 ]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/96/a9/c0d2f83a53d40a4a41be14cea6a0bf9e668ffcf8b004bd65633f433050c0/nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_aarch64.whl", hash = "sha256:9d32f62896231ebe0480efd8a7f702e143c98cfaa0e8a76df3386c1ba2b54df3", size = 207381987 },
     { url = "https://files.pythonhosted.org/packages/db/f7/97a9ea26ed4bbbfc2d470994b8b4f338ef663be97b8f677519ac195e113d/nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl", hash = "sha256:ea4f11a2904e2a8dc4b1833cc1b5181cde564edd0d5cd33e3c168eff2d1863f1", size = 207454763 },
 ]
 
@@ -1419,7 +1411,6 @@ name = "nvidia-nvjitlink-cu12"
 version = "12.4.127"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/02/45/239d52c05074898a80a900f49b1615d81c07fceadd5ad6c4f86a987c0bc4/nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:4abe7fef64914ccfa909bc2ba39739670ecc9e820c83ccc7a6ed414122599b83", size = 20552510 },
     { url = "https://files.pythonhosted.org/packages/ff/ff/847841bacfbefc97a00036e0fce5a0f086b640756dc38caea5e1bb002655/nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:06b3b9b25bf3f8af351d664978ca26a16d2c5127dbd53c0497e28d1fb9611d57", size = 21066810 },
 ]
 
@@ -1428,7 +1419,6 @@ name = "nvidia-nvtx-cu12"
 version = "12.4.127"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/06/39/471f581edbb7804b39e8063d92fc8305bdc7a80ae5c07dbe6ea5c50d14a5/nvidia_nvtx_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:7959ad635db13edf4fc65c06a6e9f9e55fc2f92596db928d169c0bb031e88ef3", size = 100417 },
     { url = "https://files.pythonhosted.org/packages/87/20/199b8713428322a2f22b722c62b8cc278cc53dffa9705d744484b5035ee9/nvidia_nvtx_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:781e950d9b9f60d8241ccea575b32f5105a5baf4c2351cab5256a24869f12a1a", size = 99144 },
 ]
 
@@ -2656,7 +2646,7 @@ wheels = [
 [[package]]
 name = "text-generation-server"
 version = "2.0.5.dev0"
-source = { virtual = "." }
+source = { editable = "." }
 dependencies = [
     { name = "einops" },
     { name = "grpc-interceptor" },
@@ -2680,6 +2670,7 @@ dependencies = [
     { name = "scipy", version = "1.15.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
     { name = "sentencepiece" },
     { name = "tokenizers" },
+    { name = "transformers" },
     { name = "typer" },
 ]
 
@@ -2746,7 +2737,7 @@ requires-dist = [
     { name = "marlin-kernels", marker = "python_full_version == '3.10.*' and extra == 'marlin'", url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp310-cp310-linux_x86_64.whl" },
     { name = "marlin-kernels", marker = "python_full_version == '3.11.*' and extra == 'marlin'", url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp311-cp311-linux_x86_64.whl" },
     { name = "marlin-kernels", marker = "python_full_version == '3.12.*' and extra == 'marlin'", url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp312-cp312-linux_x86_64.whl" },
-    { name = "moe-kernels", marker = "extra == 'moe'", url = "https://github.com/danieldk/moe-kernels/releases/download/v0.8.0/moe_kernels-0.8.0+cu123torch2.5-cp39-abi3-linux_x86_64.whl" },
+    { name = "moe-kernels", marker = "extra == 'moe'", url = "https://github.com/danieldk/moe-kernels/releases/download/v0.8.2/moe_kernels-0.8.2+cu123torch2.5-cp39-abi3-linux_x86_64.whl" },
     { name = "mypy-protobuf", marker = "extra == 'gen'", specifier = ">=3.6.0" },
     { name = "numpy", specifier = ">=1.26,<3" },
     { name = "opentelemetry-api", specifier = ">=1.27.0" },
@@ -2765,6 +2756,7 @@ requires-dist = [
     { name = "sentencepiece", specifier = ">=0.2.0" },
     { name = "texttable", marker = "extra == 'quantize'", specifier = ">=1.6.7,<2" },
     { name = "tokenizers", specifier = ">=0.20.3" },
+    { name = "transformers", specifier = ">=4.48.0" },
     { name = "typer", specifier = ">=0.15.1" },
 ]
 
@@ -2902,7 +2894,7 @@ wheels = [
 
 [[package]]
 name = "transformers"
-version = "4.48.0"
+version = "4.48.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "filelock" },
@@ -2917,9 +2909,9 @@ dependencies = [
     { name = "tokenizers" },
     { name = "tqdm" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/ea/71/93a6331682d6f15adf7d646956db0c43e5f1759bbbd05f2ef53029bae107/transformers-4.48.0.tar.gz", hash = "sha256:03fdfcbfb8b0367fb6c9fbe9d1c9aa54dfd847618be9b52400b2811d22799cb1", size = 8372101 }
+sdist = { url = "https://files.pythonhosted.org/packages/c5/cf/1093586e09c8d889d2f6b8ffe6a1369e1e179eb7b8e732fc0f348a8fe58f/transformers-4.48.2.tar.gz", hash = "sha256:dcfb73473e61f22fb3366fe2471ed2e42779ecdd49527a1bdf1937574855d516", size = 8370945 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/45/d6/a69764e89fc5c2c957aa473881527c8c35521108d553df703e9ba703daeb/transformers-4.48.0-py3-none-any.whl", hash = "sha256:6d3de6d71cb5f2a10f9775ccc17abce9620195caaf32ec96542bd2a6937f25b0", size = 9673380 },
+    { url = "https://files.pythonhosted.org/packages/bd/40/902c95a2a6f5d2d120c940ac4bd1f937c01035af529803c13d65ca33c2d1/transformers-4.48.2-py3-none-any.whl", hash = "sha256:493bc5b0268b116eff305edf6656367fc89cf570e7a9d5891369e04751db698a", size = 9667774 },
 ]
 
 [[package]]
diff --git a/tgi-entrypoint.sh b/tgi-entrypoint.sh
index 278c7d961..94ea9436c 100755
--- a/tgi-entrypoint.sh
+++ b/tgi-entrypoint.sh
@@ -2,4 +2,5 @@
 
 ldconfig 2>/dev/null || echo 'unable to refresh ld cache, not a big deal in most cases'
 
+source ./server/.venv/bin/activate
 exec text-generation-launcher $@