mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-19 13:52:07 +00:00
fix: lint backend and doc files (#2850)
This commit is contained in:
parent
11ab329883
commit
a72f339c79
@ -72,4 +72,4 @@ RUN cargo install cargo-chef
|
|||||||
COPY --from=trt-builder /usr/local/tensorrt /usr/local/tensorrt
|
COPY --from=trt-builder /usr/local/tensorrt /usr/local/tensorrt
|
||||||
COPY --from=mpi-builder /usr/local/mpi /usr/local/mpi
|
COPY --from=mpi-builder /usr/local/mpi /usr/local/mpi
|
||||||
|
|
||||||
ENV MPI_HOME=/usr/local/mpi
|
ENV MPI_HOME=/usr/local/mpi
|
||||||
|
@ -228,4 +228,4 @@ struct fmt::formatter<huggingface::tgi::backends::trtllm::sampling_params_t> : f
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -159,4 +159,4 @@ namespace huggingface::tgi::backends::trtllm {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
@ -78,4 +78,4 @@ namespace huggingface::tgi::hardware::cuda {
|
|||||||
[[nodiscard]] constexpr bool is_at_least_hopper() const { return is_at_least(HOPPER); }
|
[[nodiscard]] constexpr bool is_at_least_hopper() const { return is_at_least(HOPPER); }
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
@ -149,4 +149,4 @@ TEST_CASE("sampling_params_t to tle::SamplingConfig", "[backend_t]")
|
|||||||
|
|
||||||
REQUIRE(config.getTemperature().has_value());
|
REQUIRE(config.getTemperature().has_value());
|
||||||
REQUIRE_THAT(*config.getTemperature(), Catch::Matchers::WithinAbs(params.temperature, 1e-6f));
|
REQUIRE_THAT(*config.getTemperature(), Catch::Matchers::WithinAbs(params.temperature, 1e-6f));
|
||||||
}
|
}
|
||||||
|
@ -79,4 +79,4 @@ TEST_CASE("is_at_least") {
|
|||||||
REQUIRE(HOPPER_CAPABILITIES.is_at_least(AMPERE));
|
REQUIRE(HOPPER_CAPABILITIES.is_at_least(AMPERE));
|
||||||
REQUIRE(HOPPER_CAPABILITIES.is_at_least(ADA_LOVELACE));
|
REQUIRE(HOPPER_CAPABILITIES.is_at_least(ADA_LOVELACE));
|
||||||
REQUIRE(HOPPER_CAPABILITIES.is_at_least(HOPPER));
|
REQUIRE(HOPPER_CAPABILITIES.is_at_least(HOPPER));
|
||||||
}
|
}
|
||||||
|
@ -17,7 +17,7 @@ supported.
|
|||||||
You can use [Optimum-NVIDIA](https://github.com/huggingface/optimum-nvidia) to compile engines for the models you
|
You can use [Optimum-NVIDIA](https://github.com/huggingface/optimum-nvidia) to compile engines for the models you
|
||||||
want to use.
|
want to use.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
MODEL_NAME="meta-llama/Llama-3.1-8B-Instruct"
|
MODEL_NAME="meta-llama/Llama-3.1-8B-Instruct"
|
||||||
|
|
||||||
# Install huggingface_cli
|
# Install huggingface_cli
|
||||||
@ -32,7 +32,7 @@ mkdir -p /tmp/models/$MODEL_NAME
|
|||||||
# Create a directory to store the compiled engine
|
# Create a directory to store the compiled engine
|
||||||
mkdir -p /tmp/engines/$MODEL_NAME
|
mkdir -p /tmp/engines/$MODEL_NAME
|
||||||
|
|
||||||
# Download the model
|
# Download the model
|
||||||
HF_HUB_ENABLE_HF_TRANSFER=1 huggingface-cli download --local-dir /tmp/models/$MODEL_NAME $MODEL_NAME
|
HF_HUB_ENABLE_HF_TRANSFER=1 huggingface-cli download --local-dir /tmp/models/$MODEL_NAME $MODEL_NAME
|
||||||
|
|
||||||
# Compile the engine using Optimum-NVIDIA
|
# Compile the engine using Optimum-NVIDIA
|
||||||
@ -69,7 +69,7 @@ docker run \
|
|||||||
-e MODEL=$MODEL_NAME \
|
-e MODEL=$MODEL_NAME \
|
||||||
-e PORT=3000 \
|
-e PORT=3000 \
|
||||||
-e HF_TOKEN='hf_XXX' \
|
-e HF_TOKEN='hf_XXX' \
|
||||||
-v /tmp/engines/$MODEL_NAME:/data \
|
-v /tmp/engines/$MODEL_NAME:/data \
|
||||||
ghcr.io/huggingface/text-generation-inference:latest-trtllm \
|
ghcr.io/huggingface/text-generation-inference:latest-trtllm \
|
||||||
--executor-worker executorWorker \
|
--executor-worker executorWorker \
|
||||||
--model-id /data/$MODEL_NAME
|
--model-id /data/$MODEL_NAME
|
||||||
@ -78,4 +78,4 @@ docker run \
|
|||||||
## Development
|
## Development
|
||||||
|
|
||||||
To develop TRTLLM backend, you can use [dev containers](https://containers.dev/) located in
|
To develop TRTLLM backend, you can use [dev containers](https://containers.dev/) located in
|
||||||
`.devcontainer` directory.
|
`.devcontainer` directory.
|
||||||
|
@ -1,13 +1,13 @@
|
|||||||
# Multi-backend support
|
# Multi-backend support
|
||||||
|
|
||||||
TGI (Text Generation Inference) offers flexibility by supporting multiple backends for serving large language models (LLMs).
|
TGI (Text Generation Inference) offers flexibility by supporting multiple backends for serving large language models (LLMs).
|
||||||
With multi-backend support, you can choose the backend that best suits your needs,
|
With multi-backend support, you can choose the backend that best suits your needs,
|
||||||
whether you prioritize performance, ease of use, or compatibility with specific hardware. API interaction with
|
whether you prioritize performance, ease of use, or compatibility with specific hardware. API interaction with
|
||||||
TGI remains consistent across backends, allowing you to switch between them seamlessly.
|
TGI remains consistent across backends, allowing you to switch between them seamlessly.
|
||||||
|
|
||||||
**Supported backends:**
|
**Supported backends:**
|
||||||
* **TGI CUDA backend**: This high-performance backend is optimized for NVIDIA GPUs and serves as the default option
|
* **TGI CUDA backend**: This high-performance backend is optimized for NVIDIA GPUs and serves as the default option
|
||||||
within TGI. Developed in-house, it boasts numerous optimizations and is used in production by various projects, including those by Hugging Face.
|
within TGI. Developed in-house, it boasts numerous optimizations and is used in production by various projects, including those by Hugging Face.
|
||||||
* **[TGI TRTLLM backend](./backends/trtllm)**: This backend leverages NVIDIA's TensorRT library to accelerate LLM inference.
|
* **[TGI TRTLLM backend](./backends/trtllm)**: This backend leverages NVIDIA's TensorRT library to accelerate LLM inference.
|
||||||
It utilizes specialized optimizations and custom kernels for enhanced performance.
|
It utilizes specialized optimizations and custom kernels for enhanced performance.
|
||||||
However, it requires a model-specific compilation step for each GPU architecture.
|
However, it requires a model-specific compilation step for each GPU architecture.
|
||||||
|
Loading…
Reference in New Issue
Block a user