mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-06-10 03:12:07 +00:00
feat(backend): add guard to multiple header definitions
This commit is contained in:
parent
16ba2f5a2b
commit
c94b9de445
@ -1,9 +1,11 @@
|
||||
#include <ranges>
|
||||
#include "backend.hpp"
|
||||
|
||||
#include <nlohmann/json.hpp>
|
||||
#include <spdlog/spdlog.h>
|
||||
|
||||
#include "backend.hpp"
|
||||
#include "hardware.hpp"
|
||||
|
||||
namespace huggingface::tgi::backends::trtllm {
|
||||
constexpr tle::ParallelConfig backend_workspace_t::parallel_config() const {
|
||||
// Single engine (TP = PP = 1) -> using leader mode (no MPI involved)
|
||||
@ -23,7 +25,8 @@ namespace huggingface::tgi::backends::trtllm {
|
||||
return tle::ParallelConfig(tle::CommunicationType::kMPI, mode, std::nullopt, std::nullopt, orchestratorConfig);
|
||||
}
|
||||
|
||||
constexpr tle::ExecutorConfig backend_workspace_t::executor_config() const {
|
||||
|
||||
tle::ExecutorConfig backend_workspace_t::executor_config() const {
|
||||
// Retrieve the compute capabilities to enable some options at runtime
|
||||
const auto compute_capabilities = hardware::cuda::compute_capabilities_t();
|
||||
|
||||
@ -73,4 +76,4 @@ namespace huggingface::tgi::backends::trtllm {
|
||||
SPDLOG_TRACE(FMT_STRING("Cancelling request: {:d}"), request_id);
|
||||
executor_.cancelRequest(request_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1,7 +1,8 @@
|
||||
#pragma once
|
||||
#ifndef TGI_BACKEND_TRTLLM
|
||||
#define TGI_BACKEND_TRTLLM
|
||||
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
#include <exception>
|
||||
#include <expected>
|
||||
#include <fstream>
|
||||
#include <list>
|
||||
@ -13,8 +14,6 @@
|
||||
|
||||
#include <tensorrt_llm/executor/executor.h>
|
||||
|
||||
#include <hardware.hpp>
|
||||
|
||||
namespace huggingface::tgi::backends::trtllm {
|
||||
namespace tle = tensorrt_llm::executor;
|
||||
using json = nlohmann::json;
|
||||
@ -68,7 +67,7 @@ namespace huggingface::tgi::backends::trtllm {
|
||||
float_t temperature;
|
||||
std::list<std::vector<int32_t>> stop_words;
|
||||
|
||||
explicit generation_config_t(const json &config):
|
||||
constexpr explicit generation_config_t(const json &config):
|
||||
top_p(config.value("top_p", 1.0f)), temperature( config.value("temperature", 1.0f)), stop_words(0) {
|
||||
if(config.contains("/eos_token_id"_json) && config["/eos_token_id"_json].is_array()) {
|
||||
const auto& eos_token_id = config["eos_token_id"];
|
||||
@ -121,7 +120,7 @@ namespace huggingface::tgi::backends::trtllm {
|
||||
* `generation_config.json` holding default generation parameters.
|
||||
* @return `generation_config_t`
|
||||
*/
|
||||
[[nodiscard]] const generation_config_t& generation_config() const { return generation_config_; }
|
||||
[[nodiscard]] constexpr const generation_config_t& generation_config() const { return generation_config_; }
|
||||
|
||||
/**
|
||||
* Factory method returning new `tensorrt_llm::executor::ParallelConfig` instance used
|
||||
@ -135,7 +134,7 @@ namespace huggingface::tgi::backends::trtllm {
|
||||
* to initialize `tensorrt_llm::executor::Executor`
|
||||
* @return `tensorrt_llm::executor::ExecutorConfig` instance
|
||||
*/
|
||||
[[nodiscard]] constexpr tle::ExecutorConfig executor_config() const;
|
||||
[[nodiscard]] tle::ExecutorConfig executor_config() const;
|
||||
};
|
||||
|
||||
/**
|
||||
@ -220,4 +219,5 @@ template <> struct fmt::formatter<huggingface::tgi::backends::trtllm::sampling_p
|
||||
c.top_k, c.top_p, c.repetition_penalty, c.frequency_penalty, c.temperature, c.seed
|
||||
);
|
||||
}
|
||||
};
|
||||
};
|
||||
#endif
|
@ -1,4 +1,5 @@
|
||||
#pragma once
|
||||
#ifndef TGI_HARDWARE_CUDA
|
||||
#define TGI_HARDWARE_CUDA
|
||||
#include <cstdint>
|
||||
#include <optional>
|
||||
|
||||
@ -77,4 +78,5 @@ namespace huggingface::tgi::hardware::cuda {
|
||||
*/
|
||||
[[nodiscard]] constexpr bool is_at_least_hopper() const { return is_at_least(HOPPER); }
|
||||
};
|
||||
}
|
||||
}
|
||||
#endif
|
Loading…
Reference in New Issue
Block a user