mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-06-11 03:42:08 +00:00
feat(backend): add guard to multiple header definitions
This commit is contained in:
parent
16ba2f5a2b
commit
c94b9de445
@ -1,9 +1,11 @@
|
|||||||
#include <ranges>
|
#include <ranges>
|
||||||
#include "backend.hpp"
|
|
||||||
|
|
||||||
#include <nlohmann/json.hpp>
|
#include <nlohmann/json.hpp>
|
||||||
#include <spdlog/spdlog.h>
|
#include <spdlog/spdlog.h>
|
||||||
|
|
||||||
|
#include "backend.hpp"
|
||||||
|
#include "hardware.hpp"
|
||||||
|
|
||||||
namespace huggingface::tgi::backends::trtllm {
|
namespace huggingface::tgi::backends::trtllm {
|
||||||
constexpr tle::ParallelConfig backend_workspace_t::parallel_config() const {
|
constexpr tle::ParallelConfig backend_workspace_t::parallel_config() const {
|
||||||
// Single engine (TP = PP = 1) -> using leader mode (no MPI involved)
|
// Single engine (TP = PP = 1) -> using leader mode (no MPI involved)
|
||||||
@ -23,7 +25,8 @@ namespace huggingface::tgi::backends::trtllm {
|
|||||||
return tle::ParallelConfig(tle::CommunicationType::kMPI, mode, std::nullopt, std::nullopt, orchestratorConfig);
|
return tle::ParallelConfig(tle::CommunicationType::kMPI, mode, std::nullopt, std::nullopt, orchestratorConfig);
|
||||||
}
|
}
|
||||||
|
|
||||||
constexpr tle::ExecutorConfig backend_workspace_t::executor_config() const {
|
|
||||||
|
tle::ExecutorConfig backend_workspace_t::executor_config() const {
|
||||||
// Retrieve the compute capabilities to enable some options at runtime
|
// Retrieve the compute capabilities to enable some options at runtime
|
||||||
const auto compute_capabilities = hardware::cuda::compute_capabilities_t();
|
const auto compute_capabilities = hardware::cuda::compute_capabilities_t();
|
||||||
|
|
||||||
@ -73,4 +76,4 @@ namespace huggingface::tgi::backends::trtllm {
|
|||||||
SPDLOG_TRACE(FMT_STRING("Cancelling request: {:d}"), request_id);
|
SPDLOG_TRACE(FMT_STRING("Cancelling request: {:d}"), request_id);
|
||||||
executor_.cancelRequest(request_id);
|
executor_.cancelRequest(request_id);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,7 +1,8 @@
|
|||||||
#pragma once
|
#ifndef TGI_BACKEND_TRTLLM
|
||||||
|
#define TGI_BACKEND_TRTLLM
|
||||||
|
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
#include <exception>
|
|
||||||
#include <expected>
|
#include <expected>
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include <list>
|
#include <list>
|
||||||
@ -13,8 +14,6 @@
|
|||||||
|
|
||||||
#include <tensorrt_llm/executor/executor.h>
|
#include <tensorrt_llm/executor/executor.h>
|
||||||
|
|
||||||
#include <hardware.hpp>
|
|
||||||
|
|
||||||
namespace huggingface::tgi::backends::trtllm {
|
namespace huggingface::tgi::backends::trtllm {
|
||||||
namespace tle = tensorrt_llm::executor;
|
namespace tle = tensorrt_llm::executor;
|
||||||
using json = nlohmann::json;
|
using json = nlohmann::json;
|
||||||
@ -68,7 +67,7 @@ namespace huggingface::tgi::backends::trtllm {
|
|||||||
float_t temperature;
|
float_t temperature;
|
||||||
std::list<std::vector<int32_t>> stop_words;
|
std::list<std::vector<int32_t>> stop_words;
|
||||||
|
|
||||||
explicit generation_config_t(const json &config):
|
constexpr explicit generation_config_t(const json &config):
|
||||||
top_p(config.value("top_p", 1.0f)), temperature( config.value("temperature", 1.0f)), stop_words(0) {
|
top_p(config.value("top_p", 1.0f)), temperature( config.value("temperature", 1.0f)), stop_words(0) {
|
||||||
if(config.contains("/eos_token_id"_json) && config["/eos_token_id"_json].is_array()) {
|
if(config.contains("/eos_token_id"_json) && config["/eos_token_id"_json].is_array()) {
|
||||||
const auto& eos_token_id = config["eos_token_id"];
|
const auto& eos_token_id = config["eos_token_id"];
|
||||||
@ -121,7 +120,7 @@ namespace huggingface::tgi::backends::trtllm {
|
|||||||
* `generation_config.json` holding default generation parameters.
|
* `generation_config.json` holding default generation parameters.
|
||||||
* @return `generation_config_t`
|
* @return `generation_config_t`
|
||||||
*/
|
*/
|
||||||
[[nodiscard]] const generation_config_t& generation_config() const { return generation_config_; }
|
[[nodiscard]] constexpr const generation_config_t& generation_config() const { return generation_config_; }
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Factory method returning new `tensorrt_llm::executor::ParallelConfig` instance used
|
* Factory method returning new `tensorrt_llm::executor::ParallelConfig` instance used
|
||||||
@ -135,7 +134,7 @@ namespace huggingface::tgi::backends::trtllm {
|
|||||||
* to initialize `tensorrt_llm::executor::Executor`
|
* to initialize `tensorrt_llm::executor::Executor`
|
||||||
* @return `tensorrt_llm::executor::ExecutorConfig` instance
|
* @return `tensorrt_llm::executor::ExecutorConfig` instance
|
||||||
*/
|
*/
|
||||||
[[nodiscard]] constexpr tle::ExecutorConfig executor_config() const;
|
[[nodiscard]] tle::ExecutorConfig executor_config() const;
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -220,4 +219,5 @@ template <> struct fmt::formatter<huggingface::tgi::backends::trtllm::sampling_p
|
|||||||
c.top_k, c.top_p, c.repetition_penalty, c.frequency_penalty, c.temperature, c.seed
|
c.top_k, c.top_p, c.repetition_penalty, c.frequency_penalty, c.temperature, c.seed
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
#endif
|
@ -1,4 +1,5 @@
|
|||||||
#pragma once
|
#ifndef TGI_HARDWARE_CUDA
|
||||||
|
#define TGI_HARDWARE_CUDA
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
#include <optional>
|
#include <optional>
|
||||||
|
|
||||||
@ -77,4 +78,5 @@ namespace huggingface::tgi::hardware::cuda {
|
|||||||
*/
|
*/
|
||||||
[[nodiscard]] constexpr bool is_at_least_hopper() const { return is_at_least(HOPPER); }
|
[[nodiscard]] constexpr bool is_at_least_hopper() const { return is_at_least(HOPPER); }
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
#endif
|
Loading…
Reference in New Issue
Block a user