mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-25 03:52:08 +00:00
chore(backend): minor fixes mostly format
This commit is contained in:
parent
a1154b17ec
commit
7eec0f704f
@ -99,11 +99,11 @@ fn main() {
|
|||||||
println!("cargo:rustc-link-search=native={}", out_dir.display());
|
println!("cargo:rustc-link-search=native={}", out_dir.display());
|
||||||
|
|
||||||
if is_debug {
|
if is_debug {
|
||||||
println!("cargo:rustc-link-lib=static=fmtd");
|
println!("cargo:rustc-link-lib=dylib=fmtd");
|
||||||
println!("cargo:rustc-link-lib=static=spdlogd");
|
println!("cargo:rustc-link-lib=dylib=spdlogd");
|
||||||
} else {
|
} else {
|
||||||
println!("cargo:rustc-link-lib=fmt");
|
println!("cargo:rustc-link-lib=dylib=fmt");
|
||||||
println!("cargo:rustc-link-lib=spdlog");
|
println!("cargo:rustc-link-lib=dylib=spdlog");
|
||||||
}
|
}
|
||||||
|
|
||||||
println!("cargo:rustc-link-lib=static=common");
|
println!("cargo:rustc-link-lib=static=common");
|
||||||
|
@ -170,7 +170,7 @@ namespace huggingface::tgi::backends::llamacpp {
|
|||||||
mContext_(llama_context_factory(model)),
|
mContext_(llama_context_factory(model)),
|
||||||
mWorker_(mModel_, params.value_or(llama_context_default_params())) {
|
mWorker_(mModel_, params.value_or(llama_context_default_params())) {
|
||||||
llama_numa_init(ggml_numa_strategy::GGML_NUMA_STRATEGY_NUMACTL);
|
llama_numa_init(ggml_numa_strategy::GGML_NUMA_STRATEGY_NUMACTL);
|
||||||
}
|
};
|
||||||
|
|
||||||
std::expected<size_t, backend_error_t>
|
std::expected<size_t, backend_error_t>
|
||||||
single_worker_backend_t::stream(
|
single_worker_backend_t::stream(
|
||||||
|
@ -157,10 +157,11 @@ namespace huggingface::tgi::backends::llamacpp {
|
|||||||
|
|
||||||
class single_worker_backend_t : backend_base_t {
|
class single_worker_backend_t : backend_base_t {
|
||||||
private:
|
private:
|
||||||
constexpr const static auto llama_context_factory = [](llama_model *pModel) -> llama_context_ptr {
|
constexpr static auto llama_context_factory = [](llama_model *pModel) -> llama_context_ptr {
|
||||||
auto llParams = llama_context_default_params();
|
auto llParams = llama_context_default_params();
|
||||||
llParams.flash_attn = true;
|
llParams.flash_attn = true;
|
||||||
llParams.n_batch = 1;
|
llParams.n_batch = 1;
|
||||||
|
llParams.n_threads = 1;
|
||||||
llParams.no_perf = true;
|
llParams.no_perf = true;
|
||||||
llParams.attention_type = llama_attention_type::LLAMA_ATTENTION_TYPE_CAUSAL;
|
llParams.attention_type = llama_attention_type::LLAMA_ATTENTION_TYPE_CAUSAL;
|
||||||
|
|
||||||
@ -173,6 +174,8 @@ namespace huggingface::tgi::backends::llamacpp {
|
|||||||
public:
|
public:
|
||||||
explicit single_worker_backend_t(llama_model *pModel, const std::optional<llama_context_params> &);
|
explicit single_worker_backend_t(llama_model *pModel, const std::optional<llama_context_params> &);
|
||||||
|
|
||||||
|
using backend_base_t::generate;
|
||||||
|
|
||||||
std::expected<size_t, backend_error_t> stream(
|
std::expected<size_t, backend_error_t> stream(
|
||||||
std::span<const llama_token> tokens,
|
std::span<const llama_token> tokens,
|
||||||
const generation_params_t &generation_params,
|
const generation_params_t &generation_params,
|
||||||
@ -185,6 +188,8 @@ namespace huggingface::tgi::backends::llamacpp {
|
|||||||
llama_context_ptr mContext_;
|
llama_context_ptr mContext_;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
using backend_base_t::generate;
|
||||||
|
|
||||||
std::expected<size_t, backend_error_t> stream(
|
std::expected<size_t, backend_error_t> stream(
|
||||||
std::span<const llama_token> tokens,
|
std::span<const llama_token> tokens,
|
||||||
const generation_params_t &generation_params,
|
const generation_params_t &generation_params,
|
||||||
|
@ -70,7 +70,7 @@ pub enum LlamaCppBackendError {
|
|||||||
|
|
||||||
pub struct LlamaCppBackend {
|
pub struct LlamaCppBackend {
|
||||||
backlog: Sender<(GenerationContext, UnboundedSender<InferResult>)>,
|
backlog: Sender<(GenerationContext, UnboundedSender<InferResult>)>,
|
||||||
scheduler_handle: JoinHandle<()>,
|
_scheduler_handle: JoinHandle<()>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl LlamaCppBackend {
|
impl LlamaCppBackend {
|
||||||
@ -101,7 +101,7 @@ impl LlamaCppBackend {
|
|||||||
let handle = unsafe { spawn(|| scheduler_loop(backend, tokenizer, receiver)) };
|
let handle = unsafe { spawn(|| scheduler_loop(backend, tokenizer, receiver)) };
|
||||||
Ok(Self {
|
Ok(Self {
|
||||||
backlog: submitter,
|
backlog: submitter,
|
||||||
scheduler_handle: handle,
|
_scheduler_handle: handle,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user