mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-21 06:42:10 +00:00
Apply suggestions from code review
Co-authored-by: drbh <david.richard.holtz@gmail.com>
This commit is contained in:
parent
cdbf73eef8
commit
3ece76392b
@ -1498,7 +1498,6 @@ fn main() -> Result<(), LauncherError> {
|
|||||||
let config: Config = config.into();
|
let config: Config = config.into();
|
||||||
match config.head_dim {
|
match config.head_dim {
|
||||||
Some(h) if h == 64 || h == 128 || h == 256 => {
|
Some(h) if h == 64 || h == 128 || h == 256 => {
|
||||||
// std::env::set_var("ATTENTION", "flashdecoding");
|
|
||||||
if args.lora_adapters.is_some() {
|
if args.lora_adapters.is_some() {
|
||||||
tracing::info!("Disabling prefix caching because of lora adapters");
|
tracing::info!("Disabling prefix caching because of lora adapters");
|
||||||
std::env::set_var("USE_PREFIX_CACHING", "0");
|
std::env::set_var("USE_PREFIX_CACHING", "0");
|
||||||
|
@ -233,7 +233,6 @@ if ATTENTION == "flashinfer":
|
|||||||
causal=True,
|
causal=True,
|
||||||
softcap=0.0,
|
softcap=0.0,
|
||||||
):
|
):
|
||||||
# assert window_size_left == -1, "Windowing is not supported with flash infer"
|
|
||||||
from text_generation_server.layers.attention.flashinfer import (
|
from text_generation_server.layers.attention.flashinfer import (
|
||||||
prefill_with_paged_kv_state,
|
prefill_with_paged_kv_state,
|
||||||
)
|
)
|
||||||
|
Loading…
Reference in New Issue
Block a user