diff --git a/server/text_generation_server/models/causal_lm.py b/server/text_generation_server/models/causal_lm.py index 28534d0f..de2c0651 100644 --- a/server/text_generation_server/models/causal_lm.py +++ b/server/text_generation_server/models/causal_lm.py @@ -517,14 +517,13 @@ class CausalLM(Model): if torch.cuda.is_available(): device = torch.device(f"cuda:{rank}") dtype = default_dtype if dtype is None else dtype + elif hasattr(torch, "xpu") and torch.xpu.is_available(): + device = torch.device(f"xpu:{rank}") + dtype = default_dtype if dtype is None else dtype elif SYSTEM == "ipex": - if hasattr(torch, "xpu") and torch.xpu.is_available(): - device = torch.device(f"xpu:{rank}") - dtype = default_dtype if dtype is None else dtype - else: - device = torch.device("cpu") - # Float16 doesn't exist on target. - dtype = torch.bfloat16 if dtype is None else dtype + device = torch.device("cpu") + # Float16 doesn't exist on target. + dtype = torch.bfloat16 if dtype is None else dtype else: device = torch.device("cpu") dtype = torch.float32 if dtype is None else dtype @@ -593,8 +592,14 @@ class CausalLM(Model): if speculator: raise RuntimeError("Speculator decoding is not enabled for AutoModel") + device_count = 0 if torch.cuda.is_available(): device = torch.device("cuda") + device_count = torch.cuda.device_count() + dtype = torch.float16 if dtype is None else dtype + elif hasattr(torch, "xpu") and torch.xpu.is_available(): + device = torch.device("xpu") + device_count = torch.xpu.device_count() dtype = torch.float16 if dtype is None else dtype else: if quantize: @@ -616,18 +621,17 @@ class CausalLM(Model): torch_dtype=dtype, device_map=( "auto" - if torch.cuda.is_available() and torch.cuda.device_count() > 1 + if device_count > 1 else None ), load_in_8bit=quantize == "bitsandbytes", trust_remote_code=trust_remote_code, ) if ( - torch.cuda.is_available() - and torch.cuda.device_count() == 1 + device_count == 1 and quantize != "bitsandbytes" ): - model = model.cuda() + model = model.to(device) if tokenizer.pad_token_id is None: if model.config.pad_token_id is not None: diff --git a/server/text_generation_server/models/seq2seq_lm.py b/server/text_generation_server/models/seq2seq_lm.py index 04d4c28b..94f87d02 100644 --- a/server/text_generation_server/models/seq2seq_lm.py +++ b/server/text_generation_server/models/seq2seq_lm.py @@ -558,14 +558,13 @@ class Seq2SeqLM(Model): if torch.cuda.is_available(): device = torch.device(f"cuda:{rank}") dtype = default_dtype if dtype is None else dtype + elif hasattr(torch, "xpu") and torch.xpu.is_available(): + device = torch.device(f"xpu:{rank}") + dtype = default_dtype if dtype is None else dtype elif SYSTEM == "ipex": - if hasattr(torch, "xpu") and torch.xpu.is_available(): - device = torch.device(f"xpu:{rank}") - dtype = default_dtype if dtype is None else dtype - else: - device = torch.device("cpu") - # Float16 doesn't exist on target. - dtype = torch.bfloat16 if dtype is None else dtype + device = torch.device("cpu") + # Float16 doesn't exist on target. + dtype = torch.bfloat16 if dtype is None else dtype else: device = torch.device("cpu") dtype = torch.float32 if dtype is None else dtype @@ -630,8 +629,14 @@ class Seq2SeqLM(Model): if speculator: raise RuntimeError("Speculator decoding is not enabled for AutoModel") + device_count = 0 if torch.cuda.is_available(): device = torch.device("cuda") + device_count = torch.cuda.device_count() + dtype = torch.float16 if dtype is None else dtype + elif hasattr(torch, "xpu") and torch.xpu.is_available(): + device = torch.device("xpu") + device_count = torch.xpu.device_count() dtype = torch.float16 if dtype is None else dtype else: if quantize: @@ -646,14 +651,14 @@ class Seq2SeqLM(Model): torch_dtype=dtype, device_map=( "auto" - if torch.cuda.is_available() and torch.cuda.device_count() > 1 + if device_count > 1 else None ), load_in_8bit=quantize == "bitsandbytes", trust_remote_code=trust_remote_code, ) - if torch.cuda.is_available() and torch.cuda.device_count() == 1: - model = model.cuda() + if device_count == 1: + model = model.to(device) tokenizer = AutoTokenizer.from_pretrained( model_id, diff --git a/server/text_generation_server/utils/import_utils.py b/server/text_generation_server/utils/import_utils.py index 782b4f15..b693258c 100644 --- a/server/text_generation_server/utils/import_utils.py +++ b/server/text_generation_server/utils/import_utils.py @@ -66,6 +66,11 @@ elif is_ipex_available(): empty_cache = noop synchronize = noop get_free_memory = get_cpu_free_memory +elif hasattr(torch, "xpu") and torch.xpu.is_available(): + SYSTEM = "xpu" + empty_cache = torch.xpu.empty_cache + synchronize = torch.xpu.synchronize + get_free_memory = get_xpu_free_memory else: SYSTEM = "cpu"