mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-25 20:12:07 +00:00
fix TP in pageattn
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
This commit is contained in:
parent
201dc6294f
commit
b7fea6fc2f
@ -112,7 +112,7 @@ def serve(
|
||||
|
||||
logger.info("CLI SHARDED = {} DTYPE = {}".format(sharded, dtype))
|
||||
|
||||
if sharded:
|
||||
if sharded and os.getenv("ATTENTION", "default") not in {"paged"}:
|
||||
tgi_file = Path(__file__).resolve().parent / "tgi_service.py"
|
||||
num_shard = int(os.getenv("WORLD_SIZE", "1"))
|
||||
logger.info("CLI SHARDED = {}".format(num_shard))
|
||||
|
@ -1532,7 +1532,10 @@ fn spawn_shards(
|
||||
) -> Result<(), LauncherError> {
|
||||
// Start shard processes
|
||||
for rank in 0..num_shard {
|
||||
if rank != 0 && env_runtime::Env::new().is_hpu_device() {
|
||||
if rank != 0
|
||||
&& env_runtime::Env::new().is_hpu_device()
|
||||
&& std::env::var("ATTENTION").as_deref() != Ok("paged")
|
||||
{
|
||||
tracing::info!("Running on HPU, the launcher will not do any sharding as actual sharding is done in the server");
|
||||
break;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user