fix TP in pageattn

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
This commit is contained in:
Wang, Yi A 2025-03-14 18:01:58 -07:00
parent 201dc6294f
commit b7fea6fc2f
2 changed files with 5 additions and 2 deletions

View File

@ -112,7 +112,7 @@ def serve(
logger.info("CLI SHARDED = {} DTYPE = {}".format(sharded, dtype)) logger.info("CLI SHARDED = {} DTYPE = {}".format(sharded, dtype))
if sharded: if sharded and os.getenv("ATTENTION", "default") not in {"paged"}:
tgi_file = Path(__file__).resolve().parent / "tgi_service.py" tgi_file = Path(__file__).resolve().parent / "tgi_service.py"
num_shard = int(os.getenv("WORLD_SIZE", "1")) num_shard = int(os.getenv("WORLD_SIZE", "1"))
logger.info("CLI SHARDED = {}".format(num_shard)) logger.info("CLI SHARDED = {}".format(num_shard))

View File

@ -1532,7 +1532,10 @@ fn spawn_shards(
) -> Result<(), LauncherError> { ) -> Result<(), LauncherError> {
// Start shard processes // Start shard processes
for rank in 0..num_shard { for rank in 0..num_shard {
if rank != 0 && env_runtime::Env::new().is_hpu_device() { if rank != 0
&& env_runtime::Env::new().is_hpu_device()
&& std::env::var("ATTENTION").as_deref() != Ok("paged")
{
tracing::info!("Running on HPU, the launcher will not do any sharding as actual sharding is done in the server"); tracing::info!("Running on HPU, the launcher will not do any sharding as actual sharding is done in the server");
break; break;
} }