fix TP in pageattn

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
2025-09-09 19:34:53 +00:00 · 2025-03-14 18:01:58 -07:00 · 2025-03-14 18:01:58 -07:00 · b7fea6fc2f
commit b7fea6fc2f
parent 201dc6294f
2 changed files with 5 additions and 2 deletions
--- a/backends/gaudi/server/text_generation_server/cli.py
+++ b/backends/gaudi/server/text_generation_server/cli.py
@ -112,7 +112,7 @@ def serve(

    logger.info("CLI SHARDED = {} DTYPE = {}".format(sharded, dtype))

-    if sharded:
+    if sharded and os.getenv("ATTENTION", "default") not in {"paged"}:
        tgi_file = Path(__file__).resolve().parent / "tgi_service.py"
        num_shard = int(os.getenv("WORLD_SIZE", "1"))
        logger.info("CLI SHARDED = {}".format(num_shard))
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@ -1532,7 +1532,10 @@ fn spawn_shards(
 ) -> Result<(), LauncherError> {
    // Start shard processes
    for rank in 0..num_shard {
-        if rank != 0 && env_runtime::Env::new().is_hpu_device() {
+        if rank != 0
+            && env_runtime::Env::new().is_hpu_device()
+            && std::env::var("ATTENTION").as_deref() != Ok("paged")
+        {
            tracing::info!("Running on HPU, the launcher will not do any sharding as actual sharding is done in the server");
            break;
        }