working & cached tunableop

2025-07-12 02:40:16 +00:00 · 2024-04-29 14:55:59 +00:00 · 2024-04-29 14:55:59 +00:00 · 17f5c3078b
commit 17f5c3078b
parent 193dbb683e
4 changed files with 32 additions and 23 deletions
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@ -672,9 +672,7 @@ fn shard_manager(
        // We received a shutdown signal
        if shutdown.load(Ordering::SeqCst) {
-            p.kill().unwrap();
+            terminate("shard", p, Duration::from_secs(90)).unwrap();
            let _ = p.wait();
            tracing::info!("Shard terminated");
            return;
        }
--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@ -766,6 +766,9 @@ class FlashCausalLM(Model):
            )
            max_bt = batch.max_blocks
            max_s = max_bt * get_cache_manager().block_size
            if IS_ROCM_SYSTEM and os.environ.get("PYTORCH_TUNABLEOP_ENABLED", False):
                logger.info("PyTorch TunableOp (https://github.com/pytorch/pytorch/tree/v2.3.0/aten/src/ATen/cuda/tunable) is enabled. The warmup may take several minutes.")
            _, batch, _ = self.generate_token(batch)
        except torch.cuda.OutOfMemoryError as e:
            raise RuntimeError(
@ -820,10 +823,10 @@ class FlashCausalLM(Model):
        else:
            logger.info(f"Cuda Graphs are disabled (CUDA_GRAPHS={CUDA_GRAPHS}).")
-        # TODO: fix
+        if IS_ROCM_SYSTEM and os.environ.get("PYTORCH_TUNABLEOP_ENABLED", False):
-        if IS_ROCM_SYSTEM and False:
+            total_seqlens = list(range(2))
            total_seqlens = list(range(16))
            for seqlen in total_seqlens:
                logger.info(f"Warming up TunableOp for seqlen={seqlen}")
                self.tunableop_warmup(seqlen, max_s, max_bt)
        return int(num_blocks * BLOCK_SIZE)
--- a/server/text_generation_server/server.py
+++ b/server/text_generation_server/server.py
@ -166,6 +166,21 @@ class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServicer):
            total_ns=time.time_ns() - start,
        )
 import signal
 class SignalHandler:
    KEEP_PROCESSING = True
    def __init__(self):
        signal.signal(signal.SIGINT, self.exit_gracefully)
        signal.signal(signal.SIGTERM, self.exit_gracefully)
    def exit_gracefully(self, signum, frame):
        print(f"Exiting gracefully: Signal {signum}")
        self.KEEP_PROCESSING = False
 signal_handler = SignalHandler()
 def serve(
    model_id: str,
@ -231,11 +246,8 @@ def serve(
        logger.info("Server started at {}".format(local_url))
-        try:
+        while signal_handler.KEEP_PROCESSING:
-            await server.wait_for_termination()
+            await asyncio.sleep(0.5)
        except KeyboardInterrupt:
            logger.info("Signal received. Shutting down")
            await server.stop(0)
    asyncio.run(
        serve_inner(
--- a/server/text_generation_server/utils/paged_attention.py
+++ b/server/text_generation_server/utils/paged_attention.py
@ -4,6 +4,11 @@ from text_generation_server.utils.import_utils import IS_CUDA_SYSTEM, IS_ROCM_SY
 _PARTITION_SIZE = 512
 try:
    from vllm._C import cache_ops
 except Exception as e:
    raise ImportError(f"Could not import vllm paged attention. Make sure your installation is correct. Complete error: {e}")
 def reshape_and_cache(
    key: torch.Tensor,
@ -12,18 +17,9 @@ def reshape_and_cache(
    value_cache: torch.Tensor,
    slots: torch.Tensor,
 ):
    if IS_CUDA_SYSTEM:
        from vllm._C import cache_ops
    cache_ops.reshape_and_cache(
        key, value, key_cache, value_cache, slots, "auto", 1.0
    )
    elif IS_ROCM_SYSTEM:
        from vllm import cache_ops
        cache_ops.reshape_and_cache(key, value, key_cache, value_cache, slots)
    else:
        raise ValueError("vllm is not supported on your system")
 def attention(