remove profiling

2025-06-19 15:52:08 +00:00 · 2023-04-06 17:58:54 +02:00 · 2023-04-06 17:58:54 +02:00 · c3779fa859
commit c3779fa859
parent 26fc232afb
1 changed files with 9 additions and 24 deletions
--- a/server/text_generation_server/server.py
+++ b/server/text_generation_server/server.py
@ -39,18 +39,11 @@ class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServicer):
        return generate_pb2.ClearCacheResponse()

    async def Prefill(self, request, context):
-        from torch.profiler import profile, ProfilerActivity
-
-        with profile(
-            activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]
-        ) as prefill_prof:
-            batch = self.model.batch_type.from_pb(
-                request.batch, self.model.tokenizer, self.model.device
-            )
-
-            generations, next_batch = self.model.generate_token(batch)
-        prefill_prof.export_chrome_trace("prefill.json")
+        batch = self.model.batch_type.from_pb(
+            request.batch, self.model.tokenizer, self.model.device
+        )

+        generations, next_batch = self.model.generate_token(batch)
        self.cache.set(next_batch)

        return generate_pb2.PrefillResponse(
@ -69,20 +62,12 @@ class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServicer):
                raise ValueError(f"Batch ID {batch_pb.id} not found in cache.")
            batches.append(batch)

-        from torch.profiler import profile, ProfilerActivity
-
-        with profile(
-            activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]
-        ) as decode_prof:
-
-            if len(batches) > 1:
-                batch = self.model.batch_type.concatenate(batches)
-            else:
-                batch = batches[0]
-
-            generations, next_batch = self.model.generate_token(batch)
-        decode_prof.export_chrome_trace("decode.json")
+        if len(batches) > 1:
+            batch = self.model.batch_type.concatenate(batches)
+        else:
+            batch = batches[0]

+        generations, next_batch = self.model.generate_token(batch)
        self.cache.set(next_batch)

        return generate_pb2.DecodeResponse(