optimize code

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
2025-09-18 15:54:53 +00:00 · 2025-04-02 00:56:15 -07:00 · 2025-04-02 00:56:15 -07:00 · a84da5b698
commit a84da5b698
parent 705cc0b619
1 changed files with 15 additions and 60 deletions
--- a/backends/gaudi/server/text_generation_server/models/flash_causal_lm.py
+++ b/backends/gaudi/server/text_generation_server/models/flash_causal_lm.py
@ -328,6 +328,8 @@ class FlashCausalLMBatch(Batch):
            ### Deactivating it by default seems like the best course.
            if not REQUEST_LOGPROBS:
                r.prefill_logprobs = False
+            else:
+                assert False, "prefill_logprobs not supported yet"
            # request id -> idx in list mapping
            requests_idx_mapping[r.id] = i

@ -1847,10 +1849,6 @@ class FlashCausalLM(Model):
                    if prefill_logprobs
                    else speculative_logits
                )
-            if len(batch) > 1 and prefill_logprobs:
-                # We create the prefill_tokens_indices tensor that will be used to gather prefill logprobs
-                # When batch == 1, we will just use the batch.input_ids values directly
-                prefill_tokens_indices = batch.input_ids.new_zeros(len(out))
        else:
            prefill_logprobs = None
            next_token_logits = out
@ -1900,19 +1898,6 @@ class FlashCausalLM(Model):
            batch.adapter_meta.adapter_indices = batch.adapter_meta.adapter_indices[
                indices
            ]
-
-        # Zipped iterator
-        iterator = zip(
-            batch.requests,
-            batch.prompt_lengths,
-            batch.cache_lengths,
-            batch.input_lengths,
-            batch.all_input_ids,
-            accepted_ids,
-            current_prefilling_mask,
-            batch.prefilling_mask,
-        )
-
        # We do two for loops as the first one can run completely asynchronously from the GPU while for the second
        # one, we need to first do a HPU <-> CPU sync
        # It is faster if we delay this sync for the maximum amount of time
@ -1921,38 +1906,8 @@ class FlashCausalLM(Model):
        # Cumulative length
        cu_accepted_ids = accepted_ids.new_zeros(accepted_ids.shape[0] + 1)
        torch.cumsum(accepted_ids, dim=0, out=cu_accepted_ids[1:])
-        cumulative_length = 0
-        for i, (
-            request,
-            prompt_length,
-            cache_length,
-            input_length,
-            all_input_ids,
-            n_accepted_ids,
-            request_was_prefilling,
-            request_is_prefilling,
-        ) in enumerate(iterator):
-            # Used to gather prefill logprobs
-            # Copy batch.all_input_ids_tensor to prefill_token_indices
-            if request.prefill_logprobs and request_was_prefilling:
-                # Indexing metadata
-                out_start_index = batch.prefill_cu_outlens[i]
-                out_end_index = batch.prefill_cu_outlens[i + 1]
-
-                # Logprobs generated by the model are for the next token
-                # So we need to translate the id tensor by 1
-                ids = batch.all_input_ids_tensor[
-                    i, cache_length + 1 : cache_length + input_length + 1
-                ]
-                if len(batch) > 1:
-                    prefill_tokens_indices[out_start_index:out_end_index] = ids
-                else:
-                    # Set prefill_tokens_indices to the correct slice
-                    prefill_tokens_indices = ids
-
-            # If the device does not support triton, we copy one by one
-            if not request_is_prefilling:
-                # Only save tokens if we are done prefilling for this request
+        if speculative_logits is not None:
+            for i in range(len(batch)):
                batch.all_input_ids_tensor[
                    i,
                    batch.cache_lengths_tensor[i]
@ -1960,7 +1915,17 @@ class FlashCausalLM(Model):
                    + batch.input_lengths[i]
                    + accepted_ids[i],
                ] = next_input_ids[cu_accepted_ids[i] : cu_accepted_ids[i + 1]]
-            cumulative_length += input_length
+        else:
+            index = batch.cache_lengths_tensor + batch.input_lengths_tensor
+            batch_idx = torch.arange(
+                0,
+                batch.all_input_ids_tensor.shape[0],
+                dtype=torch.long,
+                device=batch.input_lengths_tensor.device,
+            )
+            batch.all_input_ids_tensor.index_put_(
+                (batch_idx, index.long()), next_input_ids
+            )

        # Update values
        # These values can be updated without a HPU -> CPU sync
@ -1976,16 +1941,6 @@ class FlashCausalLM(Model):
            batch.input_lengths_tensor = torch.ones_like(batch.input_lengths_tensor)
            batch.slot_indices += accepted_ids

-        if prefill and prefill_logprobs:
-            # Get prefill logprobs with inplace softmax (avoid copying the `out` tensor (max_batch_prefill_tokens * vocab_size))
-            torch.log_softmax(out, -1, out=out)
-            prefill_logprobs_tensor = out
-            prefill_logprobs = torch.gather(
-                prefill_logprobs_tensor, 1, prefill_tokens_indices.view(-1, 1)
-            )
-            # HPU <-> CPU sync
-            prefill_logprobs = prefill_logprobs.view(-1).tolist()
-
        # Does a HPU <-> CPU sync internally
        if prefill and finished_prefilling:
            # adjust segment lengths to account for all request lengths being 1 during decoding