diff --git a/server/text_generation_server/models/causal_lm.py b/server/text_generation_server/models/causal_lm.py
index fe98ff4c..421af39d 100644
--- a/server/text_generation_server/models/causal_lm.py
+++ b/server/text_generation_server/models/causal_lm.py
@@ -140,6 +140,15 @@ def remove_kv_cache_from_output(module):
     return module
 
 
+def pad_tensors(tensors, paddings, dim, value):
+    for i, (tensor, padding) in enumerate(zip(tensors, paddings)):
+        if padding > 0:
+            pad_shape = (0, 0, 0, padding) if dim == -2 else (0, padding)
+            tensors[i] = torch.nn.functional.pad(tensor, pad_shape, value=value)
+            htorch.core.mark_step()
+    return tensors
+
+
 @dataclass
 class CausalLMRequest:
     idx: int
@@ -196,7 +205,7 @@ class CausalLMBatch(Batch):
         )
 
     @classmethod
-    def recombine(cls, batches: List["CausalLMBatch"], is_optimized_for_gaudi: bool = False) -> "CausalLMBatch":
+    def recombine(cls, batches: List["CausalLMBatch"], pad_token_id: int) -> "CausalLMBatch":
         total_requests = sum(len(b) for b in batches)
         new_bs = round_up(total_requests, BATCH_BUCKET_SIZE)
         batch_id = batches[0].batch_id
@@ -224,7 +233,8 @@ class CausalLMBatch(Batch):
             return batches[0]
 
         inplace = batches[target_batch_idx].batch_size == new_bs
-        dbg_trace(scenario, f'bs:{[b.batch_size for b in batches]}->{new_bs} reqs:{[len(b) for b in batches]} offsets:{offsets} padding:{padding} moves_needed:{moves_needed} inplace:{inplace}')
+        dbg_trace(
+            scenario, f'bs:{[b.batch_size for b in batches]}->{new_bs} reqs:{[len(b) for b in batches]} offsets:{offsets} padding:{padding} moves_needed:{moves_needed} inplace:{inplace}')
 
         grouped_requests = [[req for req in batch.requests] for batch in batches]
         flat_requests = list(itertools.chain(*grouped_requests))
@@ -235,12 +245,9 @@ class CausalLMBatch(Batch):
         else:
             free_indices = itertools.count(0)
 
-        to_tensors = lambda ind: (torch.tensor(ind[0], device=device), torch.tensor(ind[1], device=device))
-        indices = [[to_tensors(req.update_idx(next(free_indices))) for req in batch_reqs] for batch_reqs in grouped_requests]
-
-        max_seq_len = batches[0].attention_mask.size(1)
-        input_length = max_input_length
-        right_padding = max_seq_len - input_length
+        def to_tensors(ind): return (torch.tensor(ind[0], device=device), torch.tensor(ind[1], device=device))
+        indices = [[to_tensors(req.update_idx(next(free_indices))) for req in batch_reqs]
+                   for batch_reqs in grouped_requests]
 
         chunk_size = batches[0].past_key_values[0][0].size(0) // batches[0].batch_size
         num_layers = len(batches[0].past_key_values)
@@ -257,9 +264,14 @@ class CausalLMBatch(Batch):
         for b in batches:
             b.past_key_values = list(b.past_key_values)
 
+        # For prefill there is a space allocated only for first token
+        # Need to add padding to the max total tokens before first decode
+        paddings = [(batch.input_length + batch.right_padding) - batch.seq_length for batch in batches]
+
         src = [b.input_ids for b in batches]
         for b in batches:
             del b.input_ids
+        src = pad_tensors(src, paddings, seq_dim, pad_token_id)
         src = shift_all(src, seq_dim, offsets)
         input_ids = prepare_memory(new_bs, src[target_batch_idx], inplace)
         input_ids = move_data(input_ids, 1, indices, src)
@@ -267,6 +279,7 @@ class CausalLMBatch(Batch):
         src = [b.attention_mask for b in batches]
         for b in batches:
             del b.attention_mask
+        src = pad_tensors(src, paddings, seq_dim, 0)
         src = shift_all(src, seq_dim, offsets)
         attention_mask = prepare_memory(new_bs, src[target_batch_idx], inplace)
         attention_mask = move_data(attention_mask, 1, indices, src)
@@ -281,11 +294,13 @@ class CausalLMBatch(Batch):
         past_key_values = []
         for layer_num in range(num_layers):
             src = [b.past_key_values[layer_num][0] for b in batches]
+            src = pad_tensors(src, paddings, key_dim, 0)
             src = shift_all(src, key_dim, offsets)
             updated_key = prepare_memory(new_bs * chunk_size, src[target_batch_idx], inplace)
             updated_key = move_data(updated_key, chunk_size, indices, src)
 
             src = [b.past_key_values[layer_num][1] for b in batches]
+            src = pad_tensors(src, paddings, value_dim, 0)
             src = shift_all(src, value_dim, offsets)
             updated_value = prepare_memory(new_bs * chunk_size, src[target_batch_idx], inplace)
             updated_value = move_data(updated_value, chunk_size, indices, src)
@@ -304,6 +319,10 @@ class CausalLMBatch(Batch):
             batches[0].next_token_chooser.device
         )
 
+        max_seq_len = attention_mask.size(1)
+        input_length = max_input_length
+        right_padding = max_seq_len - input_length
+
         htorch.core.mark_step()
 
         return cls(
@@ -320,7 +339,6 @@ class CausalLMBatch(Batch):
             right_padding=right_padding
         )
 
-
     @classmethod
     def from_pb(
         cls,
@@ -378,12 +396,16 @@ class CausalLMBatch(Batch):
         attention_mask = tokenized_inputs["attention_mask"]
 
         if is_optimized_for_gaudi:
+            # Allocate space for first token
             input_ids = torch.nn.functional.pad(
-                input_ids, (0, max_new_tokens + extra_padding), value=tokenizer.pad_token_id
+                input_ids, (0, 1), value=tokenizer.pad_token_id
             )
             attention_mask = torch.nn.functional.pad(
-                attention_mask, (0, max_new_tokens + extra_padding), value=0)
-            all_input_ids = input_ids.T.split(1, dim=1)
+                attention_mask, (0, 1), value=0
+            )
+            all_input_ids = torch.nn.functional.pad(
+                input_ids, (0, max_new_tokens + extra_padding - 1), value=tokenizer.pad_token_id
+            ).T.split(1, dim=1)
         else:
             all_input_ids = input_ids.clone().T.split(1, dim=1)
 
@@ -412,7 +434,7 @@ class CausalLMBatch(Batch):
         )
 
     @tracer.start_as_current_span("filter")
-    def filter(self, request_ids: List[int], is_optimized_for_gaudi: bool = False) -> Optional["CausalLMBatch"]:
+    def filter(self, request_ids: List[int]) -> Optional["CausalLMBatch"]:
         dbg_trace('FILTER', f'num_reqs:{len(self.requests)} -> {len(request_ids)}')
         request_ids = set(request_ids)
         self.requests = [req for req in self.requests if req.data.id in request_ids]
@@ -420,8 +442,8 @@ class CausalLMBatch(Batch):
 
     @classmethod
     @tracer.start_as_current_span("concatenate")
-    def concatenate(cls, batches: List["CausalLMBatch"], is_optimized_for_gaudi: bool = False) -> "CausalLMBatch":
-        return cls.recombine(batches, is_optimized_for_gaudi)
+    def concatenate(cls, batches: List["CausalLMBatch"], pad_token_id: int = 0) -> "CausalLMBatch":
+        return cls.recombine(batches, pad_token_id)
 
     def __len__(self):
         return len(self.requests)
@@ -517,7 +539,6 @@ class CausalLM(Model):
             ds_inference_kwargs["tensor_parallel"] = {"tp_size": world_size}
             ds_inference_kwargs["enable_cuda_graph"] = False
 
-
             if load_to_meta:
                 # model loaded to meta is managed differently
                 checkpoints_json = tempfile.NamedTemporaryFile(suffix=".json", mode="+w")
@@ -537,7 +558,7 @@ class CausalLM(Model):
                 torch_dtype=dtype,
             )
             model = model.eval().to(device)
-            #wrap in hpu_graph only if self.enable_hpu_graph is set
+            # wrap in hpu_graph only if self.enable_hpu_graph is set
             model = remove_kv_cache_from_output(model)
             if self.enable_hpu_graph:
                 model = wrap_in_hpu_graph(model, disable_tensor_cache=True)
@@ -608,7 +629,6 @@ class CausalLM(Model):
         else:
             return super().decode_token(all_input_ids, prefix_offset, read_offset)
 
-
     def forward(
         self,
         input_ids,
@@ -646,10 +666,11 @@ class CausalLM(Model):
         prefill = batch.past_key_values is None
         # Check if we need to do any bookkeeping first
         if not prefill:
-            batch = batch.__class__.recombine([batch], self.is_optimized_for_gaudi)
+            batch = batch.__class__.recombine([batch], self.tokenizer.pad_token_id)
 
         scenario = 'PREFILL' if prefill else 'GENERATE'
-        dbg_trace(scenario, f'bs:{batch.batch_size} num_reqs:{len(batch.requests)} seq_len:{batch.seq_length} padding:{batch.right_padding}')
+        dbg_trace(
+            scenario, f'bs:{batch.batch_size} num_reqs:{len(batch.requests)} seq_len:{batch.seq_length} padding:{batch.right_padding}')
         assert batch.right_padding > 0, 'No more room for next token!'
         self.step = self.step + 1
         if self.hb_profer_started == True and self.step > self.profiling_warmup_steps + self.profiling_steps:
@@ -657,16 +678,20 @@ class CausalLM(Model):
             self.hb_profer_started = False
 
         if self.is_optimized_for_gaudi:
-            token_idx = torch.tensor(batch.attention_mask.shape[-1] - batch.right_padding).to(self.device)
+            if prefill:
+                # no right padding for prefill
+                token_idx = torch.tensor(batch.attention_mask.shape[-1] - 1).to(self.device)
+            else:
+                token_idx = torch.tensor(batch.attention_mask.shape[-1] - batch.right_padding).to(self.device)
             attention_mask = batch.attention_mask
         else:
             token_idx = None
             # slice the attention mask to the correct shape
             # TODO fix me!
             attention_mask = batch.attention_mask[:, : -batch.padding_right_offset]
-        if batch.past_key_values:
-            if token_idx is not None:
-                input_ids = torch.index_select(batch.input_ids, 1, token_idx - 1)
+
+        if not prefill and token_idx is not None:
+            input_ids = torch.index_select(batch.input_ids, 1, token_idx - 1)
         else:
             input_ids = batch.input_ids
 
@@ -677,7 +702,7 @@ class CausalLM(Model):
                 batch.position_ids,
                 token_idx,
                 batch.past_key_values,
-                bypass_hpu_graph = prefill and self.limit_hpu_graph if self.enable_hpu_graph else None
+                bypass_hpu_graph=prefill and self.limit_hpu_graph if self.enable_hpu_graph else None
             )
         else:
             logits = self.forward(
@@ -686,7 +711,7 @@ class CausalLM(Model):
                 batch.position_ids,
                 token_idx,
                 batch.past_key_values,
-                bypass_hpu_graph = prefill and self.limit_hpu_graph if self.enable_hpu_graph else None
+                bypass_hpu_graph=prefill and self.limit_hpu_graph if self.enable_hpu_graph else None
             )
 
         # Results
@@ -697,7 +722,7 @@ class CausalLM(Model):
         input_length = batch.input_length
         if self.is_optimized_for_gaudi and logits.shape[-2] > 1:
             next_token_ids, next_token_logprobs, logprobs = batch.next_token_chooser(
-                batch.input_ids[:, :token_idx], logits[:, input_length - 1 : input_length, :].squeeze(-2)
+                batch.input_ids[:, :token_idx], logits[:, input_length - 1: input_length, :].squeeze(-2)
             )
         else:
             next_token_ids, next_token_logprobs, logprobs = batch.next_token_chooser(
@@ -757,7 +782,7 @@ class CausalLM(Model):
                 if stop:
                     # Decode generated tokens
                     output_text = self.decode(
-                        all_input_ids[new_input_length - stopping_criteria.current_tokens : new_input_length, 0]
+                        all_input_ids[new_input_length - stopping_criteria.current_tokens: new_input_length, 0]
                     )
                     generated_text = GeneratedText(
                         output_text,
@@ -772,7 +797,7 @@ class CausalLM(Model):
                 if stopping_criteria.current_tokens == 1 and request.prefill_logprobs:
                     # Remove generated token to only have prefill and add nan for first prompt token
                     prefill_logprobs = [float("nan")] + next_token_logprobs
-                    prefill_token_ids = all_input_ids[0 : new_input_length - 1]
+                    prefill_token_ids = all_input_ids[0: new_input_length - 1]
                     prefill_texts = self.tokenizer.batch_decode(
                         prefill_token_ids,
                         clean_up_tokenization_spaces=False,
@@ -846,7 +871,7 @@ class CausalLM(Model):
 
         # Update position_ids
         if prefill:
-            batch.position_ids = batch.position_ids[:, token_idx - 1 : token_idx] + 1
+            batch.position_ids = batch.position_ids[:, token_idx - 1: token_idx] + 1
         else:
             batch.position_ids += 1
         # Update past key values
diff --git a/server/text_generation_server/server.py b/server/text_generation_server/server.py
index 1e17784e..e54f4610 100644
--- a/server/text_generation_server/server.py
+++ b/server/text_generation_server/server.py
@@ -59,7 +59,7 @@ class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServicer):
                                         {"util": len(batch.requests)}):
             if batch is None:
                 raise ValueError(f"Batch ID {request.batch_id} not found in cache.")
-            filtered_batch = batch.filter(request.request_ids, self.model.is_optimized_for_gaudi)
+            filtered_batch = batch.filter(request.request_ids, self.model.tokenizer.pad_token_id)
             self.cache.set(filtered_batch)
 
             return generate_pb2.FilterBatchResponse(batch=filtered_batch.to_pb())
@@ -113,7 +113,7 @@ class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServicer):
 
             if len(batches) > 1:
                 with self.profiler.record_event("internal", "concatenate"):
-                    batch = self.model.batch_type.concatenate(batches, self.model.is_optimized_for_gaudi)
+                    batch = self.model.batch_type.concatenate(batches, self.model.tokenizer.pad_token_id)
             else:
                 batch = batches[0]