diff --git a/integration-tests/models/__snapshots__/test_flash_starcoder_gptq/test_flash_starcoder_gptq.json b/integration-tests/models/__snapshots__/test_flash_starcoder_gptq/test_flash_starcoder_gptq.json index 26224118..8548e376 100644 --- a/integration-tests/models/__snapshots__/test_flash_starcoder_gptq/test_flash_starcoder_gptq.json +++ b/integration-tests/models/__snapshots__/test_flash_starcoder_gptq/test_flash_starcoder_gptq.json @@ -11,57 +11,57 @@ }, { "id": 3226, - "logprob": -8.9453125, + "logprob": -9.0234375, "text": " ge" }, { "id": 21017, - "logprob": -8.8515625, + "logprob": -9.0859375, "text": "ometric" }, { "id": 81, - "logprob": -0.21875, + "logprob": -0.25585938, "text": "_" }, { "id": 6009, - "logprob": -1.2773438, + "logprob": -2.1972656, "text": "mean" }, { "id": 26, - "logprob": -0.25195312, + "logprob": -0.2998047, "text": "(" }, { "id": 62, - "logprob": -4.8203125, + "logprob": -5.6445312, "text": "L" }, { "id": 44, - "logprob": -3.7734375, + "logprob": -3.0839844, "text": ":" }, { "id": 1682, - "logprob": -0.8310547, + "logprob": -0.6748047, "text": " List" }, { "id": 77, - "logprob": -0.22766113, + "logprob": -0.3864746, "text": "[" }, { "id": 1808, - "logprob": -0.46240234, + "logprob": -0.9355469, "text": "float" }, { "id": 10794, - "logprob": -3.0234375, + "logprob": -2.5371094, "text": "]):" } ], @@ -69,7 +69,7 @@ "tokens": [ { "id": 284, - "logprob": -0.04626465, + "logprob": -1.1679688, "special": false, "text": "\n " }, diff --git a/integration-tests/models/__snapshots__/test_flash_starcoder_gptq/test_flash_starcoder_gptq_default_params.json b/integration-tests/models/__snapshots__/test_flash_starcoder_gptq/test_flash_starcoder_gptq_default_params.json index 015912f8..a6b80534 100644 --- a/integration-tests/models/__snapshots__/test_flash_starcoder_gptq/test_flash_starcoder_gptq_default_params.json +++ b/integration-tests/models/__snapshots__/test_flash_starcoder_gptq/test_flash_starcoder_gptq_default_params.json @@ -11,57 +11,57 @@ }, { "id": 3226, - "logprob": -8.9453125, + "logprob": -9.015625, "text": " ge" }, { "id": 21017, - "logprob": -8.859375, + "logprob": -9.0859375, "text": "ometric" }, { "id": 81, - "logprob": -0.21984863, + "logprob": -0.25585938, "text": "_" }, { "id": 6009, - "logprob": -1.2861328, + "logprob": -2.2304688, "text": "mean" }, { "id": 26, - "logprob": -0.25219727, + "logprob": -0.29760742, "text": "(" }, { "id": 62, - "logprob": -4.8007812, + "logprob": -5.6796875, "text": "L" }, { "id": 44, - "logprob": -3.7949219, + "logprob": -3.0742188, "text": ":" }, { "id": 1682, - "logprob": -0.8046875, + "logprob": -0.67626953, "text": " List" }, { "id": 77, - "logprob": -0.22424316, + "logprob": -0.38842773, "text": "[" }, { "id": 1808, - "logprob": -0.46191406, + "logprob": -0.9165039, "text": "float" }, { "id": 10794, - "logprob": -3.0253906, + "logprob": -2.5527344, "text": "]):" } ], @@ -69,7 +69,7 @@ "tokens": [ { "id": 284, - "logprob": 0.0, + "logprob": -0.048583984, "special": false, "text": "\n " }, diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py index bce459e3..b1270b44 100644 --- a/server/text_generation_server/models/flash_causal_lm.py +++ b/server/text_generation_server/models/flash_causal_lm.py @@ -1601,8 +1601,6 @@ class FlashCausalLM(Model): max_s = batch.max_current_length lm_head_indices = batch.prefill_head_indices - print(slots) - if cu_seqlen_prefill is None and self.max_past() is not None: # In decode, not prefill, we're actually overwriting the KV-cache # in a circular buffer mode.