From 7ab99bc6b3ae44362658de6f4eaa41c8861f4c8b Mon Sep 17 00:00:00 2001 From: drbh Date: Wed, 22 Jan 2025 20:51:20 +0000 Subject: [PATCH] feat: refactor position ids in warmup and bump tests --- ...essed_tensors_w8a8_int_dynamic_weight.json | 422 +++++++++++++++++- ...rs_w8a8_int_dynamic_weight_all_params.json | 78 ++-- ..._tensors_w8a8_int_dynamic_weight_load.json | 80 ++-- ...pressed_tensors_w8a8_int_dynamic_weight.py | 9 +- .../models/flash_causal_lm.py | 20 +- 5 files changed, 499 insertions(+), 110 deletions(-) diff --git a/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight.json b/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight.json index 2525f72c..7dbfc627 100644 --- a/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight.json +++ b/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight.json @@ -1,73 +1,469 @@ { "details": { "best_of_sequences": null, - "finish_reason": "length", - "generated_tokens": 10, + "finish_reason": "eos_token", + "generated_tokens": 76, "prefill": [], "seed": null, "tokens": [ { "id": 18183, - "logprob": -1.6669922, + "logprob": -1.5195312, "special": false, "text": " Deep" }, { "id": 6832, - "logprob": -0.08959961, + "logprob": -0.06817627, "special": false, "text": " learning" }, { "id": 374, - "logprob": -0.14685059, + "logprob": -0.13122559, "special": false, "text": " is" }, { "id": 264, - "logprob": -0.125, + "logprob": -0.13415527, "special": false, "text": " a" }, { "id": 25993, - "logprob": -0.81640625, + "logprob": -0.8769531, "special": false, "text": " subset" }, { "id": 315, - "logprob": -0.0013418198, + "logprob": -0.0011396408, "special": false, "text": " of" }, { "id": 5662, - "logprob": -0.16027832, + "logprob": -0.16442871, "special": false, "text": " machine" }, { "id": 6832, - "logprob": -0.0016393661, + "logprob": -0.0026416779, "special": false, "text": " learning" }, { "id": 429, - "logprob": -0.4477539, + "logprob": -0.48754883, "special": false, "text": " that" }, { "id": 5711, - "logprob": -1.2802734, + "logprob": -1.2294922, "special": false, "text": " uses" + }, + { + "id": 29728, + "logprob": -0.66503906, + "special": false, + "text": " neural" + }, + { + "id": 14155, + "logprob": -0.02960205, + "special": false, + "text": " networks" + }, + { + "id": 311, + "logprob": -0.7236328, + "special": false, + "text": " to" + }, + { + "id": 3960, + "logprob": -1.1914062, + "special": false, + "text": " learn" + }, + { + "id": 504, + "logprob": -0.7089844, + "special": false, + "text": " from" + }, + { + "id": 821, + "logprob": -0.7729492, + "special": false, + "text": " data" + }, + { + "id": 13, + "logprob": -0.7836914, + "special": false, + "text": "." + }, + { + "id": 1084, + "logprob": -0.9941406, + "special": false, + "text": " It" + }, + { + "id": 374, + "logprob": -0.52441406, + "special": false, + "text": " is" + }, + { + "id": 264, + "logprob": -0.9511719, + "special": false, + "text": " a" + }, + { + "id": 943, + "logprob": -0.8642578, + "special": false, + "text": " type" + }, + { + "id": 315, + "logprob": -0.00030231476, + "special": false, + "text": " of" + }, + { + "id": 20443, + "logprob": -0.14416504, + "special": false, + "text": " artificial" + }, + { + "id": 11229, + "logprob": -0.013824463, + "special": false, + "text": " intelligence" + }, + { + "id": 429, + "logprob": -0.18762207, + "special": false, + "text": " that" + }, + { + "id": 646, + "logprob": -1.0087891, + "special": false, + "text": " can" + }, + { + "id": 3960, + "logprob": -0.90234375, + "special": false, + "text": " learn" + }, + { + "id": 504, + "logprob": -0.54345703, + "special": false, + "text": " from" + }, + { + "id": 323, + "logprob": -1.0400391, + "special": false, + "text": " and" + }, + { + "id": 1281, + "logprob": -0.072509766, + "special": false, + "text": " make" + }, + { + "id": 19898, + "logprob": -0.16516113, + "special": false, + "text": " predictions" + }, + { + "id": 389, + "logprob": -0.4416504, + "special": false, + "text": " on" + }, + { + "id": 3460, + "logprob": -0.5385742, + "special": false, + "text": " large" + }, + { + "id": 14713, + "logprob": -0.4387207, + "special": false, + "text": " amounts" + }, + { + "id": 315, + "logprob": -0.00015091896, + "special": false, + "text": " of" + }, + { + "id": 821, + "logprob": -0.061431885, + "special": false, + "text": " data" + }, + { + "id": 13, + "logprob": -0.71875, + "special": false, + "text": "." + }, + { + "id": 18183, + "logprob": -0.23632812, + "special": false, + "text": " Deep" + }, + { + "id": 6832, + "logprob": -0.0017204285, + "special": false, + "text": " learning" + }, + { + "id": 374, + "logprob": -1.1738281, + "special": false, + "text": " is" + }, + { + "id": 1483, + "logprob": -0.61083984, + "special": false, + "text": " used" + }, + { + "id": 304, + "logprob": -0.035003662, + "special": false, + "text": " in" + }, + { + "id": 264, + "logprob": -0.118652344, + "special": false, + "text": " a" + }, + { + "id": 8045, + "logprob": -0.42016602, + "special": false, + "text": " variety" + }, + { + "id": 315, + "logprob": -1.6212463e-05, + "special": false, + "text": " of" + }, + { + "id": 8357, + "logprob": -0.1315918, + "special": false, + "text": " applications" + }, + { + "id": 11, + "logprob": -0.12915039, + "special": false, + "text": "," + }, + { + "id": 2670, + "logprob": -0.12463379, + "special": false, + "text": " including" + }, + { + "id": 2168, + "logprob": -0.37402344, + "special": false, + "text": " image" + }, + { + "id": 323, + "logprob": -0.1451416, + "special": false, + "text": " and" + }, + { + "id": 8806, + "logprob": -0.028869629, + "special": false, + "text": " speech" + }, + { + "id": 17843, + "logprob": -0.00024068356, + "special": false, + "text": " recognition" + }, + { + "id": 11, + "logprob": -0.00031018257, + "special": false, + "text": "," + }, + { + "id": 5810, + "logprob": -0.019821167, + "special": false, + "text": " natural" + }, + { + "id": 4128, + "logprob": -0.00012528896, + "special": false, + "text": " language" + }, + { + "id": 8692, + "logprob": -0.00089263916, + "special": false, + "text": " processing" + }, + { + "id": 11, + "logprob": -0.00073862076, + "special": false, + "text": "," + }, + { + "id": 323, + "logprob": -0.040161133, + "special": false, + "text": " and" + }, + { + "id": 38193, + "logprob": -0.4519043, + "special": false, + "text": " autonomous" + }, + { + "id": 11474, + "logprob": -0.39941406, + "special": false, + "text": " vehicles" + }, + { + "id": 13, + "logprob": -0.21166992, + "special": false, + "text": "." + }, + { + "id": 1084, + "logprob": -0.9082031, + "special": false, + "text": " It" + }, + { + "id": 374, + "logprob": -0.44213867, + "special": false, + "text": " is" + }, + { + "id": 264, + "logprob": -1.2177734, + "special": false, + "text": " a" + }, + { + "id": 18512, + "logprob": -0.5205078, + "special": false, + "text": " rapidly" + }, + { + "id": 7826, + "logprob": -0.15332031, + "special": false, + "text": " growing" + }, + { + "id": 2070, + "logprob": -0.0039978027, + "special": false, + "text": " field" + }, + { + "id": 448, + "logprob": -0.9091797, + "special": false, + "text": " with" + }, + { + "id": 1657, + "logprob": -0.17114258, + "special": false, + "text": " many" + }, + { + "id": 4650, + "logprob": -0.70703125, + "special": false, + "text": " potential" + }, + { + "id": 8357, + "logprob": -0.025131226, + "special": false, + "text": " applications" + }, + { + "id": 304, + "logprob": -0.6699219, + "special": false, + "text": " in" + }, + { + "id": 279, + "logprob": -0.35205078, + "special": false, + "text": " the" + }, + { + "id": 3853, + "logprob": -0.049194336, + "special": false, + "text": " future" + }, + { + "id": 13, + "logprob": -0.21972656, + "special": false, + "text": "." + }, + { + "id": 151643, + "logprob": -2.0019531, + "special": true, + "text": "<|endoftext|>" } ], "top_tokens": null }, - "generated_text": " Deep learning is a subset of machine learning that uses" + "generated_text": " Deep learning is a subset of machine learning that uses neural networks to learn from data. It is a type of artificial intelligence that can learn from and make predictions on large amounts of data. Deep learning is used in a variety of applications, including image and speech recognition, natural language processing, and autonomous vehicles. It is a rapidly growing field with many potential applications in the future." } diff --git a/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight_all_params.json b/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight_all_params.json index 6b3f5092..2c840e67 100644 --- a/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight_all_params.json +++ b/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight_all_params.json @@ -7,67 +7,67 @@ "seed": 0, "tokens": [ { - "id": 1939, - "logprob": -2.2460938, + "id": 5267, + "logprob": -1.1464844, "special": false, - "text": "?\n\n" + "text": "?\n" }, { "id": 33464, - "logprob": 0.0, + "logprob": -0.83203125, "special": false, "text": "Deep" }, { "id": 20909, - "logprob": -0.48608398, + "logprob": -0.5625, "special": false, "text": " Learning" }, - { - "id": 4102, - "logprob": -2.265625, - "special": false, - "text": " " - }, - { - "id": 285, - "logprob": 0.0, - "special": false, - "text": "is" - }, - { - "id": 458, - "logprob": -0.6328125, - "special": false, - "text": " an" - }, - { - "id": 20443, - "logprob": -0.1796875, - "special": false, - "text": " artificial" - }, - { - "id": 11229, - "logprob": 0.0, - "special": false, - "text": " intelligence" - }, { "id": 320, - "logprob": -0.37695312, + "logprob": -2.1464844, "special": false, "text": " (" }, { - "id": 15469, + "id": 16524, "logprob": 0.0, "special": false, - "text": "AI" + "text": "DL" + }, + { + "id": 701, + "logprob": -2.2089844, + "special": false, + "text": ")," + }, + { + "id": 476, + "logprob": -0.27368164, + "special": false, + "text": " or" + }, + { + "id": 20443, + "logprob": -0.09442139, + "special": false, + "text": " artificial" + }, + { + "id": 29728, + "logprob": 0.0, + "special": false, + "text": " neural" + }, + { + "id": 14155, + "logprob": 0.0, + "special": false, + "text": " networks" } ], "top_tokens": null }, - "generated_text": "What is deep learning?\n\nDeep Learning is an artificial intelligence (AI" + "generated_text": "What is deep learning?\nDeep Learning (DL), or artificial neural networks" } diff --git a/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight_load.json b/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight_load.json index 1fa4e33a..aee5698b 100644 --- a/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight_load.json +++ b/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight_load.json @@ -9,61 +9,61 @@ "tokens": [ { "id": 18183, - "logprob": -1.4912109, + "logprob": -1.5195312, "special": false, "text": " Deep" }, { "id": 6832, - "logprob": -0.075683594, + "logprob": -0.06817627, "special": false, "text": " learning" }, { "id": 374, - "logprob": -0.12408447, + "logprob": -0.13122559, "special": false, "text": " is" }, { "id": 264, - "logprob": -0.12768555, + "logprob": -0.13415527, "special": false, "text": " a" }, { "id": 25993, - "logprob": -0.82128906, + "logprob": -0.87353516, "special": false, "text": " subset" }, { "id": 315, - "logprob": -0.0012636185, + "logprob": -0.0011396408, "special": false, "text": " of" }, { "id": 5662, - "logprob": -0.12878418, + "logprob": -0.16442871, "special": false, "text": " machine" }, { "id": 6832, - "logprob": -0.0015888214, + "logprob": -0.0026416779, "special": false, "text": " learning" }, { "id": 429, - "logprob": -0.49194336, + "logprob": -0.48754883, "special": false, "text": " that" }, { "id": 5711, - "logprob": -1.2626953, + "logprob": -1.2294922, "special": false, "text": " uses" } @@ -82,61 +82,61 @@ "tokens": [ { "id": 18183, - "logprob": -1.4912109, + "logprob": -1.5195312, "special": false, "text": " Deep" }, { "id": 6832, - "logprob": -0.075683594, + "logprob": -0.06817627, "special": false, "text": " learning" }, { "id": 374, - "logprob": -0.12408447, + "logprob": -0.13122559, "special": false, "text": " is" }, { "id": 264, - "logprob": -0.12768555, + "logprob": -0.13415527, "special": false, "text": " a" }, { "id": 25993, - "logprob": -0.82128906, + "logprob": -0.87353516, "special": false, "text": " subset" }, { "id": 315, - "logprob": -0.0012636185, + "logprob": -0.0011396408, "special": false, "text": " of" }, { "id": 5662, - "logprob": -0.12878418, + "logprob": -0.16442871, "special": false, "text": " machine" }, { "id": 6832, - "logprob": -0.0015888214, + "logprob": -0.0026416779, "special": false, "text": " learning" }, { "id": 429, - "logprob": -0.49194336, + "logprob": -0.48754883, "special": false, "text": " that" }, { "id": 5711, - "logprob": -1.2626953, + "logprob": -1.2294922, "special": false, "text": " uses" } @@ -155,61 +155,61 @@ "tokens": [ { "id": 18183, - "logprob": -1.4912109, + "logprob": -1.5195312, "special": false, "text": " Deep" }, { "id": 6832, - "logprob": -0.075683594, + "logprob": -0.06817627, "special": false, "text": " learning" }, { "id": 374, - "logprob": -0.12408447, + "logprob": -0.13122559, "special": false, "text": " is" }, { "id": 264, - "logprob": -0.12768555, + "logprob": -0.13415527, "special": false, "text": " a" }, { "id": 25993, - "logprob": -0.82128906, + "logprob": -0.87353516, "special": false, "text": " subset" }, { "id": 315, - "logprob": -0.0012636185, + "logprob": -0.0011396408, "special": false, "text": " of" }, { "id": 5662, - "logprob": -0.12878418, + "logprob": -0.16442871, "special": false, "text": " machine" }, { "id": 6832, - "logprob": -0.0015888214, + "logprob": -0.0026416779, "special": false, "text": " learning" }, { "id": 429, - "logprob": -0.49194336, + "logprob": -0.48754883, "special": false, "text": " that" }, { "id": 5711, - "logprob": -1.2626953, + "logprob": -1.2294922, "special": false, "text": " uses" } @@ -228,61 +228,61 @@ "tokens": [ { "id": 18183, - "logprob": -1.4912109, + "logprob": -1.5195312, "special": false, "text": " Deep" }, { "id": 6832, - "logprob": -0.075683594, + "logprob": -0.06817627, "special": false, "text": " learning" }, { "id": 374, - "logprob": -0.12408447, + "logprob": -0.13122559, "special": false, "text": " is" }, { "id": 264, - "logprob": -0.12768555, + "logprob": -0.13415527, "special": false, "text": " a" }, { "id": 25993, - "logprob": -0.82128906, + "logprob": -0.87353516, "special": false, "text": " subset" }, { "id": 315, - "logprob": -0.0012636185, + "logprob": -0.0011396408, "special": false, "text": " of" }, { "id": 5662, - "logprob": -0.12878418, + "logprob": -0.16442871, "special": false, "text": " machine" }, { "id": 6832, - "logprob": -0.0015888214, + "logprob": -0.0026416779, "special": false, "text": " learning" }, { "id": 429, - "logprob": -0.49194336, + "logprob": -0.48754883, "special": false, "text": " that" }, { "id": 5711, - "logprob": -1.2626953, + "logprob": -1.2294922, "special": false, "text": " uses" } diff --git a/integration-tests/models/test_compressed_tensors_w8a8_int_dynamic_weight.py b/integration-tests/models/test_compressed_tensors_w8a8_int_dynamic_weight.py index a0b0416b..17e12c22 100644 --- a/integration-tests/models/test_compressed_tensors_w8a8_int_dynamic_weight.py +++ b/integration-tests/models/test_compressed_tensors_w8a8_int_dynamic_weight.py @@ -27,15 +27,16 @@ async def test_compressed_tensors_w8a8_int_dynamic_weight( ): response = await compressed_tensors_w8a8_int_dynamic_weight.generate( "What is deep learning?", - max_new_tokens=10, + # prefer a longer response than the default, allow the llm to end generation + max_new_tokens=1000, decoder_input_details=True, ) assert ( response.generated_text - == " Deep learning is a subset of machine learning that uses" + == " Deep learning is a subset of machine learning that uses neural networks to learn from data. It is a type of artificial intelligence that can learn from and make predictions on large amounts of data. Deep learning is used in a variety of applications, including image and speech recognition, natural language processing, and autonomous vehicles. It is a rapidly growing field with many potential applications in the future." ) - assert response.details.generated_tokens == 10 + assert response.details.generated_tokens == 76 assert response == response_snapshot @@ -64,7 +65,7 @@ async def test_compressed_tensors_w8a8_int_dynamic_weight_all_params( assert response.details.generated_tokens == 10 assert ( response.generated_text - == "What is deep learning?\n\nDeep Learning is an artificial intelligence (AI" + == "What is deep learning?\nDeep Learning (DL), or artificial neural networks" ) assert response == response_snapshot diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py index 6bc3c2ca..a7d7f711 100644 --- a/server/text_generation_server/models/flash_causal_lm.py +++ b/server/text_generation_server/models/flash_causal_lm.py @@ -1400,7 +1400,11 @@ class FlashCausalLM(Model): cache_lengths = [0] * bs if max_bs is None: input_ids = torch.zeros(bs, dtype=torch.int64, device=self.device) - position_ids = torch.zeros(bs, dtype=torch.int32, device=self.device) + if hasattr(self.model, "get_position_ids"): + # use model specific position ids for initialization + position_ids = self.model.get_position_ids(input_ids) + else: + position_ids = torch.zeros(bs, dtype=torch.int32, device=self.device) slots = torch.arange(bs, dtype=torch.int64, device=self.device) input_lengths_tensor = ( torch.ones(bs, dtype=torch.int32, device=self.device) * max_s @@ -1427,7 +1431,7 @@ class FlashCausalLM(Model): "Cuda graphs should be generated in decreasing order size to reduce VRAM usage" ) input_ids = self.cuda_graphs[max_bs]["input_ids"][:bs] - position_ids = self.cuda_graphs[max_bs]["position_ids"][:bs] + position_ids = self.cuda_graphs[max_bs]["position_ids"][..., :bs] if ATTENTION == "flashinfer": block_tables = self.cuda_graphs[max_bs]["block_tables"][: bs * max_bt] else: @@ -1456,14 +1460,6 @@ class FlashCausalLM(Model): else: state = None - if ( - hasattr(self.model, "config") - and hasattr(self.model.config, "model_type") - and self.model.config.model_type == "qwen2_vl" - ): - if position_ids.dim() == 1: - position_ids = self.model.get_position_ids(input_ids) - graph = torch.cuda.CUDAGraph() self.cuda_graphs[bs] = { "input_ids": input_ids, @@ -1486,10 +1482,6 @@ class FlashCausalLM(Model): state=state, cache_lengths_tensor=cache_lengths_tensor, ): - # in the case of N dimensional position ids we need to slice the - # position ids to match the input_ids size for cuda graphs warmup - position_ids = position_ids[..., : input_ids.shape[0]] - seqlen = Seqlen( input_lengths=input_lengths_tensor, cache_lengths=cache_lengths_tensor,