mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-19 22:02:06 +00:00
Improve qwen vl impl (#2943)
* feat: refactor model, improve startup and re enable tests * fix: improve multimodal rotary embed caching * fix: limit vision flop calc to qwen2 vl models and update config typing * fix: include clippy lint * feat: refactor position ids in warmup and bump tests * fix: prefer default dtype * fix: enable all cuda graphs and bump snapshots * fix: adjust rotaty init path * fix: simplify get position ids and remove usused vision config * fix: update position ids so first dim is batch, simplify rotary and bump vlm default token limit * fix: improve position id init during cuda warmup for mrope and simplfy rotary forward * fix: check existance before accessing rope type in cuda warmup * fix: check key before access * fix: improve mrope check in cuda graph warmup * fix: remove check for default rope type * fix: add more test and improve model generation * fix: improve and simplify get_cos_sin, refactors and cleanup get_position_ids * fix: adjust signatures with types
This commit is contained in:
parent
dd2bd5fdb3
commit
c1cf36c0dc
@ -1,73 +1,469 @@
|
|||||||
{
|
{
|
||||||
"details": {
|
"details": {
|
||||||
"best_of_sequences": null,
|
"best_of_sequences": null,
|
||||||
"finish_reason": "length",
|
"finish_reason": "eos_token",
|
||||||
"generated_tokens": 10,
|
"generated_tokens": 76,
|
||||||
"prefill": [],
|
"prefill": [],
|
||||||
"seed": null,
|
"seed": null,
|
||||||
"tokens": [
|
"tokens": [
|
||||||
{
|
{
|
||||||
"id": 18183,
|
"id": 18183,
|
||||||
"logprob": -1.6669922,
|
"logprob": -1.5195312,
|
||||||
"special": false,
|
"special": false,
|
||||||
"text": " Deep"
|
"text": " Deep"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 6832,
|
"id": 6832,
|
||||||
"logprob": -0.08959961,
|
"logprob": -0.06817627,
|
||||||
"special": false,
|
"special": false,
|
||||||
"text": " learning"
|
"text": " learning"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 374,
|
"id": 374,
|
||||||
"logprob": -0.14685059,
|
"logprob": -0.13122559,
|
||||||
"special": false,
|
"special": false,
|
||||||
"text": " is"
|
"text": " is"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 264,
|
"id": 264,
|
||||||
"logprob": -0.125,
|
"logprob": -0.13415527,
|
||||||
"special": false,
|
"special": false,
|
||||||
"text": " a"
|
"text": " a"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 25993,
|
"id": 25993,
|
||||||
"logprob": -0.81640625,
|
"logprob": -0.8769531,
|
||||||
"special": false,
|
"special": false,
|
||||||
"text": " subset"
|
"text": " subset"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 315,
|
"id": 315,
|
||||||
"logprob": -0.0013418198,
|
"logprob": -0.0011396408,
|
||||||
"special": false,
|
"special": false,
|
||||||
"text": " of"
|
"text": " of"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 5662,
|
"id": 5662,
|
||||||
"logprob": -0.16027832,
|
"logprob": -0.16442871,
|
||||||
"special": false,
|
"special": false,
|
||||||
"text": " machine"
|
"text": " machine"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 6832,
|
"id": 6832,
|
||||||
"logprob": -0.0016393661,
|
"logprob": -0.0026416779,
|
||||||
"special": false,
|
"special": false,
|
||||||
"text": " learning"
|
"text": " learning"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 429,
|
"id": 429,
|
||||||
"logprob": -0.4477539,
|
"logprob": -0.48754883,
|
||||||
"special": false,
|
"special": false,
|
||||||
"text": " that"
|
"text": " that"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 5711,
|
"id": 5711,
|
||||||
"logprob": -1.2802734,
|
"logprob": -1.2294922,
|
||||||
"special": false,
|
"special": false,
|
||||||
"text": " uses"
|
"text": " uses"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 29728,
|
||||||
|
"logprob": -0.66503906,
|
||||||
|
"special": false,
|
||||||
|
"text": " neural"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 14155,
|
||||||
|
"logprob": -0.02960205,
|
||||||
|
"special": false,
|
||||||
|
"text": " networks"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 311,
|
||||||
|
"logprob": -0.7236328,
|
||||||
|
"special": false,
|
||||||
|
"text": " to"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 3960,
|
||||||
|
"logprob": -1.1914062,
|
||||||
|
"special": false,
|
||||||
|
"text": " learn"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 504,
|
||||||
|
"logprob": -0.7089844,
|
||||||
|
"special": false,
|
||||||
|
"text": " from"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 821,
|
||||||
|
"logprob": -0.7729492,
|
||||||
|
"special": false,
|
||||||
|
"text": " data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 13,
|
||||||
|
"logprob": -0.7836914,
|
||||||
|
"special": false,
|
||||||
|
"text": "."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 1084,
|
||||||
|
"logprob": -0.9941406,
|
||||||
|
"special": false,
|
||||||
|
"text": " It"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 374,
|
||||||
|
"logprob": -0.52441406,
|
||||||
|
"special": false,
|
||||||
|
"text": " is"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 264,
|
||||||
|
"logprob": -0.9511719,
|
||||||
|
"special": false,
|
||||||
|
"text": " a"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 943,
|
||||||
|
"logprob": -0.8642578,
|
||||||
|
"special": false,
|
||||||
|
"text": " type"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 315,
|
||||||
|
"logprob": -0.00030231476,
|
||||||
|
"special": false,
|
||||||
|
"text": " of"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 20443,
|
||||||
|
"logprob": -0.14416504,
|
||||||
|
"special": false,
|
||||||
|
"text": " artificial"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 11229,
|
||||||
|
"logprob": -0.013824463,
|
||||||
|
"special": false,
|
||||||
|
"text": " intelligence"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 429,
|
||||||
|
"logprob": -0.18762207,
|
||||||
|
"special": false,
|
||||||
|
"text": " that"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 646,
|
||||||
|
"logprob": -1.0087891,
|
||||||
|
"special": false,
|
||||||
|
"text": " can"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 3960,
|
||||||
|
"logprob": -0.90234375,
|
||||||
|
"special": false,
|
||||||
|
"text": " learn"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 504,
|
||||||
|
"logprob": -0.54345703,
|
||||||
|
"special": false,
|
||||||
|
"text": " from"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 323,
|
||||||
|
"logprob": -1.0400391,
|
||||||
|
"special": false,
|
||||||
|
"text": " and"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 1281,
|
||||||
|
"logprob": -0.072509766,
|
||||||
|
"special": false,
|
||||||
|
"text": " make"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 19898,
|
||||||
|
"logprob": -0.16516113,
|
||||||
|
"special": false,
|
||||||
|
"text": " predictions"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 389,
|
||||||
|
"logprob": -0.4416504,
|
||||||
|
"special": false,
|
||||||
|
"text": " on"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 3460,
|
||||||
|
"logprob": -0.5385742,
|
||||||
|
"special": false,
|
||||||
|
"text": " large"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 14713,
|
||||||
|
"logprob": -0.4387207,
|
||||||
|
"special": false,
|
||||||
|
"text": " amounts"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 315,
|
||||||
|
"logprob": -0.00015091896,
|
||||||
|
"special": false,
|
||||||
|
"text": " of"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 821,
|
||||||
|
"logprob": -0.061431885,
|
||||||
|
"special": false,
|
||||||
|
"text": " data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 13,
|
||||||
|
"logprob": -0.71875,
|
||||||
|
"special": false,
|
||||||
|
"text": "."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 18183,
|
||||||
|
"logprob": -0.23632812,
|
||||||
|
"special": false,
|
||||||
|
"text": " Deep"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 6832,
|
||||||
|
"logprob": -0.0017204285,
|
||||||
|
"special": false,
|
||||||
|
"text": " learning"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 374,
|
||||||
|
"logprob": -1.1738281,
|
||||||
|
"special": false,
|
||||||
|
"text": " is"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 1483,
|
||||||
|
"logprob": -0.61083984,
|
||||||
|
"special": false,
|
||||||
|
"text": " used"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 304,
|
||||||
|
"logprob": -0.035003662,
|
||||||
|
"special": false,
|
||||||
|
"text": " in"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 264,
|
||||||
|
"logprob": -0.118652344,
|
||||||
|
"special": false,
|
||||||
|
"text": " a"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 8045,
|
||||||
|
"logprob": -0.42016602,
|
||||||
|
"special": false,
|
||||||
|
"text": " variety"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 315,
|
||||||
|
"logprob": -1.6212463e-05,
|
||||||
|
"special": false,
|
||||||
|
"text": " of"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 8357,
|
||||||
|
"logprob": -0.1315918,
|
||||||
|
"special": false,
|
||||||
|
"text": " applications"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 11,
|
||||||
|
"logprob": -0.12915039,
|
||||||
|
"special": false,
|
||||||
|
"text": ","
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 2670,
|
||||||
|
"logprob": -0.12463379,
|
||||||
|
"special": false,
|
||||||
|
"text": " including"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 2168,
|
||||||
|
"logprob": -0.37402344,
|
||||||
|
"special": false,
|
||||||
|
"text": " image"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 323,
|
||||||
|
"logprob": -0.1451416,
|
||||||
|
"special": false,
|
||||||
|
"text": " and"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 8806,
|
||||||
|
"logprob": -0.028869629,
|
||||||
|
"special": false,
|
||||||
|
"text": " speech"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 17843,
|
||||||
|
"logprob": -0.00024068356,
|
||||||
|
"special": false,
|
||||||
|
"text": " recognition"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 11,
|
||||||
|
"logprob": -0.00031018257,
|
||||||
|
"special": false,
|
||||||
|
"text": ","
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 5810,
|
||||||
|
"logprob": -0.019821167,
|
||||||
|
"special": false,
|
||||||
|
"text": " natural"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 4128,
|
||||||
|
"logprob": -0.00012528896,
|
||||||
|
"special": false,
|
||||||
|
"text": " language"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 8692,
|
||||||
|
"logprob": -0.00089263916,
|
||||||
|
"special": false,
|
||||||
|
"text": " processing"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 11,
|
||||||
|
"logprob": -0.00073862076,
|
||||||
|
"special": false,
|
||||||
|
"text": ","
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 323,
|
||||||
|
"logprob": -0.040161133,
|
||||||
|
"special": false,
|
||||||
|
"text": " and"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 38193,
|
||||||
|
"logprob": -0.4519043,
|
||||||
|
"special": false,
|
||||||
|
"text": " autonomous"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 11474,
|
||||||
|
"logprob": -0.39941406,
|
||||||
|
"special": false,
|
||||||
|
"text": " vehicles"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 13,
|
||||||
|
"logprob": -0.21166992,
|
||||||
|
"special": false,
|
||||||
|
"text": "."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 1084,
|
||||||
|
"logprob": -0.9082031,
|
||||||
|
"special": false,
|
||||||
|
"text": " It"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 374,
|
||||||
|
"logprob": -0.44213867,
|
||||||
|
"special": false,
|
||||||
|
"text": " is"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 264,
|
||||||
|
"logprob": -1.2177734,
|
||||||
|
"special": false,
|
||||||
|
"text": " a"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 18512,
|
||||||
|
"logprob": -0.5205078,
|
||||||
|
"special": false,
|
||||||
|
"text": " rapidly"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 7826,
|
||||||
|
"logprob": -0.15332031,
|
||||||
|
"special": false,
|
||||||
|
"text": " growing"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 2070,
|
||||||
|
"logprob": -0.0039978027,
|
||||||
|
"special": false,
|
||||||
|
"text": " field"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 448,
|
||||||
|
"logprob": -0.9091797,
|
||||||
|
"special": false,
|
||||||
|
"text": " with"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 1657,
|
||||||
|
"logprob": -0.17114258,
|
||||||
|
"special": false,
|
||||||
|
"text": " many"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 4650,
|
||||||
|
"logprob": -0.70703125,
|
||||||
|
"special": false,
|
||||||
|
"text": " potential"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 8357,
|
||||||
|
"logprob": -0.025131226,
|
||||||
|
"special": false,
|
||||||
|
"text": " applications"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 304,
|
||||||
|
"logprob": -0.6699219,
|
||||||
|
"special": false,
|
||||||
|
"text": " in"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 279,
|
||||||
|
"logprob": -0.35205078,
|
||||||
|
"special": false,
|
||||||
|
"text": " the"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 3853,
|
||||||
|
"logprob": -0.049194336,
|
||||||
|
"special": false,
|
||||||
|
"text": " future"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 13,
|
||||||
|
"logprob": -0.21972656,
|
||||||
|
"special": false,
|
||||||
|
"text": "."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 151643,
|
||||||
|
"logprob": -2.0019531,
|
||||||
|
"special": true,
|
||||||
|
"text": "<|endoftext|>"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"top_tokens": null
|
"top_tokens": null
|
||||||
},
|
},
|
||||||
"generated_text": " Deep learning is a subset of machine learning that uses"
|
"generated_text": " Deep learning is a subset of machine learning that uses neural networks to learn from data. It is a type of artificial intelligence that can learn from and make predictions on large amounts of data. Deep learning is used in a variety of applications, including image and speech recognition, natural language processing, and autonomous vehicles. It is a rapidly growing field with many potential applications in the future."
|
||||||
}
|
}
|
||||||
|
@ -7,67 +7,67 @@
|
|||||||
"seed": 0,
|
"seed": 0,
|
||||||
"tokens": [
|
"tokens": [
|
||||||
{
|
{
|
||||||
"id": 1939,
|
"id": 5267,
|
||||||
"logprob": -2.2460938,
|
"logprob": -1.1464844,
|
||||||
"special": false,
|
"special": false,
|
||||||
"text": "?\n\n"
|
"text": "?\n"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 33464,
|
"id": 33464,
|
||||||
"logprob": 0.0,
|
"logprob": -0.83203125,
|
||||||
"special": false,
|
"special": false,
|
||||||
"text": "Deep"
|
"text": "Deep"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 20909,
|
"id": 20909,
|
||||||
"logprob": -0.48608398,
|
"logprob": -0.5625,
|
||||||
"special": false,
|
"special": false,
|
||||||
"text": " Learning"
|
"text": " Learning"
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"id": 4102,
|
|
||||||
"logprob": -2.265625,
|
|
||||||
"special": false,
|
|
||||||
"text": " "
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 285,
|
|
||||||
"logprob": 0.0,
|
|
||||||
"special": false,
|
|
||||||
"text": "is"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 458,
|
|
||||||
"logprob": -0.6328125,
|
|
||||||
"special": false,
|
|
||||||
"text": " an"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 20443,
|
|
||||||
"logprob": -0.1796875,
|
|
||||||
"special": false,
|
|
||||||
"text": " artificial"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 11229,
|
|
||||||
"logprob": 0.0,
|
|
||||||
"special": false,
|
|
||||||
"text": " intelligence"
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"id": 320,
|
"id": 320,
|
||||||
"logprob": -0.37695312,
|
"logprob": -2.1464844,
|
||||||
"special": false,
|
"special": false,
|
||||||
"text": " ("
|
"text": " ("
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 15469,
|
"id": 16524,
|
||||||
"logprob": 0.0,
|
"logprob": 0.0,
|
||||||
"special": false,
|
"special": false,
|
||||||
"text": "AI"
|
"text": "DL"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 701,
|
||||||
|
"logprob": -2.2089844,
|
||||||
|
"special": false,
|
||||||
|
"text": "),"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 476,
|
||||||
|
"logprob": -0.27368164,
|
||||||
|
"special": false,
|
||||||
|
"text": " or"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 20443,
|
||||||
|
"logprob": -0.09442139,
|
||||||
|
"special": false,
|
||||||
|
"text": " artificial"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 29728,
|
||||||
|
"logprob": 0.0,
|
||||||
|
"special": false,
|
||||||
|
"text": " neural"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 14155,
|
||||||
|
"logprob": 0.0,
|
||||||
|
"special": false,
|
||||||
|
"text": " networks"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"top_tokens": null
|
"top_tokens": null
|
||||||
},
|
},
|
||||||
"generated_text": "What is deep learning?\n\nDeep Learning is an artificial intelligence (AI"
|
"generated_text": "What is deep learning?\nDeep Learning (DL), or artificial neural networks"
|
||||||
}
|
}
|
||||||
|
@ -9,61 +9,61 @@
|
|||||||
"tokens": [
|
"tokens": [
|
||||||
{
|
{
|
||||||
"id": 18183,
|
"id": 18183,
|
||||||
"logprob": -1.4912109,
|
"logprob": -1.5195312,
|
||||||
"special": false,
|
"special": false,
|
||||||
"text": " Deep"
|
"text": " Deep"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 6832,
|
"id": 6832,
|
||||||
"logprob": -0.075683594,
|
"logprob": -0.06817627,
|
||||||
"special": false,
|
"special": false,
|
||||||
"text": " learning"
|
"text": " learning"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 374,
|
"id": 374,
|
||||||
"logprob": -0.12408447,
|
"logprob": -0.13122559,
|
||||||
"special": false,
|
"special": false,
|
||||||
"text": " is"
|
"text": " is"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 264,
|
"id": 264,
|
||||||
"logprob": -0.12768555,
|
"logprob": -0.13415527,
|
||||||
"special": false,
|
"special": false,
|
||||||
"text": " a"
|
"text": " a"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 25993,
|
"id": 25993,
|
||||||
"logprob": -0.82128906,
|
"logprob": -0.87353516,
|
||||||
"special": false,
|
"special": false,
|
||||||
"text": " subset"
|
"text": " subset"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 315,
|
"id": 315,
|
||||||
"logprob": -0.0012636185,
|
"logprob": -0.0011396408,
|
||||||
"special": false,
|
"special": false,
|
||||||
"text": " of"
|
"text": " of"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 5662,
|
"id": 5662,
|
||||||
"logprob": -0.12878418,
|
"logprob": -0.16442871,
|
||||||
"special": false,
|
"special": false,
|
||||||
"text": " machine"
|
"text": " machine"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 6832,
|
"id": 6832,
|
||||||
"logprob": -0.0015888214,
|
"logprob": -0.0026416779,
|
||||||
"special": false,
|
"special": false,
|
||||||
"text": " learning"
|
"text": " learning"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 429,
|
"id": 429,
|
||||||
"logprob": -0.49194336,
|
"logprob": -0.48754883,
|
||||||
"special": false,
|
"special": false,
|
||||||
"text": " that"
|
"text": " that"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 5711,
|
"id": 5711,
|
||||||
"logprob": -1.2626953,
|
"logprob": -1.2294922,
|
||||||
"special": false,
|
"special": false,
|
||||||
"text": " uses"
|
"text": " uses"
|
||||||
}
|
}
|
||||||
@ -82,61 +82,61 @@
|
|||||||
"tokens": [
|
"tokens": [
|
||||||
{
|
{
|
||||||
"id": 18183,
|
"id": 18183,
|
||||||
"logprob": -1.4912109,
|
"logprob": -1.5195312,
|
||||||
"special": false,
|
"special": false,
|
||||||
"text": " Deep"
|
"text": " Deep"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 6832,
|
"id": 6832,
|
||||||
"logprob": -0.075683594,
|
"logprob": -0.06817627,
|
||||||
"special": false,
|
"special": false,
|
||||||
"text": " learning"
|
"text": " learning"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 374,
|
"id": 374,
|
||||||
"logprob": -0.12408447,
|
"logprob": -0.13122559,
|
||||||
"special": false,
|
"special": false,
|
||||||
"text": " is"
|
"text": " is"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 264,
|
"id": 264,
|
||||||
"logprob": -0.12768555,
|
"logprob": -0.13415527,
|
||||||
"special": false,
|
"special": false,
|
||||||
"text": " a"
|
"text": " a"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 25993,
|
"id": 25993,
|
||||||
"logprob": -0.82128906,
|
"logprob": -0.87353516,
|
||||||
"special": false,
|
"special": false,
|
||||||
"text": " subset"
|
"text": " subset"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 315,
|
"id": 315,
|
||||||
"logprob": -0.0012636185,
|
"logprob": -0.0011396408,
|
||||||
"special": false,
|
"special": false,
|
||||||
"text": " of"
|
"text": " of"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 5662,
|
"id": 5662,
|
||||||
"logprob": -0.12878418,
|
"logprob": -0.16442871,
|
||||||
"special": false,
|
"special": false,
|
||||||
"text": " machine"
|
"text": " machine"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 6832,
|
"id": 6832,
|
||||||
"logprob": -0.0015888214,
|
"logprob": -0.0026416779,
|
||||||
"special": false,
|
"special": false,
|
||||||
"text": " learning"
|
"text": " learning"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 429,
|
"id": 429,
|
||||||
"logprob": -0.49194336,
|
"logprob": -0.48754883,
|
||||||
"special": false,
|
"special": false,
|
||||||
"text": " that"
|
"text": " that"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 5711,
|
"id": 5711,
|
||||||
"logprob": -1.2626953,
|
"logprob": -1.2294922,
|
||||||
"special": false,
|
"special": false,
|
||||||
"text": " uses"
|
"text": " uses"
|
||||||
}
|
}
|
||||||
@ -155,61 +155,61 @@
|
|||||||
"tokens": [
|
"tokens": [
|
||||||
{
|
{
|
||||||
"id": 18183,
|
"id": 18183,
|
||||||
"logprob": -1.4912109,
|
"logprob": -1.5195312,
|
||||||
"special": false,
|
"special": false,
|
||||||
"text": " Deep"
|
"text": " Deep"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 6832,
|
"id": 6832,
|
||||||
"logprob": -0.075683594,
|
"logprob": -0.06817627,
|
||||||
"special": false,
|
"special": false,
|
||||||
"text": " learning"
|
"text": " learning"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 374,
|
"id": 374,
|
||||||
"logprob": -0.12408447,
|
"logprob": -0.13122559,
|
||||||
"special": false,
|
"special": false,
|
||||||
"text": " is"
|
"text": " is"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 264,
|
"id": 264,
|
||||||
"logprob": -0.12768555,
|
"logprob": -0.13415527,
|
||||||
"special": false,
|
"special": false,
|
||||||
"text": " a"
|
"text": " a"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 25993,
|
"id": 25993,
|
||||||
"logprob": -0.82128906,
|
"logprob": -0.87353516,
|
||||||
"special": false,
|
"special": false,
|
||||||
"text": " subset"
|
"text": " subset"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 315,
|
"id": 315,
|
||||||
"logprob": -0.0012636185,
|
"logprob": -0.0011396408,
|
||||||
"special": false,
|
"special": false,
|
||||||
"text": " of"
|
"text": " of"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 5662,
|
"id": 5662,
|
||||||
"logprob": -0.12878418,
|
"logprob": -0.16442871,
|
||||||
"special": false,
|
"special": false,
|
||||||
"text": " machine"
|
"text": " machine"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 6832,
|
"id": 6832,
|
||||||
"logprob": -0.0015888214,
|
"logprob": -0.0026416779,
|
||||||
"special": false,
|
"special": false,
|
||||||
"text": " learning"
|
"text": " learning"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 429,
|
"id": 429,
|
||||||
"logprob": -0.49194336,
|
"logprob": -0.48754883,
|
||||||
"special": false,
|
"special": false,
|
||||||
"text": " that"
|
"text": " that"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 5711,
|
"id": 5711,
|
||||||
"logprob": -1.2626953,
|
"logprob": -1.2294922,
|
||||||
"special": false,
|
"special": false,
|
||||||
"text": " uses"
|
"text": " uses"
|
||||||
}
|
}
|
||||||
@ -228,61 +228,61 @@
|
|||||||
"tokens": [
|
"tokens": [
|
||||||
{
|
{
|
||||||
"id": 18183,
|
"id": 18183,
|
||||||
"logprob": -1.4912109,
|
"logprob": -1.5195312,
|
||||||
"special": false,
|
"special": false,
|
||||||
"text": " Deep"
|
"text": " Deep"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 6832,
|
"id": 6832,
|
||||||
"logprob": -0.075683594,
|
"logprob": -0.06817627,
|
||||||
"special": false,
|
"special": false,
|
||||||
"text": " learning"
|
"text": " learning"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 374,
|
"id": 374,
|
||||||
"logprob": -0.12408447,
|
"logprob": -0.13122559,
|
||||||
"special": false,
|
"special": false,
|
||||||
"text": " is"
|
"text": " is"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 264,
|
"id": 264,
|
||||||
"logprob": -0.12768555,
|
"logprob": -0.13415527,
|
||||||
"special": false,
|
"special": false,
|
||||||
"text": " a"
|
"text": " a"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 25993,
|
"id": 25993,
|
||||||
"logprob": -0.82128906,
|
"logprob": -0.87353516,
|
||||||
"special": false,
|
"special": false,
|
||||||
"text": " subset"
|
"text": " subset"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 315,
|
"id": 315,
|
||||||
"logprob": -0.0012636185,
|
"logprob": -0.0011396408,
|
||||||
"special": false,
|
"special": false,
|
||||||
"text": " of"
|
"text": " of"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 5662,
|
"id": 5662,
|
||||||
"logprob": -0.12878418,
|
"logprob": -0.16442871,
|
||||||
"special": false,
|
"special": false,
|
||||||
"text": " machine"
|
"text": " machine"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 6832,
|
"id": 6832,
|
||||||
"logprob": -0.0015888214,
|
"logprob": -0.0026416779,
|
||||||
"special": false,
|
"special": false,
|
||||||
"text": " learning"
|
"text": " learning"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 429,
|
"id": 429,
|
||||||
"logprob": -0.49194336,
|
"logprob": -0.48754883,
|
||||||
"special": false,
|
"special": false,
|
||||||
"text": " that"
|
"text": " that"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 5711,
|
"id": 5711,
|
||||||
"logprob": -1.2626953,
|
"logprob": -1.2294922,
|
||||||
"special": false,
|
"special": false,
|
||||||
"text": " uses"
|
"text": " uses"
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,26 @@
|
|||||||
|
{
|
||||||
|
"choices": [
|
||||||
|
{
|
||||||
|
"finish_reason": "stop",
|
||||||
|
"index": 0,
|
||||||
|
"logprobs": null,
|
||||||
|
"message": {
|
||||||
|
"content": "The image showcases a stunning cityscape, featuring the iconic Statue of Liberty in the foreground. The image displays Lady Liberty's imposing presence, with her towering base standing beside her. Behind the statue, the city's skyline extends across the horizon, adorned with numerous tall buildings, including the Empire State Building and other notable skyscrapers. The water reflecting the sun's rays creates a serene and picturesque scene, emphasizing the beauty and resilience of this global landmark. The sky is a clear, pale blue, adding to the overall tranquility of the scene.",
|
||||||
|
"name": null,
|
||||||
|
"role": "assistant",
|
||||||
|
"tool_calls": null
|
||||||
|
},
|
||||||
|
"usage": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"created": 1738348090,
|
||||||
|
"id": "",
|
||||||
|
"model": "Qwen/Qwen2-VL-7B-Instruct",
|
||||||
|
"object": "chat.completion",
|
||||||
|
"system_fingerprint": "3.1.1-dev0-native",
|
||||||
|
"usage": {
|
||||||
|
"completion_tokens": 110,
|
||||||
|
"prompt_tokens": 8736,
|
||||||
|
"total_tokens": 8846
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,26 @@
|
|||||||
|
{
|
||||||
|
"choices": [
|
||||||
|
{
|
||||||
|
"finish_reason": "stop",
|
||||||
|
"index": 0,
|
||||||
|
"logprobs": null,
|
||||||
|
"message": {
|
||||||
|
"content": "The image shows a stylized scene set in what appears to be a diner or restaurant. In the foreground, there is a table with various food items, including a burger with lettuce and tomato, a bowl of fries, and a drink in a cup with a straw. On the right side of the table, there is an owl sitting alertly, looking directly at the camera. Behind the owl and the table, there is a large, green, dinosaur-like creature resembling Godzilla, with its mouth open and tongue visible. In the background, the diner's decor includes various signs and posters, with a green sign reading \"Basta\" and another sign that says \"Tabasco.\" The setting has a retro or vintage feel, with fluorescent lighting overhead and clean, polished surfaces.",
|
||||||
|
"name": null,
|
||||||
|
"role": "assistant",
|
||||||
|
"tool_calls": null
|
||||||
|
},
|
||||||
|
"usage": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"created": 1738348100,
|
||||||
|
"id": "",
|
||||||
|
"model": "Qwen/Qwen2-VL-7B-Instruct",
|
||||||
|
"object": "chat.completion",
|
||||||
|
"system_fingerprint": "3.1.1-dev0-native",
|
||||||
|
"usage": {
|
||||||
|
"completion_tokens": 156,
|
||||||
|
"prompt_tokens": 5375,
|
||||||
|
"total_tokens": 5531
|
||||||
|
}
|
||||||
|
}
|
@ -5,7 +5,7 @@
|
|||||||
"index": 0,
|
"index": 0,
|
||||||
"logprobs": null,
|
"logprobs": null,
|
||||||
"message": {
|
"message": {
|
||||||
"content": "The image depicts an anthropomorphic rabbit, wearing a futuristic spacesuit, in an extraterrestrial environment. The setting appears to be a red planet resembling Mars, with rugged terrain and rocky formations in the background. The moon is visible in the distant sky, adding to the lunar landscape.",
|
"content": "The image depicts an anthropomorphic rabbit, wearing a spacesuit, standing in a barren, rocky landscape that resembles the surface of another planet, possibly Mars. The rabbit has a red digestive system label on its chest, and the surrounding environment features red sandy terrain and a hazy, floating planet or moon in the background. The scene has a surreal, fantastical quality, blending elements of science fiction and space exploration with a whimsical character.",
|
||||||
"name": null,
|
"name": null,
|
||||||
"role": "assistant",
|
"role": "assistant",
|
||||||
"tool_calls": null
|
"tool_calls": null
|
||||||
@ -13,14 +13,14 @@
|
|||||||
"usage": null
|
"usage": null
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"created": 1730164250,
|
"created": 1738347908,
|
||||||
"id": "",
|
"id": "",
|
||||||
"model": "Qwen/Qwen2-VL-7B-Instruct",
|
"model": "Qwen/Qwen2-VL-7B-Instruct",
|
||||||
"object": "chat.completion",
|
"object": "chat.completion",
|
||||||
"system_fingerprint": "2.4.2-dev0-native",
|
"system_fingerprint": "3.1.1-dev0-native",
|
||||||
"usage": {
|
"usage": {
|
||||||
"completion_tokens": 58,
|
"completion_tokens": 89,
|
||||||
"prompt_tokens": 349,
|
"prompt_tokens": 1364,
|
||||||
"total_tokens": 407
|
"total_tokens": 1453
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -11,10 +11,10 @@
|
|||||||
"logprobs": null
|
"logprobs": null
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"created": 1730416361,
|
"created": 1737646031,
|
||||||
"id": "",
|
"id": "",
|
||||||
"model": "Qwen/Qwen2-VL-7B-Instruct",
|
"model": "Qwen/Qwen2-VL-7B-Instruct",
|
||||||
"object": "chat.completion.chunk",
|
"object": "chat.completion.chunk",
|
||||||
"system_fingerprint": "2.4.2-dev0-native",
|
"system_fingerprint": "3.0.2-dev0-native",
|
||||||
"usage": null
|
"usage": null
|
||||||
}
|
}
|
||||||
|
@ -27,15 +27,16 @@ async def test_compressed_tensors_w8a8_int_dynamic_weight(
|
|||||||
):
|
):
|
||||||
response = await compressed_tensors_w8a8_int_dynamic_weight.generate(
|
response = await compressed_tensors_w8a8_int_dynamic_weight.generate(
|
||||||
"What is deep learning?",
|
"What is deep learning?",
|
||||||
max_new_tokens=10,
|
# prefer a longer response than the default, allow the llm to end generation
|
||||||
|
max_new_tokens=1000,
|
||||||
decoder_input_details=True,
|
decoder_input_details=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
assert (
|
assert (
|
||||||
response.generated_text
|
response.generated_text
|
||||||
== " Deep learning is a subset of machine learning that uses"
|
== " Deep learning is a subset of machine learning that uses neural networks to learn from data. It is a type of artificial intelligence that can learn from and make predictions on large amounts of data. Deep learning is used in a variety of applications, including image and speech recognition, natural language processing, and autonomous vehicles. It is a rapidly growing field with many potential applications in the future."
|
||||||
)
|
)
|
||||||
assert response.details.generated_tokens == 10
|
assert response.details.generated_tokens == 76
|
||||||
assert response == response_snapshot
|
assert response == response_snapshot
|
||||||
|
|
||||||
|
|
||||||
@ -64,7 +65,7 @@ async def test_compressed_tensors_w8a8_int_dynamic_weight_all_params(
|
|||||||
assert response.details.generated_tokens == 10
|
assert response.details.generated_tokens == 10
|
||||||
assert (
|
assert (
|
||||||
response.generated_text
|
response.generated_text
|
||||||
== "What is deep learning?\n\nDeep Learning is an artificial intelligence (AI"
|
== "What is deep learning?\nDeep Learning (DL), or artificial neural networks"
|
||||||
)
|
)
|
||||||
assert response == response_snapshot
|
assert response == response_snapshot
|
||||||
|
|
||||||
|
@ -1,81 +1,122 @@
|
|||||||
# Disabled because it's broken.
|
import pytest
|
||||||
# import pytest
|
|
||||||
#
|
|
||||||
#
|
@pytest.fixture(scope="module")
|
||||||
# @pytest.fixture(scope="module")
|
def flash_qwen2_vl_handle(launcher):
|
||||||
# def flash_qwen2_vl_handle(launcher):
|
with launcher("Qwen/Qwen2-VL-7B-Instruct") as handle:
|
||||||
# with launcher("Qwen/Qwen2-VL-7B-Instruct") as handle:
|
yield handle
|
||||||
# yield handle
|
|
||||||
#
|
|
||||||
#
|
@pytest.fixture(scope="module")
|
||||||
# @pytest.fixture(scope="module")
|
async def flash_qwen2(flash_qwen2_vl_handle):
|
||||||
# async def flash_qwen2(flash_qwen2_vl_handle):
|
await flash_qwen2_vl_handle.health(300)
|
||||||
# await flash_qwen2_vl_handle.health(300)
|
return flash_qwen2_vl_handle.client
|
||||||
# return flash_qwen2_vl_handle.client
|
|
||||||
#
|
|
||||||
#
|
@pytest.mark.private
|
||||||
# @pytest.mark.private
|
async def test_flash_qwen2_vl_simple(flash_qwen2, response_snapshot):
|
||||||
# async def test_flash_qwen2_vl_simple(flash_qwen2, response_snapshot):
|
response = await flash_qwen2.chat(
|
||||||
# response = await flash_qwen2.chat(
|
seed=42,
|
||||||
# max_tokens=100,
|
messages=[
|
||||||
# seed=42,
|
{
|
||||||
# messages=[
|
"role": "user",
|
||||||
# {
|
"content": [
|
||||||
# "role": "user",
|
{
|
||||||
# "content": [
|
"type": "image_url",
|
||||||
# {
|
"image_url": {
|
||||||
# "type": "image_url",
|
"url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png"
|
||||||
# "image_url": {
|
},
|
||||||
# "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png"
|
},
|
||||||
# },
|
{"type": "text", "text": "Describe this image."},
|
||||||
# },
|
],
|
||||||
# {"type": "text", "text": "Describe this image."},
|
},
|
||||||
# ],
|
],
|
||||||
# },
|
)
|
||||||
# ],
|
|
||||||
# )
|
assert (
|
||||||
#
|
response.choices[0].message.content
|
||||||
# assert (
|
== "The image depicts an anthropomorphic rabbit, wearing a spacesuit, standing in a barren, rocky landscape that resembles the surface of another planet, possibly Mars. The rabbit has a red digestive system label on its chest, and the surrounding environment features red sandy terrain and a hazy, floating planet or moon in the background. The scene has a surreal, fantastical quality, blending elements of science fiction and space exploration with a whimsical character."
|
||||||
# response.choices[0].message.content
|
)
|
||||||
# == "The image depicts an anthropomorphic rabbit, wearing a futuristic spacesuit, in an extraterrestrial environment. The setting appears to be a red planet resembling Mars, with rugged terrain and rocky formations in the background. The moon is visible in the distant sky, adding to the lunar landscape."
|
|
||||||
# )
|
assert response == response_snapshot
|
||||||
#
|
|
||||||
# assert response == response_snapshot
|
|
||||||
#
|
@pytest.mark.private
|
||||||
#
|
async def test_flash_qwen2_vl_simple_streaming(flash_qwen2, response_snapshot):
|
||||||
# @pytest.mark.private
|
responses = await flash_qwen2.chat(
|
||||||
# async def test_flash_qwen2_vl_simple_streaming(flash_qwen2, response_snapshot):
|
seed=42,
|
||||||
# responses = await flash_qwen2.chat(
|
messages=[
|
||||||
# max_tokens=100,
|
{
|
||||||
# seed=42,
|
"role": "user",
|
||||||
# messages=[
|
"content": [
|
||||||
# {
|
{
|
||||||
# "role": "user",
|
"type": "image_url",
|
||||||
# "content": [
|
"image_url": {
|
||||||
# {
|
"url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png"
|
||||||
# "type": "image_url",
|
},
|
||||||
# "image_url": {
|
},
|
||||||
# "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png"
|
{"type": "text", "text": "Describe this image."},
|
||||||
# },
|
],
|
||||||
# },
|
},
|
||||||
# {"type": "text", "text": "Describe this image."},
|
],
|
||||||
# ],
|
stream=True,
|
||||||
# },
|
)
|
||||||
# ],
|
|
||||||
# stream=True,
|
count = 0
|
||||||
# )
|
generated = ""
|
||||||
#
|
last_response = None
|
||||||
# count = 0
|
async for response in responses:
|
||||||
# generated = ""
|
count += 1
|
||||||
# last_response = None
|
generated += response.choices[0].delta.content
|
||||||
# async for response in responses:
|
last_response = response
|
||||||
# count += 1
|
|
||||||
# generated += response.choices[0].delta.content
|
assert (
|
||||||
# last_response = response
|
generated
|
||||||
#
|
== "The image depicts an anthropomorphic rabbit, wearing a spacesuit, standing in a barren, rocky landscape that resembles the surface of another planet, possibly Mars. The rabbit has a red digestive system label on its chest, and the surrounding environment features red sandy terrain and a hazy, floating planet or moon in the background. The scene has a surreal, fantastical quality, blending elements of science fiction and space exploration with a whimsical character."
|
||||||
# assert (
|
)
|
||||||
# generated
|
assert count == 89
|
||||||
# == "The image depicts an anthropomorphic rabbit, wearing a futuristic spacesuit, in an extraterrestrial environment. The setting appears to be a red planet resembling Mars, with rugged terrain and rocky formations in the background. The moon is visible in the distant sky, adding to the lunar landscape."
|
assert last_response == response_snapshot
|
||||||
# )
|
|
||||||
# assert count == 58
|
|
||||||
# assert last_response == response_snapshot
|
@pytest.mark.private
|
||||||
|
async def test_flash_qwen2_vl_bay(flash_qwen2, response_snapshot):
|
||||||
|
response = await flash_qwen2.chat(
|
||||||
|
seed=42,
|
||||||
|
messages=[
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": {
|
||||||
|
"url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{"type": "text", "text": "Describe the image"},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
)
|
||||||
|
assert response == response_snapshot
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.private
|
||||||
|
async def test_flash_qwen2_vl_inpaint(flash_qwen2, response_snapshot):
|
||||||
|
response = await flash_qwen2.chat(
|
||||||
|
seed=42,
|
||||||
|
messages=[
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": {
|
||||||
|
"url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/autopipeline-inpaint.png"
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{"type": "text", "text": "Describe the image"},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
)
|
||||||
|
assert response == response_snapshot
|
||||||
|
@ -2049,7 +2049,16 @@ fn main() -> Result<(), LauncherError> {
|
|||||||
None => {
|
None => {
|
||||||
let compute_type = compute_type(num_shard);
|
let compute_type = compute_type(num_shard);
|
||||||
let compute_optimal = compute_optimal(config.as_ref(), compute_type.as_ref());
|
let compute_optimal = compute_optimal(config.as_ref(), compute_type.as_ref());
|
||||||
let default = compute_optimal.unwrap_or(4096);
|
// TODO: remove this when we correctly esimate the flops for VLMs
|
||||||
|
// this is a short term temporary fix to enable vlms to avoid rejecting images
|
||||||
|
let default_optimal = match config {
|
||||||
|
Some(ref config) => match config.model_type.as_deref() {
|
||||||
|
Some("qwen2_vl") => 10_000,
|
||||||
|
_ => 4096,
|
||||||
|
},
|
||||||
|
None => 4096,
|
||||||
|
};
|
||||||
|
let default = compute_optimal.unwrap_or(default_optimal);
|
||||||
let vram_maximum = vram_maximum(
|
let vram_maximum = vram_maximum(
|
||||||
config.as_ref(),
|
config.as_ref(),
|
||||||
compute_type.as_ref(),
|
compute_type.as_ref(),
|
||||||
|
@ -86,11 +86,18 @@ class PositionRotaryEmbedding(nn.Module):
|
|||||||
# `rope_type` is now standard in transformers, but some existing models
|
# `rope_type` is now standard in transformers, but some existing models
|
||||||
# have `type` instead.
|
# have `type` instead.
|
||||||
rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None))
|
rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None))
|
||||||
|
mrope_section = rope_scaling.get("mrope_section", None)
|
||||||
|
|
||||||
if rope_type == "linear":
|
if rope_type == "linear":
|
||||||
pass
|
pass
|
||||||
elif rope_type == "default":
|
elif rope_type == "default":
|
||||||
pass
|
pass
|
||||||
|
elif rope_type == "mrope":
|
||||||
|
mrope_section = rope_scaling["mrope_section"]
|
||||||
|
if mrope_section is not None:
|
||||||
|
return RotaryPositionEmbeddingMultimodalSections(
|
||||||
|
inv_freq, scaling_factor, mrope_section
|
||||||
|
)
|
||||||
elif rope_type == "dynamic":
|
elif rope_type == "dynamic":
|
||||||
scaling_factor = rope_scaling["factor"]
|
scaling_factor = rope_scaling["factor"]
|
||||||
return DynamicPositionRotaryEmbedding(
|
return DynamicPositionRotaryEmbedding(
|
||||||
@ -548,3 +555,66 @@ def apply_llama3_scaling(
|
|||||||
new_freqs.append((1 - smooth) * freq / scaling_factor + smooth * freq)
|
new_freqs.append((1 - smooth) * freq / scaling_factor + smooth * freq)
|
||||||
|
|
||||||
return torch.tensor(new_freqs, dtype=freqs.dtype, device=freqs.device)
|
return torch.tensor(new_freqs, dtype=freqs.dtype, device=freqs.device)
|
||||||
|
|
||||||
|
|
||||||
|
class RotaryPositionEmbeddingMultimodalSections(PositionRotaryEmbedding):
|
||||||
|
def __init__(self, inv_freq: torch.Tensor, scaling_factor: float, sections: list):
|
||||||
|
super().__init__(inv_freq, scaling_factor)
|
||||||
|
self.sections = sections
|
||||||
|
self._cos_cached = None
|
||||||
|
self._sin_cached = None
|
||||||
|
self.section_indices = (
|
||||||
|
torch.arange(len(self.sections))
|
||||||
|
.repeat_interleave(torch.tensor(self.sections))
|
||||||
|
.view(1, 1, -1)
|
||||||
|
.to(inv_freq.device)
|
||||||
|
)
|
||||||
|
|
||||||
|
def forward(
|
||||||
|
self,
|
||||||
|
query: torch.Tensor,
|
||||||
|
key: torch.Tensor,
|
||||||
|
cos: torch.Tensor,
|
||||||
|
sin: torch.Tensor,
|
||||||
|
):
|
||||||
|
# rotate half the sequence length
|
||||||
|
rot = cos.shape[-1] // 2
|
||||||
|
q2 = torch.cat([-query[..., rot:], query[..., :rot]], dim=-1)
|
||||||
|
k2 = torch.cat([-key[..., rot:], key[..., :rot]], dim=-1)
|
||||||
|
|
||||||
|
# apply the rotation
|
||||||
|
rotary_emb.apply_rotary(query, q2, cos, sin, query, q2, True)
|
||||||
|
rotary_emb.apply_rotary(key, k2, cos, sin, key, k2, True)
|
||||||
|
|
||||||
|
def _update_cos_sin_cache(
|
||||||
|
self, dtype: torch.dtype, device: torch.device, seqlen: int
|
||||||
|
):
|
||||||
|
# always cache the cos/sin for the full sequence length to avoid
|
||||||
|
# recomputing if the sequence length is smaller than the cached one
|
||||||
|
if (
|
||||||
|
seqlen > self._seq_len_cached
|
||||||
|
or self._cos_cached.device != device
|
||||||
|
or self._cos_cached.dtype != dtype
|
||||||
|
):
|
||||||
|
self._seq_len_cached = seqlen
|
||||||
|
t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
|
||||||
|
freqs = torch.outer(t, self.inv_freq.to(device=t.device))
|
||||||
|
self._cos_cached = torch.cos(freqs).to(dtype)
|
||||||
|
self._sin_cached = torch.sin(freqs).to(dtype)
|
||||||
|
self._sections = self.section_indices.expand(seqlen, -1, -1)
|
||||||
|
|
||||||
|
def get_cos_sin(
|
||||||
|
self,
|
||||||
|
position_ids: torch.Tensor,
|
||||||
|
max_s: int,
|
||||||
|
dtype: torch.dtype,
|
||||||
|
):
|
||||||
|
self._update_cos_sin_cache(dtype, position_ids.device, max_s)
|
||||||
|
slen = position_ids.shape[0]
|
||||||
|
|
||||||
|
cos = self._cos_cached[position_ids].gather(1, self._sections[:slen])
|
||||||
|
sin = self._sin_cached[position_ids].gather(1, self._sections[:slen])
|
||||||
|
|
||||||
|
cos = torch.cat([cos, cos], dim=-1)
|
||||||
|
sin = torch.cat([sin, sin], dim=-1)
|
||||||
|
return cos, sin
|
||||||
|
@ -1363,6 +1363,7 @@ def get_model(
|
|||||||
quantize=quantize,
|
quantize=quantize,
|
||||||
speculator=speculator,
|
speculator=speculator,
|
||||||
dtype=dtype,
|
dtype=dtype,
|
||||||
|
default_dtype=torch.bfloat16,
|
||||||
kv_cache_dtype=kv_cache_dtype,
|
kv_cache_dtype=kv_cache_dtype,
|
||||||
trust_remote_code=trust_remote_code,
|
trust_remote_code=trust_remote_code,
|
||||||
lora_adapter_ids=lora_adapter_ids,
|
lora_adapter_ids=lora_adapter_ids,
|
||||||
|
@ -61,11 +61,6 @@ class Qwen2Attention(torch.nn.Module):
|
|||||||
config.sliding_window if config.sliding_window is not None else -1
|
config.sliding_window if config.sliding_window is not None else -1
|
||||||
)
|
)
|
||||||
self.num_heads = config.num_attention_heads
|
self.num_heads = config.num_attention_heads
|
||||||
self.mrope_section = (
|
|
||||||
config.rope_scaling.get("mrope_section", None)
|
|
||||||
if config.rope_scaling is not None
|
|
||||||
else None
|
|
||||||
)
|
|
||||||
self.hidden_size = config.hidden_size
|
self.hidden_size = config.hidden_size
|
||||||
self.head_size = self.hidden_size // self.num_heads
|
self.head_size = self.hidden_size // self.num_heads
|
||||||
|
|
||||||
@ -127,17 +122,6 @@ class Qwen2Attention(torch.nn.Module):
|
|||||||
query = query.view(-1, self.num_heads, self.head_size)
|
query = query.view(-1, self.num_heads, self.head_size)
|
||||||
kv = kv.view(-1, 2, self.num_key_value_heads, self.head_size)
|
kv = kv.view(-1, 2, self.num_key_value_heads, self.head_size)
|
||||||
|
|
||||||
if self.mrope_section is not None:
|
|
||||||
# if mrope_section is set, we need to split the cos and sin into 3 parts and concatenate them in a specific order
|
|
||||||
cos = torch.cat(
|
|
||||||
[m[i % 3] for i, m in enumerate(cos.split(self.mrope_section, dim=-1))],
|
|
||||||
dim=-1,
|
|
||||||
)
|
|
||||||
sin = torch.cat(
|
|
||||||
[m[i % 3] for i, m in enumerate(sin.split(self.mrope_section, dim=-1))],
|
|
||||||
dim=-1,
|
|
||||||
)
|
|
||||||
|
|
||||||
self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin)
|
self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin)
|
||||||
|
|
||||||
if prefill_cache_indices is not None:
|
if prefill_cache_indices is not None:
|
||||||
@ -251,7 +235,7 @@ class Qwen2Layer(nn.Module):
|
|||||||
max_s,
|
max_s,
|
||||||
prefill_cache_indices,
|
prefill_cache_indices,
|
||||||
):
|
):
|
||||||
normed_hidden_states, res = self.input_layernorm(hidden_states, residual)
|
normed_hidden_states, residual = self.input_layernorm(hidden_states)
|
||||||
|
|
||||||
# Self Attention
|
# Self Attention
|
||||||
attn_output = self.self_attn(
|
attn_output = self.self_attn(
|
||||||
@ -266,15 +250,13 @@ class Qwen2Layer(nn.Module):
|
|||||||
max_s,
|
max_s,
|
||||||
prefill_cache_indices,
|
prefill_cache_indices,
|
||||||
)
|
)
|
||||||
|
hidden_states = attn_output + residual
|
||||||
|
|
||||||
# faster post attention rms norm
|
# faster post attention rms norm
|
||||||
normed_attn_res_output, attn_res = self.post_attention_layernorm(
|
hidden_states, residual = self.post_attention_layernorm(hidden_states)
|
||||||
attn_output, res
|
mlp_output = self.mlp(hidden_states)
|
||||||
)
|
hidden_states = mlp_output + residual
|
||||||
|
return hidden_states
|
||||||
mlp_output = self.mlp(normed_attn_res_output)
|
|
||||||
|
|
||||||
return mlp_output, attn_res
|
|
||||||
|
|
||||||
|
|
||||||
class Qwen2Model(torch.nn.Module):
|
class Qwen2Model(torch.nn.Module):
|
||||||
@ -322,18 +304,15 @@ class Qwen2Model(torch.nn.Module):
|
|||||||
) -> torch.Tensor:
|
) -> torch.Tensor:
|
||||||
hidden_states = inputs_embeds
|
hidden_states = inputs_embeds
|
||||||
|
|
||||||
# flatten position ids from 2D to 1D
|
|
||||||
cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(
|
cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(
|
||||||
position_ids.flatten(), true_max_s, hidden_states.dtype
|
position_ids,
|
||||||
|
true_max_s,
|
||||||
|
hidden_states.dtype,
|
||||||
)
|
)
|
||||||
# reshape back to 2D if the position_ids were 2D
|
|
||||||
if position_ids.size(0) != cos.size(0):
|
|
||||||
cos = cos.view(position_ids.size(0), position_ids.size(-1), -1).unsqueeze(2)
|
|
||||||
sin = sin.view(position_ids.size(0), position_ids.size(-1), -1).unsqueeze(2)
|
|
||||||
|
|
||||||
residual = None
|
residual = None
|
||||||
for i, layer in enumerate(self.layers):
|
for i, layer in enumerate(self.layers):
|
||||||
hidden_states, residual = layer(
|
hidden_states = layer(
|
||||||
hidden_states,
|
hidden_states,
|
||||||
residual,
|
residual,
|
||||||
cos,
|
cos,
|
||||||
@ -347,7 +326,7 @@ class Qwen2Model(torch.nn.Module):
|
|||||||
prefill_cache_indices,
|
prefill_cache_indices,
|
||||||
)
|
)
|
||||||
|
|
||||||
hidden_states, _ = self.norm(hidden_states, residual)
|
hidden_states, _ = self.norm(hidden_states)
|
||||||
|
|
||||||
return hidden_states
|
return hidden_states
|
||||||
|
|
||||||
|
@ -222,12 +222,11 @@ class Qwen2VLVisionBlock(nn.Module):
|
|||||||
def forward(
|
def forward(
|
||||||
self, hidden_states, cu_seqlens, rotary_pos_emb, max_seqlen
|
self, hidden_states, cu_seqlens, rotary_pos_emb, max_seqlen
|
||||||
) -> torch.Tensor:
|
) -> torch.Tensor:
|
||||||
hidden_states_post_norm1, res = self.norm1(hidden_states)
|
norm1_out, residual = self.norm1(hidden_states)
|
||||||
hidden_states = hidden_states + self.attn(
|
attn_out = self.attn(norm1_out, cu_seqlens, rotary_pos_emb, max_seqlen)
|
||||||
hidden_states_post_norm1, cu_seqlens, rotary_pos_emb, max_seqlen
|
hidden_states = attn_out + residual
|
||||||
)
|
norm2_out, residual = self.norm2(hidden_states)
|
||||||
hidden_states_post_norm2, res = self.norm2(hidden_states)
|
hidden_states = hidden_states + self.mlp(norm2_out)
|
||||||
hidden_states = hidden_states + self.mlp(hidden_states_post_norm2)
|
|
||||||
return hidden_states
|
return hidden_states
|
||||||
|
|
||||||
|
|
||||||
@ -378,8 +377,12 @@ class Qwen2VLForConditionalGeneration(nn.Module):
|
|||||||
self.config = config
|
self.config = config
|
||||||
config.vision_config.quantize = None
|
config.vision_config.quantize = None
|
||||||
config.vision_config.speculator = config.speculator
|
config.vision_config.speculator = config.speculator
|
||||||
|
# set rope_scaling.type == "mrope" since AutoConfig.from_pretrained incorrectly
|
||||||
|
# returns rope_scaling.type == "default" for Qwen2-VL model at the moment
|
||||||
|
config.rope_scaling.update({"rope_type": "mrope"})
|
||||||
self.hidden_size = config.hidden_size
|
self.hidden_size = config.hidden_size
|
||||||
self.vision_start_token_id = config.vision_start_token_id
|
self.vision_start_token_id = config.vision_start_token_id
|
||||||
|
self.vision_end_token_id = config.vision_end_token_id
|
||||||
self.image_token_id = config.image_token_id
|
self.image_token_id = config.image_token_id
|
||||||
self.video_token_id = config.video_token_id
|
self.video_token_id = config.video_token_id
|
||||||
self.spatial_merge_size = config.vision_config.spatial_merge_size
|
self.spatial_merge_size = config.vision_config.spatial_merge_size
|
||||||
@ -407,98 +410,88 @@ class Qwen2VLForConditionalGeneration(nn.Module):
|
|||||||
)
|
)
|
||||||
self.device = weights.device
|
self.device = weights.device
|
||||||
|
|
||||||
|
# based on https://github.com/huggingface/transformers/blob/e284c7e954abe12c34b50461c17f8115a0afe115/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py#L1391
|
||||||
|
# modified to first find segments then initialize position ids for each segment
|
||||||
|
# Steps:
|
||||||
|
# locate all vision and text segments
|
||||||
|
# calculate `vision_segment_lengths` for each vision segment to be use as offset
|
||||||
|
# calculate `text_segment_lengths` for each text segment to be used as offset
|
||||||
|
# create position ids for each vision segment based on the image grid
|
||||||
|
# create position ids for each text segment
|
||||||
|
# combine all the position ids
|
||||||
|
# the final segment is the difference between the last vision segment and the end of the input
|
||||||
|
# combine all the position ids and reshape to (3, input_ids_len) then swap dimensions to (input_ids_len, 3)
|
||||||
def get_position_ids(
|
def get_position_ids(
|
||||||
self,
|
self,
|
||||||
batch_input_ids: torch.Tensor,
|
input_ids: torch.Tensor,
|
||||||
image_grid_thw: Optional[torch.LongTensor] = None,
|
image_grid_thw: Optional[torch.Tensor] = None,
|
||||||
# video_grid_thw is not implemented yet as we do not accept video inputs at the moment
|
) -> torch.Tensor:
|
||||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
if image_grid_thw is None:
|
||||||
if batch_input_ids.dim() == 1:
|
return (
|
||||||
batch_input_ids = batch_input_ids.unsqueeze(0)
|
torch.arange(input_ids.shape[0], device=input_ids.device)
|
||||||
|
.unsqueeze(1)
|
||||||
position_ids = torch.ones(
|
.repeat(1, 3)
|
||||||
3,
|
|
||||||
batch_input_ids.shape[0],
|
|
||||||
batch_input_ids.shape[1],
|
|
||||||
dtype=batch_input_ids.dtype,
|
|
||||||
device=batch_input_ids.device,
|
|
||||||
)
|
)
|
||||||
d = batch_input_ids.device
|
|
||||||
if image_grid_thw is not None:
|
spatial_merge_size = self.spatial_merge_size
|
||||||
image_index = 0
|
vision_start_token_id = self.vision_start_token_id
|
||||||
|
vision_end_token_id = self.vision_end_token_id
|
||||||
|
device = input_ids.device
|
||||||
|
dtype = input_ids.dtype
|
||||||
|
input_ids_len = input_ids.shape[0]
|
||||||
|
|
||||||
|
vision_starts = torch.where(input_ids == vision_start_token_id)[0]
|
||||||
|
vision_ends = torch.where(input_ids == vision_end_token_id)[0]
|
||||||
|
vision_segments = torch.stack((vision_starts, vision_ends), dim=1)
|
||||||
|
prev_vision_end = torch.cat(
|
||||||
|
[torch.zeros(1, device=vision_ends.device, dtype=dtype), vision_ends[:-1]]
|
||||||
|
)
|
||||||
|
text_lengths_between_vision = vision_segments[:, 0] - prev_vision_end + 1
|
||||||
|
vision_widths_max = torch.cat(
|
||||||
|
[
|
||||||
|
torch.zeros(1, device=image_grid_thw.device, dtype=dtype),
|
||||||
|
image_grid_thw[:-1, 2] // spatial_merge_size,
|
||||||
|
]
|
||||||
|
)
|
||||||
|
vision_segment_lengths = vision_widths_max + text_lengths_between_vision
|
||||||
|
vision_segment_lengths = vision_segment_lengths.cumsum(dim=0)
|
||||||
|
text_segment_lengths = vision_segment_lengths - text_lengths_between_vision
|
||||||
|
|
||||||
|
# create position ids for each vision segment based on the image grid
|
||||||
llm_pos_ids_list = []
|
llm_pos_ids_list = []
|
||||||
|
for i, _ in enumerate(vision_segments):
|
||||||
for i, input_ids in enumerate(batch_input_ids):
|
t, h, w = (
|
||||||
vision_start_indices = torch.argwhere(
|
image_grid_thw[i][0],
|
||||||
input_ids == self.vision_start_token_id
|
image_grid_thw[i][1] // spatial_merge_size,
|
||||||
).squeeze(1)
|
image_grid_thw[i][2] // spatial_merge_size,
|
||||||
vision_tokens = input_ids[vision_start_indices + 1]
|
|
||||||
# only copy the sum of the image tokens GPU<->CPU
|
|
||||||
image_count = (vision_tokens == self.image_token_id).sum().item()
|
|
||||||
|
|
||||||
current_pos = 0
|
|
||||||
for _ in range(image_count):
|
|
||||||
# copy the value position of the next image token from GPU<->CPU
|
|
||||||
next_image_pos = (
|
|
||||||
(input_ids[current_pos:] == self.image_token_id)
|
|
||||||
.nonzero()[0]
|
|
||||||
.item()
|
|
||||||
)
|
)
|
||||||
# TODO: revisit above to get all next_image_pos in one go to avoid copying in the loop
|
t_indices = torch.arange(t, device=device).repeat_interleave(h * w)
|
||||||
time_steps, height, width = image_grid_thw[image_index].clone()
|
h_indices = torch.arange(h, device=device).repeat_interleave(w).repeat(t)
|
||||||
height //= self.spatial_merge_size
|
w_indices = torch.arange(w, device=device).repeat(t * h)
|
||||||
width //= self.spatial_merge_size
|
image_position_ids = torch.stack([t_indices, h_indices, w_indices], dim=0)
|
||||||
|
|
||||||
# calculate the length of the text and image tokens
|
# offset by the position of the last vision segment
|
||||||
text_length = next_image_pos
|
im = image_position_ids + vision_segment_lengths[i]
|
||||||
start_idx = (
|
llm_pos_ids_list.append(im)
|
||||||
llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
|
|
||||||
)
|
|
||||||
|
|
||||||
# text position ids
|
# create position ids for each text segment
|
||||||
text_pos_ids = torch.arange(text_length, device=d)
|
text_ranges = [
|
||||||
text_pos_ids = text_pos_ids.view(1, -1).expand(3, -1) + start_idx
|
torch.arange(seq_len, device=device).view(1, -1).expand(3, -1)
|
||||||
llm_pos_ids_list.append(text_pos_ids)
|
+ text_segment_lengths[i]
|
||||||
|
for i, seq_len in enumerate(text_lengths_between_vision)
|
||||||
|
]
|
||||||
|
|
||||||
# image position ids
|
full_llm_pos_ids_list = [
|
||||||
t_indices = torch.arange(time_steps, device=d).repeat_interleave(
|
item for sublist in zip(text_ranges, llm_pos_ids_list) for item in sublist
|
||||||
height * width
|
]
|
||||||
)
|
max_s = full_llm_pos_ids_list[-1].max() + 1
|
||||||
h_indices = (
|
final_text_len = input_ids_len - vision_ends[-1]
|
||||||
torch.arange(height, device=d)
|
if final_text_len > 0:
|
||||||
.repeat_interleave(width)
|
m = torch.arange(final_text_len, device=device).view(1, -1).expand(3, -1)
|
||||||
.repeat(time_steps)
|
full_llm_pos_ids_list.append(m + max_s)
|
||||||
)
|
|
||||||
w_indices = torch.arange(width, device=d).repeat(
|
|
||||||
height * time_steps
|
|
||||||
)
|
|
||||||
|
|
||||||
image_pos_ids = (
|
|
||||||
torch.stack([t_indices, h_indices, w_indices])
|
|
||||||
+ text_length
|
|
||||||
+ start_idx
|
|
||||||
)
|
|
||||||
llm_pos_ids_list.append(image_pos_ids)
|
|
||||||
|
|
||||||
current_pos += next_image_pos + time_steps * height * width
|
|
||||||
image_index += 1
|
|
||||||
|
|
||||||
if current_pos < batch_input_ids.size(1):
|
|
||||||
st_idx = (
|
|
||||||
llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
|
|
||||||
)
|
|
||||||
text_len = batch_input_ids.size(1) - current_pos
|
|
||||||
llm_pos_ids_list.append(
|
|
||||||
torch.arange(text_len, device=d).view(1, -1).expand(3, -1) + st_idx
|
|
||||||
)
|
|
||||||
|
|
||||||
llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
|
|
||||||
position_ids[:, i, :] = llm_positions.to(position_ids.device)
|
|
||||||
else:
|
|
||||||
position_ids = (
|
position_ids = (
|
||||||
torch.arange(batch_input_ids.shape[1], device=batch_input_ids.device)
|
torch.cat(full_llm_pos_ids_list, dim=1).reshape(3, -1).transpose(0, 1)
|
||||||
.view(1, 1, -1)
|
|
||||||
.repeat(3, batch_input_ids.shape[0], 1)
|
|
||||||
)
|
)
|
||||||
return position_ids
|
return position_ids
|
||||||
|
|
||||||
@ -527,6 +520,7 @@ class Qwen2VLForConditionalGeneration(nn.Module):
|
|||||||
|
|
||||||
# apply the visual model to the pixel values if they are provided
|
# apply the visual model to the pixel values if they are provided
|
||||||
if pixel_values is not None and len(pixel_values) > 0:
|
if pixel_values is not None and len(pixel_values) > 0:
|
||||||
|
pixel_values = pixel_values.to(inputs_embeds.dtype)
|
||||||
if pixel_values is not None:
|
if pixel_values is not None:
|
||||||
image_embeds = self.visual(
|
image_embeds = self.visual(
|
||||||
pixel_values, grid_thw=image_grid_thw
|
pixel_values, grid_thw=image_grid_thw
|
||||||
@ -545,7 +539,6 @@ class Qwen2VLForConditionalGeneration(nn.Module):
|
|||||||
true_max_s=max_s,
|
true_max_s=max_s,
|
||||||
prefill_cache_indices=prefill_cache_indices,
|
prefill_cache_indices=prefill_cache_indices,
|
||||||
)
|
)
|
||||||
hidden_states, _ = self.norm(hidden_states)
|
|
||||||
if lm_head_indices is not None:
|
if lm_head_indices is not None:
|
||||||
hidden_states = hidden_states[lm_head_indices]
|
hidden_states = hidden_states[lm_head_indices]
|
||||||
logits, speculative_logits = self.lm_head(hidden_states)
|
logits, speculative_logits = self.lm_head(hidden_states)
|
||||||
|
@ -1401,6 +1401,13 @@ class FlashCausalLM(Model):
|
|||||||
if max_bs is None:
|
if max_bs is None:
|
||||||
input_ids = torch.zeros(bs, dtype=torch.int64, device=self.device)
|
input_ids = torch.zeros(bs, dtype=torch.int64, device=self.device)
|
||||||
position_ids = torch.zeros(bs, dtype=torch.int32, device=self.device)
|
position_ids = torch.zeros(bs, dtype=torch.int32, device=self.device)
|
||||||
|
config = getattr(self.model, "config", None)
|
||||||
|
rope_scaling = getattr(config, "rope_scaling", None) if config else None
|
||||||
|
if ( # mrope have position_ids per section, if so repeat n times
|
||||||
|
isinstance(rope_scaling, dict) and rope_scaling["rope_type"] == "mrope"
|
||||||
|
):
|
||||||
|
n_sections = len(self.model.config.rope_scaling["mrope_section"])
|
||||||
|
position_ids = position_ids.unsqueeze(1).repeat(1, n_sections)
|
||||||
slots = torch.arange(bs, dtype=torch.int64, device=self.device)
|
slots = torch.arange(bs, dtype=torch.int64, device=self.device)
|
||||||
input_lengths_tensor = (
|
input_lengths_tensor = (
|
||||||
torch.ones(bs, dtype=torch.int32, device=self.device) * max_s
|
torch.ones(bs, dtype=torch.int32, device=self.device) * max_s
|
||||||
@ -1456,14 +1463,6 @@ class FlashCausalLM(Model):
|
|||||||
else:
|
else:
|
||||||
state = None
|
state = None
|
||||||
|
|
||||||
if (
|
|
||||||
hasattr(self.model, "config")
|
|
||||||
and hasattr(self.model.config, "model_type")
|
|
||||||
and self.model.config.model_type == "qwen2_vl"
|
|
||||||
):
|
|
||||||
if position_ids.dim() == 1:
|
|
||||||
position_ids = self.model.get_position_ids(input_ids)
|
|
||||||
|
|
||||||
graph = torch.cuda.CUDAGraph()
|
graph = torch.cuda.CUDAGraph()
|
||||||
self.cuda_graphs[bs] = {
|
self.cuda_graphs[bs] = {
|
||||||
"input_ids": input_ids,
|
"input_ids": input_ids,
|
||||||
@ -2050,7 +2049,7 @@ class FlashCausalLM(Model):
|
|||||||
# instantly become of shape [BATCH_SIZE]
|
# instantly become of shape [BATCH_SIZE]
|
||||||
if prefill and finished_prefilling:
|
if prefill and finished_prefilling:
|
||||||
indices = batch.cu_seqlen_prefill[1:] - 1
|
indices = batch.cu_seqlen_prefill[1:] - 1
|
||||||
batch.position_ids = batch.position_ids[(..., indices)]
|
batch.position_ids = batch.position_ids[indices]
|
||||||
batch.slot_indices = batch.slot_indices[indices]
|
batch.slot_indices = batch.slot_indices[indices]
|
||||||
batch.adapter_meta.adapter_indices = batch.adapter_meta.adapter_indices[
|
batch.adapter_meta.adapter_indices = batch.adapter_meta.adapter_indices[
|
||||||
indices
|
indices
|
||||||
|
Loading…
Reference in New Issue
Block a user