Add the usual model tests

2025-09-12 04:44:52 +00:00 · 2024-09-25 10:27:29 +00:00 · 2024-09-25 10:27:29 +00:00 · e3e483c901
commit e3e483c901
parent 245d6d8f7c
5 changed files with 704 additions and 48 deletions
--- a/integration-tests/models/snapshots/test_flash_phi35_moe/test_flash_phi35_moe.json
+++ b/integration-tests/models/snapshots/test_flash_phi35_moe/test_flash_phi35_moe.json
@ -0,0 +1,109 @@
 {
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [
      {
        "id": 1724,
        "logprob": null,
        "text": "What"
      },
      {
        "id": 338,
        "logprob": -0.7133789,
        "text": "is"
      },
      {
        "id": 16030,
        "logprob": -13.9296875,
        "text": "gradient"
      },
      {
        "id": 26815,
        "logprob": -0.048919678,
        "text": "descent"
      },
      {
        "id": 29973,
        "logprob": -3.0078125,
        "text": "?"
      },
      {
        "id": 13,
        "logprob": -2.8105469,
        "text": "\n"
      },
      {
        "id": 13,
        "logprob": -0.84521484,
        "text": "\n"
      }
    ],
    "seed": null,
    "tokens": [
      {
        "id": 25584,
        "logprob": -0.017028809,
        "special": false,
        "text": "Grad"
      },
      {
        "id": 993,
        "logprob": -0.0027313232,
        "special": false,
        "text": "ient"
      },
      {
        "id": 26815,
        "logprob": -0.023254395,
        "special": false,
        "text": " descent"
      },
      {
        "id": 338,
        "logprob": -2.0623207e-05,
        "special": false,
        "text": " is"
      },
      {
        "id": 263,
        "logprob": -0.5361328,
        "special": false,
        "text": " a"
      },
      {
        "id": 937,
        "logprob": -0.17578125,
        "special": false,
        "text": " first"
      },
      {
        "id": 29899,
        "logprob": 0.0,
        "special": false,
        "text": "-"
      },
      {
        "id": 2098,
        "logprob": -0.00011539459,
        "special": false,
        "text": "order"
      },
      {
        "id": 13883,
        "logprob": -0.47436523,
        "special": false,
        "text": " optimization"
      },
      {
        "id": 5687,
        "logprob": -0.00027680397,
        "special": false,
        "text": " algorithm"
      }
    ],
    "top_tokens": null
  },
  "generated_text": "Gradient descent is a first-order optimization algorithm"
 }
--- a/integration-tests/models/snapshots/test_flash_phi35_moe/test_flash_phi35_moe_all_params.json
+++ b/integration-tests/models/snapshots/test_flash_phi35_moe/test_flash_phi35_moe_all_params.json
@ -0,0 +1,99 @@
 {
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [
      {
        "id": 16030,
        "logprob": null,
        "text": "gradient"
      },
      {
        "id": 26815,
        "logprob": -6.4960938,
        "text": "descent"
      },
      {
        "id": 29973,
        "logprob": -5.1484375,
        "text": "?"
      },
      {
        "id": 13,
        "logprob": -4.0351562,
        "text": "\n"
      },
      {
        "id": 13,
        "logprob": -5.2265625,
        "text": "\n"
      }
    ],
    "seed": 0,
    "tokens": [
      {
        "id": 10994,
        "logprob": -1.1542969,
        "special": false,
        "text": "Hello"
      },
      {
        "id": 29991,
        "logprob": 0.0,
        "special": false,
        "text": "!"
      },
      {
        "id": 739,
        "logprob": 0.0,
        "special": false,
        "text": " It"
      },
      {
        "id": 2444,
        "logprob": -0.42260742,
        "special": false,
        "text": " seems"
      },
      {
        "id": 366,
        "logprob": 0.0,
        "special": false,
        "text": " you"
      },
      {
        "id": 29915,
        "logprob": 0.0,
        "special": false,
        "text": "'"
      },
      {
        "id": 276,
        "logprob": -0.9838867,
        "special": false,
        "text": "re"
      },
      {
        "id": 3211,
        "logprob": 0.0,
        "special": false,
        "text": " address"
      },
      {
        "id": 292,
        "logprob": 0.0,
        "special": false,
        "text": "ing"
      },
      {
        "id": 263,
        "logprob": -0.15124512,
        "special": false,
        "text": " a"
      }
    ],
    "top_tokens": null
  },
  "generated_text": "What is gradient descent?\n\nHello! It seems you're addressing a"
 }
--- a/integration-tests/models/snapshots/test_flash_phi35_moe/test_flash_phi35_moe_load.json
+++ b/integration-tests/models/snapshots/test_flash_phi35_moe/test_flash_phi35_moe_load.json
@ -0,0 +1,438 @@
 [
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [
        {
          "id": 1724,
          "logprob": null,
          "text": "What"
        },
        {
          "id": 338,
          "logprob": -0.7133789,
          "text": "is"
        },
        {
          "id": 16030,
          "logprob": -13.9296875,
          "text": "gradient"
        },
        {
          "id": 26815,
          "logprob": -0.048919678,
          "text": "descent"
        },
        {
          "id": 29973,
          "logprob": -3.0078125,
          "text": "?"
        },
        {
          "id": 13,
          "logprob": -2.8105469,
          "text": "\n"
        },
        {
          "id": 13,
          "logprob": -0.84521484,
          "text": "\n"
        }
      ],
      "seed": null,
      "tokens": [
        {
          "id": 25584,
          "logprob": -0.017028809,
          "special": false,
          "text": "Grad"
        },
        {
          "id": 993,
          "logprob": -0.0028476715,
          "special": false,
          "text": "ient"
        },
        {
          "id": 26815,
          "logprob": -0.023971558,
          "special": false,
          "text": " descent"
        },
        {
          "id": 338,
          "logprob": -2.0384789e-05,
          "special": false,
          "text": " is"
        },
        {
          "id": 263,
          "logprob": -0.5229492,
          "special": false,
          "text": " a"
        },
        {
          "id": 937,
          "logprob": -0.17602539,
          "special": false,
          "text": " first"
        },
        {
          "id": 29899,
          "logprob": 0.0,
          "special": false,
          "text": "-"
        },
        {
          "id": 2098,
          "logprob": -0.000116467476,
          "special": false,
          "text": "order"
        },
        {
          "id": 13883,
          "logprob": -0.47436523,
          "special": false,
          "text": " optimization"
        },
        {
          "id": 5687,
          "logprob": -0.00027871132,
          "special": false,
          "text": " algorithm"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "Gradient descent is a first-order optimization algorithm"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [
        {
          "id": 1724,
          "logprob": null,
          "text": "What"
        },
        {
          "id": 338,
          "logprob": -0.7128906,
          "text": "is"
        },
        {
          "id": 16030,
          "logprob": -13.9375,
          "text": "gradient"
        },
        {
          "id": 26815,
          "logprob": -0.05053711,
          "text": "descent"
        },
        {
          "id": 29973,
          "logprob": -3.0058594,
          "text": "?"
        },
        {
          "id": 13,
          "logprob": -2.8242188,
          "text": "\n"
        },
        {
          "id": 13,
          "logprob": -0.84521484,
          "text": "\n"
        }
      ],
      "seed": null,
      "tokens": [
        {
          "id": 25584,
          "logprob": -0.018859863,
          "special": false,
          "text": "Grad"
        },
        {
          "id": 993,
          "logprob": -0.002822876,
          "special": false,
          "text": "ient"
        },
        {
          "id": 26815,
          "logprob": -0.023254395,
          "special": false,
          "text": " descent"
        },
        {
          "id": 338,
          "logprob": -2.0384789e-05,
          "special": false,
          "text": " is"
        },
        {
          "id": 263,
          "logprob": -0.5229492,
          "special": false,
          "text": " a"
        },
        {
          "id": 937,
          "logprob": -0.17126465,
          "special": false,
          "text": " first"
        },
        {
          "id": 29899,
          "logprob": 0.0,
          "special": false,
          "text": "-"
        },
        {
          "id": 2098,
          "logprob": -0.0001155138,
          "special": false,
          "text": "order"
        },
        {
          "id": 13883,
          "logprob": -0.47436523,
          "special": false,
          "text": " optimization"
        },
        {
          "id": 5687,
          "logprob": -0.00027036667,
          "special": false,
          "text": " algorithm"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "Gradient descent is a first-order optimization algorithm"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [
        {
          "id": 1724,
          "logprob": null,
          "text": "What"
        },
        {
          "id": 338,
          "logprob": -0.71484375,
          "text": "is"
        },
        {
          "id": 16030,
          "logprob": -13.9375,
          "text": "gradient"
        },
        {
          "id": 26815,
          "logprob": -0.049346924,
          "text": "descent"
        },
        {
          "id": 29973,
          "logprob": -3.0078125,
          "text": "?"
        },
        {
          "id": 13,
          "logprob": -2.8242188,
          "text": "\n"
        },
        {
          "id": 13,
          "logprob": -0.86328125,
          "text": "\n"
        }
      ],
      "seed": null,
      "tokens": [
        {
          "id": 25584,
          "logprob": -0.017196655,
          "special": false,
          "text": "Grad"
        },
        {
          "id": 993,
          "logprob": -0.0028438568,
          "special": false,
          "text": "ient"
        },
        {
          "id": 26815,
          "logprob": -0.023254395,
          "special": false,
          "text": " descent"
        },
        {
          "id": 338,
          "logprob": -2.026558e-05,
          "special": false,
          "text": " is"
        },
        {
          "id": 263,
          "logprob": -0.5229492,
          "special": false,
          "text": " a"
        },
        {
          "id": 937,
          "logprob": -0.17602539,
          "special": false,
          "text": " first"
        },
        {
          "id": 29899,
          "logprob": 0.0,
          "special": false,
          "text": "-"
        },
        {
          "id": 2098,
          "logprob": -0.00011622906,
          "special": false,
          "text": "order"
        },
        {
          "id": 13883,
          "logprob": -0.48608398,
          "special": false,
          "text": " optimization"
        },
        {
          "id": 5687,
          "logprob": -0.00027894974,
          "special": false,
          "text": " algorithm"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "Gradient descent is a first-order optimization algorithm"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [
        {
          "id": 1724,
          "logprob": null,
          "text": "What"
        },
        {
          "id": 338,
          "logprob": -0.7192383,
          "text": "is"
        },
        {
          "id": 16030,
          "logprob": -13.9375,
          "text": "gradient"
        },
        {
          "id": 26815,
          "logprob": -0.050445557,
          "text": "descent"
        },
        {
          "id": 29973,
          "logprob": -3.0078125,
          "text": "?"
        },
        {
          "id": 13,
          "logprob": -2.8242188,
          "text": "\n"
        },
        {
          "id": 13,
          "logprob": -0.8276367,
          "text": "\n"
        }
      ],
      "seed": null,
      "tokens": [
        {
          "id": 25584,
          "logprob": -0.01727295,
          "special": false,
          "text": "Grad"
        },
        {
          "id": 993,
          "logprob": -0.0027542114,
          "special": false,
          "text": "ient"
        },
        {
          "id": 26815,
          "logprob": -0.023254395,
          "special": false,
          "text": " descent"
        },
        {
          "id": 338,
          "logprob": -2.0384789e-05,
          "special": false,
          "text": " is"
        },
        {
          "id": 263,
          "logprob": -0.5229492,
          "special": false,
          "text": " a"
        },
        {
          "id": 937,
          "logprob": -0.17126465,
          "special": false,
          "text": " first"
        },
        {
          "id": 29899,
          "logprob": 0.0,
          "special": false,
          "text": "-"
        },
        {
          "id": 2098,
          "logprob": -0.00011301041,
          "special": false,
          "text": "order"
        },
        {
          "id": 13883,
          "logprob": -0.48608398,
          "special": false,
          "text": " optimization"
        },
        {
          "id": 5687,
          "logprob": -0.00027894974,
          "special": false,
          "text": " algorithm"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "Gradient descent is a first-order optimization algorithm"
  }
 ]
--- a/integration-tests/models/snapshots/test_flash_phi35_moe/test_flash_phi35_moe_simple.json
+++ b/integration-tests/models/snapshots/test_flash_phi35_moe/test_flash_phi35_moe_simple.json
@ -1,26 +0,0 @@
 {
  "choices": [
    {
      "finish_reason": "length",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "I'm an AI unable to provide real-time data, but I can guide you on how to find current weather conditions in Brooklyn, New York. You can check websites like weather.com or accuweather.com, or use apps like The Weather Channel or AccuWeather on your smartphone. Alternatively, you can ask your voice assistant like Google Assistant or Siri for real-time updates.\n\nFor your information, I hope you'll have a",
        "name": null,
        "role": "assistant",
        "tool_calls": null
      },
      "usage": null
    }
  ],
  "created": 1725383029,
  "id": "",
  "model": "microsoft/Phi-3.5-MoE-instruct",
  "object": "chat.completion",
  "system_fingerprint": "2.2.1-dev0-native",
  "usage": {
    "completion_tokens": 100,
    "prompt_tokens": 31,
    "total_tokens": 131
  }
 }
--- a/integration-tests/models/test_flash_phi35_moe.py
+++ b/integration-tests/models/test_flash_phi35_moe.py
@ -2,38 +2,74 @@ import pytest
@pytest.fixture(scope="module")
-def flash_phi35_moe_chat_handle(launcher):
+def flash_phi35_moe_handle(launcher):
    with launcher(
-        "microsoft/Phi-3.5-MoE-instruct", num_shard=4, cuda_graphs=[1, 2]
+        "microsoft/Phi-3.5-MoE-instruct",
        num_shard=4,
    ) as handle:
        yield handle
@pytest.fixture(scope="module")
-async def flash_phi35_moe_chat(flash_phi35_moe_chat_handle):
+async def flash_phi35_moe(flash_phi35_moe_handle):
-    await flash_phi35_moe_chat_handle.health(300)
+    await flash_phi35_moe_handle.health(300)
-    return flash_phi35_moe_chat_handle.client
+    return flash_phi35_moe_handle.client
-@pytest.mark.private
+@pytest.mark.asyncio
-async def test_flash_phi35_moe_simple(flash_phi35_moe_chat, response_snapshot):
+async def test_flash_phi35_moe(flash_phi35_moe, response_snapshot):
-    response = await flash_phi35_moe_chat.chat(
+    response = await flash_phi35_moe.generate(
-        max_tokens=100,
+        "What is gradient descent?\n\n", max_new_tokens=10, decoder_input_details=True
        seed=1337,
        messages=[
            {
                "role": "system",
                "content": "Youre a helpful assistant! Answer the users question best you can.",
            },
            {
                "role": "user",
                "content": "What is the weather like in Brooklyn, New York?",
            },
        ],
    )
    assert response.details.generated_tokens == 10
    assert (
-        response.choices[0].message.content
+        response.generated_text
-        == "I'm an AI unable to provide real-time data, but I can guide you on how to find current weather conditions in Brooklyn, New York. You can check websites like weather.com or accuweather.com, or use apps like The Weather Channel or AccuWeather on your smartphone. Alternatively, you can ask your voice assistant like Google Assistant or Siri for real-time updates.\n\nFor your information, I hope you'll have a"
+        == "Gradient descent is a first-order optimization algorithm"
    )
    assert response == response_snapshot
@pytest.mark.asyncio
 async def test_flash_phi35_moe_all_params(flash_phi35_moe, response_snapshot):
    response = await flash_phi35_moe.generate(
        "What is gradient descent?\n\n",
        max_new_tokens=10,
        repetition_penalty=1.2,
        return_full_text=True,
        stop_sequences=["test"],
        temperature=0.5,
        top_p=0.9,
        top_k=10,
        truncate=5,
        typical_p=0.9,
        watermark=True,
        decoder_input_details=True,
        seed=0,
    )
    assert response.details.generated_tokens == 10
    assert (
        response.generated_text
        == "What is gradient descent?\n\nHello! It seems you're addressing a"
    )
    assert response == response_snapshot
@pytest.mark.asyncio
 async def test_flash_phi35_moe_load(flash_phi35_moe, generate_load, response_snapshot):
    responses = await generate_load(
        flash_phi35_moe, "What is gradient descent?\n\n", max_new_tokens=10, n=4
    )
    assert len(responses) == 4
    assert responses[0].details.generated_tokens == 10
    assert (
        responses[0].generated_text
        == "Gradient descent is a first-order optimization algorithm"
    )
    assert all(
        [r.generated_text == responses[0].generated_text for r in responses]
    ), f"{[r.generated_text  for r in responses]}"
    assert responses == response_snapshot