From e3e483c9011c418c4a024cb56a7cf94ed5b641a0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Wed, 25 Sep 2024 10:27:29 +0000
Subject: [PATCH] Add the usual model tests

---
 .../test_flash_phi35_moe.json                 | 109 +++++
 .../test_flash_phi35_moe_all_params.json      |  99 ++++
 .../test_flash_phi35_moe_load.json            | 438 ++++++++++++++++++
 .../test_flash_phi35_moe_simple.json          |  26 --
 .../models/test_flash_phi35_moe.py            |  80 +++-
 5 files changed, 704 insertions(+), 48 deletions(-)
 create mode 100644 integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe.json
 create mode 100644 integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe_all_params.json
 create mode 100644 integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe_load.json
 delete mode 100644 integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe_simple.json

diff --git a/integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe.json b/integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe.json
new file mode 100644
index 00000000..0d6dca31
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe.json
@@ -0,0 +1,109 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 1724,
+        "logprob": null,
+        "text": "What"
+      },
+      {
+        "id": 338,
+        "logprob": -0.7133789,
+        "text": "is"
+      },
+      {
+        "id": 16030,
+        "logprob": -13.9296875,
+        "text": "gradient"
+      },
+      {
+        "id": 26815,
+        "logprob": -0.048919678,
+        "text": "descent"
+      },
+      {
+        "id": 29973,
+        "logprob": -3.0078125,
+        "text": "?"
+      },
+      {
+        "id": 13,
+        "logprob": -2.8105469,
+        "text": "\n"
+      },
+      {
+        "id": 13,
+        "logprob": -0.84521484,
+        "text": "\n"
+      }
+    ],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 25584,
+        "logprob": -0.017028809,
+        "special": false,
+        "text": "Grad"
+      },
+      {
+        "id": 993,
+        "logprob": -0.0027313232,
+        "special": false,
+        "text": "ient"
+      },
+      {
+        "id": 26815,
+        "logprob": -0.023254395,
+        "special": false,
+        "text": " descent"
+      },
+      {
+        "id": 338,
+        "logprob": -2.0623207e-05,
+        "special": false,
+        "text": " is"
+      },
+      {
+        "id": 263,
+        "logprob": -0.5361328,
+        "special": false,
+        "text": " a"
+      },
+      {
+        "id": 937,
+        "logprob": -0.17578125,
+        "special": false,
+        "text": " first"
+      },
+      {
+        "id": 29899,
+        "logprob": 0.0,
+        "special": false,
+        "text": "-"
+      },
+      {
+        "id": 2098,
+        "logprob": -0.00011539459,
+        "special": false,
+        "text": "order"
+      },
+      {
+        "id": 13883,
+        "logprob": -0.47436523,
+        "special": false,
+        "text": " optimization"
+      },
+      {
+        "id": 5687,
+        "logprob": -0.00027680397,
+        "special": false,
+        "text": " algorithm"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "Gradient descent is a first-order optimization algorithm"
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe_all_params.json b/integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe_all_params.json
new file mode 100644
index 00000000..38b80335
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe_all_params.json
@@ -0,0 +1,99 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 16030,
+        "logprob": null,
+        "text": "gradient"
+      },
+      {
+        "id": 26815,
+        "logprob": -6.4960938,
+        "text": "descent"
+      },
+      {
+        "id": 29973,
+        "logprob": -5.1484375,
+        "text": "?"
+      },
+      {
+        "id": 13,
+        "logprob": -4.0351562,
+        "text": "\n"
+      },
+      {
+        "id": 13,
+        "logprob": -5.2265625,
+        "text": "\n"
+      }
+    ],
+    "seed": 0,
+    "tokens": [
+      {
+        "id": 10994,
+        "logprob": -1.1542969,
+        "special": false,
+        "text": "Hello"
+      },
+      {
+        "id": 29991,
+        "logprob": 0.0,
+        "special": false,
+        "text": "!"
+      },
+      {
+        "id": 739,
+        "logprob": 0.0,
+        "special": false,
+        "text": " It"
+      },
+      {
+        "id": 2444,
+        "logprob": -0.42260742,
+        "special": false,
+        "text": " seems"
+      },
+      {
+        "id": 366,
+        "logprob": 0.0,
+        "special": false,
+        "text": " you"
+      },
+      {
+        "id": 29915,
+        "logprob": 0.0,
+        "special": false,
+        "text": "'"
+      },
+      {
+        "id": 276,
+        "logprob": -0.9838867,
+        "special": false,
+        "text": "re"
+      },
+      {
+        "id": 3211,
+        "logprob": 0.0,
+        "special": false,
+        "text": " address"
+      },
+      {
+        "id": 292,
+        "logprob": 0.0,
+        "special": false,
+        "text": "ing"
+      },
+      {
+        "id": 263,
+        "logprob": -0.15124512,
+        "special": false,
+        "text": " a"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "What is gradient descent?\n\nHello! It seems you're addressing a"
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe_load.json b/integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe_load.json
new file mode 100644
index 00000000..f1f81152
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe_load.json
@@ -0,0 +1,438 @@
+[
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1724,
+          "logprob": null,
+          "text": "What"
+        },
+        {
+          "id": 338,
+          "logprob": -0.7133789,
+          "text": "is"
+        },
+        {
+          "id": 16030,
+          "logprob": -13.9296875,
+          "text": "gradient"
+        },
+        {
+          "id": 26815,
+          "logprob": -0.048919678,
+          "text": "descent"
+        },
+        {
+          "id": 29973,
+          "logprob": -3.0078125,
+          "text": "?"
+        },
+        {
+          "id": 13,
+          "logprob": -2.8105469,
+          "text": "\n"
+        },
+        {
+          "id": 13,
+          "logprob": -0.84521484,
+          "text": "\n"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 25584,
+          "logprob": -0.017028809,
+          "special": false,
+          "text": "Grad"
+        },
+        {
+          "id": 993,
+          "logprob": -0.0028476715,
+          "special": false,
+          "text": "ient"
+        },
+        {
+          "id": 26815,
+          "logprob": -0.023971558,
+          "special": false,
+          "text": " descent"
+        },
+        {
+          "id": 338,
+          "logprob": -2.0384789e-05,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 263,
+          "logprob": -0.5229492,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 937,
+          "logprob": -0.17602539,
+          "special": false,
+          "text": " first"
+        },
+        {
+          "id": 29899,
+          "logprob": 0.0,
+          "special": false,
+          "text": "-"
+        },
+        {
+          "id": 2098,
+          "logprob": -0.000116467476,
+          "special": false,
+          "text": "order"
+        },
+        {
+          "id": 13883,
+          "logprob": -0.47436523,
+          "special": false,
+          "text": " optimization"
+        },
+        {
+          "id": 5687,
+          "logprob": -0.00027871132,
+          "special": false,
+          "text": " algorithm"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "Gradient descent is a first-order optimization algorithm"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1724,
+          "logprob": null,
+          "text": "What"
+        },
+        {
+          "id": 338,
+          "logprob": -0.7128906,
+          "text": "is"
+        },
+        {
+          "id": 16030,
+          "logprob": -13.9375,
+          "text": "gradient"
+        },
+        {
+          "id": 26815,
+          "logprob": -0.05053711,
+          "text": "descent"
+        },
+        {
+          "id": 29973,
+          "logprob": -3.0058594,
+          "text": "?"
+        },
+        {
+          "id": 13,
+          "logprob": -2.8242188,
+          "text": "\n"
+        },
+        {
+          "id": 13,
+          "logprob": -0.84521484,
+          "text": "\n"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 25584,
+          "logprob": -0.018859863,
+          "special": false,
+          "text": "Grad"
+        },
+        {
+          "id": 993,
+          "logprob": -0.002822876,
+          "special": false,
+          "text": "ient"
+        },
+        {
+          "id": 26815,
+          "logprob": -0.023254395,
+          "special": false,
+          "text": " descent"
+        },
+        {
+          "id": 338,
+          "logprob": -2.0384789e-05,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 263,
+          "logprob": -0.5229492,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 937,
+          "logprob": -0.17126465,
+          "special": false,
+          "text": " first"
+        },
+        {
+          "id": 29899,
+          "logprob": 0.0,
+          "special": false,
+          "text": "-"
+        },
+        {
+          "id": 2098,
+          "logprob": -0.0001155138,
+          "special": false,
+          "text": "order"
+        },
+        {
+          "id": 13883,
+          "logprob": -0.47436523,
+          "special": false,
+          "text": " optimization"
+        },
+        {
+          "id": 5687,
+          "logprob": -0.00027036667,
+          "special": false,
+          "text": " algorithm"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "Gradient descent is a first-order optimization algorithm"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1724,
+          "logprob": null,
+          "text": "What"
+        },
+        {
+          "id": 338,
+          "logprob": -0.71484375,
+          "text": "is"
+        },
+        {
+          "id": 16030,
+          "logprob": -13.9375,
+          "text": "gradient"
+        },
+        {
+          "id": 26815,
+          "logprob": -0.049346924,
+          "text": "descent"
+        },
+        {
+          "id": 29973,
+          "logprob": -3.0078125,
+          "text": "?"
+        },
+        {
+          "id": 13,
+          "logprob": -2.8242188,
+          "text": "\n"
+        },
+        {
+          "id": 13,
+          "logprob": -0.86328125,
+          "text": "\n"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 25584,
+          "logprob": -0.017196655,
+          "special": false,
+          "text": "Grad"
+        },
+        {
+          "id": 993,
+          "logprob": -0.0028438568,
+          "special": false,
+          "text": "ient"
+        },
+        {
+          "id": 26815,
+          "logprob": -0.023254395,
+          "special": false,
+          "text": " descent"
+        },
+        {
+          "id": 338,
+          "logprob": -2.026558e-05,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 263,
+          "logprob": -0.5229492,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 937,
+          "logprob": -0.17602539,
+          "special": false,
+          "text": " first"
+        },
+        {
+          "id": 29899,
+          "logprob": 0.0,
+          "special": false,
+          "text": "-"
+        },
+        {
+          "id": 2098,
+          "logprob": -0.00011622906,
+          "special": false,
+          "text": "order"
+        },
+        {
+          "id": 13883,
+          "logprob": -0.48608398,
+          "special": false,
+          "text": " optimization"
+        },
+        {
+          "id": 5687,
+          "logprob": -0.00027894974,
+          "special": false,
+          "text": " algorithm"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "Gradient descent is a first-order optimization algorithm"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1724,
+          "logprob": null,
+          "text": "What"
+        },
+        {
+          "id": 338,
+          "logprob": -0.7192383,
+          "text": "is"
+        },
+        {
+          "id": 16030,
+          "logprob": -13.9375,
+          "text": "gradient"
+        },
+        {
+          "id": 26815,
+          "logprob": -0.050445557,
+          "text": "descent"
+        },
+        {
+          "id": 29973,
+          "logprob": -3.0078125,
+          "text": "?"
+        },
+        {
+          "id": 13,
+          "logprob": -2.8242188,
+          "text": "\n"
+        },
+        {
+          "id": 13,
+          "logprob": -0.8276367,
+          "text": "\n"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 25584,
+          "logprob": -0.01727295,
+          "special": false,
+          "text": "Grad"
+        },
+        {
+          "id": 993,
+          "logprob": -0.0027542114,
+          "special": false,
+          "text": "ient"
+        },
+        {
+          "id": 26815,
+          "logprob": -0.023254395,
+          "special": false,
+          "text": " descent"
+        },
+        {
+          "id": 338,
+          "logprob": -2.0384789e-05,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 263,
+          "logprob": -0.5229492,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 937,
+          "logprob": -0.17126465,
+          "special": false,
+          "text": " first"
+        },
+        {
+          "id": 29899,
+          "logprob": 0.0,
+          "special": false,
+          "text": "-"
+        },
+        {
+          "id": 2098,
+          "logprob": -0.00011301041,
+          "special": false,
+          "text": "order"
+        },
+        {
+          "id": 13883,
+          "logprob": -0.48608398,
+          "special": false,
+          "text": " optimization"
+        },
+        {
+          "id": 5687,
+          "logprob": -0.00027894974,
+          "special": false,
+          "text": " algorithm"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "Gradient descent is a first-order optimization algorithm"
+  }
+]
diff --git a/integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe_simple.json b/integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe_simple.json
deleted file mode 100644
index 7ffdbff9..00000000
--- a/integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe_simple.json
+++ /dev/null
@@ -1,26 +0,0 @@
-{
-  "choices": [
-    {
-      "finish_reason": "length",
-      "index": 0,
-      "logprobs": null,
-      "message": {
-        "content": "I'm an AI unable to provide real-time data, but I can guide you on how to find current weather conditions in Brooklyn, New York. You can check websites like weather.com or accuweather.com, or use apps like The Weather Channel or AccuWeather on your smartphone. Alternatively, you can ask your voice assistant like Google Assistant or Siri for real-time updates.\n\nFor your information, I hope you'll have a",
-        "name": null,
-        "role": "assistant",
-        "tool_calls": null
-      },
-      "usage": null
-    }
-  ],
-  "created": 1725383029,
-  "id": "",
-  "model": "microsoft/Phi-3.5-MoE-instruct",
-  "object": "chat.completion",
-  "system_fingerprint": "2.2.1-dev0-native",
-  "usage": {
-    "completion_tokens": 100,
-    "prompt_tokens": 31,
-    "total_tokens": 131
-  }
-}
diff --git a/integration-tests/models/test_flash_phi35_moe.py b/integration-tests/models/test_flash_phi35_moe.py
index d55b1ec5..2173740a 100644
--- a/integration-tests/models/test_flash_phi35_moe.py
+++ b/integration-tests/models/test_flash_phi35_moe.py
@@ -2,38 +2,74 @@ import pytest
 
 
 @pytest.fixture(scope="module")
-def flash_phi35_moe_chat_handle(launcher):
+def flash_phi35_moe_handle(launcher):
     with launcher(
-        "microsoft/Phi-3.5-MoE-instruct", num_shard=4, cuda_graphs=[1, 2]
+        "microsoft/Phi-3.5-MoE-instruct",
+        num_shard=4,
     ) as handle:
         yield handle
 
 
 @pytest.fixture(scope="module")
-async def flash_phi35_moe_chat(flash_phi35_moe_chat_handle):
-    await flash_phi35_moe_chat_handle.health(300)
-    return flash_phi35_moe_chat_handle.client
+async def flash_phi35_moe(flash_phi35_moe_handle):
+    await flash_phi35_moe_handle.health(300)
+    return flash_phi35_moe_handle.client
 
 
-@pytest.mark.private
-async def test_flash_phi35_moe_simple(flash_phi35_moe_chat, response_snapshot):
-    response = await flash_phi35_moe_chat.chat(
-        max_tokens=100,
-        seed=1337,
-        messages=[
-            {
-                "role": "system",
-                "content": "Youre a helpful assistant! Answer the users question best you can.",
-            },
-            {
-                "role": "user",
-                "content": "What is the weather like in Brooklyn, New York?",
-            },
-        ],
+@pytest.mark.asyncio
+async def test_flash_phi35_moe(flash_phi35_moe, response_snapshot):
+    response = await flash_phi35_moe.generate(
+        "What is gradient descent?\n\n", max_new_tokens=10, decoder_input_details=True
     )
 
+    assert response.details.generated_tokens == 10
     assert (
-        response.choices[0].message.content
-        == "I'm an AI unable to provide real-time data, but I can guide you on how to find current weather conditions in Brooklyn, New York. You can check websites like weather.com or accuweather.com, or use apps like The Weather Channel or AccuWeather on your smartphone. Alternatively, you can ask your voice assistant like Google Assistant or Siri for real-time updates.\n\nFor your information, I hope you'll have a"
+        response.generated_text
+        == "Gradient descent is a first-order optimization algorithm"
     )
     assert response == response_snapshot
+
+
+@pytest.mark.asyncio
+async def test_flash_phi35_moe_all_params(flash_phi35_moe, response_snapshot):
+    response = await flash_phi35_moe.generate(
+        "What is gradient descent?\n\n",
+        max_new_tokens=10,
+        repetition_penalty=1.2,
+        return_full_text=True,
+        stop_sequences=["test"],
+        temperature=0.5,
+        top_p=0.9,
+        top_k=10,
+        truncate=5,
+        typical_p=0.9,
+        watermark=True,
+        decoder_input_details=True,
+        seed=0,
+    )
+
+    assert response.details.generated_tokens == 10
+    assert (
+        response.generated_text
+        == "What is gradient descent?\n\nHello! It seems you're addressing a"
+    )
+    assert response == response_snapshot
+
+
+@pytest.mark.asyncio
+async def test_flash_phi35_moe_load(flash_phi35_moe, generate_load, response_snapshot):
+    responses = await generate_load(
+        flash_phi35_moe, "What is gradient descent?\n\n", max_new_tokens=10, n=4
+    )
+
+    assert len(responses) == 4
+    assert responses[0].details.generated_tokens == 10
+    assert (
+        responses[0].generated_text
+        == "Gradient descent is a first-order optimization algorithm"
+    )
+    assert all(
+        [r.generated_text == responses[0].generated_text for r in responses]
+    ), f"{[r.generated_text  for r in responses]}"
+
+    assert responses == response_snapshot