diff --git a/docs/openapi.json b/docs/openapi.json index 0b5b3ae3..80fa5b3d 100644 --- a/docs/openapi.json +++ b/docs/openapi.json @@ -2183,4 +2183,4 @@ "description": "Hugging Face Text Generation Inference API" } ] -} +} \ No newline at end of file diff --git a/docs/source/supported_models.md b/docs/source/supported_models.md index 832f88ef..3fa78ee9 100644 --- a/docs/source/supported_models.md +++ b/docs/source/supported_models.md @@ -20,6 +20,7 @@ Text Generation Inference enables serving optimized models on specific hardware - [Mixtral](https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1) - [Gpt Bigcode](https://huggingface.co/bigcode/gpt_bigcode-santacoder) - [Phi](https://huggingface.co/microsoft/phi-1_5) +- [PhiMoe](https://huggingface.co/microsoft/Phi-3.5-MoE-instruct) - [Baichuan](https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat) - [Falcon](https://huggingface.co/tiiuae/falcon-7b-instruct) - [StarCoder 2](https://huggingface.co/bigcode/starcoder2-15b-instruct-v0.1) diff --git a/integration-tests/models/test_flash_phi35_moe.py b/integration-tests/models/test_flash_phi35_moe.py index e3a9eff3..177c7f24 100644 --- a/integration-tests/models/test_flash_phi35_moe.py +++ b/integration-tests/models/test_flash_phi35_moe.py @@ -4,9 +4,7 @@ import pytest @pytest.fixture(scope="module") def flash_llama_chat_handle(launcher): with launcher( - "microsoft/Phi-3.5-MoE-instruct", - num_shard=4, - cuda_graphs=[1, 2] + "microsoft/Phi-3.5-MoE-instruct", num_shard=4, cuda_graphs=[1, 2] ) as handle: yield handle