diff --git a/integration-tests/models/test_flash_grammar_llama.py b/integration-tests/models/test_flash_grammar_llama.py
index 585d0656..ce1cf787 100644
--- a/integration-tests/models/test_flash_grammar_llama.py
+++ b/integration-tests/models/test_flash_grammar_llama.py
@@ -28,6 +28,7 @@ async def test_flash_llama_grammar(flash_llama_grammar, response_snapshot):
     assert response == response_snapshot
 
 
+@pytest.mark.skip
 @pytest.mark.asyncio
 async def test_flash_llama_grammar_regex(flash_llama_grammar, response_snapshot):
     response = await flash_llama_grammar.generate(
@@ -46,6 +47,7 @@ async def test_flash_llama_grammar_regex(flash_llama_grammar, response_snapshot)
     assert response == response_snapshot
 
 
+@pytest.mark.skip
 @pytest.mark.asyncio
 async def test_flash_llama_grammar_json(flash_llama_grammar, response_snapshot):
     response = await flash_llama_grammar.generate(
@@ -94,6 +96,7 @@ async def test_flash_llama_grammar_json(flash_llama_grammar, response_snapshot):
     assert response == response_snapshot
 
 
+@pytest.mark.skip
 @pytest.mark.asyncio
 async def test_flash_llama_grammar_load(
     flash_llama_grammar, generate_load, response_snapshot
@@ -125,6 +128,7 @@ async def test_flash_llama_grammar_load(
 
 # this is the same as the above test, but only fires off a single request
 # this is only to ensure that the parallel and single inference produce the same result
+@pytest.mark.skip
 @pytest.mark.asyncio
 async def test_flash_llama_grammar_single_load_instance(
     flash_llama_grammar, generate_load, response_snapshot
diff --git a/integration-tests/models/test_grammar_llama.py b/integration-tests/models/test_grammar_llama.py
index da46352f..ce5da8a9 100644
--- a/integration-tests/models/test_grammar_llama.py
+++ b/integration-tests/models/test_grammar_llama.py
@@ -21,6 +21,7 @@ async def non_flash_llama_grammar(non_flash_llama_grammar_handle):
     return non_flash_llama_grammar_handle.client
 
 
+@pytest.mark.skip
 @pytest.mark.asyncio
 async def test_non_flash_llama_grammar_json(non_flash_llama_grammar, response_snapshot):
     response = await non_flash_llama_grammar.generate(
diff --git a/integration-tests/models/test_tools_llama.py b/integration-tests/models/test_tools_llama.py
index 38570c38..21bcbb52 100644
--- a/integration-tests/models/test_tools_llama.py
+++ b/integration-tests/models/test_tools_llama.py
@@ -98,6 +98,7 @@ async def test_flash_llama_grammar_no_tools(
     assert response == response_snapshot
 
 
+@pytest.mark.skip
 @pytest.mark.asyncio
 @pytest.mark.private
 async def test_flash_llama_grammar_tools(flash_llama_grammar_tools, response_snapshot):
@@ -134,6 +135,7 @@ async def test_flash_llama_grammar_tools(flash_llama_grammar_tools, response_sna
     assert response == response_snapshot
 
 
+@pytest.mark.skip
 @pytest.mark.asyncio
 @pytest.mark.private
 async def test_flash_llama_grammar_tools_auto(
@@ -173,6 +175,7 @@ async def test_flash_llama_grammar_tools_auto(
     assert response == response_snapshot
 
 
+@pytest.mark.skip
 @pytest.mark.asyncio
 @pytest.mark.private
 async def test_flash_llama_grammar_tools_choice(
@@ -208,6 +211,7 @@ async def test_flash_llama_grammar_tools_choice(
     assert response == response_snapshot
 
 
+@pytest.mark.skip
 @pytest.mark.asyncio
 @pytest.mark.private
 async def test_flash_llama_grammar_tools_stream(