diff --git a/Dockerfile_intel b/Dockerfile_intel index 786176417..be2488661 100644 --- a/Dockerfile_intel +++ b/Dockerfile_intel @@ -215,16 +215,9 @@ COPY server server COPY server/Makefile server/Makefile ENV UV_SYSTEM_PYTHON=1 RUN cd server && \ + make gen-server && \ pip install -U pip uv && \ - uv sync --frozen --extra gen --extra accelerate --extra compressed-tensors --extra quantize --extra peft --extra outlines --no-install-project && \ - . ./.venv/bin/activate && \ - make gen-server-raw - -RUN cd server && \ - uv sync --frozen --extra gen --extra accelerate --extra compressed-tensors --extra quantize --extra peft --extra outlines && \ - . ./.venv/bin/activate && \ - pwd && \ - text-generation-server --help + uv pip install -e ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir # Install benchmarker COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark @@ -238,8 +231,5 @@ ENV ATTENTION=flashdecoding-ipex ENV PREFIX_CACHING=1 ENV PREFILL_CHUNKING=1 ENV CUDA_GRAPHS=0 -COPY ./tgi-entrypoint.sh /tgi-entrypoint.sh -RUN chmod +x /tgi-entrypoint.sh - -ENTRYPOINT ["/tgi-entrypoint.sh"] +ENTRYPOINT ["text-generation-launcher"] CMD ["--json-output"] diff --git a/integration-tests/models/__snapshots__/test_mllama/test_mllama_load.json b/integration-tests/models/__snapshots__/test_mllama/test_mllama_load.json index d719a7f33..65154e89b 100644 --- a/integration-tests/models/__snapshots__/test_mllama/test_mllama_load.json +++ b/integration-tests/models/__snapshots__/test_mllama/test_mllama_load.json @@ -6,7 +6,7 @@ "index": 0, "logprobs": null, "message": { - "content": "In a bustling city, a chicken named Cluck", + "content": "In a small town, a chicken named Cluck", "name": null, "role": "assistant", "tool_calls": null @@ -14,11 +14,11 @@ "usage": null } ], - "created": 1727773835, + "created": 1738753835, "id": "", "model": "meta-llama/Llama-3.2-11B-Vision-Instruct", "object": "chat.completion", - "system_fingerprint": "2.4.2-dev0-native", + "system_fingerprint": "3.1.1-dev0-native", "usage": { "completion_tokens": 10, "prompt_tokens": 50, @@ -32,7 +32,7 @@ "index": 0, "logprobs": null, "message": { - "content": "In a world where even chickens could dream big,", + "content": "In a small town, a chicken named Cluck", "name": null, "role": "assistant", "tool_calls": null @@ -40,63 +40,11 @@ "usage": null } ], - "created": 1727773835, + "created": 1738753835, "id": "", "model": "meta-llama/Llama-3.2-11B-Vision-Instruct", "object": "chat.completion", - "system_fingerprint": "2.4.2-dev0-native", - "usage": { - "completion_tokens": 10, - "prompt_tokens": 50, - "total_tokens": 60 - } - }, - { - "choices": [ - { - "finish_reason": "length", - "index": 0, - "logprobs": null, - "message": { - "content": "In a world where even chickens could dream big,", - "name": null, - "role": "assistant", - "tool_calls": null - }, - "usage": null - } - ], - "created": 1727773835, - "id": "", - "model": "meta-llama/Llama-3.2-11B-Vision-Instruct", - "object": "chat.completion", - "system_fingerprint": "2.4.2-dev0-native", - "usage": { - "completion_tokens": 10, - "prompt_tokens": 50, - "total_tokens": 60 - } - }, - { - "choices": [ - { - "finish_reason": "length", - "index": 0, - "logprobs": null, - "message": { - "content": "In a world where even chickens could dream big,", - "name": null, - "role": "assistant", - "tool_calls": null - }, - "usage": null - } - ], - "created": 1727773835, - "id": "", - "model": "meta-llama/Llama-3.2-11B-Vision-Instruct", - "object": "chat.completion", - "system_fingerprint": "2.4.2-dev0-native", + "system_fingerprint": "3.1.1-dev0-native", "usage": { "completion_tokens": 10, "prompt_tokens": 50, diff --git a/integration-tests/models/__snapshots__/test_mllama/test_mllama_simpl.json b/integration-tests/models/__snapshots__/test_mllama/test_mllama_simpl.json index a520e797f..14ca3f4eb 100644 --- a/integration-tests/models/__snapshots__/test_mllama/test_mllama_simpl.json +++ b/integration-tests/models/__snapshots__/test_mllama/test_mllama_simpl.json @@ -5,7 +5,7 @@ "index": 0, "logprobs": null, "message": { - "content": "In a bustling city, a chicken named Cluck", + "content": "In a small town, a chicken named Cluck", "name": null, "role": "assistant", "tool_calls": null @@ -13,11 +13,11 @@ "usage": null } ], - "created": 1727556016, + "created": 1738753833, "id": "", "model": "meta-llama/Llama-3.2-11B-Vision-Instruct", "object": "chat.completion", - "system_fingerprint": "2.4.2-dev0-native", + "system_fingerprint": "3.1.1-dev0-native", "usage": { "completion_tokens": 10, "prompt_tokens": 50, diff --git a/integration-tests/models/test_mllama.py b/integration-tests/models/test_mllama.py index f23356908..90b2bff19 100644 --- a/integration-tests/models/test_mllama.py +++ b/integration-tests/models/test_mllama.py @@ -47,8 +47,7 @@ async def test_mllama_simpl(mllama, response_snapshot): "total_tokens": 60, } assert ( - response.choices[0].message.content - == "In a bustling city, a chicken named Cluck" + response.choices[0].message.content == "In a small town, a chicken named Cluck" ) assert response == response_snapshot @@ -84,12 +83,12 @@ async def test_mllama_load(mllama, generate_load, response_snapshot): ] responses = await asyncio.gather(*futures) - _ = [response.choices[0].message.content for response in responses] + generated_texts = [response.choices[0].message.content for response in responses] # XXX: TODO: Fix this test. - # assert generated_texts[0] == "In a bustling city, a chicken named Cluck" - # assert len(generated_texts) == 4 - # assert generated_texts, all( - # [text == generated_texts[0] for text in generated_texts] - # ) - # assert responses == response_snapshot + assert generated_texts[0] == "In a small town, a chicken named Cluck" + assert len(generated_texts) == 2 + assert generated_texts, all( + [text == generated_texts[0] for text in generated_texts] + ) + assert responses == response_snapshot