From d9c668f8b2b1442c5c25ae8dcc6b434c7522015a Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Wed, 1 May 2024 18:21:17 +0200 Subject: [PATCH 01/46] Fix: "Fixing" double BOS for mistral too. (#1843) # What does this PR do? Fixes # (issue) ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests), Pull Request section? - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link to it if that's the case. - [ ] Did you make sure to update the documentation with your changes? Here are the [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation). - [ ] Did you write any new necessary tests? ## Who can review? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR. --- server/text_generation_server/models/flash_mistral.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/server/text_generation_server/models/flash_mistral.py b/server/text_generation_server/models/flash_mistral.py index 6959e2ec..85e93543 100644 --- a/server/text_generation_server/models/flash_mistral.py +++ b/server/text_generation_server/models/flash_mistral.py @@ -121,6 +121,11 @@ class FlashMistralBatch(FlashCausalLMBatch): requests_idx_mapping[r.id] = i tokenized_input = tokenized_input[-r.truncate :] + if ( + tokenized_input[0] == tokenizer.bos_token_id + and tokenized_input[1] == tokenizer.bos_token_id + ): + tokenized_input = tokenized_input[1:] input_length = len(tokenized_input) input_lengths.append(input_length) From 224be709ce6ac3fdd95b2370835506fde6202574 Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Wed, 1 May 2024 21:48:06 +0200 Subject: [PATCH 02/46] Adding scripts to prepare load data. (#1841) # What does this PR do? Fixes # (issue) ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests), Pull Request section? - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link to it if that's the case. - [ ] Did you make sure to update the documentation with your changes? Here are the [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation). - [ ] Did you write any new necessary tests? ## Who can review? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR. --- .gitignore | 1 + load_tests/Makefile | 9 ++++++ load_tests/filter.py | 26 +++++++++++++++ load_tests/orca.py | 27 ++++++++++++++++ load_tests/starcoder_load.js | 63 ------------------------------------ 5 files changed, 63 insertions(+), 63 deletions(-) create mode 100644 load_tests/Makefile create mode 100644 load_tests/filter.py create mode 100644 load_tests/orca.py delete mode 100644 load_tests/starcoder_load.js diff --git a/.gitignore b/.gitignore index 2ac2f6b4..e9ad1808 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,4 @@ server/exllama_kernels/exllama_kernels/hip_buffers.cuh server/exllama_kernels/exllama_kernels/exllama_ext_hip.cpp data/ +load_tests/*.json diff --git a/load_tests/Makefile b/load_tests/Makefile new file mode 100644 index 00000000..9199aa3b --- /dev/null +++ b/load_tests/Makefile @@ -0,0 +1,9 @@ + +ShareGPT_V3_unfiltered_cleaned_split.json: + wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json + +prepare_share: ShareGPT_V3_unfiltered_cleaned_split.json + python filter.py + +prepare_orca: + python orca.py diff --git a/load_tests/filter.py b/load_tests/filter.py new file mode 100644 index 00000000..a00226ed --- /dev/null +++ b/load_tests/filter.py @@ -0,0 +1,26 @@ +import json + + +def main(): + with open("./ShareGPT_V3_unfiltered_cleaned_split.json", "r") as f: + data = json.load(f) + + # Select only the first 2k conversations that start with a human. + max = 2000 + conversations = [] + for conversation in data: + conv = conversation.get("conversations") + if conv and conv[0]["from"] == "human": + # Trim the rest of the output + conversation["conversations"] = conversation["conversations"][:1] + conversations.append(conversation) + + if len(conversation) >= max: + break + + with open("./small.json", "w") as f: + data = json.dump(conversations, f, indent=4) + + +if __name__ == "__main__": + main() diff --git a/load_tests/orca.py b/load_tests/orca.py new file mode 100644 index 00000000..e607d27c --- /dev/null +++ b/load_tests/orca.py @@ -0,0 +1,27 @@ +import json +import datasets +import tqdm + + +def main(): + dataset = datasets.load_dataset("Open-Orca/OpenOrca", split="train") + # Select only the first 2k conversations that start with a human. + max = min(2000, len(dataset)) + conversations = [] + for item in tqdm.tqdm(dataset, total=max): + conversation = { + "conversations": [ + {"from": "human", "value": item["question"]}, + ], + "id": item["id"], + } + conversations.append(conversation) + if len(conversations) >= max: + break + + with open("./small.json", "w") as f: + data = json.dump(conversations, f, indent=4) + + +if __name__ == "__main__": + main() diff --git a/load_tests/starcoder_load.js b/load_tests/starcoder_load.js deleted file mode 100644 index 2f6cb3d6..00000000 --- a/load_tests/starcoder_load.js +++ /dev/null @@ -1,63 +0,0 @@ -import {check} from 'k6'; -import http from 'k6/http'; -import {Trend} from 'k6/metrics'; - -const host = __ENV.HOST || '127.0.0.1:3000'; - -const totalTime = new Trend('total_time', true); -const validationTime = new Trend('validation_time', true); -const queueTime = new Trend('queue_time', true); -const inferenceTime = new Trend('inference_time', true); -const timePerToken = new Trend('time_per_token', true); - -const example = { - payload: JSON.stringify({ - inputs: '# This is a fibonacci function written in the Python programming language.' + - 'def fibonacci', - parameters: { - details: true, - max_new_tokens: 60, - temperature: 0.2, - top_p: 0.95, - seed: 0, - }, - }), - generated_tokens: 60 -}; - -export const options = { - thresholds: { - http_req_failed: ['rate==0'], - time_per_token: ['p(95)<90'], - queue_time: ['p(95)<1500'], - }, - scenarios: { - load_test: { - executor: 'constant-arrival-rate', - duration: '60s', - preAllocatedVUs: 100, - rate: 10, - timeUnit: '1s', - }, - }, -}; - -export default function () { - const headers = {'Content-Type': 'application/json'}; - const res = http.post(`http://${host}/generate`, example.payload, { - headers, - }); - - check(res, { - 'Post status is 200': (r) => res.status === 200, - 'Post response generated tokens': (r) => res.status === 200 && res.json().details.generated_tokens === example.generated_tokens, - }); - - if (res.status === 200) { - totalTime.add(res.headers["X-Total-Time"]); - validationTime.add(res.headers["X-Validation-Time"]); - queueTime.add(res.headers["X-Queue-Time"]); - inferenceTime.add(res.headers["X-Inference-Time"]); - timePerToken.add(res.headers["X-Time-Per-Token"]); - } -} From 6310e2454cf64564f5ff07da92882f4c7f134a5a Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Thu, 2 May 2024 15:09:46 +0200 Subject: [PATCH 03/46] Remove misleading warning (not that important nowadays anyway). (#1848) # What does this PR do? Fixes # (issue) ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests), Pull Request section? - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link to it if that's the case. - [ ] Did you make sure to update the documentation with your changes? Here are the [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation). - [ ] Did you write any new necessary tests? ## Who can review? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR. --- .../models/custom_modeling/neox_modeling.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/server/text_generation_server/models/custom_modeling/neox_modeling.py b/server/text_generation_server/models/custom_modeling/neox_modeling.py index 1b060060..f060ec0e 100644 --- a/server/text_generation_server/models/custom_modeling/neox_modeling.py +++ b/server/text_generation_server/models/custom_modeling/neox_modeling.py @@ -60,9 +60,6 @@ if ( except ImportError: pass -if not CUSTOM_KERNELS_ENABLED: - logger.warning("We're not using custom kernels.") - def make_causal_mask( input_ids_shape: torch.Size, device: torch.device, past_key_values_length: int From ee002cad1e1821933373fa15a56abd35f91e6d78 Mon Sep 17 00:00:00 2001 From: drbh Date: Thu, 2 May 2024 10:56:24 -0400 Subject: [PATCH 04/46] feat: prefer huggingface_hub in docs and show image api (#1844) This PR prefers the `huggingface_hub` library, refactors the grammar docs and adds the new image_url api to the vlm docs. --- docs/source/basic_tutorials/using_guidance.md | 332 +++++++----------- .../basic_tutorials/visual_language_models.md | 62 +++- 2 files changed, 197 insertions(+), 197 deletions(-) diff --git a/docs/source/basic_tutorials/using_guidance.md b/docs/source/basic_tutorials/using_guidance.md index 606f2453..d0008fdb 100644 --- a/docs/source/basic_tutorials/using_guidance.md +++ b/docs/source/basic_tutorials/using_guidance.md @@ -2,7 +2,7 @@ Text Generation Inference (TGI) now supports [JSON and regex grammars](#grammar-and-constraints) and [tools and functions](#tools-and-functions) to help developers guide LLM responses to fit their needs. -These feature are available starting from version `1.4.3`. They are accessible via the [text_generation](https://pypi.org/project/text-generation/) library. The tool support is compatible with OpenAI's client libraries. The following guide will walk you through the new features and how to use them! +These feature are available starting from version `1.4.3`. They are accessible via the [`huggingface_hub`](https://pypi.org/project/huggingface-hub/) library. The tool support is compatible with OpenAI's client libraries. The following guide will walk you through the new features and how to use them! _note: guidance is supported as grammar in the `/generate` endpoint and as tools in the `/chat/completions` endpoint._ @@ -74,6 +74,45 @@ curl localhost:3000/generate \ ``` +### Hugging Face Hub Python Library + +The Hugging Face Hub Python library provides a client that makes it easy to interact with the Messages API. Here's an example of how to use the client to send a request with a grammar parameter. + +```python +from huggingface_hub import InferenceClient + +client = InferenceClient("http://localhost:3000") + +schema = { + "properties": { + "location": {"title": "Location", "type": "string"}, + "activity": {"title": "Activity", "type": "string"}, + "animals_seen": { + "maximum": 5, + "minimum": 1, + "title": "Animals Seen", + "type": "integer", + }, + "animals": {"items": {"type": "string"}, "title": "Animals", "type": "array"}, + }, + "required": ["location", "activity", "animals_seen", "animals"], + "title": "Animals", + "type": "object", +} + +user_input = "I saw a puppy a cat and a raccoon during my bike ride in the park" +resp = client.text_generation( + f"convert to JSON: 'f{user_input}'. please use the following schema: {schema}", + max_new_tokens=100, + seed=42, + grammar={"type": "json", "value": schema}, +) + +print(resp) +# { "activity": "bike ride", "animals": ["puppy", "cat", "raccoon"], "animals_seen": 3, "location": "park" } + +``` + A grammar can be defined using Pydantic models, JSON schemas, or regular expressions. The LLM will then generate a response that conforms to the specified grammar. > Note: A grammar must compile to an intermediate representation to constrain the output. Grammar compilation is a computationally expensive and may take a few seconds to complete on the first request. Subsequent requests will use the cached grammar and will be much faster. @@ -83,134 +122,55 @@ A grammar can be defined using Pydantic models, JSON schemas, or regular express Using Pydantic models we can define a similar grammar as the previous example in a shorter and more readable way. ```python -import requests +from huggingface_hub import InferenceClient from pydantic import BaseModel, conint from typing import List + class Animals(BaseModel): location: str activity: str animals_seen: conint(ge=1, le=5) # Constrained integer type animals: List[str] -prompt = "convert to JSON: I saw a puppy a cat and a raccoon during my bike ride in the park" -data = { - "inputs": prompt, - "parameters": { - "repetition_penalty": 1.3, - "grammar": { - "type": "json", - "value": Animals.schema() - } - } -} +client = InferenceClient("http://localhost:3000") -headers = { - "Content-Type": "application/json", -} - -response = requests.post( - 'http://127.0.0.1:3000/generate', - headers=headers, - json=data +user_input = "I saw a puppy a cat and a raccoon during my bike ride in the park" +resp = client.text_generation( + f"convert to JSON: 'f{user_input}'. please use the following schema: {Animals.schema()}", + max_new_tokens=100, + seed=42, + grammar={"type": "json", "value": Animals.schema()}, ) -print(response.json()) -# {'generated_text': '{ "activity": "bike riding", "animals": ["puppy","cat","raccoon"],"animals_seen": 3, "location":"park" }'} + +print(resp) +# { "activity": "bike ride", "animals": ["puppy", "cat", "raccoon"], "animals_seen": 3, "location": "park" } + ``` -### JSON Schema Integration - -If Pydantic's not your style, go raw with direct JSON Schema integration. This is similar to the first example but with programmatic control. +defining a grammar as regular expressions ```python -import requests +from huggingface_hub import InferenceClient -json_schema = { - "properties": { - "location": { - "type": "string" - }, - "activity": { - "type": "string" - }, - "animals_seen": { - "type": "integer", - "minimum": 1, - "maximum": 5 - }, - "animals": { - "type": "array", - "items": { - "type": "string" - } - } +client = InferenceClient("http://localhost:3000") + +regexp = "((25[0-5]|2[0-4]\\d|[01]?\\d\\d?)\\.){3}(25[0-5]|2[0-4]\\d|[01]?\\d\\d?)" + +resp = client.text_generation( + f"Whats Googles DNS? Please use the following regex: {regexp}", + seed=42, + grammar={ + "type": "regex", + "value": regexp, }, - "required": ["location", "activity", "animals_seen", "animals"] -} - -data = { - "inputs": "convert to JSON: I saw a puppy a cat and a raccoon during my bike ride in the park", - "parameters": { - "max_new_tokens": 200, - "repetition_penalty": 1.3, - "grammar": { - "type": "json", - "value": json_schema - } - } -} - -headers = { - "Content-Type": "application/json", -} - -response = requests.post( - 'http://127.0.0.1:3000/generate', - headers=headers, - json=data ) -print(response.json()) -# {'generated_text': '{\n"activity": "biking",\n"animals": ["puppy","cat","raccoon"]\n , "animals_seen": 3,\n "location":"park"}'} -``` -### Using the client - -TGI provides a client library to that make it easy to send requests with all of the parameters we've discussed above. Here's an example of how to use the client to send a request with a grammar parameter. - -```python -from text_generation import AsyncClient -from text_generation.types import GrammarType - -# NOTE: tools defined above and removed for brevity - -# Define an async function to encapsulate the async operation -async def main(): - client = AsyncClient(base_url="http://localhost:3000") - - # Use 'await' to wait for the async method 'chat' to complete - response = await client.generate( - "Whats Googles DNS", - max_new_tokens=10, - decoder_input_details=True, - seed=1, - grammar={ - "type": GrammarType.Regex, - "value": "((25[0-5]|2[0-4]\\d|[01]?\\d\\d?)\\.){3}(25[0-5]|2[0-4]\\d|[01]?\\d\\d?)", - }, - ) - - # Once the response is received, you can process it - print(response.generated_text) - -# Ensure the main async function is run in the event loop -if __name__ == "__main__": - import asyncio - asyncio.run(main()) - -# 118.8.0.84 +print(resp) +# 7.1.1.1 ``` @@ -265,107 +225,87 @@ curl localhost:3000/v1/chat/completions \ // {"id":"","object":"text_completion","created":1709051640,"model":"HuggingFaceH4/zephyr-7b-beta","system_fingerprint":"1.4.3-native","choices":[{"index":0,"message":{"role":"assistant","tool_calls":{"id":0,"type":"function","function":{"description":null,"name":"tools","parameters":{"format":"celsius","location":"New York"}}}},"logprobs":null,"finish_reason":"eos_token"}],"usage":{"prompt_tokens":157,"completion_tokens":19,"total_tokens":176}} ``` -### Text Generation Inference Client +### Chat Completion with Tools -TGI provides a client library to interact with the Messages API and Tool functions. The client library is available in both synchronous and asynchronous versions. +Grammars are supported in the `/generate` endpoint, while tools are supported in the `/chat/completions` endpoint. Here's an example of how to use the client to send a request with a tool parameter. ```python -from text_generation import AsyncClient +from huggingface_hub import InferenceClient -# NOTE: tools defined above and removed for brevity +client = InferenceClient("http://localhost:3000") -# Define an async function to encapsulate the async operation -async def main(): - client = AsyncClient(base_url="http://localhost:3000") - - # Use 'await' to wait for the async method 'chat' to complete - response = await client.chat( - max_tokens=100, - seed=1, - tools=tools, - presence_penalty=-1.1, - messages=[ - { - "role": "system", - "content": "You're a helpful assistant! Answer the users question best you can.", +tools = [ + { + "type": "function", + "function": { + "name": "get_current_weather", + "description": "Get the current weather", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "The city and state, e.g. San Francisco, CA", + }, + "format": { + "type": "string", + "enum": ["celsius", "fahrenheit"], + "description": "The temperature unit to use. Infer this from the users location.", + }, + }, + "required": ["location", "format"], }, - { - "role": "user", - "content": "What is the weather like in Brooklyn, New York?", + }, + }, + { + "type": "function", + "function": { + "name": "get_n_day_weather_forecast", + "description": "Get an N-day weather forecast", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "The city and state, e.g. San Francisco, CA", + }, + "format": { + "type": "string", + "enum": ["celsius", "fahrenheit"], + "description": "The temperature unit to use. Infer this from the users location.", + }, + "num_days": { + "type": "integer", + "description": "The number of days to forecast", + }, + }, + "required": ["location", "format", "num_days"], }, - ], - ) + }, + }, +] - # Once the response is received, you can process it - print(response.choices[0].message.tool_calls) +chat = client.chat_completion( + messages=[ + { + "role": "system", + "content": "You're a helpful assistant! Answer the users question best you can.", + }, + { + "role": "user", + "content": "What is the weather like in Brooklyn, New York?", + }, + ], + tools=tools, + seed=42, + max_tokens=100, +) -# Ensure the main async function is run in the event loop -if __name__ == "__main__": - import asyncio - asyncio.run(main()) - -# {"id":"","object":"text_completion","created":1709051942,"model":"HuggingFaceH4/zephyr-7b-beta","system_fingerprint":"1.4.3-native","choices":[{"index":0,"message":{"role":"assistant","tool_calls":{"id":0,"type":"function","function":{"description":null,"name":"tools","parameters":{"format":"celsius","location":"New York"}}}},"logprobs":null,"finish_reason":"eos_token"}],"usage":{"prompt_tokens":157,"completion_tokens":20,"total_tokens":177}} +print(chat.choices[0].message.tool_calls) +# [ChatCompletionOutputToolCall(function=ChatCompletionOutputFunctionDefinition(arguments={'format': 'fahrenheit', 'location': 'Brooklyn, New York', 'num_days': 7}, name='get_n_day_weather_forecast', description=None), id=0, type='function')] ``` -
- Tools used in example above - -```python - tools = [ - { - "type": "function", - "function": { - "name": "get_current_weather", - "description": "Get the current weather", - "parameters": { - "type": "object", - "properties": { - "location": { - "type": "string", - "description": "The city and state, e.g. San Francisco, CA", - }, - "format": { - "type": "string", - "enum": ["celsius", "fahrenheit"], - "description": "The temperature unit to use. Infer this from the users location.", - }, - }, - "required": ["location", "format"], - }, - }, - }, - { - "type": "function", - "function": { - "name": "get_n_day_weather_forecast", - "description": "Get an N-day weather forecast", - "parameters": { - "type": "object", - "properties": { - "location": { - "type": "string", - "description": "The city and state, e.g. San Francisco, CA", - }, - "format": { - "type": "string", - "enum": ["celsius", "fahrenheit"], - "description": "The temperature unit to use. Infer this from the users location.", - }, - "num_days": { - "type": "integer", - "description": "The number of days to forecast", - }, - }, - "required": ["location", "format", "num_days"], - }, - }, - } - ] -``` - -
- ### OpenAI integration TGI exposes an OpenAI-compatible API, which means you can use OpenAI's client libraries to interact with TGI's Messages API and Tool functions. diff --git a/docs/source/basic_tutorials/visual_language_models.md b/docs/source/basic_tutorials/visual_language_models.md index e804ef09..3770db0b 100644 --- a/docs/source/basic_tutorials/visual_language_models.md +++ b/docs/source/basic_tutorials/visual_language_models.md @@ -53,7 +53,67 @@ for token in client.text_generation(prompt, max_new_tokens=10, stream=True): # This is a picture of an anthropomorphic rabbit in a space suit. ``` -If you want additional details, you can add `details=True`. In this case, you get a `TextGenerationStreamResponse` which contains additional information such as the probabilities and the tokens. For the final response in the stream, it also returns the full generated text. +or via the `chat_completion` endpoint: + +```python +from huggingface_hub import InferenceClient + +client = InferenceClient("http://127.0.0.1:3000") + +chat = client.chat_completion( + messages=[ + { + "role": "user", + "content": [ + {"type": "text", "text": "Whats in this image?"}, + { + "type": "image_url", + "image_url": { + "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png" + }, + }, + ], + }, + ], + seed=42, + max_tokens=100, +) + +print(chat) +# ChatCompletionOutput(choices=[ChatCompletionOutputComplete(finish_reason='length', index=0, message=ChatCompletionOutputMessage(role='assistant', content=" The image you've provided features an anthropomorphic rabbit in spacesuit attire. This rabbit is depicted with human-like posture and movement, standing on a rocky terrain with a vast, reddish-brown landscape in the background. The spacesuit is detailed with mission patches, circuitry, and a helmet that covers the rabbit's face and ear, with an illuminated red light on the chest area.\n\nThe artwork style is that of a", name=None, tool_calls=None), logprobs=None)], created=1714589614, id='', model='llava-hf/llava-v1.6-mistral-7b-hf', object='text_completion', system_fingerprint='2.0.2-native', usage=ChatCompletionOutputUsage(completion_tokens=100, prompt_tokens=2943, total_tokens=3043)) + +``` + +or with OpenAi's library: + +```python +from openai import OpenAI + +# init the client but point it to TGI +client = OpenAI(base_url="http://localhost:3000/v1", api_key="-") + +chat_completion = client.chat.completions.create( + model="tgi", + messages=[ + { + "role": "user", + "content": [ + {"type": "text", "text": "Whats in this image?"}, + { + "type": "image_url", + "image_url": { + "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png" + }, + }, + ], + }, + ], + stream=False, +) + +print(chat_completion) +# ChatCompletion(id='', choices=[Choice(finish_reason='eos_token', index=0, logprobs=None, message=ChatCompletionMessage(content=' The image depicts an anthropomorphic rabbit dressed in a space suit with gear that resembles NASA attire. The setting appears to be a solar eclipse with dramatic mountain peaks and a partial celestial body in the sky. The artwork is detailed and vivid, with a warm color palette and a sense of an adventurous bunny exploring or preparing for a journey beyond Earth. ', role='assistant', function_call=None, tool_calls=None))], created=1714589732, model='llava-hf/llava-v1.6-mistral-7b-hf', object='text_completion', system_fingerprint='2.0.2-native', usage=CompletionUsage(completion_tokens=84, prompt_tokens=2943, total_tokens=3027)) +``` ### Inference Through Sending `cURL` Requests From 6a164ce27022d9fe05febe1df912f0943b36d69c Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Thu, 2 May 2024 19:07:10 +0200 Subject: [PATCH 05/46] Updating Phi3 (long context). (#1849) # What does this PR do? Fixes # (issue) ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests), Pull Request section? - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link to it if that's the case. - [ ] Did you make sure to update the documentation with your changes? Here are the [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation). - [ ] Did you write any new necessary tests? ## Who can review? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR. --- Cargo.lock | 8 +- router/src/config.rs | 1 + server/text_generation_server/utils/layers.py | 92 ++++++++++++++++++- 3 files changed, 96 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 671d615c..7962174b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3552,7 +3552,7 @@ dependencies = [ [[package]] name = "text-generation-benchmark" -version = "2.0.1" +version = "2.0.2" dependencies = [ "average", "clap", @@ -3573,7 +3573,7 @@ dependencies = [ [[package]] name = "text-generation-client" -version = "2.0.1" +version = "2.0.2" dependencies = [ "futures", "grpc-metadata", @@ -3590,7 +3590,7 @@ dependencies = [ [[package]] name = "text-generation-launcher" -version = "2.0.1" +version = "2.0.2" dependencies = [ "clap", "ctrlc", @@ -3608,7 +3608,7 @@ dependencies = [ [[package]] name = "text-generation-router" -version = "2.0.1" +version = "2.0.2" dependencies = [ "async-stream", "axum", diff --git a/router/src/config.rs b/router/src/config.rs index 88cde69a..8640ede9 100644 --- a/router/src/config.rs +++ b/router/src/config.rs @@ -136,6 +136,7 @@ pub enum Config { Phi, #[serde(rename = "phi-msft")] PhiMsft, + Phi3, Llama, Baichuan, Gemma, diff --git a/server/text_generation_server/utils/layers.py b/server/text_generation_server/utils/layers.py index 6e4a13cd..7d339fe5 100644 --- a/server/text_generation_server/utils/layers.py +++ b/server/text_generation_server/utils/layers.py @@ -1029,10 +1029,10 @@ try: scaling_factor = None rope_scaling = _get_rope_config(config) if rope_scaling is not None: - scaling_factor = rope_scaling["factor"] if rope_scaling["type"] == "linear": pass elif rope_scaling["type"] == "dynamic": + scaling_factor = rope_scaling["factor"] return DynamicPositionRotaryEmbedding( dim=dim, max_position_embeddings=config.max_position_embeddings, @@ -1041,6 +1041,7 @@ try: scaling_factor=scaling_factor, ) elif rope_scaling["type"] == "yarn": + scaling_factor = rope_scaling["factor"] return YarnPositionRotaryEmbedding( dim=2 * inv_freq.shape[0], max_position_embeddings=rope_scaling[ @@ -1054,6 +1055,52 @@ try: beta_fast=32, beta_slow=1, ) + elif rope_scaling["type"] == "su": + short_factor = torch.tensor( + rope_scaling["short_factor"], dtype=torch.float32, device=device + ) + short_inv_freq = 1.0 / ( + short_factor + * base + ** ( + torch.arange(0, dim, 2, device=device, dtype=torch.float32) + / dim + ) + ) + long_factor = torch.tensor( + rope_scaling["long_factor"], dtype=torch.float32, device=device + ) + long_inv_freq = 1.0 / ( + long_factor + * base + ** ( + torch.arange(0, dim, 2, device=device, dtype=torch.float32) + / dim + ) + ) + + original_max_position_embeddings = ( + config.original_max_position_embeddings + ) + max_position_embeddings = config.max_position_embeddings + if max_position_embeddings <= original_max_position_embeddings: + scaling_factor = 1.0 + else: + scale = ( + max_position_embeddings / original_max_position_embeddings + ) + scaling_factor = math.sqrt( + 1 + + math.log(scale) + / math.log(original_max_position_embeddings) + ) + + return SuRotaryEmbedding( + short_inv_freq=short_inv_freq, + long_inv_freq=long_inv_freq, + scaling_factor=scaling_factor, + original_max_position_embeddings=original_max_position_embeddings, + ) else: raise NotImplementedError( f"rope scaling type {rope_scaling['type']} is not implemented or invalid" @@ -1141,6 +1188,49 @@ try: # Note: this unsqueeze is not necessary on RoCm + VLLM ROPE implementation, but we leave it as is to avoid yet an other controlflow. return cos.unsqueeze(1), sin.unsqueeze(1) + class SuRotaryEmbedding(PositionRotaryEmbedding): + def __init__( + self, + short_inv_freq, + long_inv_freq, + scaling_factor, + original_max_position_embeddings, + ): + super(PositionRotaryEmbedding, self).__init__() + self.short_inv_freq = short_inv_freq + self.long_inv_freq = long_inv_freq + self.scaling_factor = scaling_factor + self.original_max_position_embeddings = original_max_position_embeddings + self._seq_len_cached = 0 + self._cos_cached = None + self._sin_cached = None + self._cos_k_cached = None + self._sin_k_cached = None + self.dynamic_args = None + + def _update_cos_sin_cache(self, dtype, device, seqlen): + # Reset the tables if the sequence length has changed, + # or if we're on a new device (possibly due to tracing for instance) + if ( + seqlen > self._seq_len_cached + or self._cos_cached.device != device + or self._cos_cached.dtype != dtype + ): + self._seq_len_cached = seqlen + if seqlen > self.original_max_position_embeddings: + inv_freq = self.long_inv_freq + else: + inv_freq = self.short_inv_freq + t = torch.arange(seqlen, device=device, dtype=inv_freq.dtype) + if self.scaling_factor is not None: + t /= self.scaling_factor + # Don't do einsum, it converts fp32 to fp16 + # freqs = torch.einsum("i,j->ij", t, self.inv_freq) + + freqs = torch.outer(t, inv_freq.to(device=t.device)) + self._cos_cached = torch.cos(freqs).to(dtype) + self._sin_cached = torch.sin(freqs).to(dtype) + class DynamicPositionRotaryEmbedding(PositionRotaryEmbedding): def __init__(self, dim, max_position_embeddings, base, device, scaling_factor): inv_freq = _create_inv_freq(dim, base, device) From 227c7770c65d1677638a5811a2f915ca2682e51b Mon Sep 17 00:00:00 2001 From: Lucain Date: Fri, 3 May 2024 16:39:04 +0200 Subject: [PATCH 06/46] Add router name to /info endpoint (#1854) Add `router` key in `/info` endpoint and set it to `env!("CARGO_PKG_NAME")` => so always set to `"text-generation-router"` in TGI. Happy to change the naming if you think of a better one (framework? package_name?) The goal is to use this information in `InferenceClient` to know the model is served with TGI. At the moment we can use https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.2/info to infer it is TGI-served because it returns information but having a proper key would be better. For context, a transformers-served model is only outputting `{"ok": "ok"}` (see [here](https://api-inference.huggingface.co/models/microsoft/DialoGPT-large/info)). --- router/src/lib.rs | 2 ++ router/src/server.rs | 1 + 2 files changed, 3 insertions(+) diff --git a/router/src/lib.rs b/router/src/lib.rs index fac4c14e..96a9fdf6 100644 --- a/router/src/lib.rs +++ b/router/src/lib.rs @@ -159,6 +159,8 @@ pub struct Info { #[schema(example = "32")] pub max_client_batch_size: usize, /// Router Info + #[schema(example = "text-generation-router")] + pub router: &'static str, #[schema(example = "0.5.0")] pub version: &'static str, #[schema(nullable = true, example = "null")] diff --git a/router/src/server.rs b/router/src/server.rs index 7343c852..adaa409c 100644 --- a/router/src/server.rs +++ b/router/src/server.rs @@ -1568,6 +1568,7 @@ pub async fn run( max_batch_size, validation_workers, max_client_batch_size, + router: env!("CARGO_PKG_NAME"), version: env!("CARGO_PKG_VERSION"), sha: option_env!("VERGEN_GIT_SHA"), docker_label: option_env!("DOCKER_LABEL"), From 263732ef7aacc3aa93ec94018eb3706cab9972d2 Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Mon, 6 May 2024 13:48:11 +0200 Subject: [PATCH 07/46] Upgrading to rust 1.78. (#1851) Fixes # (issue) - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests), Pull Request section? - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link to it if that's the case. - [ ] Did you make sure to update the documentation with your changes? Here are the [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation). - [ ] Did you write any new necessary tests? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR. --- Cargo.lock | 9 +++++---- Dockerfile | 2 +- Dockerfile_amd | 2 +- Dockerfile_intel | 2 +- benchmark/src/event.rs | 6 +++--- launcher/Cargo.toml | 1 + launcher/src/main.rs | 19 ++++++++++--------- router/grpc-metadata/src/lib.rs | 23 +---------------------- router/src/infer.rs | 4 ++-- router/src/server.rs | 13 +++++-------- rust-toolchain.toml | 7 +++---- 11 files changed, 33 insertions(+), 55 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 7962174b..31afeda9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -78,9 +78,9 @@ dependencies = [ [[package]] name = "anstyle-query" -version = "1.1.0" +version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad186efb764318d35165f1758e7dcef3b10628e26d41a44bc5550652e6804391" +checksum = "a64c907d4e79225ac72e2a354c9ce84d50ebb4586dee56c82b3ee73004f537f5" dependencies = [ "windows-sys 0.52.0", ] @@ -310,9 +310,9 @@ dependencies = [ [[package]] name = "base64" -version = "0.13.1" +version = "0.22.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8" +checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" [[package]] name = "base64" @@ -3601,6 +3601,7 @@ dependencies = [ "reqwest", "serde", "serde_json", + "thiserror", "tracing", "tracing-subscriber", "vergen", diff --git a/Dockerfile b/Dockerfile index 94d10bc1..175287bb 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,5 @@ # Rust builder -FROM lukemathwalker/cargo-chef:latest-rust-1.75 AS chef +FROM lukemathwalker/cargo-chef:latest-rust-1.78 AS chef WORKDIR /usr/src FROM chef as planner diff --git a/Dockerfile_amd b/Dockerfile_amd index fb820116..57a7c637 100644 --- a/Dockerfile_amd +++ b/Dockerfile_amd @@ -1,5 +1,5 @@ # Rust builder -FROM lukemathwalker/cargo-chef:latest-rust-1.75 AS chef +FROM lukemathwalker/cargo-chef:latest-rust-1.78 AS chef WORKDIR /usr/src ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse diff --git a/Dockerfile_intel b/Dockerfile_intel index d0791cac..8c7478c1 100644 --- a/Dockerfile_intel +++ b/Dockerfile_intel @@ -1,4 +1,4 @@ -FROM lukemathwalker/cargo-chef:latest-rust-1.75 AS chef +FROM lukemathwalker/cargo-chef:latest-rust-1.78 AS chef WORKDIR /usr/src ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse diff --git a/benchmark/src/event.rs b/benchmark/src/event.rs index 91ce8400..07482aed 100644 --- a/benchmark/src/event.rs +++ b/benchmark/src/event.rs @@ -11,7 +11,7 @@ pub(crate) enum Event { /// Key press. Key(event::KeyEvent), /// Terminal resize. - Resize(u16, u16), + Resize, } pub(crate) async fn terminal_event_task( @@ -47,8 +47,8 @@ async fn event_loop(fps: u32, event_sender: mpsc::Sender) { if event::poll(Duration::from_secs(0)).expect("no events available") { match event::read().expect("unable to read event") { event::Event::Key(e) => event_sender.send(Event::Key(e)).await.unwrap_or(()), - event::Event::Resize(w, h) => { - event_sender.send(Event::Resize(w, h)).await.unwrap_or(()) + event::Event::Resize(_w, _h) => { + event_sender.send(Event::Resize).await.unwrap_or(()) } _ => (), } diff --git a/launcher/Cargo.toml b/launcher/Cargo.toml index 6b6fd58e..eb219423 100644 --- a/launcher/Cargo.toml +++ b/launcher/Cargo.toml @@ -14,6 +14,7 @@ nix = { version = "0.28.0", features = ["signal"] } once_cell = "1.19.0" serde = { version = "1.0.188", features = ["derive"] } serde_json = "1.0.107" +thiserror = "1.0.59" tracing = "0.1.37" tracing-subscriber = { version = "0.3.17", features = ["json", "env-filter"] } diff --git a/launcher/src/main.rs b/launcher/src/main.rs index 9a327b27..d6b45c1d 100644 --- a/launcher/src/main.rs +++ b/launcher/src/main.rs @@ -18,6 +18,7 @@ use std::thread; use std::thread::sleep; use std::time::{Duration, Instant}; use std::{fs, io}; +use thiserror::Error; use tracing_subscriber::EnvFilter; mod env_runtime; @@ -824,26 +825,26 @@ fn find_num_shards( Ok(num_shard) } -#[derive(Debug)] +#[derive(Debug, Error)] enum LauncherError { + #[error("Invalid argument: {0}")] ArgumentValidation(String), + #[error("not enough cuda devices: {0}")] NotEnoughCUDADevices(String), + #[error("Download error")] DownloadError, + #[error("Shard cannot start")] ShardCannotStart, + #[error("Shard disconnected")] ShardDisconnected, + #[error("Shard failed")] ShardFailed, + #[error("Webserver failed")] WebserverFailed, + #[error("Webserver cannot start")] WebserverCannotStart, } -impl core::fmt::Display for LauncherError { - fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { - write!(f, "{self:?}") - } -} - -impl std::error::Error for LauncherError {} - fn download_convert_model(args: &Args, running: Arc) -> Result<(), LauncherError> { // Enter download tracing span let _span = tracing::span!(tracing::Level::INFO, "download").entered(); diff --git a/router/grpc-metadata/src/lib.rs b/router/grpc-metadata/src/lib.rs index 7ba353fa..3068a61c 100644 --- a/router/grpc-metadata/src/lib.rs +++ b/router/grpc-metadata/src/lib.rs @@ -2,30 +2,9 @@ //! Inspired by: https://github.com/open-telemetry/opentelemetry-rust gRPC examples use opentelemetry::global; -use opentelemetry::propagation::{Extractor, Injector}; +use opentelemetry::propagation::Injector; use tracing_opentelemetry::OpenTelemetrySpanExt; -/// Extract context metadata from a gRPC request's metadata -struct MetadataExtractor<'a>(pub &'a tonic::metadata::MetadataMap); - -impl<'a> Extractor for MetadataExtractor<'a> { - /// Get a value for a key from the MetadataMap. If the value can't be converted to &str, returns None - fn get(&self, key: &str) -> Option<&str> { - self.0.get(key).and_then(|metadata| metadata.to_str().ok()) - } - - /// Collect all the keys from the MetadataMap. - fn keys(&self) -> Vec<&str> { - self.0 - .keys() - .map(|key| match key { - tonic::metadata::KeyRef::Ascii(v) => v.as_str(), - tonic::metadata::KeyRef::Binary(v) => v.as_str(), - }) - .collect::>() - } -} - /// Inject context in the metadata of a gRPC request. struct MetadataInjector<'a>(pub &'a mut tonic::metadata::MetadataMap); diff --git a/router/src/infer.rs b/router/src/infer.rs index d48b47f6..85e8775e 100644 --- a/router/src/infer.rs +++ b/router/src/infer.rs @@ -1260,7 +1260,7 @@ mod tests { }, ]; - let example_chat_with_system = vec![Message { + let example_chat_with_system = [Message { role: "system".to_string(), content: Some( "You are a friendly chatbot who always responds in the style of a pirate" @@ -1384,7 +1384,7 @@ mod tests { { let mut env = Environment::new(); env.add_function("raise_exception", raise_exception); - let tmpl = env.template_from_str(&chat_template); + let tmpl = env.template_from_str(chat_template); let result = tmpl.unwrap().render(input).unwrap(); assert_eq!(result, target); } diff --git a/router/src/server.rs b/router/src/server.rs index adaa409c..6b51109b 100644 --- a/router/src/server.rs +++ b/router/src/server.rs @@ -698,7 +698,7 @@ async fn completions( model: model_id.clone(), system_fingerprint: system_fingerprint.clone(), }) - .map_or_else(|_e| Event::default(), |data| data) + .unwrap_or_else(|_e| Event::default()) }; let (header_tx, header_rx) = oneshot::channel(); @@ -1124,13 +1124,10 @@ async fn chat_completions( logprobs, stream_token.details.map(|d| d.finish_reason.to_string()), )) - .map_or_else( - |e| { - println!("Failed to serialize ChatCompletionChunk: {:?}", e); - Event::default() - }, - |data| data, - ) + .unwrap_or_else(|e| { + println!("Failed to serialize ChatCompletionChunk: {:?}", e); + Event::default() + }) }; let (headers, response_stream) = generate_stream_internal( diff --git a/rust-toolchain.toml b/rust-toolchain.toml index 67982433..507ee859 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,6 +1,5 @@ [toolchain] -# Released on: 28 December, 2023 -# Branched from master on: 10 November, 2023 -# https://releases.rs/docs/1.75.0/ -channel = "1.75.0" +# Released on: 02 May, 2024 +# https://releases.rs/docs/1.78.0/ +channel = "1.78.0" components = ["rustfmt", "clippy"] From b726e4fa84743e05c6c30cba02b8ccfaf35bbd26 Mon Sep 17 00:00:00 2001 From: "Wang, Yi" Date: Mon, 6 May 2024 22:05:43 +0800 Subject: [PATCH 08/46] update xpu docker image and use public ipex whel (#1860) # What does this PR do? Fixes # (issue) ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests), Pull Request section? - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link to it if that's the case. - [ ] Did you make sure to update the documentation with your changes? Here are the [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation). - [ ] Did you write any new necessary tests? ## Who can review? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR. Signed-off-by: Wang, Yi A --- Dockerfile_intel | 23 ++++------------------- 1 file changed, 4 insertions(+), 19 deletions(-) diff --git a/Dockerfile_intel b/Dockerfile_intel index 8c7478c1..5bc39d64 100644 --- a/Dockerfile_intel +++ b/Dockerfile_intel @@ -36,7 +36,7 @@ RUN cargo build --release # Text Generation Inference base image for Intel -FROM intel/intel-extension-for-pytorch:2.1.10-xpu as base +FROM intel/intel-extension-for-pytorch:2.1.30-xpu as base USER root # libssl.so.1.1 is not installed on Ubuntu 22.04 by default, install it @@ -47,7 +47,7 @@ RUN wget http://nz2.archive.ubuntu.com/ubuntu/pool/main/o/openssl/libssl1.1_1.1. RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \ | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list -RUN apt-get update && apt install -y intel-basekit xpu-smi cmake python3-dev ninja-build +RUN apt-get update && apt install -y intel-basekit xpu-smi # Text Generation Inference base env ENV HUGGINGFACE_HUB_CACHE=/data \ @@ -56,9 +56,8 @@ ENV HUGGINGFACE_HUB_CACHE=/data \ WORKDIR /usr/src -# Build pytorch and ipex -RUN git clone https://github.com/intel/intel-extension-for-pytorch && cd intel-extension-for-pytorch && git checkout -b xpu_main origin/xpu-main -RUN git clone https://github.com/pytorch/pytorch.git && cd pytorch && git checkout 209f2fa8ff86652f67d75c2f19bf9cb9942fd018 && git apply /usr/src/intel-extension-for-pytorch/torch_patches/00*.patch +RUN wget https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.1.30a0-cp310-cp310-linux_x86_64.whl +RUN pip install intel_extension_for_pytorch-2.1.30a0-cp310-cp310-linux_x86_64.whl # Install server COPY proto proto @@ -72,25 +71,11 @@ RUN cd server && \ ENV CCL_ROOT=/opt/intel/oneapi/ccl/latest ENV I_MPI_ROOT=/opt/intel/oneapi/mpi/latest ENV FI_PROVIDER_PATH=/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/lib/prov:/usr/lib/x86_64-linux-gnu/libfabric -ENV DIAGUTIL_PATH=/opt/intel/oneapi/compiler/latest/etc/compiler/sys_check/sys_check.sh -ENV CCL_CONFIGURATION=cpu_gpu_dpcpp -ENV MANPATH=/opt/intel/oneapi/mpi/latest/share/man:/opt/intel/oneapi/mpi/latest/share/man:/opt/intel/oneapi/compiler/latest/share/man -ENV CMAKE_PREFIX_PATH=/opt/intel/oneapi/mkl/latest/lib/cmake:/opt/intel/oneapi/compiler/latest -ENV CMPLR_ROOT=/opt/intel/oneapi/compiler/latest ENV LIBRARY_PATH=/opt/intel/oneapi/mpi/latest/lib:/opt/intel/oneapi/ccl/latest/lib/:/opt/intel/oneapi/mkl/latest/lib/:/opt/intel/oneapi/compiler/latest/lib -ENV OCL_ICD_FILENAMES=libintelocl_emu.so:libalteracl.so:/opt/intel/oneapi/compiler/latest/lib/libintelocl.so -ENV CLASSPATH=/opt/intel/oneapi/mpi/latest/share/java/mpi.jar:/opt/intel/oneapi/mpi/latest/share/java/mpi.jar ENV LD_LIBRARY_PATH=/opt/intel/oneapi/ccl/latest/lib/:/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/lib:/opt/intel/oneapi/mpi/latest/lib:/opt/intel/oneapi/mkl/latest/lib:/opt/intel/oneapi/compiler/latest/opt/compiler/lib:/opt/intel/oneapi/compiler/latest/lib:/opt/intel/oneapi/lib:/opt/intel/oneapi/lib/intel64: -ENV MKLROOT=/opt/intel/oneapi/mkl/latest -ENV NLSPATH=/opt/intel/oneapi/mkl/latest/share/locale/%l_%t/%N:/opt/intel/oneapi/compiler/latest/lib/locale/%l_%t/%N ENV PATH=/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/bin:/opt/intel/oneapi/mpi/latest/bin:/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/bin:/opt/intel/oneapi/mkl/latest/bin/:/opt/intel/oneapi/compiler/latest/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin -ENV CPATH=/opt/intel/oneapi/mpi/latest/include:/opt/intel/oneapi/ccl/latest/include:/opt/intel/oneapi/mkl/latest/include ENV CCL_ZE_IPC_EXCHANGE=sockets - -RUN pip uninstall -y torch && cd pytorch && git submodule update --init --recursive && python setup.py install -RUN pip uninstall -y intel-extension-for-pytorch && cd intel-extension-for-pytorch && git submodule update --init --recursive && USE_AOT_DEVLIST='pvc' BUILD_SEPARATE_OPS=ON BUILD_WITH_CPU=ON USE_XETLA=ON python setup.py install - # Install benchmarker COPY --from=builder /usr/src/target/release/text-generation-benchmark /usr/local/bin/text-generation-benchmark # Install router From e5c4a219b39bb26947577e9af8c424f0eaa7f1dc Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Mon, 13 May 2024 12:44:30 +0200 Subject: [PATCH 09/46] Refactor layers. (#1866) Fixes # (issue) - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests), Pull Request section? - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link to it if that's the case. - [ ] Did you make sure to update the documentation with your changes? Here are the [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation). - [ ] Did you write any new necessary tests? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR. --- server/tests/utils/test_layers.py | 2 +- .../text_generation_server/layers/__init__.py | 14 + .../{utils => layers}/awq/conversion_utils.py | 0 .../{utils => layers}/awq/quantize/qmodule.py | 0 server/text_generation_server/layers/bnb.py | 106 ++ server/text_generation_server/layers/conv.py | 41 + server/text_generation_server/layers/eetq.py | 25 + server/text_generation_server/layers/fp8.py | 43 + .../layers/gptq/__init__.py | 39 + .../{utils => layers}/gptq/custom_autotune.py | 0 .../{utils => layers}/gptq/exllama.py | 0 .../{utils => layers}/gptq/exllamav2.py | 0 .../layers/gptq/exllamav2.py.rej | 10 + .../layers/gptq/quant_linear.py | 356 +++++ .../{utils => layers}/gptq/quantize.py | 0 .../layers/layernorm.py | 185 +++ .../text_generation_server/layers/linear.py | 153 ++ .../text_generation_server/layers/medusa.py | 186 +++ .../text_generation_server/layers/rotary.py | 419 +++++ .../layers/speculative.py | 35 + .../layers/tensor_parallel.py | 188 +++ .../models/cache_manager.py | 4 +- .../models/custom_modeling/bloom_modeling.py | 2 +- .../models/custom_modeling/clip.py | 2 +- .../custom_modeling/flash_cohere_modeling.py | 18 +- .../custom_modeling/flash_dbrx_modeling.py | 19 +- .../custom_modeling/flash_gemma_modeling.py | 6 +- .../custom_modeling/flash_llama_modeling.py | 6 +- .../custom_modeling/flash_mistral_modeling.py | 6 +- .../custom_modeling/flash_mixtral_modeling.py | 14 +- .../custom_modeling/flash_neox_modeling.py | 10 +- .../custom_modeling/flash_phi_modeling.py | 8 +- .../custom_modeling/flash_qwen2_modeling.py | 6 +- .../custom_modeling/flash_rw_modeling.py | 10 +- .../flash_santacoder_modeling.py | 10 +- .../flash_starcoder2_modeling.py | 10 +- .../models/custom_modeling/idefics2.py | 2 +- .../custom_modeling/idefics_modeling.py | 16 +- .../custom_modeling/idefics_perceiver.py | 2 +- .../models/custom_modeling/idefics_vision.py | 2 +- .../models/custom_modeling/llava_next.py | 2 +- .../models/custom_modeling/mamba_modeling.py | 4 +- .../models/custom_modeling/mpt_modeling.py | 2 +- .../models/custom_modeling/neox_modeling.py | 2 +- .../models/custom_modeling/opt_modeling.py | 2 +- .../models/custom_modeling/phi_modeling.py | 2 +- .../models/custom_modeling/t5_modeling.py | 2 +- .../models/flash_causal_lm.py | 36 +- .../models/flash_llama.py | 4 +- .../models/flash_mistral.py | 4 +- .../models/flash_neox.py | 4 +- .../text_generation_server/models/flash_rw.py | 4 +- .../models/flash_santacoder.py | 4 +- .../utils/flash_attn.py | 197 ++- .../utils/import_utils.py | 41 +- server/text_generation_server/utils/layers.py | 1374 ----------------- .../utils/paged_attention.py | 24 +- .../text_generation_server/utils/weights.py | 10 +- 58 files changed, 2092 insertions(+), 1581 deletions(-) create mode 100644 server/text_generation_server/layers/__init__.py rename server/text_generation_server/{utils => layers}/awq/conversion_utils.py (100%) rename server/text_generation_server/{utils => layers}/awq/quantize/qmodule.py (100%) create mode 100644 server/text_generation_server/layers/bnb.py create mode 100644 server/text_generation_server/layers/conv.py create mode 100644 server/text_generation_server/layers/eetq.py create mode 100644 server/text_generation_server/layers/fp8.py create mode 100644 server/text_generation_server/layers/gptq/__init__.py rename server/text_generation_server/{utils => layers}/gptq/custom_autotune.py (100%) rename server/text_generation_server/{utils => layers}/gptq/exllama.py (100%) rename server/text_generation_server/{utils => layers}/gptq/exllamav2.py (100%) create mode 100644 server/text_generation_server/layers/gptq/exllamav2.py.rej create mode 100644 server/text_generation_server/layers/gptq/quant_linear.py rename server/text_generation_server/{utils => layers}/gptq/quantize.py (100%) create mode 100644 server/text_generation_server/layers/layernorm.py create mode 100644 server/text_generation_server/layers/linear.py create mode 100644 server/text_generation_server/layers/medusa.py create mode 100644 server/text_generation_server/layers/rotary.py create mode 100644 server/text_generation_server/layers/speculative.py create mode 100644 server/text_generation_server/layers/tensor_parallel.py delete mode 100644 server/text_generation_server/utils/layers.py diff --git a/server/tests/utils/test_layers.py b/server/tests/utils/test_layers.py index 93a0e982..9a8da0d6 100644 --- a/server/tests/utils/test_layers.py +++ b/server/tests/utils/test_layers.py @@ -1,5 +1,5 @@ import torch -from text_generation_server.utils.layers import ( +from text_generation_server.layers import ( TensorParallelEmbedding, ) diff --git a/server/text_generation_server/layers/__init__.py b/server/text_generation_server/layers/__init__.py new file mode 100644 index 00000000..c3a6c921 --- /dev/null +++ b/server/text_generation_server/layers/__init__.py @@ -0,0 +1,14 @@ +from text_generation_server.layers.tensor_parallel import ( + TensorParallelColumnLinear, + TensorParallelRowLinear, + TensorParallelEmbedding, +) +from text_generation_server.layers.speculative import SpeculativeHead +from text_generation_server.layers.linear import ( + get_linear, + FastLinear, +) + +# Just to add the `load` methods. +from text_generation_server.layers.layernorm import load_layer_norm +from text_generation_server.layers.conv import load_conv2d diff --git a/server/text_generation_server/utils/awq/conversion_utils.py b/server/text_generation_server/layers/awq/conversion_utils.py similarity index 100% rename from server/text_generation_server/utils/awq/conversion_utils.py rename to server/text_generation_server/layers/awq/conversion_utils.py diff --git a/server/text_generation_server/utils/awq/quantize/qmodule.py b/server/text_generation_server/layers/awq/quantize/qmodule.py similarity index 100% rename from server/text_generation_server/utils/awq/quantize/qmodule.py rename to server/text_generation_server/layers/awq/quantize/qmodule.py diff --git a/server/text_generation_server/layers/bnb.py b/server/text_generation_server/layers/bnb.py new file mode 100644 index 00000000..d27a33a1 --- /dev/null +++ b/server/text_generation_server/layers/bnb.py @@ -0,0 +1,106 @@ +import torch +from loguru import logger +from functools import lru_cache +import bitsandbytes as bnb +from bitsandbytes.nn import Int8Params, Params4bit + + +@lru_cache(1) +def warn_deprecate_bnb(): + logger.warning( + "Bitsandbytes 8bit is deprecated, using `eetq` is a drop-in replacement, and has much better performnce" + ) + + +class Linear8bitLt(torch.nn.Module): + def __init__( + self, + weight, + bias, + has_fp16_weights=True, + memory_efficient_backward=False, + threshold=0.0, + index=None, + ): + super().__init__() + assert ( + not memory_efficient_backward + ), "memory_efficient_backward is no longer required and the argument is deprecated in 0.37.0 and will be removed in 0.39.0" + self.state = bnb.MatmulLtState() + self.index = index + + # Necessary for stacked layers + self.state.threshold = threshold + self.state.has_fp16_weights = has_fp16_weights + self.state.memory_efficient_backward = memory_efficient_backward + if threshold > 0.0 and not has_fp16_weights: + self.state.use_pool = True + + self.weight = Int8Params( + weight.data, + has_fp16_weights=has_fp16_weights, + requires_grad=has_fp16_weights, + ) + self.weight.cuda(weight.device) + self.bias = bias + + def init_8bit_state(self): + self.state.CB = self.weight.CB + self.state.SCB = self.weight.SCB + self.weight.CB = None + self.weight.SCB = None + + def forward(self, x: torch.Tensor): + self.state.is_training = self.training + if self.weight.CB is not None: + self.init_8bit_state() + + # weights are cast automatically as Int8Params, but the bias has to be cast manually + if self.bias is not None and self.bias.dtype != x.dtype: + self.bias.data = self.bias.data.to(x.dtype) + + out = bnb.matmul(x, self.weight, bias=self.bias, state=self.state) + + if not self.state.has_fp16_weights: + if self.state.CB is not None and self.state.CxB is not None: + # we converted 8-bit row major to turing/ampere format in the first inference pass + # we no longer need the row-major weight + del self.state.CB + self.weight.data = self.state.CxB + return out + + +class Linear4bit(nn.Module): + def __init__(self, weight, bias, quant_type): + super().__init__() + self.weight = Params4bit( + weight.data, + requires_grad=False, + compress_statistics=True, + quant_type=quant_type, + ) + self.compute_dtype = None + self.weight.cuda(weight.device) + self.bias = bias + + def forward(self, x: torch.Tensor): + # weights are cast automatically as Int8Params, but the bias has to be cast manually + if self.bias is not None and self.bias.dtype != x.dtype: + self.bias.data = self.bias.data.to(x.dtype) + + if getattr(self.weight, "quant_state", None) is None: + print( + "FP4 quantization state not initialized. Please call .cuda() or .to(device) on the LinearFP4 layer first." + ) + inp_dtype = x.dtype + if self.compute_dtype is not None: + x = x.to(self.compute_dtype) + + bias = None if self.bias is None else self.bias.to(self.compute_dtype) + out = bnb.matmul_4bit( + x, self.weight.t(), bias=bias, quant_state=self.weight.quant_state + ) + + out = out.to(inp_dtype) + + return out diff --git a/server/text_generation_server/layers/conv.py b/server/text_generation_server/layers/conv.py new file mode 100644 index 00000000..7fb18ab3 --- /dev/null +++ b/server/text_generation_server/layers/conv.py @@ -0,0 +1,41 @@ +from accelerate import init_empty_weights +import torch + + +@classmethod +def load_conv2d(cls, prefix, weights, in_channels, out_channels, kernel_size, stride): + weight = weights.get_tensor(f"{prefix}.weight") + bias = weights.get_tensor(f"{prefix}.bias") + with init_empty_weights(): + conv2d = cls( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + ) + + conv2d.weight = torch.nn.Parameter(weight) + conv2d.bias = torch.nn.Parameter(bias) + return conv2d + + +@classmethod +def load_conv2d_no_bias( + cls, prefix, weights, in_channels, out_channels, kernel_size, stride +): + weight = weights.get_tensor(f"{prefix}.weight") + with init_empty_weights(): + conv2d = cls( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + ) + + conv2d.weight = torch.nn.Parameter(weight) + conv2d.bias = None + return conv2d + + +torch.nn.Conv2d.load = load_conv2d +torch.nn.Conv2d.load_no_bias = load_conv2d_no_bias diff --git a/server/text_generation_server/layers/eetq.py b/server/text_generation_server/layers/eetq.py new file mode 100644 index 00000000..fd22b5c6 --- /dev/null +++ b/server/text_generation_server/layers/eetq.py @@ -0,0 +1,25 @@ +import torch +from EETQ import quant_weights, w8_a16_gemm + + +class EETQLinear(torch.nn.Module): + def __init__( + self, + weight, + bias, + ) -> None: + super().__init__() + device = weight.device + if weight.dtype != torch.float16: + weight = weight.to(dtype=torch.float16) + weight = torch.t(weight).contiguous().cpu() + weight, scale = quant_weights(weight, torch.int8, False) + + self.weight = weight.cuda(device) + self.scale = scale.cuda(device) + self.bias = bias.cuda(device) if bias is not None else None + + def forward(self, input: torch.Tensor) -> torch.Tensor: + output = w8_a16_gemm(input, self.weight, self.scale) + output = output + self.bias if self.bias is not None else output + return output diff --git a/server/text_generation_server/layers/fp8.py b/server/text_generation_server/layers/fp8.py new file mode 100644 index 00000000..dd61d081 --- /dev/null +++ b/server/text_generation_server/layers/fp8.py @@ -0,0 +1,43 @@ +import torch + + +def fp8_quantize(weight, qdtype=torch.float8_e4m3fn): + device = weight.device + # weight, scale = quant_weights(weight, torch.int8, False) + finfo = torch.finfo(qdtype) + # Calculate the scale as dtype max divided by absmax + scale = finfo.max / weight.abs().max().clamp(min=1e-12) + # scale and clamp the tensor to bring it to + # the representative range of float8 data type + # (as default cast is unsaturated) + qweight = (weight * scale).clamp(min=finfo.min, max=finfo.max) + # Return both float8 data and the inverse scale (as float), + # as both required as inputs to torch._scaled_mm + qweight = qweight.to(qdtype) + scale = scale.float().reciprocal() + return qweight, scale + + +class Fp8Linear(torch.nn.Module): + def __init__( + self, + weight, + bias, + ) -> None: + super().__init__() + self.dtype = weight.dtype + self.qweight, self.scale = fp8_quantize(weight) + + self.bias = bias if bias is not None else None + + def forward(self, input: torch.Tensor) -> torch.Tensor: + qinput, scale = fp8_quantize(input) + output, _ = torch._scaled_mm( + qinput, + self.qweight.t(), + out_dtype=self.dtype, + scale_a=scale, + scale_b=self.scale, + bias=self.bias, + ) + return output diff --git a/server/text_generation_server/layers/gptq/__init__.py b/server/text_generation_server/layers/gptq/__init__.py new file mode 100644 index 00000000..1c46f493 --- /dev/null +++ b/server/text_generation_server/layers/gptq/__init__.py @@ -0,0 +1,39 @@ +import os +import torch +from text_generation_server.utils.import_utils import ( + SYSTEM, +) + +try: + major, _minor = torch.cuda.get_device_capability() +except Exception: + major = 1 + +HAS_EXLLAMA = False +CAN_EXLLAMA = major >= 8 or SYSTEM == "rocm" +V2 = os.getenv("EXLLAMA_VERSION", "2") == "2" +if os.getenv("DISABLE_EXLLAMA") == "True": + HAS_EXLLAMA = False +elif CAN_EXLLAMA: + try: + if V2: + from text_generation_server.layers.gptq.exllamav2 import ( + QuantLinear as ExllamaQuantLinear, + create_exllama_buffers, + set_device, + ) + + HAS_EXLLAMA = "2" + else: + from text_generation_server.layers.gptq.exllama import ( + Ex4bitLinear as ExllamaQuantLinear, + create_exllama_buffers, + set_device, + ) + + HAS_EXLLAMA = "1" + + except ImportError: + pass + +from text_generation_server.layers.gptq.quant_linear import QuantLinear diff --git a/server/text_generation_server/utils/gptq/custom_autotune.py b/server/text_generation_server/layers/gptq/custom_autotune.py similarity index 100% rename from server/text_generation_server/utils/gptq/custom_autotune.py rename to server/text_generation_server/layers/gptq/custom_autotune.py diff --git a/server/text_generation_server/utils/gptq/exllama.py b/server/text_generation_server/layers/gptq/exllama.py similarity index 100% rename from server/text_generation_server/utils/gptq/exllama.py rename to server/text_generation_server/layers/gptq/exllama.py diff --git a/server/text_generation_server/utils/gptq/exllamav2.py b/server/text_generation_server/layers/gptq/exllamav2.py similarity index 100% rename from server/text_generation_server/utils/gptq/exllamav2.py rename to server/text_generation_server/layers/gptq/exllamav2.py diff --git a/server/text_generation_server/layers/gptq/exllamav2.py.rej b/server/text_generation_server/layers/gptq/exllamav2.py.rej new file mode 100644 index 00000000..cde7b73a --- /dev/null +++ b/server/text_generation_server/layers/gptq/exllamav2.py.rej @@ -0,0 +1,10 @@ +diff a/server/text_generation_server/layers/gptq/exllamav2.py b/server/text_generation_server/layers/gptq/exllamav2.py (rejected hunks) +@@ -119,6 +119,8 @@ def ext_make_q_matrix(w: dict, temp_dq, key: str = None): + none_tensor, + temp_dq, + ) ++ else: ++ RuntimeError("Cannot create handle") + + + DEVICE = None diff --git a/server/text_generation_server/layers/gptq/quant_linear.py b/server/text_generation_server/layers/gptq/quant_linear.py new file mode 100644 index 00000000..b52ceb0f --- /dev/null +++ b/server/text_generation_server/layers/gptq/quant_linear.py @@ -0,0 +1,356 @@ +import math +import numpy as np +import torch +import torch.nn as nn +from torch.cuda.amp import custom_fwd + +import triton +import triton.language as tl +from . import custom_autotune + + +# code based https://github.com/fpgaminer/GPTQ-triton +@custom_autotune.autotune( + configs=[ + triton.Config( + { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 8, + }, + num_stages=4, + num_warps=4, + ), + triton.Config( + { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 8, + }, + num_stages=4, + num_warps=4, + ), + triton.Config( + { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 8, + }, + num_stages=4, + num_warps=4, + ), + triton.Config( + { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 8, + }, + num_stages=4, + num_warps=4, + ), + triton.Config( + { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 8, + }, + num_stages=4, + num_warps=4, + ), + triton.Config( + { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 8, + }, + num_stages=2, + num_warps=8, + ), + triton.Config( + { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + }, + num_stages=3, + num_warps=8, + ), + triton.Config( + { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + }, + num_stages=2, + num_warps=4, + ), + ], + key=["M", "N", "K"], + nearest_power_of_two=True, + prune_configs_by={ + "early_config_prune": custom_autotune.matmul248_kernel_config_pruner, + "perf_model": None, + "top_k": None, + }, +) +@triton.jit +def matmul_248_kernel( + a_ptr, + b_ptr, + c_ptr, + scales_ptr, + zeros_ptr, + g_ptr, + M, + N, + K, + bits, + maxq, + stride_am, + stride_ak, + stride_bk, + stride_bn, + stride_cm, + stride_cn, + stride_scales, + stride_zeros, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, +): + """ + Compute the matrix multiplication C = A x B. + A is of shape (M, K) float16 + B is of shape (K//8, N) int32 + C is of shape (M, N) float16 + scales is of shape (G, N) float16 + zeros is of shape (G, N) float16 + g_ptr is of shape (K) int32 + """ + infearure_per_bits = 32 // bits + + pid = tl.program_id(axis=0) + num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) + num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) + num_pid_k = tl.cdiv(K, BLOCK_SIZE_K) + num_pid_in_group = GROUP_SIZE_M * num_pid_n + group_id = pid // num_pid_in_group + first_pid_m = group_id * GROUP_SIZE_M + group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M) + pid_m = first_pid_m + (pid % group_size_m) + pid_n = (pid % num_pid_in_group) // group_size_m + + offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + offs_k = tl.arange(0, BLOCK_SIZE_K) + a_ptrs = a_ptr + ( + offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak + ) # (BLOCK_SIZE_M, BLOCK_SIZE_K) + a_mask = offs_am[:, None] < M + # b_ptrs is set up such that it repeats elements along the K axis 8 times + b_ptrs = b_ptr + ( + (offs_k[:, None] // infearure_per_bits) * stride_bk + + offs_bn[None, :] * stride_bn + ) # (BLOCK_SIZE_K, BLOCK_SIZE_N) + g_ptrs = g_ptr + offs_k + # shifter is used to extract the N bits of each element in the 32-bit word from B + scales_ptrs = scales_ptr + offs_bn[None, :] + zeros_ptrs = zeros_ptr + (offs_bn[None, :] // infearure_per_bits) + + shifter = (offs_k % infearure_per_bits) * bits + zeros_shifter = (offs_bn % infearure_per_bits) * bits + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + + for k in range(0, num_pid_k): + g_idx = tl.load(g_ptrs) + + # Fetch scales and zeros; these are per-outfeature and thus reused in the inner loop + scales = tl.load( + scales_ptrs + g_idx[:, None] * stride_scales + ) # (BLOCK_SIZE_K, BLOCK_SIZE_N,) + zeros = tl.load( + zeros_ptrs + g_idx[:, None] * stride_zeros + ) # (BLOCK_SIZE_K, BLOCK_SIZE_N,) + + zeros = (zeros >> zeros_shifter[None, :]) & maxq + zeros = (zeros + 1) & maxq # eventually avoid overflow + + a = tl.load(a_ptrs, mask=a_mask, other=0.0) # (BLOCK_SIZE_M, BLOCK_SIZE_K) + b = tl.load(b_ptrs) # (BLOCK_SIZE_K, BLOCK_SIZE_N), but repeated + + # Now we need to unpack b (which is N-bit values) into 32-bit values + b = (b >> shifter[:, None]) & maxq # Extract the N-bit values + b = (b - zeros) * scales # Scale and shift + + accumulator += tl.dot(a, b) + a_ptrs += BLOCK_SIZE_K + b_ptrs += (BLOCK_SIZE_K // infearure_per_bits) * stride_bk + g_ptrs += BLOCK_SIZE_K + + c_ptrs = c_ptr + stride_cm * offs_am[:, None] + stride_cn * offs_bn[None, :] + c_mask = (offs_am[:, None] < M) & (offs_bn[None, :] < N) + tl.store(c_ptrs, accumulator, mask=c_mask) + + +def matmul248(input, qweight, scales, qzeros, g_idx, bits, maxq): + with torch.cuda.device(input.device): + output = torch.empty( + (input.shape[0], qweight.shape[1]), device=input.device, dtype=torch.float16 + ) + grid = lambda META: ( + triton.cdiv(input.shape[0], META["BLOCK_SIZE_M"]) + * triton.cdiv(qweight.shape[1], META["BLOCK_SIZE_N"]), + ) + matmul_248_kernel[grid]( + input, + qweight, + output, + scales, + qzeros, + g_idx, + input.shape[0], + qweight.shape[1], + input.shape[1], + bits, + maxq, + input.stride(0), + input.stride(1), + qweight.stride(0), + qweight.stride(1), + output.stride(0), + output.stride(1), + scales.stride(0), + qzeros.stride(0), + ) + return output + + +class QuantLinearFunction(torch.autograd.Function): + @staticmethod + @custom_fwd(cast_inputs=torch.float16) + def forward(ctx, input, qweight, scales, qzeros, g_idx, bits, maxq): + output = matmul248(input, qweight, scales, qzeros, g_idx, bits, maxq) + return output + + +class QuantLinear(nn.Module): + def __init__(self, qweight, qzeros, scales, g_idx, bias, bits, groupsize): + super().__init__() + self.register_buffer("qweight", qweight) + self.register_buffer("qzeros", qzeros) + self.register_buffer("scales", scales) + self.register_buffer("g_idx", g_idx) + if bias is not None: + self.register_buffer("bias", bias) + else: + self.bias = None + if bits not in [2, 4, 8]: + raise NotImplementedError("Only 2,4,8 bits are supported.") + self.bits = bits + self.maxq = 2**self.bits - 1 + self.groupsize = groupsize + + self.outfeatures = qweight.shape[1] + self.infeatures = qweight.shape[0] * 32 // bits + + @classmethod + def new(cls, bits, groupsize, infeatures, outfeatures, bias): + if bits not in [2, 4, 8]: + raise NotImplementedError("Only 2,4,8 bits are supported.") + + qweight = torch.zeros((infeatures // 32 * bits, outfeatures), dtype=torch.int32) + qzeros = torch.zeros( + (math.ceil(infeatures / groupsize), outfeatures // 32 * bits), + dtype=torch.int32, + ) + scales = torch.zeros( + (math.ceil(infeatures / groupsize), outfeatures), dtype=torch.float16 + ) + g_idx = torch.tensor( + [i // groupsize for i in range(infeatures)], dtype=torch.int32 + ) + if bias: + bias = torch.zeros((outfeatures), dtype=torch.float16) + else: + bias = None + return cls(qweight, qzeros, scales, g_idx, bias, bits, groupsize) + + def pack(self, linear, scales, zeros, g_idx=None): + self.g_idx = g_idx.clone() if g_idx is not None else self.g_idx + + scales = scales.t().contiguous() + zeros = zeros.t().contiguous() + scale_zeros = zeros * scales + self.scales = scales.clone().half() + if linear.bias is not None: + self.bias = linear.bias.clone().half() + + intweight = [] + for idx in range(self.infeatures): + intweight.append( + torch.round( + (linear.weight.data[:, idx] + scale_zeros[self.g_idx[idx]]) + / self.scales[self.g_idx[idx]] + ).to(torch.int)[:, None] + ) + intweight = torch.cat(intweight, dim=1) + intweight = intweight.t().contiguous() + intweight = intweight.numpy().astype(np.uint32) + qweight = np.zeros( + (intweight.shape[0] // 32 * self.bits, intweight.shape[1]), dtype=np.uint32 + ) + i = 0 + row = 0 + while row < qweight.shape[0]: + if self.bits in [2, 4, 8]: + for j in range(i, i + (32 // self.bits)): + qweight[row] |= intweight[j] << (self.bits * (j - i)) + i += 32 // self.bits + row += 1 + else: + raise NotImplementedError("Only 2,4,8 bits are supported.") + + qweight = qweight.astype(np.int32) + self.qweight = torch.from_numpy(qweight) + + zeros -= 1 + zeros = zeros.numpy().astype(np.uint32) + qzeros = np.zeros( + (zeros.shape[0], zeros.shape[1] // 32 * self.bits), dtype=np.uint32 + ) + i = 0 + col = 0 + while col < qzeros.shape[1]: + if self.bits in [2, 4, 8]: + for j in range(i, i + (32 // self.bits)): + qzeros[:, col] |= zeros[:, j] << (self.bits * (j - i)) + i += 32 // self.bits + col += 1 + else: + raise NotImplementedError("Only 2,4,8 bits are supported.") + + qzeros = qzeros.astype(np.int32) + self.qzeros = torch.from_numpy(qzeros) + + def forward(self, x): + out_shape = x.shape[:-1] + (self.outfeatures,) + out = QuantLinearFunction.apply( + x.reshape(-1, x.shape[-1]), + self.qweight, + self.scales, + self.qzeros, + self.g_idx, + self.bits, + self.maxq, + ) + out = out + self.bias if self.bias is not None else out + return out.reshape(out_shape) diff --git a/server/text_generation_server/utils/gptq/quantize.py b/server/text_generation_server/layers/gptq/quantize.py similarity index 100% rename from server/text_generation_server/utils/gptq/quantize.py rename to server/text_generation_server/layers/gptq/quantize.py diff --git a/server/text_generation_server/layers/layernorm.py b/server/text_generation_server/layers/layernorm.py new file mode 100644 index 00000000..15d24e80 --- /dev/null +++ b/server/text_generation_server/layers/layernorm.py @@ -0,0 +1,185 @@ +import torch +from torch import nn +from accelerate import init_empty_weights +from text_generation_server.utils.import_utils import ( + SYSTEM, +) + + +# Monkey patching +@classmethod +def load_layer_norm(cls, prefix, weights, eps): + weight = weights.get_tensor(f"{prefix}.weight") + bias = weights.get_tensor(f"{prefix}.bias") + with init_empty_weights(): + ln = cls(weight.shape, eps=eps) + + ln.weight = torch.nn.Parameter(weight) + ln.bias = torch.nn.Parameter(bias) + return ln + + +@classmethod +def load_layer_norm_no_bias(cls, prefix, weights, eps): + weight = weights.get_tensor(f"{prefix}.weight") + with init_empty_weights(): + ln = cls(weight.shape, eps=eps) + + ln.weight = torch.nn.Parameter(weight) + ln.bias = None + return ln + + +torch.nn.LayerNorm.load = load_layer_norm +torch.nn.LayerNorm.load_no_bias = load_layer_norm_no_bias + +if SYSTEM == "cuda": + import dropout_layer_norm + + class FastLayerNorm(nn.LayerNorm): + def forward(self, hidden_states, residual=None): + if hidden_states.shape[-1] > 8192: + if residual is not None: + hidden_states += residual + residual = hidden_states + + return super(FastLayerNorm, self).forward(hidden_states), residual + else: + ( + normed_hidden_states, + residual, + *rest, + ) = dropout_layer_norm.dropout_add_ln_fwd( + hidden_states, + residual, + self.weight, + self.bias, + None, + None, + None, + None, + 0.0, + self.eps, + 1.0, + 0, + None, + False, + False, + ) + if residual is None: + residual = hidden_states + + return normed_hidden_states, residual + +elif SYSTEM == "rocm": + from vllm import layernorm_ops + + class FastLayerNorm(nn.LayerNorm): + def forward(self, hidden_states, residual=None): + if residual is not None: + hidden_states += residual + residual = hidden_states + + return super().forward(hidden_states), residual + +elif SYSTEM == "xpu": + import intel_extension_for_pytorch as ipex + + class FastLayerNorm(nn.LayerNorm): + def forward(self, hidden_states, residual=None): + res_out = hidden_states + out = ipex.llm.functional.add_layer_norm( + residual, hidden_states, self.weight, self.bias, self.eps, True + ) + if residual is not None: + res_out = residual + return out, res_out + + +class FastRMSNorm(nn.Module): + def __init__(self, weight: torch.Tensor, eps: float): + super().__init__() + + self.weight = nn.Parameter(weight) + self.variance_epsilon = eps + + @classmethod + def load(cls, prefix, weights, eps=1e-6): + weight = weights.get_tensor(f"{prefix}.weight") + return cls(weight, eps) + + def forward(self, hidden_states, residual=None): + if SYSTEM == "xpu": + residual_out = hidden_states + out = ipex.llm.functional.add_rms_norm( + residual, + hidden_states, + self.weight, + None, + self.variance_epsilon, + True, + ) + if residual is not None: + residual_out = residual + return out, residual_out + elif hidden_states.shape[-1] > 8192: + if residual is not None: + hidden_states += residual + residual = hidden_states + + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt( + variance + self.variance_epsilon + ) + + # convert into half-precision if necessary + if self.weight.dtype in [torch.float16, torch.bfloat16]: + hidden_states = hidden_states.to(self.weight.dtype) + + return self.weight * hidden_states, residual + elif SYSTEM == "cuda": + # faster post attention rms norm + ( + normed_hidden_states, + res, + *rest, + ) = dropout_layer_norm.dropout_add_ln_fwd( + hidden_states, + residual, + self.weight, + None, + None, + None, + None, + None, + 0.0, + self.variance_epsilon, + 1.0, + 0, + None, + False, + True, # Activate RMSNorm + ) + if res is None: + res = hidden_states + + return normed_hidden_states, res + elif SYSTEM == "rocm": + # We use VLLM RMSNorm kernel that can be compiled for RoCm, instead of Flash Attention ones that can not. + if residual is not None: + hidden_states += residual + residual = hidden_states + + out = torch.empty_like(hidden_states) + layernorm_ops.rms_norm( + out, + hidden_states, + self.weight.data, + self.variance_epsilon, + ) + return out, residual + else: + raise ValueError( + "Your system seem to be not supported. Please check your install or open an issue at https://github.com/huggingface/text-generation-inference/issues with a clear reproduction." + ) diff --git a/server/text_generation_server/layers/linear.py b/server/text_generation_server/layers/linear.py new file mode 100644 index 00000000..d137a500 --- /dev/null +++ b/server/text_generation_server/layers/linear.py @@ -0,0 +1,153 @@ +import torch +from torch.nn import functional as F +from text_generation_server.utils.import_utils import SYSTEM + + +class FastLinear(torch.nn.Module): + def __init__( + self, + weight, + bias, + ) -> None: + super().__init__() + self.weight = torch.nn.Parameter(weight) + if bias is not None: + self.bias = torch.nn.Parameter(bias) + else: + self.bias = None + + @classmethod + def load(cls, config, prefix: str, weights, bias: bool): + weight = weights.get_tensor(f"{prefix}.weight") + if bias: + bias = weights.get_tensor(f"{prefix}.bias") + else: + bias = None + return cls(weight, bias) + + def forward(self, input: torch.Tensor) -> torch.Tensor: + return F.linear(input, self.weight, self.bias) + + +def get_linear(weight, bias, quantize): + if quantize is None: + linear = FastLinear(weight, bias) + elif quantize == "eetq": + try: + from text_generation_server.layers.eetq import EETQLinear + + linear = EETQLinear(weight, bias) + except ImportError: + raise ImportError( + "Please install EETQ from https://github.com/NetEase-FuXi/EETQ" + ) + elif quantize == "fp8": + from text_generation_server.layers.fp8 import Fp8Linear + + linear = Fp8Linear(weight, bias) + elif quantize == "bitsandbytes": + try: + from text_generation_server.layers.bnb import ( + warn_deprecate_bnb, + Linear8bitLt, + ) + except ImportError: + raise NotImplementedError( + f"Bitsandbytes is missing install it with `pip install bitsandbytes`." + ) + warn_deprecate_bnb() + linear = Linear8bitLt( + weight, + bias, + has_fp16_weights=False, + threshold=6.0, + ) + if bias is not None: + linear.bias = nn.Parameter(bias) + elif quantize == "bitsandbytes-fp4": + try: + from text_generation_server.layers.bnb import Linear4bit + except ImportError: + raise NotImplementedError( + f"Bitsandbytes is missing install it with `pip install bitsandbytes`." + ) + linear = Linear4bit( + weight, + bias, + quant_type="fp4", + ) + elif quantize == "bitsandbytes-nf4": + try: + from text_generation_server.layers.bnb import Linear4bit + except ImportError: + raise NotImplementedError( + f"Bitsandbytes is missing install it with `pip install bitsandbytes`." + ) + linear = Linear4bit( + weight, + bias, + quant_type="nf4", + ) + elif quantize == "gptq": + try: + qweight, qzeros, scales, g_idx, bits, groupsize, use_exllama = weight + except Exception: + raise NotImplementedError( + f"The passed weight is not `gptq` compatible, loader needs to be updated." + ) + + if use_exllama: + try: + from text_generation_server.layers.gptq import ( + ExllamaQuantLinear, + ) + except ImportError: + raise NotImplementedError( + f"Exllama gptq kernels are not installed. Install them `cd server/exllama_kernels && python setup.py install && cd ../exllamav2_kernels && python setup.py install`" + ) + + linear = ExllamaQuantLinear( + qweight, qzeros, scales, g_idx, bias, bits, groupsize + ) + else: + from text_generation_server.layers.gptq.quant_linear import QuantLinear + + linear = QuantLinear( + qweight, + qzeros, + scales, + g_idx, + bias, + bits, + groupsize, + ) + elif quantize == "awq": + try: + qweight, qzeros, scales, _, bits, groupsize, _ = weight + except Exception: + raise NotImplementedError( + f"The passed weight is not `awq` compatible, loader needs to be updated." + ) + if SYSTEM == "rocm": + raise NotImplementedError( + "AWQ GEMM kernel can't be used on ROCm systems, please use `--quantize gptq` instead " + "to use Exllama/GPTQ kernels for AWQ inference." + ) + try: + from text_generation_server.layers.awq.quantize.qmodule import WQLinear + + linear = WQLinear( + w_bit=bits, + group_size=groupsize, + qweight=qweight, + qzeros=qzeros, + scales=scales, + bias=bias is not None, + ) + except ImportError: + raise NotImplementedError( + "You do not seem to have awq installed, either install it (cd server && make install-awq), or try using GPTQ `---quantize gptq` a conversion AWQ->GPTQ will happen on the fly" + ) + else: + raise NotImplementedError(f"Quantization `{quantize}` is not implemented yet.") + return linear diff --git a/server/text_generation_server/layers/medusa.py b/server/text_generation_server/layers/medusa.py new file mode 100644 index 00000000..4ac86978 --- /dev/null +++ b/server/text_generation_server/layers/medusa.py @@ -0,0 +1,186 @@ +import torch +from torch import nn +from typing import Tuple, Optional +from text_generation_server.utils.speculate import get_speculate +from text_generation_server.layers.linear import FastLinear +from text_generation_server.layers.tensor_parallel import ( + TensorParallelHead, + TensorParallelColumnLinear, +) + + +class ResBlock(torch.nn.Module): + def __init__(self, config, prefix, weights): + super().__init__() + self.linear = FastLinear.load( + config, prefix=f"{prefix}.linear", weights=weights, bias=True + ) + self.act = torch.nn.SiLU() + + def forward(self, x): + return x + self.act(self.linear(x)) + + +class MedusaModel(torch.nn.Module): + def __init__(self, config, medusa_config, weights): + super().__init__() + self.heads = torch.nn.ModuleList( + [ + MedusaHead(config, medusa_config, prefix=f"{i}", weights=weights) + for i in range(get_speculate()) + ] + ) + + def forward(self, x): + speculative_logits = torch.stack([head(x) for head in self.heads], dim=1) + return speculative_logits + + +class MedusaHead(torch.nn.Module): + def __init__(self, config, medusa_config, prefix, weights): + super().__init__() + self.blocks = torch.nn.ModuleList( + [ + ResBlock(config, prefix=f"{prefix}.{i}", weights=weights) + for i in range(medusa_config["medusa_num_layers"]) + ] + ) + n = len(self.blocks) + self.out = FastLinear.load( + config, prefix=f"{prefix}.{n}", weights=weights, bias=False + ) + + def forward(self, x): + for block in self.blocks: + x = block(x) + x = self.out(x) + return x + + +class MedusaHeadV1(nn.Module): + def __init__(self, lm_head, medusa): + super().__init__() + self.lm_head = lm_head + self.medusa = medusa + + @staticmethod + def load(config, prefix: str, weights): + from pathlib import Path + from safetensors import safe_open + import json + + use_medusa = config.use_medusa + + medusa_config = str(Path(use_medusa) / "config.json") + filename = str(Path(use_medusa) / "medusa_lm_head.safetensors") + + with open(medusa_config, "r") as f: + medusa_config = json.load(f) + routing = weights.routing + with safe_open(filename, framework="pytorch") as f: + for k in f.keys(): + if k in routing and routing[k] != filename: + raise RuntimeError( + f"Key {k} was found in multiple files: {filename} and {routing[k]}" + ) + routing[k] = filename + + medusa = MedusaModel(config, medusa_config, weights) + lm_head = TensorParallelHead.load(config, prefix, weights) + return MedusaHeadV1(lm_head, medusa) + + def forward( + self, input: torch.Tensor + ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: + logits = self.lm_head(input) + # If we have too many tokens, we skip speculative logits + if input.shape[0] > 128: + return logits, None + + speculative_logits = self.medusa(input) + return logits, speculative_logits + + +class MedusaHeadV2(nn.Module): + def __init__(self, config, prefix, weights): + super().__init__() + from pathlib import Path + from safetensors import safe_open + import json + + use_medusa = config.use_medusa + + medusa_config = str(Path(use_medusa) / "config.json") + filename = str(Path(use_medusa) / "medusa_lm_head.safetensors") + + with open(medusa_config, "r") as f: + medusa_config = json.load(f) + routing = weights.routing + with safe_open(filename, framework="pytorch") as f: + for k in f.keys(): + if k in routing and routing[k] != filename: + raise RuntimeError( + f"Key {k} was found in multiple files: {filename} and {routing[k]}" + ) + routing[k] = filename + + self.n_medusa_heads = get_speculate() + + assert medusa_config["medusa_num_layers"] == 1 + self.linear = TensorParallelColumnLinear.load_multi( + config, + prefixes=[f"{i}.0.linear" for i in range(self.n_medusa_heads)], + dim=0, + weights=weights, + bias=True, + ) + self.process_group = weights.process_group + self.world_size = self.process_group.size() + self.rank = self.process_group.rank() + + self.act = torch.nn.SiLU() + + self.lm_head = TensorParallelHead.load(config, prefix, weights) + + def forward(self, x): + # If we have too many tokens, we skip speculative logits + if x.shape[0] > 128: + logits = self.lm_head(x) + return logits, None + + size = x.shape[-1] + block_size = (size + self.world_size - 1) // self.world_size + start = self.rank * block_size + stop = (self.rank + 1) * block_size + + x_block = x[:, start:stop] + + # Compute all medusa heads at the same time, then reshape and move the n_medusa_heads dim to dim 1 + medusa_res = self.act(self.linear(x)).reshape( + *x_block.shape[:-1], self.n_medusa_heads, x_block.shape[-1] + ) + + # Apply all residual medusa heads + output = x[:, start:stop].unsqueeze(-2) + medusa_res + + # Gather medusa heads + world_output = [ + torch.empty_like(output) for _ in range(self.process_group.size()) + ] + torch.distributed.all_gather(world_output, output, group=self.process_group) + world_output = torch.cat(world_output, dim=-1) + + # Stack x and medusa residual x + stacked_x = torch.cat([x.unsqueeze(-2), world_output], dim=-2) + + # Compute lm head on x + medusa residual x + logits = self.lm_head(stacked_x) + + # Finally, split logits from speculative logits + logits, speculative_logits = torch.split( + logits, [1, self.n_medusa_heads], dim=-2 + ) + # Squeeze added dimension + logits = logits.squeeze(-2) + + return logits, speculative_logits diff --git a/server/text_generation_server/layers/rotary.py b/server/text_generation_server/layers/rotary.py new file mode 100644 index 00000000..503dd554 --- /dev/null +++ b/server/text_generation_server/layers/rotary.py @@ -0,0 +1,419 @@ +import os +import torch +from torch import nn + +from text_generation_server.utils.import_utils import SYSTEM + +if SYSTEM == "cuda": + from flash_attn.layers.rotary import RotaryEmbedding + import rotary_emb +elif SYSTEM == "rocm": + from vllm import pos_encoding_ops + + +def _create_inv_freq(dim, base, device): + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, device=device, dtype=torch.float32) / dim) + ) + return inv_freq + + +def _get_rope_config(config): + if os.getenv("ROPE_SCALING", None) is not None: + rope_scaling = { + "type": os.environ["ROPE_SCALING"], + "factor": float(os.environ["ROPE_FACTOR"]), + } + return rope_scaling + return getattr(config, "rope_scaling", None) + + +class PositionRotaryEmbedding(nn.Module): + def __init__(self, inv_freq, scaling_factor): + super().__init__() + self.inv_freq = inv_freq + self._seq_len_cached = 0 + self._cos_cached = None + self._sin_cached = None + self._cos_k_cached = None + self._sin_k_cached = None + self.scaling_factor = scaling_factor + self.dynamic_args = None + + def forward( + self, + query: torch.Tensor, + key: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, + ): + # Such controlflows may add some overhead. + if SYSTEM == "cuda": + rotary_dim = cos.shape[-1] + q1 = query[..., :rotary_dim] + q2 = query[..., rotary_dim : 2 * rotary_dim] + + rotary_emb.apply_rotary(q1, q2, cos, sin, q1, q2, False) + + k1 = key[..., :rotary_dim] + k2 = key[..., rotary_dim : 2 * rotary_dim] + + rotary_emb.apply_rotary(k1, k2, cos, sin, k1, k2, False) + elif SYSTEM == "rocm": + # NOTE: On RoCm systems, we use a ROPE implementatation adapted from VLLM which launches a single kernel for both query/key, contrary to flash-attn implementation used on NVIDIA systems. + # Compiling flash-attn rotary on RoCm, it appears hipcc is unable to unroll loops, resulting in an even slower inference compared to eager: https://github.com/pytorch/pytorch/issues/113773 + + head_size = query.shape[-1] + + # Inplace operation, updating query and key. + pos_encoding_ops.rotary_embedding(query, key, head_size, cos, sin, True) + elif SYSTEM == "xpu": + ipex.llm.functional.rotary_embedding( + query, key, sin, cos, query.size(-1), True + ) + else: + raise ValueError( + "Your system seem to be not supported. Please check your install or open an issue at https://github.com/huggingface/text-generation-inference/issues with a clear reproduction." + ) + + @classmethod + def static(cls, config, dim, base, device): + inv_freq = _create_inv_freq(dim, base, device) + scaling_factor = None + rope_scaling = _get_rope_config(config) + if rope_scaling is not None: + if rope_scaling["type"] == "linear": + pass + elif rope_scaling["type"] == "dynamic": + scaling_factor = rope_scaling["factor"] + return DynamicPositionRotaryEmbedding( + dim=dim, + max_position_embeddings=config.max_position_embeddings, + base=base, + device=inv_freq.device, + scaling_factor=scaling_factor, + ) + elif rope_scaling["type"] == "yarn": + scaling_factor = rope_scaling["factor"] + return YarnPositionRotaryEmbedding( + dim=2 * inv_freq.shape[0], + max_position_embeddings=rope_scaling[ + "original_max_position_embeddings" + ], + base=10000.0, + device=inv_freq.device, + scaling_factor=scaling_factor, + extrapolation_factor=1, + attn_factor=1, + beta_fast=32, + beta_slow=1, + ) + elif rope_scaling["type"] == "su": + short_factor = torch.tensor( + rope_scaling["short_factor"], dtype=torch.float32, device=device + ) + short_inv_freq = 1.0 / ( + short_factor + * base + ** ( + torch.arange(0, dim, 2, device=device, dtype=torch.float32) + / dim + ) + ) + long_factor = torch.tensor( + rope_scaling["long_factor"], dtype=torch.float32, device=device + ) + long_inv_freq = 1.0 / ( + long_factor + * base + ** ( + torch.arange(0, dim, 2, device=device, dtype=torch.float32) + / dim + ) + ) + + original_max_position_embeddings = ( + config.original_max_position_embeddings + ) + max_position_embeddings = config.max_position_embeddings + if max_position_embeddings <= original_max_position_embeddings: + scaling_factor = 1.0 + else: + scale = max_position_embeddings / original_max_position_embeddings + scaling_factor = math.sqrt( + 1 + math.log(scale) / math.log(original_max_position_embeddings) + ) + + return SuRotaryEmbedding( + short_inv_freq=short_inv_freq, + long_inv_freq=long_inv_freq, + scaling_factor=scaling_factor, + original_max_position_embeddings=original_max_position_embeddings, + ) + else: + raise NotImplementedError( + f"rope scaling type {rope_scaling['type']} is not implemented or invalid" + ) + return cls(inv_freq, scaling_factor) + + @classmethod + def load(cls, config, prefix, weights): + # XXX: Always load this in float32 ! + dtype = weights.dtype + weights.dtype = torch.float32 + inv_freq = weights.get_tensor(f"{prefix}.inv_freq") + weights.dtype = dtype + + scaling_factor = None + rope_scaling = _get_rope_config(config) + if rope_scaling is not None: + scaling_factor = rope_scaling["factor"] + if rope_scaling["type"] == "linear": + pass + elif rope_scaling["type"] == "dynamic": + return DynamicPositionRotaryEmbedding( + dim=2 * inv_freq.shape[0], + max_position_embeddings=config.max_position_embeddings, + base=10000.0, + device=inv_freq.device, + scaling_factor=scaling_factor, + ) + elif rope_scaling["type"] == "yarn": + return YarnPositionRotaryEmbedding( + dim=2 * inv_freq.shape[0], + max_position_embeddings=rope_scaling[ + "original_max_position_embeddings" + ], + base=10000.0, + device=inv_freq.device, + scaling_factor=scaling_factor, + extrapolation_factor=1, + attn_factor=1, + beta_fast=32, + beta_slow=1, + ) + else: + raise NotImplementedError( + f"rope scaling type {rope_scaling['type']} is not implemented or invalid" + ) + return cls(inv_freq, scaling_factor) + + def _update_cos_sin_cache(self, dtype, device, seqlen): + # Reset the tables if the sequence length has changed, + # or if we're on a new device (possibly due to tracing for instance) + if ( + seqlen > self._seq_len_cached + or self._cos_cached.device != device + or self._cos_cached.dtype != dtype + ): + self._seq_len_cached = seqlen + t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype) + if self.scaling_factor is not None: + t /= self.scaling_factor + # Don't do einsum, it converts fp32 to fp16 + # freqs = torch.einsum("i,j->ij", t, self.inv_freq) + + freqs = torch.outer(t, self.inv_freq.to(device=t.device)) + self._cos_cached = torch.cos(freqs).to(dtype) + self._sin_cached = torch.sin(freqs).to(dtype) + + def get_cos_sin(self, position_ids: torch.Tensor, max_s: int, dtype: torch.dtype): + """ + Return cos and sin for the asked position ids + """ + if SYSTEM == "rocm": + # For RoCm, we always use float cos/sin to avoid a cast. + # For NVIDIA, for some reason, the flash-attn rotary kernel requires cos/sin and query/key to be of same dtype: https://github.com/Dao-AILab/flash-attention/blob/017716451d446e464dde9aca3a3c1ed2209caaa9/csrc/rotary/rotary.cpp#L26 + # But later on goes and cast cos/sin to float anyway: https://github.com/Dao-AILab/flash-attention/blob/017716451d446e464dde9aca3a3c1ed2209caaa9/csrc/rotary/rotary_cuda.cu#L29, which looks suboptimal. + dtype = torch.float32 + + self._update_cos_sin_cache(dtype, position_ids.device, max_s) + + cos = torch.index_select(self._cos_cached, 0, position_ids) + sin = torch.index_select(self._sin_cached, 0, position_ids) + + # Note: this unsqueeze is not necessary on RoCm + VLLM ROPE implementation, but we leave it as is to avoid yet an other controlflow. + return cos.unsqueeze(1), sin.unsqueeze(1) + + +class SuRotaryEmbedding(PositionRotaryEmbedding): + def __init__( + self, + short_inv_freq, + long_inv_freq, + scaling_factor, + original_max_position_embeddings, + ): + super(PositionRotaryEmbedding, self).__init__() + self.short_inv_freq = short_inv_freq + self.long_inv_freq = long_inv_freq + self.scaling_factor = scaling_factor + self.original_max_position_embeddings = original_max_position_embeddings + self._seq_len_cached = 0 + self._cos_cached = None + self._sin_cached = None + self._cos_k_cached = None + self._sin_k_cached = None + self.dynamic_args = None + + def _update_cos_sin_cache(self, dtype, device, seqlen): + # Reset the tables if the sequence length has changed, + # or if we're on a new device (possibly due to tracing for instance) + if ( + seqlen > self._seq_len_cached + or self._cos_cached.device != device + or self._cos_cached.dtype != dtype + ): + self._seq_len_cached = seqlen + if seqlen > self.original_max_position_embeddings: + inv_freq = self.long_inv_freq + else: + inv_freq = self.short_inv_freq + t = torch.arange(seqlen, device=device, dtype=inv_freq.dtype) + if self.scaling_factor is not None: + t /= self.scaling_factor + # Don't do einsum, it converts fp32 to fp16 + # freqs = torch.einsum("i,j->ij", t, self.inv_freq) + + freqs = torch.outer(t, inv_freq.to(device=t.device)) + self._cos_cached = torch.cos(freqs).to(dtype) + self._sin_cached = torch.sin(freqs).to(dtype) + + +class DynamicPositionRotaryEmbedding(PositionRotaryEmbedding): + def __init__(self, dim, max_position_embeddings, base, device, scaling_factor): + inv_freq = _create_inv_freq(dim, base, device) + super().__init__(inv_freq, scaling_factor) + self.dim = dim + self.max_position_embeddings = max_position_embeddings + self.base = base + + def _update_cos_sin_cache(self, dtype, device, seqlen): + # Reset the tables if the sequence length has changed, + # or if we're on a new device (possibly due to tracing for instance) + if ( + seqlen > self._seq_len_cached + or self._cos_cached.device != device + or self._cos_cached.dtype != dtype + ): + if seqlen > self.max_position_embeddings: + newbase = self.base * ( + (self.scaling_factor * seqlen / self.max_position_embeddings) + - (self.scaling_factor - 1) + ) ** (self.dim / (self.dim - 2)) + self.inv_freq = _create_inv_freq( + self.dim, newbase, self.inv_freq.device + ) + self._seq_len_cached = seqlen + t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype) + # Don't do einsum, it converts fp32 to fp16 + # freqs = torch.einsum("i,j->ij", t, self.inv_freq) + + freqs = torch.outer(t, self.inv_freq.to(device=t.device)) + self._cos_cached = torch.cos(freqs).to(dtype) + self._sin_cached = torch.sin(freqs).to(dtype) + + +# Inverse dim formula to find dim based on number of rotations +import math + + +def find_correction_dim(num_rotations, dim, base=10000, max_position_embeddings=2048): + return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / ( + 2 * math.log(base) + ) + + +# Find dim range bounds based on rotations +def find_correction_range( + low_rot, high_rot, dim, base=10000, max_position_embeddings=2048 +): + low = math.floor(find_correction_dim(low_rot, dim, base, max_position_embeddings)) + high = math.ceil(find_correction_dim(high_rot, dim, base, max_position_embeddings)) + return max(low, 0), min(high, dim - 1) # Clamp values just in case + + +def linear_ramp_mask(min, max, dim): + if min == max: + max += 0.001 # Prevent singularity + + linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min) + ramp_func = torch.clamp(linear_func, 0, 1) + return ramp_func + + +def get_mscale(scale=1): + if scale <= 1: + return 1.0 + return 0.1 * math.log(scale) + 1.0 + + +class YarnPositionRotaryEmbedding(PositionRotaryEmbedding): + def __init__( + self, + dim, + max_position_embeddings, + base, + device, + scaling_factor, + *, + extrapolation_factor, + attn_factor, + beta_fast, + beta_slow, + ): + inv_freq = _create_inv_freq(dim, base, device) + super().__init__(inv_freq, scaling_factor) + self.dim = dim + self.max_position_embeddings = max_position_embeddings + self.base = base + self.extrapolation_factor = extrapolation_factor + self.attn_factor = attn_factor + self.beta_fast = beta_fast + self.beta_slow = beta_slow + self.mscale = float( + get_mscale(self.scaling_factor) * self.attn_factor + ) # Get n-d magnitude scaling corrected for interpolation + + def _update_cos_sin_cache(self, dtype, device, seqlen): + # Reset the tables if the sequence length has changed, + # or if we're on a new device (possibly due to tracing for instance) + if ( + seqlen > self._seq_len_cached + or self._cos_cached.device != device + or self._cos_cached.dtype != dtype + ): + if seqlen > self.max_position_embeddings: + inv_freq_extrapolation = _create_inv_freq( + self.dim, self.base, self.inv_freq.device + ) + freqs = 1.0 / inv_freq_extrapolation + inv_freq_interpolation = 1.0 / (self.scaling_factor * freqs) + low, high = find_correction_range( + self.beta_fast, + self.beta_slow, + self.dim, + self.base, + self.max_position_embeddings, + ) + inv_freq_mask = ( + 1 - linear_ramp_mask(low, high, self.dim // 2).float().to(device) + ) * self.extrapolation_factor # Get n-d rotational scaling corrected for extrapolation + inv_freq = ( + inv_freq_interpolation * (1 - inv_freq_mask) + + inv_freq_extrapolation * inv_freq_mask + ) + + self.inv_freq = inv_freq + self.mscale = float( + get_mscale(self.scaling_factor) * self.attn_factor + ) # Get n-d magnitude scaling corrected for interpolation + + self._seq_len_cached = seqlen + t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype) + # Don't do einsum, it converts fp32 to fp16 + # freqs = torch.einsum("i,j->ij", t, self.inv_freq) + + freqs = torch.outer(t, self.inv_freq.to(device=t.device)) + self._cos_cached = (torch.cos(freqs) * self.mscale).to(dtype) + self._sin_cached = (torch.sin(freqs) * self.mscale).to(dtype) diff --git a/server/text_generation_server/layers/speculative.py b/server/text_generation_server/layers/speculative.py new file mode 100644 index 00000000..663f8c2e --- /dev/null +++ b/server/text_generation_server/layers/speculative.py @@ -0,0 +1,35 @@ +import torch +from typing import Tuple, Optional +from text_generation_server.layers.medusa import MedusaHeadV1, MedusaHeadV2 +from text_generation_server.layers.tensor_parallel import TensorParallelHead + + +class SpeculativeHead(torch.nn.Module): + def __init__(self, lm_head, medusa): + super().__init__() + self.head = lm_head + self.medusa = medusa + + @staticmethod + def load(config, prefix: str, weights): + use_medusa = config.use_medusa + if use_medusa: + lm_head = None + try: + medusa = MedusaHeadV1.load(config, prefix, weights) + except: + medusa = MedusaHeadV2(config, prefix, weights) + else: + lm_head = TensorParallelHead.load(config, prefix, weights) + medusa = None + return SpeculativeHead(lm_head, medusa) + + def forward( + self, input: torch.Tensor + ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: + if self.medusa is not None: + return self.medusa(input) + + assert self.head is not None + logits = self.head(input) + return logits, None diff --git a/server/text_generation_server/layers/tensor_parallel.py b/server/text_generation_server/layers/tensor_parallel.py new file mode 100644 index 00000000..34b9c51e --- /dev/null +++ b/server/text_generation_server/layers/tensor_parallel.py @@ -0,0 +1,188 @@ +import torch +from torch.nn import functional as F +from typing import List +from text_generation_server.layers.linear import get_linear, FastLinear + + +class SuperLayer(torch.nn.Module): + def __init__(self, linear): + super().__init__() + self.linear = linear + + def forward(self, x): + return self.linear.forward(x) + + +class TensorParallelHead(SuperLayer): + def __init__(self, linear, process_group, should_gather: bool): + super().__init__(linear) + self.process_group = process_group + self.should_gather = should_gather + + @staticmethod + def load(config, prefix: str, weights): + if weights.process_group.size() > 1: + try: + weight = weights.get_sharded(f"{prefix}.weight", dim=0) + should_gather = True + except AssertionError: + # If the vocab size is not divisible by number of shards + # just load the entire thing. + weight = weights.get_tensor(f"{prefix}.weight") + should_gather = False + else: + weight = weights.get_tensor(f"{prefix}.weight") + should_gather = False + + # GPTQ,AWQ,EETQ don't quantize heads (nor embeddings) + if config.quantize in ["gptq", "awq", "eetq"]: + quantize = None + else: + quantize = config.quantize + return TensorParallelHead( + get_linear(weight, bias=None, quantize=quantize), + process_group=weights.process_group, + should_gather=should_gather, + ) + + def forward(self, input: torch.Tensor) -> torch.Tensor: + if not self.should_gather: + return super().forward(input) + + world_size = self.process_group.size() + if len(input.shape) == 2 and isinstance(self.linear, FastLinear): + out_dim = self.linear.weight.shape[0] + + if input.shape[0] == 1: + world_out = input.new_empty(1, out_dim * world_size) + local_out = input.new_empty(1, out_dim) + gather_input = local_out + else: + world_out = input.new_empty(out_dim * world_size, input.shape[0]) + gather_input = input.new_empty(out_dim, input.shape[0]) + local_out = gather_input.T + + torch.mm(input, self.linear.weight.T, out=local_out) + + torch.distributed.all_gather_into_tensor( + world_out, gather_input, group=self.process_group + ) + + if input.shape[0] == 1: + return world_out + return world_out.T + + output = super().forward(input) + world_output = [ + torch.empty_like(output) for _ in range(self.process_group.size()) + ] + torch.distributed.all_gather(world_output, output, group=self.process_group) + world_output = torch.cat(world_output, dim=-1) + return world_output + + +class TensorParallelColumnLinear(SuperLayer): + @classmethod + def load_gate_up(cls, config, prefix: str, weights, bias: bool): + """Specific method when the QKV was joined after the fact""" + weight = weights.get_weights_col_packed_gate_up( + prefix, quantize=config.quantize + ) + if bias: + raise NotImplementedError("packed_gate_up only implemented without bias") + else: + bias = None + linear = get_linear(weight, bias, config.quantize) + return cls(linear) + + @classmethod + def load_qkv(cls, config, prefix: str, weights, bias: bool): + """Specific method when the QKV was joined after the fact""" + weight = weights.get_weights_col_packed_qkv(prefix, quantize=config.quantize) + if bias: + raise NotImplementedError("packed_qkv only implemented for baichuan") + else: + bias = None + linear = get_linear(weight, bias, config.quantize) + return cls(linear) + + @classmethod + def load(cls, config, prefix: str, weights, bias: bool): + return cls.load_multi(config, [prefix], weights, bias, dim=0) + + @classmethod + def load_multi(cls, config, prefixes: List[str], weights, bias: bool, dim: int): + weight = weights.get_multi_weights_col( + prefixes, quantize=config.quantize, dim=dim + ) + + if bias: + b = [weights.get_sharded(f"{p}.bias", dim=0) for p in prefixes] + bias = torch.cat(b, dim=dim) + else: + bias = None + linear = get_linear(weight, bias, config.quantize) + return cls(linear) + + +class TensorParallelRowLinear(SuperLayer): + def __init__(self, linear, process_group): + super().__init__(linear) + self.process_group = process_group + + @classmethod + def load(cls, config, prefix: str, weights, bias: bool): + weight = weights.get_multi_weights_row(prefix, quantize=config.quantize) + + if bias and weights.process_group.rank() == 0: + # Rank is only on the first rank process + bias = weights.get_tensor(f"{prefix}.bias") + else: + bias = None + return cls( + get_linear(weight, bias, config.quantize), + process_group=weights.process_group, + ) + + def forward(self, input: torch.Tensor, reduce: bool = True) -> torch.Tensor: + out = super().forward(input) + if self.process_group.size() > 1 and reduce: + torch.distributed.all_reduce(out, group=self.process_group) + return out + + +class TensorParallelEmbedding(torch.nn.Module): + def __init__(self, prefix: str, weights, reduce=True): + super().__init__() + weight = weights.get_partial_sharded(f"{prefix}.weight", dim=0) + num_embeddings = weights.get_shape(f"{prefix}.weight")[0] + + process_group = weights.process_group + + world_size = process_group.size() + rank = process_group.rank() + + block_size = (num_embeddings + world_size - 1) // world_size + self.min_id = rank * block_size + self.max_id = min(num_embeddings, (rank + 1) * block_size) + self.null_idx = weight.shape[ + 0 + ] # Usually block_size, might be less in non even vocab_size. + self.process_group = weights.process_group + self.reduce = reduce + + """Additional 0 entry used for masking""" + self.weight = torch.nn.Parameter(F.pad(weight, (0, 0, 0, 1))) + + def forward(self, input: torch.Tensor) -> torch.Tensor: + # default all out of bounds values to `self.null_idx` that will then be mapped to 0 + # translate for [0, self.max_id - self.min_id[ + input = torch.where( + (self.min_id > input) | (input >= self.max_id), + self.null_idx, + input - self.min_id, + ) + out = torch.nn.functional.embedding(input, self.weight) + if self.reduce and self.process_group.size() > 1: + torch.distributed.all_reduce(out, group=self.process_group) + return out diff --git a/server/text_generation_server/models/cache_manager.py b/server/text_generation_server/models/cache_manager.py index 4c65e2dd..c7705fe8 100644 --- a/server/text_generation_server/models/cache_manager.py +++ b/server/text_generation_server/models/cache_manager.py @@ -2,7 +2,7 @@ import math import torch from typing import Optional, List, Tuple -from text_generation_server.utils.import_utils import IS_XPU_SYSTEM +from text_generation_server.utils.import_utils import SYSTEM BLOCK_SIZE: int = 16 # Will be set in warmup @@ -25,7 +25,7 @@ class CacheManager: self.repeat_slots = repeat_slots element_size = torch.tensor([], dtype=dtype).element_size() - if IS_XPU_SYSTEM: + if SYSTEM == "xpu": x = 1 else: x = self.block_size // element_size diff --git a/server/text_generation_server/models/custom_modeling/bloom_modeling.py b/server/text_generation_server/models/custom_modeling/bloom_modeling.py index c8f02bca..0d8a1b59 100644 --- a/server/text_generation_server/models/custom_modeling/bloom_modeling.py +++ b/server/text_generation_server/models/custom_modeling/bloom_modeling.py @@ -32,7 +32,7 @@ from transformers.modeling_outputs import ( ) from transformers import BloomConfig, PreTrainedModel -from text_generation_server.utils.layers import ( +from text_generation_server.layers import ( TensorParallelColumnLinear, TensorParallelEmbedding, TensorParallelRowLinear, diff --git a/server/text_generation_server/models/custom_modeling/clip.py b/server/text_generation_server/models/custom_modeling/clip.py index c4917733..56618bf1 100644 --- a/server/text_generation_server/models/custom_modeling/clip.py +++ b/server/text_generation_server/models/custom_modeling/clip.py @@ -15,7 +15,7 @@ from transformers.modeling_outputs import ( ) from transformers import CLIPConfig, CLIPTextConfig, CLIPVisionConfig -from text_generation_server.utils.layers import ( +from text_generation_server.layers import ( TensorParallelEmbedding, TensorParallelColumnLinear, TensorParallelRowLinear, diff --git a/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py b/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py index 56d9a966..8c423eaf 100644 --- a/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py @@ -26,18 +26,22 @@ from transformers.activations import ACT2FN from typing import Optional, List, Tuple from text_generation_server.utils import paged_attention, flash_attn -from text_generation_server.utils.import_utils import IS_ROCM_SYSTEM, IS_CUDA_SYSTEM -from text_generation_server.utils.layers import ( +from text_generation_server.utils.import_utils import SYSTEM +from text_generation_server.layers import ( TensorParallelRowLinear, TensorParallelColumnLinear, TensorParallelEmbedding, - PositionRotaryEmbedding, SpeculativeHead, get_linear, +) +from text_generation_server.layers.layernorm import ( FastLayerNorm, ) +from text_generation_server.layers.rotary import ( + PositionRotaryEmbedding, +) -if IS_CUDA_SYSTEM: +if SYSTEM == "cuda": import dropout_layer_norm else: dropout_layer_norm = None @@ -52,7 +56,7 @@ class CohereRotary(PositionRotaryEmbedding): sin: torch.Tensor, ): # Such controlflows may add some overhead. - if IS_CUDA_SYSTEM: + if SYSTEM == "cuda": import rotary_emb q1 = query[..., ::2] @@ -64,7 +68,7 @@ class CohereRotary(PositionRotaryEmbedding): k2 = key[..., 1::2] rotary_emb.apply_rotary(k1, k2, cos, sin, k1, k2, False) - elif IS_ROCM_SYSTEM: + elif SYSTEM == "rocm": from vllm import pos_encoding_ops # NOTE: On RoCm systems, we use a ROPE implementatation adapted from VLLM which launches a single kernel for both query/key, contrary to flash-attn implementation used on NVIDIA systems. @@ -90,7 +94,7 @@ class CohereLayerNorm(nn.Module): self.eps = eps def forward(self, hidden_states): - if hidden_states.shape[-1] > 8192 or IS_ROCM_SYSTEM: + if hidden_states.shape[-1] > 8192 or SYSTEM == "rocm": hidden_states = hidden_states.reshape( -1, self.weight.shape[0], self.weight.shape[1] ) diff --git a/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py b/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py index d0978bef..9d652b67 100644 --- a/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py @@ -21,21 +21,26 @@ from transformers.activations import ACT2FN from transformers.configuration_utils import PretrainedConfig from typing import Optional, List, Tuple, Any from loguru import logger -from text_generation_server.utils.import_utils import IS_XPU_SYSTEM +from text_generation_server.utils.import_utils import SYSTEM -if not IS_XPU_SYSTEM: +if SYSTEM != "xpu": from vllm.model_executor.layers.fused_moe import fused_moe + from text_generation_server.utils import paged_attention, flash_attn -from text_generation_server.utils.layers import ( +from text_generation_server.layers import ( FastLinear, - FastLayerNorm, TensorParallelRowLinear, TensorParallelColumnLinear, TensorParallelEmbedding, - PositionRotaryEmbedding, SpeculativeHead, get_linear, ) +from text_generation_server.layers.rotary import ( + PositionRotaryEmbedding, +) +from text_generation_server.layers.layernorm import ( + FastLayerNorm, +) from text_generation_server.utils.log import log_once @@ -216,7 +221,7 @@ def _load_gqa(config, prefix: str, weights): bits, groupsize, desc_act, quant_method = weights._get_gptq_params() - from text_generation_server.utils.layers import HAS_EXLLAMA + from text_generation_server.layers import HAS_EXLLAMA use_exllama = ( bits == 4 and HAS_EXLLAMA and config.quantize == "gptq" and not desc_act @@ -236,7 +241,7 @@ def _load_gqa(config, prefix: str, weights): log_once( logger.info, "Converting AWQ model to Exllama/GPTQ packing format." ) - from text_generation_server.utils.awq.conversion_utils import ( + from text_generation_server.layers.awq.conveersion_utils import ( fast_awq_to_gptq, ) diff --git a/server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py b/server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py index bd7596db..43b90bdd 100644 --- a/server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py @@ -27,13 +27,15 @@ from transformers.configuration_utils import PretrainedConfig from typing import Optional, List, Tuple from text_generation_server.utils import paged_attention, flash_attn -from text_generation_server.utils.layers import ( +from text_generation_server.layers import ( TensorParallelRowLinear, TensorParallelColumnLinear, TensorParallelEmbedding, - PositionRotaryEmbedding, SpeculativeHead, get_linear, +) +from text_generation_server.layers.rotary import PositionRotaryEmbedding +from text_generation_server.layers.layernorm import ( FastRMSNorm, ) diff --git a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py index 6fa85d4e..a7969494 100644 --- a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py @@ -27,13 +27,15 @@ from transformers.configuration_utils import PretrainedConfig from typing import Optional, List, Tuple from text_generation_server.utils import paged_attention, flash_attn -from text_generation_server.utils.layers import ( +from text_generation_server.layers import ( TensorParallelRowLinear, TensorParallelColumnLinear, TensorParallelEmbedding, - PositionRotaryEmbedding, SpeculativeHead, get_linear, +) +from text_generation_server.layers.rotary import PositionRotaryEmbedding +from text_generation_server.layers.layernorm import ( FastRMSNorm, ) diff --git a/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py b/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py index c2445cda..3e13c26d 100644 --- a/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py @@ -27,13 +27,15 @@ from transformers.configuration_utils import PretrainedConfig from typing import Optional, List, Tuple from text_generation_server.utils import paged_attention, flash_attn -from text_generation_server.utils.layers import ( +from text_generation_server.layers import ( TensorParallelRowLinear, TensorParallelColumnLinear, TensorParallelEmbedding, - PositionRotaryEmbedding, SpeculativeHead, get_linear, +) +from text_generation_server.layers.rotary import PositionRotaryEmbedding +from text_generation_server.layers.layernorm import ( FastRMSNorm, ) diff --git a/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py b/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py index 3f6c8e03..be2d6c45 100644 --- a/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py @@ -24,9 +24,9 @@ import torch.distributed import numpy as np from torch import nn -from text_generation_server.utils.import_utils import IS_XPU_SYSTEM +from text_generation_server.utils.import_utils import SYSTEM -if not IS_XPU_SYSTEM: +if SYSTEM != "xpu": from vllm.model_executor.layers.fused_moe import fused_moe from transformers.activations import ACT2FN from transformers.configuration_utils import PretrainedConfig @@ -34,16 +34,20 @@ from typing import Optional, List, Tuple from loguru import logger from text_generation_server.utils import paged_attention, flash_attn -from text_generation_server.utils.layers import ( +from text_generation_server.layers import ( FastLinear, - FastRMSNorm, TensorParallelRowLinear, TensorParallelColumnLinear, TensorParallelEmbedding, - PositionRotaryEmbedding, SpeculativeHead, get_linear, ) +from text_generation_server.layers.layernorm import ( + FastRMSNorm, +) +from text_generation_server.layers.rotary import ( + PositionRotaryEmbedding, +) class MixtralConfig(PretrainedConfig): diff --git a/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py b/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py index ee062d3d..d45cab2e 100644 --- a/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py @@ -29,15 +29,19 @@ from typing import Optional, List, Tuple from text_generation_server.utils import paged_attention, flash_attn from text_generation_server.utils.flash_attn import attention -from text_generation_server.utils.layers import ( +from text_generation_server.layers import ( TensorParallelRowLinear, TensorParallelColumnLinear, TensorParallelEmbedding, SpeculativeHead, - FastLayerNorm, - PositionRotaryEmbedding, get_linear, ) +from text_generation_server.layers.layernorm import ( + FastLayerNorm, +) +from text_generation_server.layers.rotary import ( + PositionRotaryEmbedding, +) def load_row(config, prefix: str, weights, bias: bool): diff --git a/server/text_generation_server/models/custom_modeling/flash_phi_modeling.py b/server/text_generation_server/models/custom_modeling/flash_phi_modeling.py index cfe447a7..f2efb538 100644 --- a/server/text_generation_server/models/custom_modeling/flash_phi_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_phi_modeling.py @@ -7,15 +7,19 @@ from transformers.configuration_utils import PretrainedConfig from typing import Optional, List, Tuple from text_generation_server.utils import paged_attention, flash_attn -from text_generation_server.utils.layers import ( +from text_generation_server.layers import ( TensorParallelRowLinear, TensorParallelColumnLinear, TensorParallelEmbedding, - PositionRotaryEmbedding, SpeculativeHead, get_linear, +) +from text_generation_server.layers.layernorm import ( FastLayerNorm, ) +from text_generation_server.layers.rotary import ( + PositionRotaryEmbedding, +) class PhiConfig(PretrainedConfig): diff --git a/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py b/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py index 94023b33..3a6d2db5 100644 --- a/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py @@ -6,13 +6,15 @@ from transformers.activations import ACT2FN from typing import Optional, List, Tuple from text_generation_server.utils import paged_attention, flash_attn -from text_generation_server.utils.layers import ( +from text_generation_server.layers import ( TensorParallelRowLinear, TensorParallelColumnLinear, TensorParallelEmbedding, - PositionRotaryEmbedding, SpeculativeHead, get_linear, +) +from text_generation_server.layers.rotary import PositionRotaryEmbedding +from text_generation_server.layers.layernorm import ( FastRMSNorm, ) diff --git a/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py b/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py index a9127d1f..52ea3ae1 100644 --- a/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py @@ -8,15 +8,19 @@ from typing import Optional, List, Tuple from text_generation_server.utils import paged_attention, flash_attn from text_generation_server.utils.flash_attn import attention -from text_generation_server.utils.layers import ( +from text_generation_server.layers import ( TensorParallelRowLinear, TensorParallelColumnLinear, TensorParallelEmbedding, SpeculativeHead, - FastLayerNorm, - PositionRotaryEmbedding, get_linear, ) +from text_generation_server.layers.layernorm import ( + FastLayerNorm, +) +from text_generation_server.layers.rotary import ( + PositionRotaryEmbedding, +) def load_row(config, prefix: str, weights, bias: bool): diff --git a/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py b/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py index bbb603a7..d2f6d9af 100644 --- a/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py @@ -6,14 +6,16 @@ from transformers.activations import ACT2FN from typing import Optional, List, Tuple from text_generation_server.utils import paged_attention, flash_attn -from text_generation_server.utils.layers import ( +from text_generation_server.layers import ( TensorParallelRowLinear, TensorParallelColumnLinear, SpeculativeHead, TensorParallelEmbedding, - FastLayerNorm, get_linear, ) +from text_generation_server.layers.layernorm import ( + FastLayerNorm, +) def load_multi_mqa( @@ -80,13 +82,13 @@ def _load_multi_mqa_gptq( g_idx = g_idx.to(device=weights.device) elif quant_method == "awq": g_idx = None - from text_generation_server.utils.awq.conversion_utils import ( + from text_generation_server.layers.awq.conversion_utils import ( fast_awq_to_gptq, ) qweight, qzeros = fast_awq_to_gptq(qweight, qzeros) - from text_generation_server.utils.layers import HAS_EXLLAMA + from text_generation_server.layers.gptq import HAS_EXLLAMA use_exllama = HAS_EXLLAMA weight = (qweight, qzeros, scales, g_idx, bits, groupsize, use_exllama) diff --git a/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py b/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py index ed77af78..3e2ce4f9 100644 --- a/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py @@ -27,15 +27,19 @@ from transformers.configuration_utils import PretrainedConfig from typing import Optional, List, Tuple from text_generation_server.utils import paged_attention, flash_attn -from text_generation_server.utils.layers import ( +from text_generation_server.layers import ( TensorParallelRowLinear, TensorParallelColumnLinear, TensorParallelEmbedding, - PositionRotaryEmbedding, SpeculativeHead, get_linear, - FastRMSNorm, +) +from text_generation_server.layers.layernorm import ( FastLayerNorm, + FastRMSNorm, +) +from text_generation_server.layers.rotary import ( + PositionRotaryEmbedding, ) diff --git a/server/text_generation_server/models/custom_modeling/idefics2.py b/server/text_generation_server/models/custom_modeling/idefics2.py index cb2ee7db..935f049b 100644 --- a/server/text_generation_server/models/custom_modeling/idefics2.py +++ b/server/text_generation_server/models/custom_modeling/idefics2.py @@ -29,7 +29,7 @@ from text_generation_server.models.custom_modeling.vlm import ( ) from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask -from text_generation_server.utils.layers import ( +from text_generation_server.layers import ( TensorParallelColumnLinear, TensorParallelEmbedding, TensorParallelRowLinear, diff --git a/server/text_generation_server/models/custom_modeling/idefics_modeling.py b/server/text_generation_server/models/custom_modeling/idefics_modeling.py index ee4cdb08..ec3f900b 100644 --- a/server/text_generation_server/models/custom_modeling/idefics_modeling.py +++ b/server/text_generation_server/models/custom_modeling/idefics_modeling.py @@ -47,20 +47,22 @@ from text_generation_server.models.custom_modeling.idefics_vision import ( from text_generation_server.models.custom_modeling.idefics_perceiver import ( IdeficsPerceiverResampler, ) -from text_generation_server.utils.layers import ( +from text_generation_server.layers import ( TensorParallelColumnLinear, TensorParallelEmbedding, TensorParallelRowLinear, SpeculativeHead, - PositionRotaryEmbedding, FastLinear, ) -from text_generation_server.utils.import_utils import IS_CUDA_SYSTEM, IS_ROCM_SYSTEM +from text_generation_server.layers.rotary import PositionRotaryEmbedding +from text_generation_server.utils.import_utils import SYSTEM -if IS_CUDA_SYSTEM: +if SYSTEM == "cuda": import dropout_layer_norm -elif IS_ROCM_SYSTEM: +elif SYSTEM == "rocm": from vllm import layernorm_ops +else: + raise RuntimeError(f"Unsupported system {SYSTEM}") @dataclass @@ -373,7 +375,7 @@ class IdeficsRMSNorm(nn.Module): hidden_states = hidden_states.to(self.weight.dtype) return self.weight * hidden_states - elif IS_CUDA_SYSTEM: + elif SYSTEM == "cuda": # faster post attention rms norm unwrap = False if len(hidden_states.shape) > 2: @@ -405,7 +407,7 @@ class IdeficsRMSNorm(nn.Module): normed_hidden_states = normed_hidden_states.view(*shape) return normed_hidden_states - elif IS_ROCM_SYSTEM: + elif SYSTEM == "rocm": # We use VLLM RMSNorm kernel that can be compiled for RoCm, instead of Flash Attention ones that can not. if residual is not None: hidden_states += residual diff --git a/server/text_generation_server/models/custom_modeling/idefics_perceiver.py b/server/text_generation_server/models/custom_modeling/idefics_perceiver.py index 477d4d70..af44490b 100644 --- a/server/text_generation_server/models/custom_modeling/idefics_perceiver.py +++ b/server/text_generation_server/models/custom_modeling/idefics_perceiver.py @@ -41,7 +41,7 @@ from typing import Optional, Tuple import torch import torch.nn as nn -from text_generation_server.utils.layers import ( +from text_generation_server.layers import ( TensorParallelColumnLinear, TensorParallelRowLinear, ) diff --git a/server/text_generation_server/models/custom_modeling/idefics_vision.py b/server/text_generation_server/models/custom_modeling/idefics_vision.py index c521dd0a..30c5997f 100644 --- a/server/text_generation_server/models/custom_modeling/idefics_vision.py +++ b/server/text_generation_server/models/custom_modeling/idefics_vision.py @@ -28,7 +28,7 @@ from transformers.utils import ( ModelOutput, logging, ) -from text_generation_server.utils.layers import ( +from text_generation_server.layers import ( TensorParallelColumnLinear, TensorParallelRowLinear, TensorParallelEmbedding, diff --git a/server/text_generation_server/models/custom_modeling/llava_next.py b/server/text_generation_server/models/custom_modeling/llava_next.py index 0d93791f..a049f756 100644 --- a/server/text_generation_server/models/custom_modeling/llava_next.py +++ b/server/text_generation_server/models/custom_modeling/llava_next.py @@ -27,7 +27,7 @@ from text_generation_server.models.custom_modeling.vlm import ( load_text_model, load_vision_model, ) -from text_generation_server.utils.layers import ( +from text_generation_server.layers import ( TensorParallelColumnLinear, TensorParallelRowLinear, ) diff --git a/server/text_generation_server/models/custom_modeling/mamba_modeling.py b/server/text_generation_server/models/custom_modeling/mamba_modeling.py index c58a617f..293051c2 100644 --- a/server/text_generation_server/models/custom_modeling/mamba_modeling.py +++ b/server/text_generation_server/models/custom_modeling/mamba_modeling.py @@ -8,12 +8,12 @@ from typing import Optional, Tuple, Any from transformers.configuration_utils import PretrainedConfig import torch.nn.functional as F -from text_generation_server.utils.layers import ( +from text_generation_server.layers import ( SpeculativeHead, TensorParallelEmbedding, - FastRMSNorm, FastLinear, ) +from text_generation_server.layers.layernorm import FastRMSNorm from einops import rearrange from causal_conv1d import causal_conv1d_fn, causal_conv1d_update diff --git a/server/text_generation_server/models/custom_modeling/mpt_modeling.py b/server/text_generation_server/models/custom_modeling/mpt_modeling.py index 9b0f8b92..f7981bf5 100644 --- a/server/text_generation_server/models/custom_modeling/mpt_modeling.py +++ b/server/text_generation_server/models/custom_modeling/mpt_modeling.py @@ -17,7 +17,7 @@ from transformers.modeling_outputs import ( ) from einops import rearrange from packaging import version -from text_generation_server.utils.layers import ( +from text_generation_server.layers import ( TensorParallelEmbedding, TensorParallelColumnLinear, TensorParallelRowLinear, diff --git a/server/text_generation_server/models/custom_modeling/neox_modeling.py b/server/text_generation_server/models/custom_modeling/neox_modeling.py index f060ec0e..fcad32fa 100644 --- a/server/text_generation_server/models/custom_modeling/neox_modeling.py +++ b/server/text_generation_server/models/custom_modeling/neox_modeling.py @@ -40,7 +40,7 @@ from transformers.modeling_outputs import ( from transformers.modeling_utils import PreTrainedModel from transformers import GPTNeoXConfig from loguru import logger -from text_generation_server.utils.layers import ( +from text_generation_server.layers import ( TensorParallelColumnLinear, TensorParallelEmbedding, TensorParallelRowLinear, diff --git a/server/text_generation_server/models/custom_modeling/opt_modeling.py b/server/text_generation_server/models/custom_modeling/opt_modeling.py index 7a5cf917..83d62dea 100644 --- a/server/text_generation_server/models/custom_modeling/opt_modeling.py +++ b/server/text_generation_server/models/custom_modeling/opt_modeling.py @@ -27,7 +27,7 @@ from transformers.modeling_outputs import ( ) from transformers.modeling_utils import PreTrainedModel from transformers import OPTConfig -from text_generation_server.utils.layers import ( +from text_generation_server.layers import ( FastLinear, TensorParallelColumnLinear, TensorParallelEmbedding, diff --git a/server/text_generation_server/models/custom_modeling/phi_modeling.py b/server/text_generation_server/models/custom_modeling/phi_modeling.py index 1571f9fd..04b470eb 100644 --- a/server/text_generation_server/models/custom_modeling/phi_modeling.py +++ b/server/text_generation_server/models/custom_modeling/phi_modeling.py @@ -9,7 +9,7 @@ from typing import Optional, List, Tuple, Any from transformers.configuration_utils import PretrainedConfig from transformers.modeling_outputs import CausalLMOutputWithPast -from text_generation_server.utils.layers import ( +from text_generation_server.layers import ( TensorParallelRowLinear, TensorParallelColumnLinear, TensorParallelEmbedding, diff --git a/server/text_generation_server/models/custom_modeling/t5_modeling.py b/server/text_generation_server/models/custom_modeling/t5_modeling.py index 2773fb15..0b899fba 100644 --- a/server/text_generation_server/models/custom_modeling/t5_modeling.py +++ b/server/text_generation_server/models/custom_modeling/t5_modeling.py @@ -38,7 +38,7 @@ from transformers.utils import ( is_torch_fx_proxy, ) from transformers import T5Config -from text_generation_server.utils.layers import ( +from text_generation_server.layers import ( TensorParallelColumnLinear, TensorParallelEmbedding, TensorParallelRowLinear, diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py index a6d0204f..f567bea9 100644 --- a/server/text_generation_server/models/flash_causal_lm.py +++ b/server/text_generation_server/models/flash_causal_lm.py @@ -12,7 +12,6 @@ from dataclasses import dataclass from opentelemetry import trace from transformers import PreTrainedTokenizerBase from typing import Optional, Tuple, List, Type, Dict - from text_generation_server.models import Model from text_generation_server.utils.tokens import batch_top_tokens from text_generation_server.utils.speculate import get_speculate @@ -32,13 +31,14 @@ from text_generation_server.models.globals import MEM_POOL, CUDA_GRAPHS from text_generation_server.utils import StoppingCriteria, HeterogeneousNextTokenChooser from text_generation_server.utils.dist import MEMORY_FRACTION -tracer = trace.get_tracer(__name__) from text_generation_server.utils.import_utils import ( - IS_CUDA_SYSTEM, - IS_ROCM_SYSTEM, - IS_XPU_SYSTEM, + empty_cache, + synchronize, + get_free_memory, ) +tracer = trace.get_tracer(__name__) + @dataclass class FlashCausalLMBatch(Batch): @@ -757,10 +757,8 @@ class FlashCausalLM(Model): def warmup(self, batch: FlashCausalLMBatch): # The warmup batch is the biggest batch we could ever receive - if IS_CUDA_SYSTEM or IS_ROCM_SYSTEM: - torch.cuda.empty_cache() - elif IS_XPU_SYSTEM: - torch.xpu.empty_cache() + empty_cache() + try: cache_manager = set_cache_manager( batch.blocks, @@ -780,10 +778,7 @@ class FlashCausalLM(Model): f"You need to decrease `--max-batch-prefill-tokens`" ) from e - if IS_CUDA_SYSTEM or IS_ROCM_SYSTEM: - torch.cuda.synchronize(self.device) - elif IS_XPU_SYSTEM: - torch.xpu.synchronize(self.device) + synchronize(self.device) # Inspired by the original implementation in [vllm](https://github.com/vllm-project/vllm) # Calculate the number of blocks that can be allocated with the free memory @@ -791,20 +786,7 @@ class FlashCausalLM(Model): cache_block_size = BLOCK_SIZE * self.num_kv_heads * self.head_size total_cache_size = self.num_layers * cache_block_size * 2 * dtype_size - if IS_CUDA_SYSTEM or IS_ROCM_SYSTEM: - total_free_memory, _ = torch.cuda.mem_get_info(self.device) - total_gpu_memory = torch.cuda.get_device_properties( - self.device - ).total_memory - - free_memory = max( - 0, total_free_memory - (1 - MEMORY_FRACTION) * total_gpu_memory - ) - elif IS_XPU_SYSTEM: - total_gpu_memory = torch.xpu.get_device_properties(self.device).total_memory - free_memory = int(total_gpu_memory * 0.5) - else: - raise NotImplementedError("FlashModel is only available on GPU") + free_memory = get_free_memory(self.device, MEMORY_FRACTION) num_blocks = ( # Leave 5% for some wiggle room diff --git a/server/text_generation_server/models/flash_llama.py b/server/text_generation_server/models/flash_llama.py index 609a188d..8ea70713 100644 --- a/server/text_generation_server/models/flash_llama.py +++ b/server/text_generation_server/models/flash_llama.py @@ -18,7 +18,7 @@ from text_generation_server.utils import ( tracer = trace.get_tracer(__name__) -from text_generation_server.utils.import_utils import IS_XPU_SYSTEM +from text_generation_server.utils.import_utils import SYSTEM class FlashLlama(FlashCausalLM): @@ -35,7 +35,7 @@ class FlashLlama(FlashCausalLM): if torch.cuda.is_available(): device = torch.device(f"cuda:{rank}") dtype = torch.float16 if dtype is None else dtype - elif IS_XPU_SYSTEM: + elif SYSTEM == "xpu": device = torch.device(f"xpu:{rank}") dtype = torch.float16 if dtype is None else dtype else: diff --git a/server/text_generation_server/models/flash_mistral.py b/server/text_generation_server/models/flash_mistral.py index 85e93543..48304ad8 100644 --- a/server/text_generation_server/models/flash_mistral.py +++ b/server/text_generation_server/models/flash_mistral.py @@ -33,7 +33,7 @@ tracer = trace.get_tracer(__name__) # Will be set in init SLIDING_WINDOW: Optional[int] = None SLIDING_WINDOW_BLOCKS: Optional[int] = None -from text_generation_server.utils.import_utils import IS_XPU_SYSTEM +from text_generation_server.utils.import_utils import SYSTEM MEM_POOL = torch.cuda.graph_pool_handle() if torch.cuda.is_available() else None @@ -322,7 +322,7 @@ class BaseFlashMistral(FlashCausalLM): if torch.cuda.is_available(): device = torch.device(f"cuda:{rank}") dtype = torch.float16 if dtype is None else dtype - elif IS_XPU_SYSTEM: + elif SYSTEM == "xpu": device = torch.device(f"xpu:{rank}") dtype = torch.float16 if dtype is None else dtype else: diff --git a/server/text_generation_server/models/flash_neox.py b/server/text_generation_server/models/flash_neox.py index f82e27db..1119bdae 100644 --- a/server/text_generation_server/models/flash_neox.py +++ b/server/text_generation_server/models/flash_neox.py @@ -14,7 +14,7 @@ from text_generation_server.utils import ( weight_files, Weights, ) -from text_generation_server.utils.import_utils import IS_XPU_SYSTEM +from text_generation_server.utils.import_utils import SYSTEM tracer = trace.get_tracer(__name__) @@ -33,7 +33,7 @@ class FlashNeoXSharded(FlashCausalLM): if torch.cuda.is_available(): device = torch.device(f"cuda:{rank}") dtype = torch.float16 if dtype is None else dtype - elif IS_XPU_SYSTEM: + elif SYSTEM == "xpu": device = torch.device(f"xpu:{rank}") dtype = torch.float16 if dtype is None else dtype else: diff --git a/server/text_generation_server/models/flash_rw.py b/server/text_generation_server/models/flash_rw.py index ccf38a0c..33298e1a 100644 --- a/server/text_generation_server/models/flash_rw.py +++ b/server/text_generation_server/models/flash_rw.py @@ -15,7 +15,7 @@ from text_generation_server.utils import ( weight_files, Weights, ) -from text_generation_server.utils.import_utils import IS_XPU_SYSTEM +from text_generation_server.utils.import_utils import SYSTEM tracer = trace.get_tracer(__name__) @@ -34,7 +34,7 @@ class FlashRWSharded(FlashCausalLM): if torch.cuda.is_available(): device = torch.device(f"cuda:{rank}") dtype = torch.float16 if dtype is None else dtype - elif IS_XPU_SYSTEM: + elif SYSTEM == "xpu": device = torch.device(f"xpu:{rank}") dtype = torch.float16 if dtype is None else dtype else: diff --git a/server/text_generation_server/models/flash_santacoder.py b/server/text_generation_server/models/flash_santacoder.py index e66f1bf8..66698a3a 100644 --- a/server/text_generation_server/models/flash_santacoder.py +++ b/server/text_generation_server/models/flash_santacoder.py @@ -18,7 +18,7 @@ from text_generation_server.utils import ( Weights, ) -from text_generation_server.utils.import_utils import IS_XPU_SYSTEM +from text_generation_server.utils.import_utils import SYSTEM tracer = trace.get_tracer(__name__) @@ -37,7 +37,7 @@ class FlashSantacoderSharded(FlashCausalLM): if torch.cuda.is_available(): device = torch.device(f"cuda:{rank}") dtype = torch.float16 if dtype is None else dtype - elif IS_XPU_SYSTEM: + elif SYSTEM == "xpu": device = torch.device(f"xpu:{rank}") dtype = torch.float16 if dtype is None else dtype else: diff --git a/server/text_generation_server/utils/flash_attn.py b/server/text_generation_server/utils/flash_attn.py index 583a8f91..0830656d 100644 --- a/server/text_generation_server/utils/flash_attn.py +++ b/server/text_generation_server/utils/flash_attn.py @@ -2,13 +2,8 @@ import os import torch from loguru import logger -import math -from text_generation_server.utils.import_utils import ( - IS_CUDA_SYSTEM, - IS_ROCM_SYSTEM, - IS_XPU_SYSTEM, -) +from text_generation_server.utils.import_utils import SYSTEM if os.getenv("USE_FLASH_ATTENTION", "").lower() == "false": raise ImportError("`USE_FLASH_ATTENTION` is false.") @@ -16,83 +11,22 @@ HAS_FLASH_ATTN = True HAS_FLASH_ATTN_V2_CUDA = False HAS_FLASH_ATTN_V2_ROCM = False -if IS_XPU_SYSTEM: +if SYSTEM == "xpu": import intel_extension_for_pytorch as ipex -if IS_CUDA_SYSTEM or IS_ROCM_SYSTEM: - if not torch.cuda.is_available(): - raise ImportError("CUDA is not available") + def attention( + q, + k, + v, + out, + cu_seqlens, + max_s, + softmax_scale, + window_size_left=-1, + ): + if window_size_left <= 0 and window_size_left != -1: + raise ValueError("`window_size_left` must be > 0 or -1") - major, minor = torch.cuda.get_device_capability() - is_sm75 = major == 7 and minor == 5 - is_sm8x = major == 8 and minor >= 0 - is_sm90 = major == 9 and minor == 0 - - HAS_FLASH_ATTN = False - HAS_FLASH_ATTN_V2_CUDA = False - HAS_FLASH_ATTN_V2_ROCM = False - try: - try: - import flash_attn_2_cuda - except ImportError: - architecture_suffix = "" - if IS_CUDA_SYSTEM: - architecture_suffix = "-cuda" - elif IS_ROCM_SYSTEM: - architecture_suffix = "-rocm" - raise ImportError( - "Flash Attention V2 is not installed.\n" - "Use the official Docker image (ghcr.io/huggingface/text-generation-inference:latest) " - f"or install flash attention v2 with `cd server && make install install-flash-attention-v2{architecture_suffix}`" - ) - if not (is_sm8x or is_sm90): - raise ImportError( - f"GPU with CUDA capability {major} {minor} is not supported for " - "Flash Attention V2" - ) - HAS_FLASH_ATTN_V2_CUDA = IS_CUDA_SYSTEM - HAS_FLASH_ATTN_V2_ROCM = IS_ROCM_SYSTEM - except ImportError as e: - try: - import flash_attn_cuda - except ImportError: - raise ImportError( - "Flash Attention is not installed.\n" - "Use the official Docker image (ghcr.io/huggingface/text-generation-inference:latest) " - "or install flash attention with `cd server && make install install-flash-attention`" - ) from e - - if IS_CUDA_SYSTEM and not (is_sm75 or is_sm8x or is_sm90): - raise ImportError( - f"GPU with CUDA capability {major} {minor} is not supported" - ) from e - elif IS_ROCM_SYSTEM: - for idx in range(torch.cuda.device_count()): - if "MI210" not in torch.cuda.get_device_name( - idx - ) and "MI250" not in torch.cuda.get_device_name(idx): - raise ImportError( - f"AMD GPU {torch.cuda.get_device_name(idx)} does not support flash-attention" - ) - - logger.warning(f"Unable to use Flash Attention V2: {e}") - HAS_FLASH_ATTN = True - - -def attention( - q, - k, - v, - out, - cu_seqlens, - max_s, - softmax_scale, - window_size_left=-1, -): - if window_size_left <= 0 and window_size_left != -1: - raise ValueError("`window_size_left` must be > 0 or -1") - - if IS_XPU_SYSTEM: if window_size_left != -1: raise ValueError( f"XPU version of Flash Attention does not support window attention (window_size_left != -1, got window_size_left={window_size_left})." @@ -114,7 +48,77 @@ def attention( None, ) - if HAS_FLASH_ATTN_V2_CUDA: + +if SYSTEM in {"cuda", "rocm"}: + if not torch.cuda.is_available(): + raise ImportError("CUDA is not available") + + major, minor = torch.cuda.get_device_capability() + is_sm75 = major == 7 and minor == 5 + is_sm8x = major == 8 and minor >= 0 + is_sm90 = major == 9 and minor == 0 + + HAS_FLASH_ATTN = False + HAS_FLASH_ATTN_V2_CUDA = False + HAS_FLASH_ATTN_V2_ROCM = False + try: + try: + import flash_attn_2_cuda + except ImportError: + architecture_suffix = f"-{SYSTEM}" + raise ImportError( + "Flash Attention V2 is not installed.\n" + "Use the official Docker image (ghcr.io/huggingface/text-generation-inference:latest) " + f"or install flash attention v2 with `cd server && make install install-flash-attention-v2{architecture_suffix}`" + ) + if not (is_sm8x or is_sm90): + raise ImportError( + f"GPU with CUDA capability {major} {minor} is not supported for " + "Flash Attention V2" + ) + HAS_FLASH_ATTN_V2_CUDA = SYSTEM == "cuda" + HAS_FLASH_ATTN_V2_ROCM = SYSTEM == "rocm" + except ImportError as e: + try: + import flash_attn_cuda + except ImportError: + raise ImportError( + "Flash Attention is not installed.\n" + "Use the official Docker image (ghcr.io/huggingface/text-generation-inference:latest) " + "or install flash attention with `cd server && make install install-flash-attention`" + ) from e + + if SYSTEM == "cuda" and not (is_sm75 or is_sm8x or is_sm90): + raise ImportError( + f"GPU with CUDA capability {major} {minor} is not supported" + ) from e + elif SYSTEM == "rocm": + for idx in range(torch.cuda.device_count()): + if "MI210" not in torch.cuda.get_device_name( + idx + ) and "MI250" not in torch.cuda.get_device_name(idx): + raise ImportError( + f"AMD GPU {torch.cuda.get_device_name(idx)} does not support flash-attention" + ) + + logger.warning(f"Unable to use Flash Attention V2: {e}") + HAS_FLASH_ATTN = True + + +if HAS_FLASH_ATTN_V2_CUDA: + + def attention( + q, + k, + v, + out, + cu_seqlens, + max_s, + softmax_scale, + window_size_left=-1, + ): + if window_size_left <= 0 and window_size_left != -1: + raise ValueError("`window_size_left` must be > 0 or -1") return flash_attn_2_cuda.varlen_fwd( q, k, @@ -136,7 +140,21 @@ def attention( False, None, ) - elif HAS_FLASH_ATTN_V2_ROCM: + +elif HAS_FLASH_ATTN_V2_ROCM: + + def attention( + q, + k, + v, + out, + cu_seqlens, + max_s, + softmax_scale, + window_size_left=-1, + ): + if window_size_left <= 0 and window_size_left != -1: + raise ValueError("`window_size_left` must be > 0 or -1") if window_size_left != -1: raise ValueError( f"RoCm version of Flash Attention v2 does not support window attention (window_size_left != -1, got window_size_left={window_size_left})." @@ -159,7 +177,19 @@ def attention( False, None, ) - elif HAS_FLASH_ATTN: + +elif HAS_FLASH_ATTN: + + def attention( + q, + k, + v, + out, + cu_seqlens, + max_s, + softmax_scale, + window_size_left=-1, + ): if window_size_left != -1: raise NotImplementedError( "window_size_left is only available with flash attn v2" @@ -209,4 +239,5 @@ def attention( None, ) +else: raise NotImplementedError("flash attention is not installed") diff --git a/server/text_generation_server/utils/import_utils.py b/server/text_generation_server/utils/import_utils.py index db205f4d..f54987eb 100644 --- a/server/text_generation_server/utils/import_utils.py +++ b/server/text_generation_server/utils/import_utils.py @@ -10,6 +10,41 @@ def is_xpu_available(): return hasattr(torch, "xpu") and torch.xpu.is_available() -IS_ROCM_SYSTEM = torch.version.hip is not None -IS_CUDA_SYSTEM = torch.version.cuda is not None -IS_XPU_SYSTEM = is_xpu_available() +def get_cuda_free_memory(device, memory_fraction): + total_free_memory, _ = torch.cuda.mem_get_info(device) + total_gpu_memory = torch.cuda.get_device_properties(device).total_memory + free_memory = max(0, total_free_memory - (1 - memory_fraction) * total_gpu_memory) + return free_memory + + +def get_xpu_free_memory(device): + total_gpu_memory = torch.xpu.get_device_properties(device).total_memory + free_memory = int(total_gpu_memory * 0.5) + return free_memory + + +SYSTEM = None +if torch.version.hip is not None: + SYSTEM = "rocm" + empty_cache = torch.cuda.empty_cache + synchronize = torch.cuda.synchronize + get_free_memory = get_cuda_free_memory +elif torch.version.cuda is not None and torch.cuda.is_available(): + SYSTEM = "cuda" + empty_cache = torch.cuda.empty_cache + synchronize = torch.cuda.synchronize + get_free_memory = get_cuda_free_memory +elif is_xpu_available(): + SYSTEM = "xpu" + empty_cache = torch.xpu.empty_cache + synchronize = torch.xpu.synchronize + get_free_memory = get_xpu_free_memory +else: + SYSTEM = "cpu" + + def noop(*args, **kwargs): + pass + + empty_cache = noop + synchronize = noop + get_free_memory = noop diff --git a/server/text_generation_server/utils/layers.py b/server/text_generation_server/utils/layers.py deleted file mode 100644 index 7d339fe5..00000000 --- a/server/text_generation_server/utils/layers.py +++ /dev/null @@ -1,1374 +0,0 @@ -import os -import torch -import torch.distributed - -from torch import nn -from torch.nn import functional as F -from typing import List, Tuple, Optional -from loguru import logger -from functools import lru_cache - -from text_generation_server.utils.speculate import get_speculate - -HAS_BITS_AND_BYTES = True -try: - import bitsandbytes as bnb - from bitsandbytes.nn import Int8Params, Params4bit -except ImportError: - HAS_BITS_AND_BYTES = False - -from accelerate import init_empty_weights - -from text_generation_server.utils.gptq.quant_linear import QuantLinear -from text_generation_server.utils.import_utils import ( - IS_CUDA_SYSTEM, - IS_ROCM_SYSTEM, - IS_XPU_SYSTEM, -) - -if IS_XPU_SYSTEM: - import intel_extension_for_pytorch as ipex - -HAS_AWQ = True -try: - from text_generation_server.utils.awq.quantize.qmodule import WQLinear -except ImportError: - HAS_AWQ = False - -try: - major, _minor = torch.cuda.get_device_capability() -except Exception: - major = 1 - -HAS_EXLLAMA = False -CAN_EXLLAMA = major >= 8 or IS_ROCM_SYSTEM -V2 = os.getenv("EXLLAMA_VERSION", "2") == "2" - -if os.getenv("DISABLE_EXLLAMA") == "True": - HAS_EXLLAMA = False -elif CAN_EXLLAMA: - try: - if V2: - from text_generation_server.utils.gptq.exllamav2 import ( - QuantLinear as ExllamaQuantLinear, - create_exllama_buffers, - set_device, - ) - - HAS_EXLLAMA = "2" - else: - from text_generation_server.utils.gptq.exllama import ( - Ex4bitLinear as ExllamaQuantLinear, - create_exllama_buffers, - set_device, - ) - - HAS_EXLLAMA = "1" - - except ImportError: - pass - -HAS_EETQ = False -try: - from EETQ import quant_weights, w8_a16_gemm - - HAS_EETQ = True -except ImportError: - pass - - -# Monkey patching -@classmethod -def load_layer_norm(cls, prefix, weights, eps): - weight = weights.get_tensor(f"{prefix}.weight") - bias = weights.get_tensor(f"{prefix}.bias") - with init_empty_weights(): - ln = cls(weight.shape, eps=eps) - - ln.weight = nn.Parameter(weight) - ln.bias = nn.Parameter(bias) - return ln - - -@classmethod -def load_layer_norm_no_bias(cls, prefix, weights, eps): - weight = weights.get_tensor(f"{prefix}.weight") - with init_empty_weights(): - ln = cls(weight.shape, eps=eps) - - ln.weight = nn.Parameter(weight) - ln.bias = None - return ln - - -@classmethod -def load_conv2d(cls, prefix, weights, in_channels, out_channels, kernel_size, stride): - weight = weights.get_tensor(f"{prefix}.weight") - bias = weights.get_tensor(f"{prefix}.bias") - with init_empty_weights(): - conv2d = cls( - in_channels=in_channels, - out_channels=out_channels, - kernel_size=kernel_size, - stride=stride, - ) - - conv2d.weight = nn.Parameter(weight) - conv2d.bias = nn.Parameter(bias) - return conv2d - - -@classmethod -def load_conv2d_no_bias( - cls, prefix, weights, in_channels, out_channels, kernel_size, stride -): - weight = weights.get_tensor(f"{prefix}.weight") - with init_empty_weights(): - conv2d = cls( - in_channels=in_channels, - out_channels=out_channels, - kernel_size=kernel_size, - stride=stride, - ) - - conv2d.weight = nn.Parameter(weight) - conv2d.bias = None - return conv2d - - -torch.nn.Conv2d.load = load_conv2d -torch.nn.Conv2d.load_no_bias = load_conv2d_no_bias -torch.nn.LayerNorm.load = load_layer_norm -torch.nn.LayerNorm.load_no_bias = load_layer_norm_no_bias - - -class FastLinear(nn.Module): - def __init__( - self, - weight, - bias, - ) -> None: - super().__init__() - self.weight = nn.Parameter(weight) - if bias is not None: - self.bias = nn.Parameter(bias) - else: - self.bias = None - - @classmethod - def load(cls, config, prefix: str, weights, bias: bool): - weight = weights.get_tensor(f"{prefix}.weight") - if bias: - bias = weights.get_tensor(f"{prefix}.bias") - else: - bias = None - return cls(weight, bias) - - def forward(self, input: torch.Tensor) -> torch.Tensor: - return F.linear(input, self.weight, self.bias) - - -class EETQLinear(nn.Module): - def __init__( - self, - weight, - bias, - ) -> None: - super().__init__() - device = weight.device - if weight.dtype != torch.float16: - weight = weight.to(dtype=torch.float16) - weight = torch.t(weight).contiguous().cpu() - weight, scale = quant_weights(weight, torch.int8, False) - - self.weight = weight.cuda(device) - self.scale = scale.cuda(device) - self.bias = bias.cuda(device) if bias is not None else None - - def forward(self, input: torch.Tensor) -> torch.Tensor: - output = w8_a16_gemm(input, self.weight, self.scale) - output = output + self.bias if self.bias is not None else output - return output - - -def fp8_quantize(weight, qdtype=torch.float8_e4m3fn): - device = weight.device - # weight, scale = quant_weights(weight, torch.int8, False) - finfo = torch.finfo(qdtype) - # Calculate the scale as dtype max divided by absmax - scale = finfo.max / weight.abs().max().clamp(min=1e-12) - # scale and clamp the tensor to bring it to - # the representative range of float8 data type - # (as default cast is unsaturated) - qweight = (weight * scale).clamp(min=finfo.min, max=finfo.max) - # Return both float8 data and the inverse scale (as float), - # as both required as inputs to torch._scaled_mm - qweight = qweight.to(qdtype) - scale = scale.float().reciprocal() - return qweight, scale - - -class Fp8Linear(nn.Module): - def __init__( - self, - weight, - bias, - ) -> None: - super().__init__() - self.dtype = weight.dtype - self.qweight, self.scale = fp8_quantize(weight) - - self.bias = bias if bias is not None else None - - def forward(self, input: torch.Tensor) -> torch.Tensor: - qinput, scale = fp8_quantize(input) - output, _ = torch._scaled_mm( - qinput, - self.qweight.t(), - out_dtype=self.dtype, - scale_a=scale, - scale_b=self.scale, - bias=self.bias, - ) - return output - - -class Linear8bitLt(nn.Module): - def __init__( - self, - weight, - bias, - has_fp16_weights=True, - memory_efficient_backward=False, - threshold=0.0, - index=None, - ): - super().__init__() - assert ( - not memory_efficient_backward - ), "memory_efficient_backward is no longer required and the argument is deprecated in 0.37.0 and will be removed in 0.39.0" - self.state = bnb.MatmulLtState() - self.index = index - - # Necessary for stacked layers - self.state.threshold = threshold - self.state.has_fp16_weights = has_fp16_weights - self.state.memory_efficient_backward = memory_efficient_backward - if threshold > 0.0 and not has_fp16_weights: - self.state.use_pool = True - - self.weight = Int8Params( - weight.data, - has_fp16_weights=has_fp16_weights, - requires_grad=has_fp16_weights, - ) - self.weight.cuda(weight.device) - self.bias = bias - - def init_8bit_state(self): - self.state.CB = self.weight.CB - self.state.SCB = self.weight.SCB - self.weight.CB = None - self.weight.SCB = None - - def forward(self, x: torch.Tensor): - self.state.is_training = self.training - if self.weight.CB is not None: - self.init_8bit_state() - - # weights are cast automatically as Int8Params, but the bias has to be cast manually - if self.bias is not None and self.bias.dtype != x.dtype: - self.bias.data = self.bias.data.to(x.dtype) - - out = bnb.matmul(x, self.weight, bias=self.bias, state=self.state) - - if not self.state.has_fp16_weights: - if self.state.CB is not None and self.state.CxB is not None: - # we converted 8-bit row major to turing/ampere format in the first inference pass - # we no longer need the row-major weight - del self.state.CB - self.weight.data = self.state.CxB - return out - - -class Linear4bit(nn.Module): - def __init__(self, weight, bias, quant_type): - super().__init__() - self.weight = Params4bit( - weight.data, - requires_grad=False, - compress_statistics=True, - quant_type=quant_type, - ) - self.compute_dtype = None - self.weight.cuda(weight.device) - self.bias = bias - - def forward(self, x: torch.Tensor): - # weights are cast automatically as Int8Params, but the bias has to be cast manually - if self.bias is not None and self.bias.dtype != x.dtype: - self.bias.data = self.bias.data.to(x.dtype) - - if getattr(self.weight, "quant_state", None) is None: - print( - "FP4 quantization state not initialized. Please call .cuda() or .to(device) on the LinearFP4 layer first." - ) - inp_dtype = x.dtype - if self.compute_dtype is not None: - x = x.to(self.compute_dtype) - - bias = None if self.bias is None else self.bias.to(self.compute_dtype) - out = bnb.matmul_4bit( - x, self.weight.t(), bias=bias, quant_state=self.weight.quant_state - ) - - out = out.to(inp_dtype) - - return out - - -@lru_cache(1) -def warn_deprecate_bnb(): - logger.warning( - "Bitsandbytes 8bit is deprecated, using `eetq` is a drop-in replacement, and has much better performnce" - ) - - -def get_linear(weight, bias, quantize): - if quantize is None: - linear = FastLinear(weight, bias) - elif quantize == "eetq": - if HAS_EETQ: - linear = EETQLinear(weight, bias) - else: - raise ImportError( - "Please install EETQ from https://github.com/NetEase-FuXi/EETQ" - ) - elif quantize == "fp8": - linear = Fp8Linear(weight, bias) - elif quantize == "bitsandbytes": - warn_deprecate_bnb() - linear = Linear8bitLt( - weight, - bias, - has_fp16_weights=False, - threshold=6.0, - ) - if bias is not None: - linear.bias = nn.Parameter(bias) - elif quantize == "bitsandbytes-fp4": - linear = Linear4bit( - weight, - bias, - quant_type="fp4", - ) - elif quantize == "bitsandbytes-nf4": - linear = Linear4bit( - weight, - bias, - quant_type="nf4", - ) - elif quantize == "gptq": - try: - qweight, qzeros, scales, g_idx, bits, groupsize, use_exllama = weight - except Exception: - raise NotImplementedError( - f"The passed weight is not `gptq` compatible, loader needs to be updated." - ) - - if use_exllama: - linear = ExllamaQuantLinear( - qweight, qzeros, scales, g_idx, bias, bits, groupsize - ) - else: - linear = QuantLinear( - qweight, - qzeros, - scales, - g_idx, - bias, - bits, - groupsize, - ) - elif quantize == "awq": - try: - qweight, qzeros, scales, _, bits, groupsize, _ = weight - except Exception: - raise NotImplementedError( - f"The passed weight is not `awq` compatible, loader needs to be updated." - ) - if IS_ROCM_SYSTEM: - raise NotImplementedError( - "AWQ GEMM kernel can't be used on ROCm systems, please use `--quantize gptq` instead " - "to use Exllama/GPTQ kernels for AWQ inference." - ) - if not HAS_AWQ: - raise NotImplementedError( - "You do not seem to have awq installed, either install it (cd server && make install-awq), or try using GPTQ `---quantize gptq` a conversion AWQ->GPTQ will happen on the fly" - ) - linear = WQLinear( - w_bit=bits, - group_size=groupsize, - qweight=qweight, - qzeros=qzeros, - scales=scales, - bias=bias is not None, - ) - else: - raise NotImplementedError(f"Quantization `{quantize}` is not implemented yet.") - return linear - - -class SuperLayer(nn.Module): - def __init__(self, linear): - super().__init__() - self.linear = linear - - def forward(self, x): - return self.linear.forward(x) - - -class ResBlock(torch.nn.Module): - def __init__(self, config, prefix, weights): - super().__init__() - self.linear = FastLinear.load( - config, prefix=f"{prefix}.linear", weights=weights, bias=True - ) - self.act = torch.nn.SiLU() - - def forward(self, x): - return x + self.act(self.linear(x)) - - -class MedusaModel(torch.nn.Module): - def __init__(self, config, medusa_config, weights): - super().__init__() - self.heads = torch.nn.ModuleList( - [ - MedusaHead(config, medusa_config, prefix=f"{i}", weights=weights) - for i in range(get_speculate()) - ] - ) - - def forward(self, x): - speculative_logits = torch.stack([head(x) for head in self.heads], dim=1) - return speculative_logits - - -class MedusaHead(torch.nn.Module): - def __init__(self, config, medusa_config, prefix, weights): - super().__init__() - self.blocks = torch.nn.ModuleList( - [ - ResBlock(config, prefix=f"{prefix}.{i}", weights=weights) - for i in range(medusa_config["medusa_num_layers"]) - ] - ) - n = len(self.blocks) - self.out = FastLinear.load( - config, prefix=f"{prefix}.{n}", weights=weights, bias=False - ) - - def forward(self, x): - for block in self.blocks: - x = block(x) - x = self.out(x) - return x - - -class MedusaHeadV1(nn.Module): - def __init__(self, lm_head, medusa): - super().__init__() - self.lm_head = lm_head - self.medusa = medusa - - @staticmethod - def load(config, prefix: str, weights): - from pathlib import Path - from safetensors import safe_open - import json - - use_medusa = config.use_medusa - - medusa_config = str(Path(use_medusa) / "config.json") - filename = str(Path(use_medusa) / "medusa_lm_head.safetensors") - - with open(medusa_config, "r") as f: - medusa_config = json.load(f) - routing = weights.routing - with safe_open(filename, framework="pytorch") as f: - for k in f.keys(): - if k in routing and routing[k] != filename: - raise RuntimeError( - f"Key {k} was found in multiple files: {filename} and {routing[k]}" - ) - routing[k] = filename - - medusa = MedusaModel(config, medusa_config, weights) - lm_head = TensorParallelHead.load(config, prefix, weights) - return MedusaHeadV1(lm_head, medusa) - - def forward( - self, input: torch.Tensor - ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: - logits = self.lm_head(input) - # If we have too many tokens, we skip speculative logits - if input.shape[0] > 128: - return logits, None - - speculative_logits = self.medusa(input) - return logits, speculative_logits - - -class MedusaHeadV2(nn.Module): - def __init__(self, config, prefix, weights): - super().__init__() - from pathlib import Path - from safetensors import safe_open - import json - - use_medusa = config.use_medusa - - medusa_config = str(Path(use_medusa) / "config.json") - filename = str(Path(use_medusa) / "medusa_lm_head.safetensors") - - with open(medusa_config, "r") as f: - medusa_config = json.load(f) - routing = weights.routing - with safe_open(filename, framework="pytorch") as f: - for k in f.keys(): - if k in routing and routing[k] != filename: - raise RuntimeError( - f"Key {k} was found in multiple files: {filename} and {routing[k]}" - ) - routing[k] = filename - - self.n_medusa_heads = get_speculate() - - assert medusa_config["medusa_num_layers"] == 1 - self.linear = TensorParallelColumnLinear.load_multi( - config, - prefixes=[f"{i}.0.linear" for i in range(self.n_medusa_heads)], - dim=0, - weights=weights, - bias=True, - ) - self.process_group = weights.process_group - self.world_size = self.process_group.size() - self.rank = self.process_group.rank() - - self.act = torch.nn.SiLU() - - self.lm_head = TensorParallelHead.load(config, prefix, weights) - - def forward(self, x): - # If we have too many tokens, we skip speculative logits - if x.shape[0] > 128: - logits = self.lm_head(x) - return logits, None - - size = x.shape[-1] - block_size = (size + self.world_size - 1) // self.world_size - start = self.rank * block_size - stop = (self.rank + 1) * block_size - - x_block = x[:, start:stop] - - # Compute all medusa heads at the same time, then reshape and move the n_medusa_heads dim to dim 1 - medusa_res = self.act(self.linear(x)).reshape( - *x_block.shape[:-1], self.n_medusa_heads, x_block.shape[-1] - ) - - # Apply all residual medusa heads - output = x[:, start:stop].unsqueeze(-2) + medusa_res - - # Gather medusa heads - world_output = [ - torch.empty_like(output) for _ in range(self.process_group.size()) - ] - torch.distributed.all_gather(world_output, output, group=self.process_group) - world_output = torch.cat(world_output, dim=-1) - - # Stack x and medusa residual x - stacked_x = torch.cat([x.unsqueeze(-2), world_output], dim=-2) - - # Compute lm head on x + medusa residual x - logits = self.lm_head(stacked_x) - - # Finally, split logits from speculative logits - logits, speculative_logits = torch.split( - logits, [1, self.n_medusa_heads], dim=-2 - ) - # Squeeze added dimension - logits = logits.squeeze(-2) - - return logits, speculative_logits - - -class SpeculativeHead(nn.Module): - def __init__(self, lm_head, medusa): - super().__init__() - self.head = lm_head - self.medusa = medusa - - @staticmethod - def load(config, prefix: str, weights): - use_medusa = config.use_medusa - if use_medusa: - lm_head = None - try: - medusa = MedusaHeadV1.load(config, prefix, weights) - except: - medusa = MedusaHeadV2(config, prefix, weights) - else: - lm_head = TensorParallelHead.load(config, prefix, weights) - medusa = None - return SpeculativeHead(lm_head, medusa) - - def forward( - self, input: torch.Tensor - ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: - if self.medusa is not None: - return self.medusa(input) - - assert self.head is not None - logits = self.head(input) - return logits, None - - -class TensorParallelHead(SuperLayer): - def __init__(self, linear, process_group, should_gather: bool): - super().__init__(linear) - self.process_group = process_group - self.should_gather = should_gather - - @staticmethod - def load(config, prefix: str, weights): - if weights.process_group.size() > 1: - try: - weight = weights.get_sharded(f"{prefix}.weight", dim=0) - should_gather = True - except AssertionError: - # If the vocab size is not divisible by number of shards - # just load the entire thing. - weight = weights.get_tensor(f"{prefix}.weight") - should_gather = False - else: - weight = weights.get_tensor(f"{prefix}.weight") - should_gather = False - - # GPTQ,AWQ,EETQ don't quantize heads (nor embeddings) - if config.quantize in ["gptq", "awq", "eetq"]: - quantize = None - else: - quantize = config.quantize - return TensorParallelHead( - get_linear(weight, bias=None, quantize=quantize), - process_group=weights.process_group, - should_gather=should_gather, - ) - - def forward(self, input: torch.Tensor) -> torch.Tensor: - if not self.should_gather: - return super().forward(input) - - world_size = self.process_group.size() - if len(input.shape) == 2 and isinstance(self.linear, FastLinear): - out_dim = self.linear.weight.shape[0] - - if input.shape[0] == 1: - world_out = input.new_empty(1, out_dim * world_size) - local_out = input.new_empty(1, out_dim) - gather_input = local_out - else: - world_out = input.new_empty(out_dim * world_size, input.shape[0]) - gather_input = input.new_empty(out_dim, input.shape[0]) - local_out = gather_input.T - - torch.mm(input, self.linear.weight.T, out=local_out) - - torch.distributed.all_gather_into_tensor( - world_out, gather_input, group=self.process_group - ) - - if input.shape[0] == 1: - return world_out - return world_out.T - - output = super().forward(input) - world_output = [ - torch.empty_like(output) for _ in range(self.process_group.size()) - ] - torch.distributed.all_gather(world_output, output, group=self.process_group) - world_output = torch.cat(world_output, dim=-1) - return world_output - - -class TensorParallelColumnLinear(SuperLayer): - @classmethod - def load_gate_up(cls, config, prefix: str, weights, bias: bool): - """Specific method when the QKV was joined after the fact""" - weight = weights.get_weights_col_packed_gate_up( - prefix, quantize=config.quantize - ) - if bias: - raise NotImplementedError("packed_gate_up only implemented without bias") - else: - bias = None - linear = get_linear(weight, bias, config.quantize) - return cls(linear) - - @classmethod - def load_qkv(cls, config, prefix: str, weights, bias: bool): - """Specific method when the QKV was joined after the fact""" - weight = weights.get_weights_col_packed_qkv(prefix, quantize=config.quantize) - if bias: - raise NotImplementedError("packed_qkv only implemented for baichuan") - else: - bias = None - linear = get_linear(weight, bias, config.quantize) - return cls(linear) - - @classmethod - def load(cls, config, prefix: str, weights, bias: bool): - return cls.load_multi(config, [prefix], weights, bias, dim=0) - - @classmethod - def load_multi(cls, config, prefixes: List[str], weights, bias: bool, dim: int): - weight = weights.get_multi_weights_col( - prefixes, quantize=config.quantize, dim=dim - ) - - if bias: - b = [weights.get_sharded(f"{p}.bias", dim=0) for p in prefixes] - bias = torch.cat(b, dim=dim) - else: - bias = None - linear = get_linear(weight, bias, config.quantize) - return cls(linear) - - -class TensorParallelRowLinear(SuperLayer): - def __init__(self, linear, process_group): - super().__init__(linear) - self.process_group = process_group - - @classmethod - def load(cls, config, prefix: str, weights, bias: bool): - weight = weights.get_multi_weights_row(prefix, quantize=config.quantize) - - if bias and weights.process_group.rank() == 0: - # Rank is only on the first rank process - bias = weights.get_tensor(f"{prefix}.bias") - else: - bias = None - return cls( - get_linear(weight, bias, config.quantize), - process_group=weights.process_group, - ) - - def forward(self, input: torch.Tensor, reduce: bool = True) -> torch.Tensor: - out = super().forward(input) - if self.process_group.size() > 1 and reduce: - torch.distributed.all_reduce(out, group=self.process_group) - return out - - -class TensorParallelEmbedding(nn.Module): - def __init__(self, prefix: str, weights, reduce=True): - super().__init__() - weight = weights.get_partial_sharded(f"{prefix}.weight", dim=0) - num_embeddings = weights.get_shape(f"{prefix}.weight")[0] - - process_group = weights.process_group - - world_size = process_group.size() - rank = process_group.rank() - - block_size = (num_embeddings + world_size - 1) // world_size - self.min_id = rank * block_size - self.max_id = min(num_embeddings, (rank + 1) * block_size) - self.null_idx = weight.shape[ - 0 - ] # Usually block_size, might be less in non even vocab_size. - self.process_group = weights.process_group - self.reduce = reduce - - """Additional 0 entry used for masking""" - self.weight = nn.Parameter(F.pad(weight, (0, 0, 0, 1))) - - def forward(self, input: torch.Tensor) -> torch.Tensor: - # default all out of bounds values to `self.null_idx` that will then be mapped to 0 - # translate for [0, self.max_id - self.min_id[ - input = torch.where( - (self.min_id > input) | (input >= self.max_id), - self.null_idx, - input - self.min_id, - ) - out = torch.nn.functional.embedding(input, self.weight) - if self.reduce and self.process_group.size() > 1: - torch.distributed.all_reduce(out, group=self.process_group) - return out - - -try: - if IS_CUDA_SYSTEM: - import dropout_layer_norm - elif IS_ROCM_SYSTEM: - from vllm import layernorm_ops - else: - dropout_layer_norm = None - - class FastLayerNorm(nn.LayerNorm): - def forward(self, hidden_states, residual=None): - if IS_XPU_SYSTEM: - res_out = hidden_states - out = ipex.llm.functional.add_layer_norm( - residual, hidden_states, self.weight, self.bias, self.eps, True - ) - if residual is not None: - res_out = residual - return out, res_out - elif hidden_states.shape[-1] > 8192 or IS_ROCM_SYSTEM: - if residual is not None: - hidden_states += residual - residual = hidden_states - - return super(FastLayerNorm, self).forward(hidden_states), residual - else: - ( - normed_hidden_states, - residual, - *rest, - ) = dropout_layer_norm.dropout_add_ln_fwd( - hidden_states, - residual, - self.weight, - self.bias, - None, - None, - None, - None, - 0.0, - self.eps, - 1.0, - 0, - None, - False, - False, - ) - if residual is None: - residual = hidden_states - - return normed_hidden_states, residual - - class FastRMSNorm(nn.Module): - def __init__(self, weight: torch.Tensor, eps: float): - super().__init__() - - self.weight = nn.Parameter(weight) - self.variance_epsilon = eps - - @classmethod - def load(cls, prefix, weights, eps=1e-6): - weight = weights.get_tensor(f"{prefix}.weight") - return cls(weight, eps) - - def forward(self, hidden_states, residual=None): - if IS_XPU_SYSTEM: - residual_out = hidden_states - out = ipex.llm.functional.add_rms_norm( - residual, - hidden_states, - self.weight, - None, - self.variance_epsilon, - True, - ) - if residual is not None: - residual_out = residual - return out, residual_out - elif hidden_states.shape[-1] > 8192: - if residual is not None: - hidden_states += residual - residual = hidden_states - - hidden_states = hidden_states.to(torch.float32) - variance = hidden_states.pow(2).mean(-1, keepdim=True) - hidden_states = hidden_states * torch.rsqrt( - variance + self.variance_epsilon - ) - - # convert into half-precision if necessary - if self.weight.dtype in [torch.float16, torch.bfloat16]: - hidden_states = hidden_states.to(self.weight.dtype) - - return self.weight * hidden_states, residual - elif IS_CUDA_SYSTEM: - # faster post attention rms norm - ( - normed_hidden_states, - res, - *rest, - ) = dropout_layer_norm.dropout_add_ln_fwd( - hidden_states, - residual, - self.weight, - None, - None, - None, - None, - None, - 0.0, - self.variance_epsilon, - 1.0, - 0, - None, - False, - True, # Activate RMSNorm - ) - if res is None: - res = hidden_states - - return normed_hidden_states, res - elif IS_ROCM_SYSTEM: - # We use VLLM RMSNorm kernel that can be compiled for RoCm, instead of Flash Attention ones that can not. - if residual is not None: - hidden_states += residual - residual = hidden_states - - out = torch.empty_like(hidden_states) - layernorm_ops.rms_norm( - out, - hidden_states, - self.weight.data, - self.variance_epsilon, - ) - return out, residual - else: - raise ValueError( - "Your system seem to be not supported. Please check your install or open an issue at https://github.com/huggingface/text-generation-inference/issues with a clear reproduction." - ) - -except ImportError: - pass - -try: - if IS_CUDA_SYSTEM: - from flash_attn.layers.rotary import RotaryEmbedding - import rotary_emb - elif IS_ROCM_SYSTEM: - from vllm import pos_encoding_ops - - def _create_inv_freq(dim, base, device): - inv_freq = 1.0 / ( - base ** (torch.arange(0, dim, 2, device=device, dtype=torch.float32) / dim) - ) - return inv_freq - - def _get_rope_config(config): - if os.getenv("ROPE_SCALING", None) is not None: - rope_scaling = { - "type": os.environ["ROPE_SCALING"], - "factor": float(os.environ["ROPE_FACTOR"]), - } - return rope_scaling - return getattr(config, "rope_scaling", None) - - class PositionRotaryEmbedding(nn.Module): - def __init__(self, inv_freq, scaling_factor): - super().__init__() - self.inv_freq = inv_freq - self._seq_len_cached = 0 - self._cos_cached = None - self._sin_cached = None - self._cos_k_cached = None - self._sin_k_cached = None - self.scaling_factor = scaling_factor - self.dynamic_args = None - - def forward( - self, - query: torch.Tensor, - key: torch.Tensor, - cos: torch.Tensor, - sin: torch.Tensor, - ): - # Such controlflows may add some overhead. - if IS_CUDA_SYSTEM: - rotary_dim = cos.shape[-1] - q1 = query[..., :rotary_dim] - q2 = query[..., rotary_dim : 2 * rotary_dim] - - rotary_emb.apply_rotary(q1, q2, cos, sin, q1, q2, False) - - k1 = key[..., :rotary_dim] - k2 = key[..., rotary_dim : 2 * rotary_dim] - - rotary_emb.apply_rotary(k1, k2, cos, sin, k1, k2, False) - elif IS_ROCM_SYSTEM: - # NOTE: On RoCm systems, we use a ROPE implementatation adapted from VLLM which launches a single kernel for both query/key, contrary to flash-attn implementation used on NVIDIA systems. - # Compiling flash-attn rotary on RoCm, it appears hipcc is unable to unroll loops, resulting in an even slower inference compared to eager: https://github.com/pytorch/pytorch/issues/113773 - - head_size = query.shape[-1] - - # Inplace operation, updating query and key. - pos_encoding_ops.rotary_embedding(query, key, head_size, cos, sin, True) - elif IS_XPU_SYSTEM: - ipex.llm.functional.rotary_embedding( - query, key, sin, cos, query.size(-1), True - ) - else: - raise ValueError( - "Your system seem to be not supported. Please check your install or open an issue at https://github.com/huggingface/text-generation-inference/issues with a clear reproduction." - ) - - @classmethod - def static(cls, config, dim, base, device): - inv_freq = _create_inv_freq(dim, base, device) - scaling_factor = None - rope_scaling = _get_rope_config(config) - if rope_scaling is not None: - if rope_scaling["type"] == "linear": - pass - elif rope_scaling["type"] == "dynamic": - scaling_factor = rope_scaling["factor"] - return DynamicPositionRotaryEmbedding( - dim=dim, - max_position_embeddings=config.max_position_embeddings, - base=base, - device=inv_freq.device, - scaling_factor=scaling_factor, - ) - elif rope_scaling["type"] == "yarn": - scaling_factor = rope_scaling["factor"] - return YarnPositionRotaryEmbedding( - dim=2 * inv_freq.shape[0], - max_position_embeddings=rope_scaling[ - "original_max_position_embeddings" - ], - base=10000.0, - device=inv_freq.device, - scaling_factor=scaling_factor, - extrapolation_factor=1, - attn_factor=1, - beta_fast=32, - beta_slow=1, - ) - elif rope_scaling["type"] == "su": - short_factor = torch.tensor( - rope_scaling["short_factor"], dtype=torch.float32, device=device - ) - short_inv_freq = 1.0 / ( - short_factor - * base - ** ( - torch.arange(0, dim, 2, device=device, dtype=torch.float32) - / dim - ) - ) - long_factor = torch.tensor( - rope_scaling["long_factor"], dtype=torch.float32, device=device - ) - long_inv_freq = 1.0 / ( - long_factor - * base - ** ( - torch.arange(0, dim, 2, device=device, dtype=torch.float32) - / dim - ) - ) - - original_max_position_embeddings = ( - config.original_max_position_embeddings - ) - max_position_embeddings = config.max_position_embeddings - if max_position_embeddings <= original_max_position_embeddings: - scaling_factor = 1.0 - else: - scale = ( - max_position_embeddings / original_max_position_embeddings - ) - scaling_factor = math.sqrt( - 1 - + math.log(scale) - / math.log(original_max_position_embeddings) - ) - - return SuRotaryEmbedding( - short_inv_freq=short_inv_freq, - long_inv_freq=long_inv_freq, - scaling_factor=scaling_factor, - original_max_position_embeddings=original_max_position_embeddings, - ) - else: - raise NotImplementedError( - f"rope scaling type {rope_scaling['type']} is not implemented or invalid" - ) - return cls(inv_freq, scaling_factor) - - @classmethod - def load(cls, config, prefix, weights): - # XXX: Always load this in float32 ! - dtype = weights.dtype - weights.dtype = torch.float32 - inv_freq = weights.get_tensor(f"{prefix}.inv_freq") - weights.dtype = dtype - - scaling_factor = None - rope_scaling = _get_rope_config(config) - if rope_scaling is not None: - scaling_factor = rope_scaling["factor"] - if rope_scaling["type"] == "linear": - pass - elif rope_scaling["type"] == "dynamic": - return DynamicPositionRotaryEmbedding( - dim=2 * inv_freq.shape[0], - max_position_embeddings=config.max_position_embeddings, - base=10000.0, - device=inv_freq.device, - scaling_factor=scaling_factor, - ) - elif rope_scaling["type"] == "yarn": - return YarnPositionRotaryEmbedding( - dim=2 * inv_freq.shape[0], - max_position_embeddings=rope_scaling[ - "original_max_position_embeddings" - ], - base=10000.0, - device=inv_freq.device, - scaling_factor=scaling_factor, - extrapolation_factor=1, - attn_factor=1, - beta_fast=32, - beta_slow=1, - ) - else: - raise NotImplementedError( - f"rope scaling type {rope_scaling['type']} is not implemented or invalid" - ) - return cls(inv_freq, scaling_factor) - - def _update_cos_sin_cache(self, dtype, device, seqlen): - # Reset the tables if the sequence length has changed, - # or if we're on a new device (possibly due to tracing for instance) - if ( - seqlen > self._seq_len_cached - or self._cos_cached.device != device - or self._cos_cached.dtype != dtype - ): - self._seq_len_cached = seqlen - t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype) - if self.scaling_factor is not None: - t /= self.scaling_factor - # Don't do einsum, it converts fp32 to fp16 - # freqs = torch.einsum("i,j->ij", t, self.inv_freq) - - freqs = torch.outer(t, self.inv_freq.to(device=t.device)) - self._cos_cached = torch.cos(freqs).to(dtype) - self._sin_cached = torch.sin(freqs).to(dtype) - - def get_cos_sin( - self, position_ids: torch.Tensor, max_s: int, dtype: torch.dtype - ): - """ - Return cos and sin for the asked position ids - """ - if IS_ROCM_SYSTEM: - # For RoCm, we always use float cos/sin to avoid a cast. - # For NVIDIA, for some reason, the flash-attn rotary kernel requires cos/sin and query/key to be of same dtype: https://github.com/Dao-AILab/flash-attention/blob/017716451d446e464dde9aca3a3c1ed2209caaa9/csrc/rotary/rotary.cpp#L26 - # But later on goes and cast cos/sin to float anyway: https://github.com/Dao-AILab/flash-attention/blob/017716451d446e464dde9aca3a3c1ed2209caaa9/csrc/rotary/rotary_cuda.cu#L29, which looks suboptimal. - dtype = torch.float32 - - self._update_cos_sin_cache(dtype, position_ids.device, max_s) - - cos = torch.index_select(self._cos_cached, 0, position_ids) - sin = torch.index_select(self._sin_cached, 0, position_ids) - - # Note: this unsqueeze is not necessary on RoCm + VLLM ROPE implementation, but we leave it as is to avoid yet an other controlflow. - return cos.unsqueeze(1), sin.unsqueeze(1) - - class SuRotaryEmbedding(PositionRotaryEmbedding): - def __init__( - self, - short_inv_freq, - long_inv_freq, - scaling_factor, - original_max_position_embeddings, - ): - super(PositionRotaryEmbedding, self).__init__() - self.short_inv_freq = short_inv_freq - self.long_inv_freq = long_inv_freq - self.scaling_factor = scaling_factor - self.original_max_position_embeddings = original_max_position_embeddings - self._seq_len_cached = 0 - self._cos_cached = None - self._sin_cached = None - self._cos_k_cached = None - self._sin_k_cached = None - self.dynamic_args = None - - def _update_cos_sin_cache(self, dtype, device, seqlen): - # Reset the tables if the sequence length has changed, - # or if we're on a new device (possibly due to tracing for instance) - if ( - seqlen > self._seq_len_cached - or self._cos_cached.device != device - or self._cos_cached.dtype != dtype - ): - self._seq_len_cached = seqlen - if seqlen > self.original_max_position_embeddings: - inv_freq = self.long_inv_freq - else: - inv_freq = self.short_inv_freq - t = torch.arange(seqlen, device=device, dtype=inv_freq.dtype) - if self.scaling_factor is not None: - t /= self.scaling_factor - # Don't do einsum, it converts fp32 to fp16 - # freqs = torch.einsum("i,j->ij", t, self.inv_freq) - - freqs = torch.outer(t, inv_freq.to(device=t.device)) - self._cos_cached = torch.cos(freqs).to(dtype) - self._sin_cached = torch.sin(freqs).to(dtype) - - class DynamicPositionRotaryEmbedding(PositionRotaryEmbedding): - def __init__(self, dim, max_position_embeddings, base, device, scaling_factor): - inv_freq = _create_inv_freq(dim, base, device) - super().__init__(inv_freq, scaling_factor) - self.dim = dim - self.max_position_embeddings = max_position_embeddings - self.base = base - - def _update_cos_sin_cache(self, dtype, device, seqlen): - # Reset the tables if the sequence length has changed, - # or if we're on a new device (possibly due to tracing for instance) - if ( - seqlen > self._seq_len_cached - or self._cos_cached.device != device - or self._cos_cached.dtype != dtype - ): - if seqlen > self.max_position_embeddings: - newbase = self.base * ( - (self.scaling_factor * seqlen / self.max_position_embeddings) - - (self.scaling_factor - 1) - ) ** (self.dim / (self.dim - 2)) - self.inv_freq = _create_inv_freq( - self.dim, newbase, self.inv_freq.device - ) - self._seq_len_cached = seqlen - t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype) - # Don't do einsum, it converts fp32 to fp16 - # freqs = torch.einsum("i,j->ij", t, self.inv_freq) - - freqs = torch.outer(t, self.inv_freq.to(device=t.device)) - self._cos_cached = torch.cos(freqs).to(dtype) - self._sin_cached = torch.sin(freqs).to(dtype) - - # Inverse dim formula to find dim based on number of rotations - import math - - def find_correction_dim( - num_rotations, dim, base=10000, max_position_embeddings=2048 - ): - return ( - dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi)) - ) / (2 * math.log(base)) - - # Find dim range bounds based on rotations - def find_correction_range( - low_rot, high_rot, dim, base=10000, max_position_embeddings=2048 - ): - low = math.floor( - find_correction_dim(low_rot, dim, base, max_position_embeddings) - ) - high = math.ceil( - find_correction_dim(high_rot, dim, base, max_position_embeddings) - ) - return max(low, 0), min(high, dim - 1) # Clamp values just in case - - def linear_ramp_mask(min, max, dim): - if min == max: - max += 0.001 # Prevent singularity - - linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min) - ramp_func = torch.clamp(linear_func, 0, 1) - return ramp_func - - def get_mscale(scale=1): - if scale <= 1: - return 1.0 - return 0.1 * math.log(scale) + 1.0 - - class YarnPositionRotaryEmbedding(PositionRotaryEmbedding): - def __init__( - self, - dim, - max_position_embeddings, - base, - device, - scaling_factor, - *, - extrapolation_factor, - attn_factor, - beta_fast, - beta_slow, - ): - inv_freq = _create_inv_freq(dim, base, device) - super().__init__(inv_freq, scaling_factor) - self.dim = dim - self.max_position_embeddings = max_position_embeddings - self.base = base - self.extrapolation_factor = extrapolation_factor - self.attn_factor = attn_factor - self.beta_fast = beta_fast - self.beta_slow = beta_slow - self.mscale = float( - get_mscale(self.scaling_factor) * self.attn_factor - ) # Get n-d magnitude scaling corrected for interpolation - - def _update_cos_sin_cache(self, dtype, device, seqlen): - # Reset the tables if the sequence length has changed, - # or if we're on a new device (possibly due to tracing for instance) - if ( - seqlen > self._seq_len_cached - or self._cos_cached.device != device - or self._cos_cached.dtype != dtype - ): - if seqlen > self.max_position_embeddings: - inv_freq_extrapolation = _create_inv_freq( - self.dim, self.base, self.inv_freq.device - ) - freqs = 1.0 / inv_freq_extrapolation - inv_freq_interpolation = 1.0 / (self.scaling_factor * freqs) - low, high = find_correction_range( - self.beta_fast, - self.beta_slow, - self.dim, - self.base, - self.max_position_embeddings, - ) - inv_freq_mask = ( - 1 - - linear_ramp_mask(low, high, self.dim // 2).float().to(device) - ) * self.extrapolation_factor # Get n-d rotational scaling corrected for extrapolation - inv_freq = ( - inv_freq_interpolation * (1 - inv_freq_mask) - + inv_freq_extrapolation * inv_freq_mask - ) - - self.inv_freq = inv_freq - self.mscale = float( - get_mscale(self.scaling_factor) * self.attn_factor - ) # Get n-d magnitude scaling corrected for interpolation - - self._seq_len_cached = seqlen - t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype) - # Don't do einsum, it converts fp32 to fp16 - # freqs = torch.einsum("i,j->ij", t, self.inv_freq) - - freqs = torch.outer(t, self.inv_freq.to(device=t.device)) - self._cos_cached = (torch.cos(freqs) * self.mscale).to(dtype) - self._sin_cached = (torch.sin(freqs) * self.mscale).to(dtype) - -except ImportError: - pass diff --git a/server/text_generation_server/utils/paged_attention.py b/server/text_generation_server/utils/paged_attention.py index 62c0c893..1b31f7e7 100644 --- a/server/text_generation_server/utils/paged_attention.py +++ b/server/text_generation_server/utils/paged_attention.py @@ -1,13 +1,9 @@ import torch -from text_generation_server.utils.import_utils import ( - IS_CUDA_SYSTEM, - IS_ROCM_SYSTEM, - IS_XPU_SYSTEM, -) +from text_generation_server.utils.import_utils import SYSTEM _PARTITION_SIZE = 512 -if IS_XPU_SYSTEM: +if SYSTEM == "xpu": import intel_extension_for_pytorch as ipex @@ -18,17 +14,17 @@ def reshape_and_cache( value_cache: torch.Tensor, slots: torch.Tensor, ): - if IS_CUDA_SYSTEM: + if SYSTEM == "cuda": from vllm._C import cache_ops cache_ops.reshape_and_cache( key, value, key_cache, value_cache, slots, "auto", 1.0 ) - elif IS_ROCM_SYSTEM: + elif SYSTEM == "rocm": from vllm import cache_ops cache_ops.reshape_and_cache(key, value, key_cache, value_cache, slots) - elif IS_XPU_SYSTEM: + elif SYSTEM == "xpu": ipex.llm.modules.PagedAttention.reshape_and_cache( key, value, key_cache, value_cache, slots ) @@ -68,7 +64,7 @@ def attention( block_size = value_cache.shape[3] num_seqs, num_heads, head_size = query.shape max_num_partitions = (max_s + _PARTITION_SIZE - 1) // _PARTITION_SIZE - if IS_XPU_SYSTEM: + if SYSTEM == "xpu": query = query.contiguous() return ipex.llm.modules.PagedAttention.single_query_cached_kv_attention( out, @@ -91,7 +87,7 @@ def attention( # to parallelize. use_v1 = max_s <= 8192 and (max_num_partitions == 1 or num_seqs * num_heads > 512) if use_v1: - if IS_CUDA_SYSTEM: + if SYSTEM == "cuda": from vllm._C import ops ops.paged_attention_v1( @@ -109,7 +105,7 @@ def attention( "auto", 1.0, ) - elif IS_ROCM_SYSTEM: + elif SYSTEM == "rocm": from vllm import attention_ops attention_ops.paged_attention_v1( @@ -143,7 +139,7 @@ def attention( ) max_logits = torch.empty_like(exp_sums) - if IS_CUDA_SYSTEM: + if SYSTEM == "cuda": from vllm._C import ops ops.paged_attention_v2( @@ -164,7 +160,7 @@ def attention( "auto", 1.0, ) - elif IS_ROCM_SYSTEM: + elif SYSTEM == "rocm": from vllm import attention_ops attention_ops.paged_attention_v2( diff --git a/server/text_generation_server/utils/weights.py b/server/text_generation_server/utils/weights.py index da7aed1a..6af7d3fb 100644 --- a/server/text_generation_server/utils/weights.py +++ b/server/text_generation_server/utils/weights.py @@ -171,7 +171,7 @@ class Weights: log_once( logger.info, "Converting AWQ model to Exllama/GPTQ packing format." ) - from text_generation_server.utils.awq.conversion_utils import ( + from text_generation_server.layers.awq.conversion_utils import ( fast_awq_to_gptq, ) @@ -227,7 +227,7 @@ class Weights: bits, groupsize, desc_act, quant_method = self._get_gptq_params() - from text_generation_server.utils.layers import HAS_EXLLAMA + from text_generation_server.layers.gptq import HAS_EXLLAMA use_exllama = ( bits == 4 and HAS_EXLLAMA and quantize == "gptq" and not desc_act @@ -242,7 +242,7 @@ class Weights: log_once( logger.info, "Converting AWQ model to Exllama/GPTQ packing format." ) - from text_generation_server.utils.awq.conversion_utils import ( + from text_generation_server.layers.awq.conversion_utils import ( fast_awq_to_gptq, ) @@ -321,7 +321,7 @@ class Weights: # it would require to reorder input activations that are split unto several GPUs use_exllama = False - from text_generation_server.utils.layers import HAS_EXLLAMA, CAN_EXLLAMA + from text_generation_server.layers.gptq import HAS_EXLLAMA, CAN_EXLLAMA if use_exllama: if not HAS_EXLLAMA: @@ -348,7 +348,7 @@ class Weights: log_once( logger.info, "Converting AWQ model to Exllama/GPTQ packing format." ) - from text_generation_server.utils.awq.conversion_utils import ( + from text_generation_server.layers.awq.conversion_utils import ( fast_awq_to_gptq, ) From c3954319996b4833ace90fa5fcbecfe5a3e8f994 Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Mon, 13 May 2024 13:46:29 +0200 Subject: [PATCH 10/46] Granite support? (#1882) Fixes # (issue) - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests), Pull Request section? - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link to it if that's the case. - [ ] Did you make sure to update the documentation with your changes? Here are the [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation). - [ ] Did you write any new necessary tests? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR. --- server/poetry.lock | 3 +- server/requirements_cuda.txt | 8 +-- server/requirements_rocm.txt | 8 +-- .../custom_modeling/flash_llama_modeling.py | 57 +++++++------------ 4 files changed, 31 insertions(+), 45 deletions(-) diff --git a/server/poetry.lock b/server/poetry.lock index cdbbd581..70e51d64 100644 --- a/server/poetry.lock +++ b/server/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. [[package]] name = "accelerate" @@ -1585,6 +1585,7 @@ description = "Nvidia JIT LTO Library" optional = false python-versions = ">=3" files = [ + {file = "nvidia_nvjitlink_cu12-12.5.40-py3-none-manylinux2014_aarch64.whl", hash = "sha256:004186d5ea6a57758fd6d57052a123c73a4815adf365eb8dd6a85c9eaa7535ff"}, {file = "nvidia_nvjitlink_cu12-12.5.40-py3-none-manylinux2014_x86_64.whl", hash = "sha256:d9714f27c1d0f0895cd8915c07a87a1d0029a0aa36acaf9156952ec2a8a12189"}, {file = "nvidia_nvjitlink_cu12-12.5.40-py3-none-win_amd64.whl", hash = "sha256:c3401dc8543b52d3a8158007a0c1ab4e9c768fcbd24153a48c86972102197ddd"}, ] diff --git a/server/requirements_cuda.txt b/server/requirements_cuda.txt index c2714764..7f0efded 100644 --- a/server/requirements_cuda.txt +++ b/server/requirements_cuda.txt @@ -11,7 +11,7 @@ googleapis-common-protos==1.63.0 ; python_version >= "3.9" and python_version < grpc-interceptor==0.15.4 ; python_version >= "3.9" and python_version < "3.13" grpcio-reflection==1.62.2 ; python_version >= "3.9" and python_version < "3.13" grpcio-status==1.62.2 ; python_version >= "3.9" and python_version < "3.13" -grpcio==1.62.2 ; python_version >= "3.9" and python_version < "3.13" +grpcio==1.63.0 ; python_version >= "3.9" and python_version < "3.13" hf-transfer==0.1.6 ; python_version >= "3.9" and python_version < "3.13" huggingface-hub==0.19.4 ; python_version >= "3.9" and python_version < "3.13" idna==3.7 ; python_version >= "3.9" and python_version < "3.13" @@ -32,15 +32,15 @@ prometheus-client==0.20.0 ; python_version >= "3.9" and python_version < "3.13" protobuf==4.25.3 ; python_version >= "3.9" and python_version < "3.13" py-cpuinfo==9.0.0 ; python_version >= "3.9" and python_version < "3.13" pyyaml==6.0.1 ; python_version >= "3.9" and python_version < "3.13" -regex==2024.4.28 ; python_version >= "3.9" and python_version < "3.13" +regex==2024.5.10 ; python_version >= "3.9" and python_version < "3.13" requests==2.31.0 ; python_version >= "3.9" and python_version < "3.13" safetensors==0.4.3 ; python_version >= "3.9" and python_version < "3.13" scipy==1.13.0 ; python_version >= "3.9" and python_version < "3.13" sentencepiece==0.1.99 ; python_version >= "3.9" and python_version < "3.13" setuptools==69.5.1 ; python_version >= "3.9" and python_version < "3.13" tokenizers==0.19.1 ; python_version >= "3.9" and python_version < "3.13" -tqdm==4.66.2 ; python_version >= "3.9" and python_version < "3.13" -transformers==4.40.1 ; python_version >= "3.9" and python_version < "3.13" +tqdm==4.66.4 ; python_version >= "3.9" and python_version < "3.13" +transformers==4.40.2 ; python_version >= "3.9" and python_version < "3.13" typer==0.6.1 ; python_version >= "3.9" and python_version < "3.13" typing-extensions==4.11.0 ; python_version >= "3.9" and python_version < "3.13" urllib3==2.2.1 ; python_version >= "3.9" and python_version < "3.13" diff --git a/server/requirements_rocm.txt b/server/requirements_rocm.txt index c2714764..7f0efded 100644 --- a/server/requirements_rocm.txt +++ b/server/requirements_rocm.txt @@ -11,7 +11,7 @@ googleapis-common-protos==1.63.0 ; python_version >= "3.9" and python_version < grpc-interceptor==0.15.4 ; python_version >= "3.9" and python_version < "3.13" grpcio-reflection==1.62.2 ; python_version >= "3.9" and python_version < "3.13" grpcio-status==1.62.2 ; python_version >= "3.9" and python_version < "3.13" -grpcio==1.62.2 ; python_version >= "3.9" and python_version < "3.13" +grpcio==1.63.0 ; python_version >= "3.9" and python_version < "3.13" hf-transfer==0.1.6 ; python_version >= "3.9" and python_version < "3.13" huggingface-hub==0.19.4 ; python_version >= "3.9" and python_version < "3.13" idna==3.7 ; python_version >= "3.9" and python_version < "3.13" @@ -32,15 +32,15 @@ prometheus-client==0.20.0 ; python_version >= "3.9" and python_version < "3.13" protobuf==4.25.3 ; python_version >= "3.9" and python_version < "3.13" py-cpuinfo==9.0.0 ; python_version >= "3.9" and python_version < "3.13" pyyaml==6.0.1 ; python_version >= "3.9" and python_version < "3.13" -regex==2024.4.28 ; python_version >= "3.9" and python_version < "3.13" +regex==2024.5.10 ; python_version >= "3.9" and python_version < "3.13" requests==2.31.0 ; python_version >= "3.9" and python_version < "3.13" safetensors==0.4.3 ; python_version >= "3.9" and python_version < "3.13" scipy==1.13.0 ; python_version >= "3.9" and python_version < "3.13" sentencepiece==0.1.99 ; python_version >= "3.9" and python_version < "3.13" setuptools==69.5.1 ; python_version >= "3.9" and python_version < "3.13" tokenizers==0.19.1 ; python_version >= "3.9" and python_version < "3.13" -tqdm==4.66.2 ; python_version >= "3.9" and python_version < "3.13" -transformers==4.40.1 ; python_version >= "3.9" and python_version < "3.13" +tqdm==4.66.4 ; python_version >= "3.9" and python_version < "3.13" +transformers==4.40.2 ; python_version >= "3.9" and python_version < "3.13" typer==0.6.1 ; python_version >= "3.9" and python_version < "3.13" typing-extensions==4.11.0 ; python_version >= "3.9" and python_version < "3.13" urllib3==2.2.1 ; python_version >= "3.9" and python_version < "3.13" diff --git a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py index a7969494..6a6b2e0a 100644 --- a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py @@ -23,7 +23,6 @@ import torch.distributed from torch import nn from transformers.activations import ACT2FN -from transformers.configuration_utils import PretrainedConfig from typing import Optional, List, Tuple from text_generation_server.utils import paged_attention, flash_attn @@ -32,7 +31,6 @@ from text_generation_server.layers import ( TensorParallelColumnLinear, TensorParallelEmbedding, SpeculativeHead, - get_linear, ) from text_generation_server.layers.rotary import PositionRotaryEmbedding from text_generation_server.layers.layernorm import ( @@ -41,22 +39,29 @@ from text_generation_server.layers.layernorm import ( def load_attention(config, prefix, weights): + bias = config.attention_bias if config.num_attention_heads != config.num_key_value_heads: - return _load_gqa(config, prefix, weights) + return TensorParallelColumnLinear.load_multi( + config, + prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"], + dim=0, + weights=weights, + bias=bias, + ) else: if config.model_type == "baichuan": return TensorParallelColumnLinear.load_qkv( config, prefix=f"{prefix}.W_pack", weights=weights, - bias=False, + bias=bias, ) elif config.model_type == "phi3": return TensorParallelColumnLinear.load_qkv( config, prefix=f"{prefix}.qkv_proj", weights=weights, - bias=False, + bias=bias, ) else: return TensorParallelColumnLinear.load_multi( @@ -64,36 +69,10 @@ def load_attention(config, prefix, weights): prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"], dim=0, weights=weights, - bias=False, + bias=bias, ) -def _load_gqa(config, prefix: str, weights): - assert config.hidden_size % config.num_attention_heads == 0 - assert config.num_attention_heads % weights.process_group.size() == 0 - - weight = weights.get_multi_weights_col( - prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"], - quantize=config.quantize, - dim=0, - ) - - if config.quantize not in ["gptq", "awq"]: - weight = weight.to(dtype=weights.dtype).to(device=weights.device) - - head_size = config.hidden_size // config.num_attention_heads - num_heads = config.num_attention_heads // weights.process_group.size() - num_key_value_heads = config.num_key_value_heads // weights.process_group.size() - assert list(weight.shape) == [ - (num_heads + 2 * num_key_value_heads) * head_size, - config.hidden_size, - ], f"{list(weight.shape)} != {[(num_heads + 2 * config.num_key_value_heads) * head_size, config.hidden_size]}" - - return TensorParallelColumnLinear( - get_linear(weight, bias=None, quantize=config.quantize) - ) - - class FlashLlamaAttention(torch.nn.Module): def __init__( self, @@ -214,12 +193,13 @@ class LlamaMLP(nn.Module): ) ) # Fuse gate and up proj + bias = getattr(config, "mlp_bias", False) if config.model_type == "phi3": self.gate_up_proj = TensorParallelColumnLinear.load_gate_up( config, prefix=f"{prefix}.gate_up_proj", weights=weights, - bias=False, + bias=bias, ) else: self.gate_up_proj = TensorParallelColumnLinear.load_multi( @@ -227,13 +207,13 @@ class LlamaMLP(nn.Module): prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"], weights=weights, dim=0, - bias=False, + bias=bias, ) self.down_proj = TensorParallelRowLinear.load( config, prefix=f"{prefix}.down_proj", weights=weights, - bias=False, + bias=bias, ) self.intermediate_size = ( config.intermediate_size // weights.process_group.size() @@ -385,9 +365,14 @@ class FlashLlamaForCausalLM(torch.nn.Module): weights=weights, ) self.model = FlashLlamaModel(prefix, config, weights) + if config.tie_word_embeddings: + suffix = "model.embed_tokens" + else: + suffix = "lm_head" + self.lm_head = SpeculativeHead.load( config, - prefix="lm_head" if not prefix else f"{prefix}.lm_head", + prefix=suffix if not prefix else f"{prefix}.suffix", weights=weights, ) From 330aa87f3e07cf3e40af95f6da5787005ccf779c Mon Sep 17 00:00:00 2001 From: Nilabhra Roy Chowdhury Date: Tue, 14 May 2024 10:06:02 +0200 Subject: [PATCH 11/46] Add: Support for the Falcon2 11B architecture (#1886) # What does this PR do? Add's support for the Falcon2 11B model architecture. ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [x] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests), Pull Request section? - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link to it if that's the case. - [ ] Did you make sure to update the documentation with your changes? Here are the [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation). - [ ] Did you write any new necessary tests? ## Who can review? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR. --------- Signed-off-by: Raphael Glon Signed-off-by: Wang, Yi A Co-authored-by: OlivierDehaene Co-authored-by: Nicolas Patry Co-authored-by: oOraph <13552058+oOraph@users.noreply.github.com> Co-authored-by: Raphael Glon Co-authored-by: Julien Chaumond Co-authored-by: OlivierDehaene <23298448+OlivierDehaene@users.noreply.github.com> Co-authored-by: abhishek thakur <1183441+abhishekkrthakur@users.noreply.github.com> Co-authored-by: Dong Shin Co-authored-by: Christof Weickhardt Co-authored-by: Ikko Eltociear Ashimine Co-authored-by: drbh Co-authored-by: Lucain Co-authored-by: fxmarty <9808326+fxmarty@users.noreply.github.com> Co-authored-by: Moritz Laurer <41862082+MoritzLaurer@users.noreply.github.com> Co-authored-by: dr3s Co-authored-by: Wang, Yi Co-authored-by: Morgan Funtowicz Co-authored-by: Maziyar Panahi Co-authored-by: Brandon Royal <2762697+brandonroyal@users.noreply.github.com> Co-authored-by: Mishig Co-authored-by: Martin Iglesias Goyanes Co-authored-by: martini --- .../custom_modeling/flash_llama_modeling.py | 3 +- .../custom_modeling/flash_rw_modeling.py | 101 ++++++++++++------ 2 files changed, 72 insertions(+), 32 deletions(-) diff --git a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py index 6a6b2e0a..40ccb576 100644 --- a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py @@ -18,9 +18,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +from typing import List, Optional, Tuple + import torch import torch.distributed - from torch import nn from transformers.activations import ACT2FN from typing import Optional, List, Tuple diff --git a/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py b/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py index 52ea3ae1..fa463a19 100644 --- a/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py @@ -1,26 +1,21 @@ +from typing import List, Optional, Tuple + import torch import torch.distributed - from torch import nn -from transformers.modeling_utils import PreTrainedModel from transformers.configuration_utils import PretrainedConfig -from typing import Optional, List, Tuple +from transformers.modeling_utils import PreTrainedModel -from text_generation_server.utils import paged_attention, flash_attn -from text_generation_server.utils.flash_attn import attention from text_generation_server.layers import ( - TensorParallelRowLinear, + SpeculativeHead, TensorParallelColumnLinear, TensorParallelEmbedding, - SpeculativeHead, + TensorParallelRowLinear, get_linear, ) -from text_generation_server.layers.layernorm import ( - FastLayerNorm, -) -from text_generation_server.layers.rotary import ( - PositionRotaryEmbedding, -) +from text_generation_server.layers.layernorm import FastLayerNorm +from text_generation_server.layers.rotary import PositionRotaryEmbedding +from text_generation_server.utils import flash_attn, paged_attention def load_row(config, prefix: str, weights, bias: bool): @@ -52,6 +47,7 @@ class RWConfig(PretrainedConfig): hidden_size=64, num_hidden_layers=None, num_attention_heads=None, + num_ln_in_prallel_attention=None, layer_norm_epsilon=1e-5, initializer_range=0.02, use_cache=True, @@ -65,6 +61,7 @@ class RWConfig(PretrainedConfig): new_decoder_architecture=None, bias=False, parallel_attn=False, + rope_theta=10_000.0, **kwargs, ): if alibi: @@ -75,6 +72,7 @@ class RWConfig(PretrainedConfig): self.model_type = model_type self.alibi = False self.rotary = True + self.rope_theta = rope_theta self.vocab_size = vocab_size # Backward compatibility with n_embed kwarg @@ -91,6 +89,7 @@ class RWConfig(PretrainedConfig): else kwargs.pop("n_head", 8) ) self.layer_norm_epsilon = layer_norm_epsilon + self.num_ln_in_parallel_attention = num_ln_in_prallel_attention self.initializer_range = initializer_range self.use_cache = use_cache self.hidden_dropout = hidden_dropout @@ -132,9 +131,13 @@ class FlashRWAttention(torch.nn.Module): self.num_heads_kv = config.n_head_kv self.hidden_size = config.hidden_size self.head_size = self.hidden_size // self.num_heads + self.rope_theta = config.rope_theta self.rotary_emb = PositionRotaryEmbedding.static( - config=config, dim=self.head_size, base=10000.0, device=weights.device + config=config, + dim=self.head_size, + base=self.rope_theta, + device=weights.device, ) self.softmax_scale = self.head_size ** (-0.5) @@ -244,9 +247,13 @@ class FlashRWLargeAttention(torch.nn.Module): self.hidden_size = hidden_size self.head_size = hidden_size // num_heads self.num_groups = num_groups + self.rope_theta = config.rope_theta self.rotary_emb = PositionRotaryEmbedding.static( - config=config, dim=self.head_size, base=10000.0, device=weights.device + config=config, + dim=self.head_size, + base=self.rope_theta, + device=weights.device, ) self.softmax_scale = self.head_size ** (-0.5) @@ -257,7 +264,7 @@ class FlashRWLargeAttention(torch.nn.Module): if process_group.size() > self.num_groups: raise NotImplementedError( - f"Tensor Parallelism is not implemented for world_size > n groups" + "Tensor Parallelism is not implemented for world_size > n groups" ) if self.num_groups % process_group.size() != 0: raise NotImplementedError( @@ -459,29 +466,61 @@ class FlashRWLayer(nn.Module): max_s, ) - hidden_states, residual = self.post_attention_layernorm( - hidden_states, residual - ) + if self.post_attention_layernorm is not None: + hidden_states, residual = self.post_attention_layernorm( + hidden_states, residual + ) mlp_output = self.mlp(hidden_states) return mlp_output, residual +class FlashRWLayerNorm(nn.Module): + def __init__(self, config, prefix, weights): + super().__init__() + self.num_ln = config.num_ln_in_parallel_attn + + if self.num_ln == 1: + self.input_ln = FastLayerNorm.load( + prefix=f"{prefix}.input_layernorm", + weights=weights, + eps=config.layer_norm_epsilon, + ) + elif self.num_ln == 2: + self.ln_attn = FastLayerNorm.load( + prefix=f"{prefix}.ln_attn", + weights=weights, + eps=config.layer_norm_epsilon, + ) + self.ln_mlp = FastLayerNorm.load( + prefix=f"{prefix}.ln_mlp", + weights=weights, + eps=config.layer_norm_epsilon, + ) + else: + raise ValueError("Number of layer norms can either be 1 or 2.") + + def forward( + self, + hidden_states, + residual, + ): + if self.num_ln == 1: + ln_hidden_states, residual = self.input_ln(hidden_states, residual) + return ln_hidden_states, ln_hidden_states, residual + elif self.num_ln == 2: + ln_attn, residual = self.ln_attn(hidden_states, residual) + ln_mlp, _ = self.ln_mlp(residual) + return ln_attn, ln_mlp, residual + + class FlashRWLargeLayer(nn.Module): def __init__(self, layer_id, config, weights): super().__init__() prefix = f"transformer.h.{layer_id}" - self.ln_attn = FastLayerNorm.load( - prefix=f"{prefix}.ln_attn", - weights=weights, - eps=config.layer_norm_epsilon, - ) - self.ln_mlp = FastLayerNorm.load( - prefix=f"{prefix}.ln_mlp", - weights=weights, - eps=config.layer_norm_epsilon, - ) + + self.ln_layer = FlashRWLayerNorm(config, prefix, weights) self.self_attention = FlashRWLargeAttention( config, @@ -507,8 +546,8 @@ class FlashRWLargeLayer(nn.Module): input_lengths, max_s, ): - ln_attn, residual = self.ln_attn(hidden_states, residual) - ln_mlp, _ = self.ln_mlp(residual) + # Layer norm. + ln_attn, ln_mlp, residual = self.ln_layer(hidden_states, residual) # Self attention. attn_output = self.self_attention( From 95d15b4bbe562c1e67ec8ef2ddc3baccd9eb031a Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Tue, 14 May 2024 12:33:18 +0200 Subject: [PATCH 12/46] MLPSpeculator. (#1865) Fixes # (issue) - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests), Pull Request section? - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link to it if that's the case. - [ ] Did you make sure to update the documentation with your changes? Here are the [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation). - [ ] Did you write any new necessary tests? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR. --------- Co-authored-by: Joshua Rosenkranz --- .../text_generation_server/layers/__init__.py | 2 +- .../text_generation_server/layers/medusa.py | 35 ++-- server/text_generation_server/layers/mlp.py | 176 ++++++++++++++++++ .../layers/speculative.py | 43 +++-- .../text_generation_server/models/__init__.py | 75 +++++++- server/text_generation_server/models/bloom.py | 4 +- .../models/causal_lm.py | 6 +- .../models/custom_modeling/idefics2.py | 4 +- .../models/custom_modeling/llava_next.py | 2 +- .../models/flash_causal_lm.py | 3 + .../models/flash_cohere.py | 4 +- .../models/flash_dbrx.py | 4 +- .../models/flash_gemma.py | 4 +- .../models/flash_llama.py | 4 +- .../models/flash_mistral.py | 8 +- .../models/flash_mixtral.py | 4 +- .../models/flash_neox.py | 4 +- .../models/flash_phi.py | 16 +- .../models/flash_qwen2.py | 4 +- .../text_generation_server/models/flash_rw.py | 4 +- .../models/flash_santacoder.py | 4 +- .../models/flash_starcoder2.py | 4 +- .../models/galactica.py | 4 +- .../text_generation_server/models/gpt_neox.py | 4 +- .../text_generation_server/models/idefics.py | 4 +- .../text_generation_server/models/idefics2.py | 4 +- .../models/llava_next.py | 4 +- server/text_generation_server/models/mamba.py | 4 +- server/text_generation_server/models/mpt.py | 4 +- server/text_generation_server/models/opt.py | 4 +- server/text_generation_server/models/phi.py | 4 +- server/text_generation_server/models/rw.py | 4 +- .../models/santacoder.py | 4 +- .../models/seq2seq_lm.py | 6 +- server/text_generation_server/models/t5.py | 4 +- 35 files changed, 365 insertions(+), 103 deletions(-) create mode 100644 server/text_generation_server/layers/mlp.py diff --git a/server/text_generation_server/layers/__init__.py b/server/text_generation_server/layers/__init__.py index c3a6c921..c29dd092 100644 --- a/server/text_generation_server/layers/__init__.py +++ b/server/text_generation_server/layers/__init__.py @@ -3,11 +3,11 @@ from text_generation_server.layers.tensor_parallel import ( TensorParallelRowLinear, TensorParallelEmbedding, ) -from text_generation_server.layers.speculative import SpeculativeHead from text_generation_server.layers.linear import ( get_linear, FastLinear, ) +from text_generation_server.layers.speculative import SpeculativeHead # Just to add the `load` methods. from text_generation_server.layers.layernorm import load_layer_norm diff --git a/server/text_generation_server/layers/medusa.py b/server/text_generation_server/layers/medusa.py index 4ac86978..2e9a010f 100644 --- a/server/text_generation_server/layers/medusa.py +++ b/server/text_generation_server/layers/medusa.py @@ -69,21 +69,24 @@ class MedusaHeadV1(nn.Module): from safetensors import safe_open import json - use_medusa = config.use_medusa + speculator = config.speculator - medusa_config = str(Path(use_medusa) / "config.json") - filename = str(Path(use_medusa) / "medusa_lm_head.safetensors") + path = speculator["path"] + medusa_config = str(Path(path) / "config.json") - with open(medusa_config, "r") as f: - medusa_config = json.load(f) - routing = weights.routing - with safe_open(filename, framework="pytorch") as f: - for k in f.keys(): - if k in routing and routing[k] != filename: - raise RuntimeError( - f"Key {k} was found in multiple files: {filename} and {routing[k]}" - ) - routing[k] = filename + for fname in speculator["model_paths"]: + filename = str(Path(path) / fname) + + with open(medusa_config, "r") as f: + medusa_config = json.load(f) + routing = weights.routing + with safe_open(filename, framework="pytorch") as f: + for k in f.keys(): + if k in routing and routing[k] != filename: + raise RuntimeError( + f"Key {k} was found in multiple files: {filename} and {routing[k]}" + ) + routing[k] = filename medusa = MedusaModel(config, medusa_config, weights) lm_head = TensorParallelHead.load(config, prefix, weights) @@ -108,10 +111,10 @@ class MedusaHeadV2(nn.Module): from safetensors import safe_open import json - use_medusa = config.use_medusa + speculator = config.speculator - medusa_config = str(Path(use_medusa) / "config.json") - filename = str(Path(use_medusa) / "medusa_lm_head.safetensors") + medusa_config = str(Path(speculator) / "config.json") + filename = str(Path(speculator) / "medusa_lm_head.safetensors") with open(medusa_config, "r") as f: medusa_config = json.load(f) diff --git a/server/text_generation_server/layers/mlp.py b/server/text_generation_server/layers/mlp.py new file mode 100644 index 00000000..f08cb673 --- /dev/null +++ b/server/text_generation_server/layers/mlp.py @@ -0,0 +1,176 @@ +import torch +import math +from torch import nn +from torch.nn import functional as F +from typing import Optional, Tuple +from text_generation_server.layers import TensorParallelEmbedding, FastLinear +from text_generation_server.layers.tensor_parallel import TensorParallelHead +from text_generation_server.utils.speculate import get_speculate + + +class MLPSpeculatorLayerNorm(nn.Module): + """ + A L2 normalization implementation + ... + Args + ---- + normalized_shape : int + Dimensionality of input data (size of final tensor axis) + elementwise_scale_weight : torch.Tensor + learned scaling term after normalization? + elementwise_shift_bias : torch.Tensor + learned bias term after normalization? + eps : float + Safety term to prevent division by zero. Make sure the chosen value fits in the range of your encoding scheme (i.e. fp16 requires eps >= 6e-8). + """ + + def __init__( + self, + prefix, + config, + weights, + eps=1e-06, + ): + super(MLPSpeculatorLayerNorm, self).__init__() + self.weight = weights.get_tensor(f"{prefix}.weight") + self.bias = weights.get_tensor(f"{prefix}.bias") + self.eps = eps + + def forward(self, x): + xf = x + xf = xf * torch.rsqrt(xf.pow(2).mean(-1, keepdim=True) + self.eps) + x = xf.type_as(x) + x = self.weight * x + x = x + self.bias + return x + + +class MLPSpeculatorModel(torch.nn.Module): + def __init__(self, config, prefix, weights): + super().__init__() + self.config = config + self.n_predict = get_speculate() + self.hidden_size = config.hidden_size + self.emb = nn.ModuleList( + [ + TensorParallelEmbedding(f"{prefix}.emb.{i}", weights) + for i in range(self.n_predict) + ] + ) + self.proj = [ + FastLinear.load( + config, + prefix=f"{prefix}.proj.{i}", + weights=weights, + bias=False, + ) + for i in range(self.n_predict) + ] + self.head = nn.ModuleList( + [ + FastLinear.load(config, f"{prefix}.head.{i}", weights, bias=False) + for i in range(self.n_predict) + ] + ) + self.ln = nn.ModuleList( + [ + MLPSpeculatorLayerNorm( + prefix=f"{prefix}.ln.{i}", + config=config, + weights=weights, + ) + for i in range(self.n_predict) + ] + ) + + # Weights ensure that state_0 accounts for 50% of state magnitude by final head in expectation + self.state_weight = 0.5 ** (0.5 / self.n_predict) + self.emb_weight = math.sqrt(1 - self.state_weight**2) + self.activation = nn.GELU() + # TODO + self.vsize = config.vocab_size + self.inner_dim = config.speculator_config["inner_dim"] + self.top_k_tokens_per_head = [1] * self.n_predict + + def forward( + self, + hidden_states: torch.Tensor, + input_ids: torch.Tensor, + ): + top_k_tokens_per_head = self.top_k_tokens_per_head + + # k indicates # of candidates + # h indicates # of generated tokens + state = hidden_states + b = state.size(0) + ind = input_ids.unsqueeze(0) + all_probs = torch.empty( + b, self.n_predict, self.vsize, device=state.device + ) # b k h v + assert ( + len(top_k_tokens_per_head) == self.n_predict + ), f"You must provide a topk number for each head ({self.n_predict} heads, {len(top_k_tokens_per_head)} provided)" + for i in range(self.n_predict): + # Project and predict + z = self.emb[i](ind) + z = z.mul(self.emb_weight * math.sqrt(self.inner_dim / 2)) # b k d + state = self.proj[i](state) * self.state_weight + z + state = self.activation(self.ln[i](state)) # b k d + probs = F.log_softmax(self.head[i](state), dim=-1) # b k v + _probs, preds = probs.topk(top_k_tokens_per_head[i], dim=-1) # b k k' + + # Update candidate set with new predictions + + # Update distribution set with new logits + all_probs[:, i] = probs.exp() + + # Update state, log_probs and ind for new predictions + state = state.unsqueeze(2).expand( + -1, -1, top_k_tokens_per_head[i], -1 + ) # b k k' d + state = state.reshape(-1, b, state.size(3)) # b kk' d + ind = preds.view(-1, b) # b kk' + + speculative_logits = all_probs + return speculative_logits + + +class MLPSpeculatorHead(nn.Module): + def __init__(self, lm_head, mlp_speculator): + super().__init__() + self.lm_head = lm_head + self.mlp_speculator = mlp_speculator + + def forward( + self, input: torch.Tensor + ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: + logits = self.lm_head(input) + # If we have too many tokens, we skip speculative logits + if input.shape[0] > 128: + return logits, None + + input_ids = logits.argmax(dim=-1) + speculative_logits = self.mlp_speculator(input, input_ids) + return logits, speculative_logits + + @staticmethod + def load(config, prefix: str, weights): + from pathlib import Path + from safetensors import safe_open + + speculator_path = config.speculator["path"] + + for fname in config.speculator["model_paths"]: + filename = str(Path(speculator_path) / fname) + routing = weights.routing + with safe_open(filename, framework="pytorch") as f: + for k in f.keys(): + if k in routing and routing[k] != filename: + raise RuntimeError( + f"Key {k} was found in multiple files: {filename} and {routing[k]}" + ) + routing[k] = filename + + mlp_speculator = MLPSpeculatorModel(config, "speculator", weights) + lm_head = TensorParallelHead.load(config, prefix, weights) + return MLPSpeculatorHead(lm_head, mlp_speculator) diff --git a/server/text_generation_server/layers/speculative.py b/server/text_generation_server/layers/speculative.py index 663f8c2e..4b977a56 100644 --- a/server/text_generation_server/layers/speculative.py +++ b/server/text_generation_server/layers/speculative.py @@ -1,34 +1,51 @@ import torch +import json from typing import Tuple, Optional -from text_generation_server.layers.medusa import MedusaHeadV1, MedusaHeadV2 from text_generation_server.layers.tensor_parallel import TensorParallelHead +from text_generation_server.layers.medusa import MedusaHeadV1, MedusaHeadV2 +from text_generation_server.layers.mlp import MLPSpeculatorHead class SpeculativeHead(torch.nn.Module): - def __init__(self, lm_head, medusa): + def __init__(self, lm_head, speculator): super().__init__() self.head = lm_head - self.medusa = medusa + self.speculator = speculator @staticmethod def load(config, prefix: str, weights): - use_medusa = config.use_medusa - if use_medusa: - lm_head = None + speculator = config.speculator + if speculator: + speculator_path = config.speculator["path"] + speculator_config = str(speculator_path / "config.json") + + with open(speculator_config, "r") as f: + speculator_config = json.load(f) + + config.speculator_config = speculator_config try: - medusa = MedusaHeadV1.load(config, prefix, weights) - except: - medusa = MedusaHeadV2(config, prefix, weights) + architecture = speculator_config["architectures"][0] + + if architecture == "MLPSpeculatorPreTrainedModel": + speculator = MLPSpeculatorHead.load(config, prefix, weights) + else: + speculator = None + except KeyError: + try: + speculator = MedusaHeadV1.load(config, prefix, weights) + except: + speculator = MedusaHeadV2(config, prefix, weights) + lm_head = None else: lm_head = TensorParallelHead.load(config, prefix, weights) - medusa = None - return SpeculativeHead(lm_head, medusa) + speculator = None + return SpeculativeHead(lm_head, speculator) def forward( self, input: torch.Tensor ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: - if self.medusa is not None: - return self.medusa(input) + if self.speculator is not None: + return self.speculator(input) assert self.head is not None logits = self.head(input) diff --git a/server/text_generation_server/models/__init__.py b/server/text_generation_server/models/__init__.py index 1cd1563f..3d3d3e1e 100644 --- a/server/text_generation_server/models/__init__.py +++ b/server/text_generation_server/models/__init__.py @@ -1,9 +1,10 @@ import torch +import os from loguru import logger from transformers.configuration_utils import PretrainedConfig from transformers.models.auto import modeling_auto -from huggingface_hub import hf_hub_download +from huggingface_hub import hf_hub_download, HfApi from typing import Optional from pathlib import Path @@ -40,8 +41,9 @@ def get_model( config_dict, _ = PretrainedConfig.get_config_dict( model_id, revision=revision, trust_remote_code=trust_remote_code ) + model_type = config_dict.get("model_type", None) - use_medusa = None + speculator = None if "medusa_num_heads" in config_dict: medusa_model_id = model_id medusa_revision = revision @@ -61,6 +63,8 @@ def get_model( config_dict, _ = PretrainedConfig.get_config_dict( model_id, revision=revision, trust_remote_code=trust_remote_code ) + # Reload model type from parent. + model_type = config_dict.get("model_type", None) is_local = Path(medusa_model_id).exists() if not is_local: medusa_config = hf_hub_download( @@ -71,11 +75,70 @@ def get_model( revision=medusa_revision, filename="medusa_lm_head.safetensors", ) - use_medusa = Path(medusa_config).parent + speculator = { + "path": Path(medusa_config).parent, + "model_paths": ["medusa_lm_head.safetensors"], + } else: - use_medusa = Path(medusa_model_id) + speculator = { + "path": Path(medusa_model_id), + "model_paths": ["medusa_lm_head.safetensors"], + } method = "medusa" + elif model_type == "mlp_speculator": + mlp_model_id = model_id + mlp_revision = revision + model_id = config_dict["base_model_name_or_path"] + revision = "main" + speculate_mlp = config_dict["n_predict"] + if speculate is not None: + if speculate > speculate_mlp: + raise RuntimeError( + f"Speculate is set to `{speculate}` but this mlp_speculator models only has `{speculate_mlp}` heads, please make them match" + ) + else: + set_speculate(speculate) + else: + set_speculate(speculate_mlp) + + config_dict, _ = PretrainedConfig.get_config_dict( + model_id, revision=revision, trust_remote_code=trust_remote_code + ) + # Reload model type from parent. + model_type = config_dict.get("model_type", None) + is_local = Path(mlp_model_id).exists() + extension = ".safetensors" + if not is_local: + mlp_speculator_config = hf_hub_download( + mlp_model_id, revision=mlp_revision, filename="config.json" + ) + api = HfApi() + info = api.model_info(mlp_model_id, revision=mlp_revision) + filenames = [ + s.rfilename + for s in info.siblings + if s.rfilename.endswith(extension) + and len(s.rfilename.split("/")) == 1 + and "arguments" not in s.rfilename + and "args" not in s.rfilename + and "training" not in s.rfilename + ] + for filename in filenames: + hf_hub_download( + mlp_model_id, + revision=mlp_revision, + filename=filename, + ) + speculator = { + "path": Path(mlp_speculator_config).parent, + "model_paths": filenames, + } + else: + speculator = Path(mlp_model_id) + filenames = [p for p in os.listdir(speculator) if p.endswith(extension)] + speculator = {"path": speculator, "model_paths": filenames} + method = "mlp_speculator" else: method = "n-gram" @@ -92,7 +155,7 @@ def get_model( return BLOOM( model_id, revision, - use_medusa=use_medusa, + speculator=speculator, dtype=dtype, trust_remote_code=trust_remote_code, ) @@ -101,7 +164,7 @@ def get_model( return CausalLM( model_id, revision, - use_medusa=use_medusa, + speculator=speculator, dtype=dtype, trust_remote_code=trust_remote_code, ) diff --git a/server/text_generation_server/models/bloom.py b/server/text_generation_server/models/bloom.py index 86cafda2..6fe64374 100644 --- a/server/text_generation_server/models/bloom.py +++ b/server/text_generation_server/models/bloom.py @@ -35,14 +35,14 @@ class BLOOM(CausalLM): self, model_id: str, revision: Optional[str] = None, - use_medusa: Optional[str] = None, + speculator: Optional[str] = None, dtype: Optional[torch.dtype] = None, trust_remote_code: bool = False, ): super(BLOOM, self).__init__( model_id=model_id, revision=revision, - use_medusa=use_medusa, + speculator=speculator, dtype=dtype, trust_remote_code=trust_remote_code, ) diff --git a/server/text_generation_server/models/causal_lm.py b/server/text_generation_server/models/causal_lm.py index 37d7479b..796f8cd3 100644 --- a/server/text_generation_server/models/causal_lm.py +++ b/server/text_generation_server/models/causal_lm.py @@ -598,12 +598,12 @@ class CausalLM(Model): self, model_id: str, revision: Optional[str] = None, - use_medusa: Optional[str] = None, + speculator: Optional[str] = None, dtype: Optional[torch.dtype] = None, trust_remote_code: bool = False, ): - if use_medusa: - raise RuntimeError("Medusa decoding is not enabled for AutoModel") + if speculator: + raise RuntimeError("Speculator decoding is not enabled for AutoModel") # Create tokenizer tokenizer = AutoTokenizer.from_pretrained( diff --git a/server/text_generation_server/models/custom_modeling/idefics2.py b/server/text_generation_server/models/custom_modeling/idefics2.py index 935f049b..51fd7c02 100644 --- a/server/text_generation_server/models/custom_modeling/idefics2.py +++ b/server/text_generation_server/models/custom_modeling/idefics2.py @@ -683,9 +683,9 @@ class Idefics2ForConditionalGeneration(nn.Module): def __init__(self, prefix, config, weights): super().__init__() config.vision_config.quantize = config.quantize - config.vision_config.use_medusa = config.use_medusa + config.vision_config.speculator = config.speculator config.text_config.quantize = config.quantize - config.text_config.use_medusa = config.use_medusa + config.text_config.speculator = config.speculator vision_config = config.vision_config self.text_model = load_text_model( diff --git a/server/text_generation_server/models/custom_modeling/llava_next.py b/server/text_generation_server/models/custom_modeling/llava_next.py index a049f756..de9673aa 100644 --- a/server/text_generation_server/models/custom_modeling/llava_next.py +++ b/server/text_generation_server/models/custom_modeling/llava_next.py @@ -135,7 +135,7 @@ class LlavaNextForConditionalGeneration(nn.Module): self.vocab_size = config.text_config.vocab_size self.config = config config.text_config.quantize = config.quantize - config.text_config.use_medusa = config.use_medusa + config.text_config.speculator = config.speculator self.language_model = load_text_model( prefix="language_model" if not prefix else f"{prefix}.language_model", config=config.text_config, diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py index f567bea9..01b4862f 100644 --- a/server/text_generation_server/models/flash_causal_lm.py +++ b/server/text_generation_server/models/flash_causal_lm.py @@ -207,6 +207,7 @@ class FlashCausalLMBatch(Batch): # Paged attention # Remove one as the first token des not have a past speculative_length = get_speculate() + speculative_length = 0 if speculative_length is None else speculative_length total_tokens = input_length + max_new_tokens - 1 + speculative_length needed_blocks = math.ceil(total_tokens / BLOCK_SIZE) blocks += needed_blocks @@ -1101,6 +1102,8 @@ class FlashCausalLM(Model): next_token_texts = [] left = 0 + logger.info(f"Accepted ids {n_accepted_ids}") + current_stopped = False for j in range(index, index + n_accepted_ids): # Generated token diff --git a/server/text_generation_server/models/flash_cohere.py b/server/text_generation_server/models/flash_cohere.py index f85c7722..b907ee08 100644 --- a/server/text_generation_server/models/flash_cohere.py +++ b/server/text_generation_server/models/flash_cohere.py @@ -24,7 +24,7 @@ class FlashCohere(FlashCausalLM): model_id: str, revision: Optional[str] = None, quantize: Optional[str] = None, - use_medusa: Optional[str] = None, + speculator: Optional[str] = None, dtype: Optional[torch.dtype] = None, trust_remote_code: bool = False, ): @@ -49,7 +49,7 @@ class FlashCohere(FlashCausalLM): model_id, revision=revision, trust_remote_code=trust_remote_code ) config.quantize = quantize - config.use_medusa = use_medusa + config.speculator = speculator torch.distributed.barrier(group=self.process_group) diff --git a/server/text_generation_server/models/flash_dbrx.py b/server/text_generation_server/models/flash_dbrx.py index 367d3db0..d5eb1a6e 100644 --- a/server/text_generation_server/models/flash_dbrx.py +++ b/server/text_generation_server/models/flash_dbrx.py @@ -26,7 +26,7 @@ class FlashDbrx(FlashCausalLM): model_id: str, revision: Optional[str] = None, quantize: Optional[str] = None, - use_medusa: Optional[str] = None, + speculator: Optional[str] = None, dtype: Optional[torch.dtype] = None, trust_remote_code: bool = False, ): @@ -74,7 +74,7 @@ class FlashDbrx(FlashCausalLM): model_id, revision=revision, trust_remote_code=trust_remote_code ) config.quantize = quantize - config.use_medusa = use_medusa + config.speculator = speculator torch.distributed.barrier(group=self.process_group) diff --git a/server/text_generation_server/models/flash_gemma.py b/server/text_generation_server/models/flash_gemma.py index 7259b820..9c00a056 100644 --- a/server/text_generation_server/models/flash_gemma.py +++ b/server/text_generation_server/models/flash_gemma.py @@ -25,7 +25,7 @@ class FlashGemma(FlashCausalLM): model_id: str, revision: Optional[str] = None, quantize: Optional[str] = None, - use_medusa: Optional[str] = None, + speculator: Optional[str] = None, dtype: Optional[torch.dtype] = None, trust_remote_code: bool = False, ): @@ -50,7 +50,7 @@ class FlashGemma(FlashCausalLM): model_id, revision=revision, trust_remote_code=trust_remote_code ) config.quantize = quantize - config.use_medusa = use_medusa + config.speculator = speculator torch.distributed.barrier(group=self.process_group) diff --git a/server/text_generation_server/models/flash_llama.py b/server/text_generation_server/models/flash_llama.py index 8ea70713..796fbd47 100644 --- a/server/text_generation_server/models/flash_llama.py +++ b/server/text_generation_server/models/flash_llama.py @@ -27,7 +27,7 @@ class FlashLlama(FlashCausalLM): model_id: str, revision: Optional[str] = None, quantize: Optional[str] = None, - use_medusa: Optional[str] = None, + speculator: Optional[str] = None, dtype: Optional[torch.dtype] = None, trust_remote_code: bool = False, ): @@ -71,7 +71,7 @@ class FlashLlama(FlashCausalLM): model_id, revision=revision, trust_remote_code=trust_remote_code ) config.quantize = quantize - config.use_medusa = use_medusa + config.speculator = speculator torch.distributed.barrier(group=self.process_group) diff --git a/server/text_generation_server/models/flash_mistral.py b/server/text_generation_server/models/flash_mistral.py index 48304ad8..b83f49a4 100644 --- a/server/text_generation_server/models/flash_mistral.py +++ b/server/text_generation_server/models/flash_mistral.py @@ -313,7 +313,7 @@ class BaseFlashMistral(FlashCausalLM): config_cls=AutoConfig, revision: Optional[str] = None, quantize: Optional[str] = None, - use_medusa: Optional[str] = None, + speculator: Optional[str] = None, dtype: Optional[torch.dtype] = None, trust_remote_code: bool = False, tokenizer_class=AutoTokenizer, @@ -340,7 +340,7 @@ class BaseFlashMistral(FlashCausalLM): model_id, revision=revision, trust_remote_code=trust_remote_code ) config.quantize = quantize - config.use_medusa = use_medusa + config.speculator = speculator # Set context windows if getattr(config, "sliding_window", None) is not None: @@ -567,7 +567,7 @@ class FlashMistral(BaseFlashMistral): model_id: str, revision: Optional[str] = None, quantize: Optional[str] = None, - use_medusa: Optional[str] = None, + speculator: Optional[str] = None, dtype: Optional[torch.dtype] = None, trust_remote_code: bool = False, ): @@ -577,7 +577,7 @@ class FlashMistral(BaseFlashMistral): model_id=model_id, revision=revision, quantize=quantize, - use_medusa=use_medusa, + speculator=speculator, dtype=dtype, trust_remote_code=trust_remote_code, ) diff --git a/server/text_generation_server/models/flash_mixtral.py b/server/text_generation_server/models/flash_mixtral.py index 2ee35e82..587d423f 100644 --- a/server/text_generation_server/models/flash_mixtral.py +++ b/server/text_generation_server/models/flash_mixtral.py @@ -15,7 +15,7 @@ class FlashMixtral(BaseFlashMistral): model_id: str, revision: Optional[str] = None, quantize: Optional[str] = None, - use_medusa: Optional[str] = None, + speculator: Optional[str] = None, dtype: Optional[torch.dtype] = None, trust_remote_code: bool = False, ): @@ -25,7 +25,7 @@ class FlashMixtral(BaseFlashMistral): model_id=model_id, revision=revision, quantize=quantize, - use_medusa=use_medusa, + speculator=speculator, dtype=dtype, trust_remote_code=trust_remote_code, ) diff --git a/server/text_generation_server/models/flash_neox.py b/server/text_generation_server/models/flash_neox.py index 1119bdae..adefaeb2 100644 --- a/server/text_generation_server/models/flash_neox.py +++ b/server/text_generation_server/models/flash_neox.py @@ -25,7 +25,7 @@ class FlashNeoXSharded(FlashCausalLM): model_id: str, revision: Optional[str] = None, quantize: Optional[str] = None, - use_medusa: Optional[str] = None, + speculator: Optional[str] = None, dtype: Optional[torch.dtype] = None, trust_remote_code: bool = False, ): @@ -51,7 +51,7 @@ class FlashNeoXSharded(FlashCausalLM): model_id, revision=revision, trust_remote_code=trust_remote_code ) config.quantize = quantize - config.use_medusa = use_medusa + config.speculator = speculator torch.distributed.barrier(group=self.process_group) filenames = weight_files(model_id, revision=revision, extension=".safetensors") diff --git a/server/text_generation_server/models/flash_phi.py b/server/text_generation_server/models/flash_phi.py index cb55f9e6..32b573a9 100644 --- a/server/text_generation_server/models/flash_phi.py +++ b/server/text_generation_server/models/flash_phi.py @@ -25,7 +25,7 @@ class FlashPhi(FlashCausalLM): model_id: str, revision: Optional[str] = None, quantize: Optional[str] = None, - use_medusa: Optional[str] = None, + speculator: Optional[str] = None, dtype: Optional[torch.dtype] = None, trust_remote_code: bool = False, ): @@ -48,7 +48,7 @@ class FlashPhi(FlashCausalLM): model_id, revision=revision, trust_remote_code=trust_remote_code ) config.quantize = quantize - config.use_medusa = use_medusa + config.speculator = speculator torch.distributed.barrier(group=self.process_group) @@ -58,7 +58,7 @@ class FlashPhi(FlashCausalLM): weights._set_gptq_params(model_id, revision) model = FlashPhiForCausalLM(config, weights) - if use_medusa: + if speculator: from text_generation_server.utils.medusa import MedusaModel from huggingface_hub import hf_hub_download import json @@ -66,19 +66,19 @@ class FlashPhi(FlashCausalLM): from pathlib import Path is_local_model = ( - Path(use_medusa).exists() and Path(use_medusa).is_dir() + Path(speculator).exists() and Path(speculator).is_dir() ) or os.getenv("WEIGHTS_CACHE_OVERRIDE", None) is not None if not is_local_model: medusa_config = hf_hub_download( - use_medusa, revision=revision, filename="config.json" + speculator, revision=revision, filename="config.json" ) medusa_head = hf_hub_download( - use_medusa, revision=revision, filename="medusa_lm_head.pt" + speculator, revision=revision, filename="medusa_lm_head.pt" ) else: - medusa_config = str(Path(use_medusa) / "config.json") - medusa_head = str(Path(use_medusa) / "medusa_lm_head.pt") + medusa_config = str(Path(speculator) / "config.json") + medusa_head = str(Path(speculator) / "medusa_lm_head.pt") with open(medusa_config, "r") as f: config = json.load(f) diff --git a/server/text_generation_server/models/flash_qwen2.py b/server/text_generation_server/models/flash_qwen2.py index cb3cf6b0..59064b30 100644 --- a/server/text_generation_server/models/flash_qwen2.py +++ b/server/text_generation_server/models/flash_qwen2.py @@ -30,7 +30,7 @@ class FlashQwen2(BaseFlashMistral): model_id: str, revision: Optional[str] = None, quantize: Optional[str] = None, - use_medusa: Optional[str] = None, + speculator: Optional[str] = None, dtype: Optional[torch.dtype] = None, trust_remote_code: bool = False, ): @@ -53,7 +53,7 @@ class FlashQwen2(BaseFlashMistral): model_id, revision=revision, trust_remote_code=trust_remote_code ) config.quantize = quantize - config.use_medusa = use_medusa + config.speculator = speculator # Set context windows if config.sliding_window is not None: diff --git a/server/text_generation_server/models/flash_rw.py b/server/text_generation_server/models/flash_rw.py index 33298e1a..e6350611 100644 --- a/server/text_generation_server/models/flash_rw.py +++ b/server/text_generation_server/models/flash_rw.py @@ -26,7 +26,7 @@ class FlashRWSharded(FlashCausalLM): model_id: str, revision: Optional[str] = None, quantize: Optional[str] = None, - use_medusa: Optional[str] = None, + speculator: Optional[str] = None, dtype: Optional[torch.dtype] = None, trust_remote_code: bool = False, ): @@ -66,7 +66,7 @@ class FlashRWSharded(FlashCausalLM): ) config.quantize = quantize - config.use_medusa = use_medusa + config.speculator = speculator if config.quantize == "gptq": weights._set_gptq_params(model_id, revision) diff --git a/server/text_generation_server/models/flash_santacoder.py b/server/text_generation_server/models/flash_santacoder.py index 66698a3a..2ad36b93 100644 --- a/server/text_generation_server/models/flash_santacoder.py +++ b/server/text_generation_server/models/flash_santacoder.py @@ -29,7 +29,7 @@ class FlashSantacoderSharded(FlashCausalLM): model_id: str, revision: Optional[str] = None, quantize: Optional[str] = None, - use_medusa: Optional[str] = None, + speculator: Optional[str] = None, dtype: Optional[torch.dtype] = None, trust_remote_code: bool = False, ): @@ -57,7 +57,7 @@ class FlashSantacoderSharded(FlashCausalLM): trust_remote_code=True, ) config.quantize = quantize - config.use_medusa = use_medusa + config.speculator = speculator config.transpose = config.architectures[0].startswith("GPT2") torch.distributed.barrier(group=self.process_group) diff --git a/server/text_generation_server/models/flash_starcoder2.py b/server/text_generation_server/models/flash_starcoder2.py index 68e726d8..dc5d49be 100644 --- a/server/text_generation_server/models/flash_starcoder2.py +++ b/server/text_generation_server/models/flash_starcoder2.py @@ -29,7 +29,7 @@ class FlashStarcoder2(BaseFlashMistral): model_id: str, revision: Optional[str] = None, quantize: Optional[str] = None, - use_medusa: Optional[str] = None, + speculator: Optional[str] = None, dtype: Optional[torch.dtype] = None, trust_remote_code: bool = False, ): @@ -52,7 +52,7 @@ class FlashStarcoder2(BaseFlashMistral): model_id, revision=revision, trust_remote_code=trust_remote_code ) config.quantize = quantize - config.use_medusa = use_medusa + config.speculator = speculator # Set context windows if config.sliding_window is not None: diff --git a/server/text_generation_server/models/galactica.py b/server/text_generation_server/models/galactica.py index a46f86be..4656fd45 100644 --- a/server/text_generation_server/models/galactica.py +++ b/server/text_generation_server/models/galactica.py @@ -167,7 +167,7 @@ class GalacticaSharded(CausalLM): model_id: str, revision: Optional[str] = None, quantize: Optional[str] = None, - use_medusa: Optional[str] = None, + speculator: Optional[str] = None, dtype: Optional[torch.dtype] = None, trust_remote_code: bool = False, ): @@ -195,7 +195,7 @@ class GalacticaSharded(CausalLM): ) config.quantize = quantize tokenizer.pad_token_id = config.pad_token_id - config.use_medusa = use_medusa + config.speculator = speculator torch.distributed.barrier(group=self.process_group) filenames = weight_files(model_id, revision=revision, extension=".safetensors") diff --git a/server/text_generation_server/models/gpt_neox.py b/server/text_generation_server/models/gpt_neox.py index 1c4cfe7d..c0e1adf2 100644 --- a/server/text_generation_server/models/gpt_neox.py +++ b/server/text_generation_server/models/gpt_neox.py @@ -24,7 +24,7 @@ class GPTNeoxSharded(CausalLM): model_id: str, revision: Optional[str] = None, quantize: Optional[str] = None, - use_medusa: Optional[str] = None, + speculator: Optional[str] = None, dtype: Optional[torch.dtype] = None, trust_remote_code: bool = False, ): @@ -51,7 +51,7 @@ class GPTNeoxSharded(CausalLM): trust_remote_code=trust_remote_code, ) config.quantize = quantize - config.use_medusa = use_medusa + config.speculator = speculator torch.distributed.barrier(group=self.process_group) filenames = weight_files(model_id, revision=revision, extension=".safetensors") diff --git a/server/text_generation_server/models/idefics.py b/server/text_generation_server/models/idefics.py index 30bf4aa6..c1fe03e4 100644 --- a/server/text_generation_server/models/idefics.py +++ b/server/text_generation_server/models/idefics.py @@ -31,7 +31,7 @@ class IDEFICSSharded(IdeficsCausalLM): model_id: str, revision: Optional[str] = None, quantize: Optional[str] = None, - use_medusa: Optional[str] = None, + speculator: Optional[str] = None, dtype: Optional[torch.dtype] = None, trust_remote_code: bool = False, ): @@ -52,7 +52,7 @@ class IDEFICSSharded(IdeficsCausalLM): trust_remote_code=trust_remote_code, ) config.quantize = quantize - config.use_medusa = use_medusa + config.speculator = speculator config.vision_config.quantize = quantize tokenizer = LlamaTokenizerFast.from_pretrained( diff --git a/server/text_generation_server/models/idefics2.py b/server/text_generation_server/models/idefics2.py index e831af89..314c0500 100644 --- a/server/text_generation_server/models/idefics2.py +++ b/server/text_generation_server/models/idefics2.py @@ -18,7 +18,7 @@ class Idefics2(VlmCausalLM): model_id: str, revision: Optional[str] = None, quantize: Optional[str] = None, - use_medusa: Optional[str] = None, + speculator: Optional[str] = None, dtype: Optional[torch.dtype] = None, trust_remote_code: bool = False, ): @@ -35,7 +35,7 @@ class Idefics2(VlmCausalLM): model_id=model_id, revision=revision, quantize=quantize, - use_medusa=use_medusa, + speculator=speculator, dtype=dtype, trust_remote_code=trust_remote_code, ) diff --git a/server/text_generation_server/models/llava_next.py b/server/text_generation_server/models/llava_next.py index 3983bc85..effe8b91 100644 --- a/server/text_generation_server/models/llava_next.py +++ b/server/text_generation_server/models/llava_next.py @@ -18,7 +18,7 @@ class LlavaNext(VlmCausalLM): model_id: str, revision: Optional[str] = None, quantize: Optional[str] = None, - use_medusa: Optional[str] = None, + speculator: Optional[str] = None, dtype: Optional[torch.dtype] = None, trust_remote_code: bool = False, ): @@ -30,7 +30,7 @@ class LlavaNext(VlmCausalLM): model_id=model_id, revision=revision, quantize=quantize, - use_medusa=use_medusa, + speculator=speculator, dtype=dtype, trust_remote_code=trust_remote_code, ) diff --git a/server/text_generation_server/models/mamba.py b/server/text_generation_server/models/mamba.py index 0884317e..b28b744f 100644 --- a/server/text_generation_server/models/mamba.py +++ b/server/text_generation_server/models/mamba.py @@ -408,7 +408,7 @@ class Mamba(Model): model_id: str, revision: Optional[str] = None, quantize: Optional[str] = None, - use_medusa: Optional[str] = None, + speculator: Optional[str] = None, dtype: Optional[torch.dtype] = None, trust_remote_code: bool = False, ): @@ -445,7 +445,7 @@ class Mamba(Model): tokenizer.pad_token = tokenizer.eos_token config.quantize = quantize - config.use_medusa = use_medusa + config.speculator = speculator torch.distributed.barrier(group=self.process_group) filenames = weight_files(model_id, revision=revision, extension=".safetensors") weights = Weights(filenames, device, dtype, process_group=self.process_group) diff --git a/server/text_generation_server/models/mpt.py b/server/text_generation_server/models/mpt.py index 6b3f29a6..8d8b4909 100644 --- a/server/text_generation_server/models/mpt.py +++ b/server/text_generation_server/models/mpt.py @@ -43,7 +43,7 @@ class MPTSharded(CausalLM): model_id: str, revision: Optional[str] = None, quantize: Optional[str] = None, - use_medusa: Optional[str] = None, + speculator: Optional[str] = None, dtype: Optional[torch.dtype] = None, trust_remote_code: bool = False, ): @@ -76,7 +76,7 @@ class MPTSharded(CausalLM): config = json.load(f) config = PretrainedConfig(**config) config.quantize = quantize - config.use_medusa = use_medusa + config.speculator = speculator torch.distributed.barrier(group=self.process_group) diff --git a/server/text_generation_server/models/opt.py b/server/text_generation_server/models/opt.py index 703e5b58..5b84f4ff 100644 --- a/server/text_generation_server/models/opt.py +++ b/server/text_generation_server/models/opt.py @@ -22,7 +22,7 @@ class OPTSharded(CausalLM): model_id: str, revision: Optional[str] = None, quantize: Optional[str] = None, - use_medusa: Optional[str] = None, + speculator: Optional[str] = None, dtype: Optional[torch.dtype] = None, trust_remote_code: bool = False, ): @@ -48,7 +48,7 @@ class OPTSharded(CausalLM): trust_remote_code=trust_remote_code, ) config.quantize = quantize - config.use_medusa = use_medusa + config.speculator = speculator tokenizer.pad_token_id = config.pad_token_id torch.distributed.barrier(group=self.process_group) diff --git a/server/text_generation_server/models/phi.py b/server/text_generation_server/models/phi.py index cc4e2505..d68866c1 100644 --- a/server/text_generation_server/models/phi.py +++ b/server/text_generation_server/models/phi.py @@ -22,7 +22,7 @@ class Phi(CausalLM): model_id: str, revision: Optional[str] = None, quantize: Optional[str] = None, - use_medusa: Optional[str] = None, + speculator: Optional[str] = None, dtype: Optional[torch.dtype] = None, trust_remote_code: bool = False, ): @@ -53,7 +53,7 @@ class Phi(CausalLM): tokenizer.pad_token = tokenizer.eos_token config.quantize = quantize - config.use_medusa = use_medusa + config.speculator = speculator torch.distributed.barrier(group=self.process_group) filenames = weight_files(model_id, revision=revision, extension=".safetensors") weights = Weights(filenames, device, dtype, process_group=self.process_group) diff --git a/server/text_generation_server/models/rw.py b/server/text_generation_server/models/rw.py index 92c93542..d4764ded 100644 --- a/server/text_generation_server/models/rw.py +++ b/server/text_generation_server/models/rw.py @@ -12,11 +12,11 @@ class RW(CausalLM): model_id: str, revision: Optional[str] = None, quantize: Optional[str] = None, - use_medusa: Optional[str] = None, + speculator: Optional[str] = None, dtype: Optional[torch.dtype] = None, trust_remote_code: bool = False, ): - if use_medusa: + if speculator: raise RuntimeError("Medusa decoding is not enabled for AutoModel") if torch.cuda.is_available(): diff --git a/server/text_generation_server/models/santacoder.py b/server/text_generation_server/models/santacoder.py index a887555a..f07734c2 100644 --- a/server/text_generation_server/models/santacoder.py +++ b/server/text_generation_server/models/santacoder.py @@ -15,14 +15,14 @@ class SantaCoder(CausalLM): self, model_id: str, revision: Optional[str] = None, - use_medusa: Optional[str] = None, + speculator: Optional[str] = None, dtype: Optional[torch.dtype] = None, trust_remote_code: bool = False, ): super().__init__( model_id=model_id, revision=revision, - use_medusa=use_medusa, + speculator=speculator, dtype=dtype, trust_remote_code=trust_remote_code, ) diff --git a/server/text_generation_server/models/seq2seq_lm.py b/server/text_generation_server/models/seq2seq_lm.py index e55a661c..6a0c812f 100644 --- a/server/text_generation_server/models/seq2seq_lm.py +++ b/server/text_generation_server/models/seq2seq_lm.py @@ -532,12 +532,12 @@ class Seq2SeqLM(Model): model_id: str, revision: Optional[str] = None, quantize: Optional[str] = None, - use_medusa: Optional[str] = None, + speculator: Optional[str] = None, dtype: Optional[torch.dtype] = None, trust_remote_code: bool = False, ): - if use_medusa: - raise RuntimeError("Medusa decoding is not enabled for AutoModel") + if speculator: + raise RuntimeError("Speculator decoding is not enabled for AutoModel") if torch.cuda.is_available(): device = torch.device("cuda") diff --git a/server/text_generation_server/models/t5.py b/server/text_generation_server/models/t5.py index 3f3cb965..8e0735e5 100644 --- a/server/text_generation_server/models/t5.py +++ b/server/text_generation_server/models/t5.py @@ -25,7 +25,7 @@ class T5Sharded(Seq2SeqLM): model_id: str, revision: Optional[str] = None, quantize: Optional[str] = None, - use_medusa: Optional[str] = None, + speculator: Optional[str] = None, dtype: Optional[torch.dtype] = None, trust_remote_code: bool = False, ): @@ -43,7 +43,7 @@ class T5Sharded(Seq2SeqLM): trust_remote_code=trust_remote_code, ) config.quantize = quantize - config.use_medusa = use_medusa + config.speculator = speculator tokenizer = AutoTokenizer.from_pretrained( model_id, From 4494b84b18a3aa6218a5b9316a7725129665edf1 Mon Sep 17 00:00:00 2001 From: Brandon Lockaby Date: Tue, 14 May 2024 14:23:39 -0400 Subject: [PATCH 13/46] Correct 'using guidance' link (#1892) Fix typo in link to 'using guidance' article --- docs/source/conceptual/guidance.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/conceptual/guidance.md b/docs/source/conceptual/guidance.md index 0ce34f2f..a566c4a6 100644 --- a/docs/source/conceptual/guidance.md +++ b/docs/source/conceptual/guidance.md @@ -76,7 +76,7 @@ There are two main ways to use guidance; you can either use the `/generate` endp Under the hood tools are a special case of grammars that allows the model to choose one or none of the provided tools. -Please refer to [using guidance](../basic_tutorial/using_guidance) for more examples and details on how to use guidance in Python, JavaScript, and cURL. +Please refer to [using guidance](../basic_tutorials/using_guidance) for more examples and details on how to use guidance in Python, JavaScript, and cURL. ### Getting the most out of guidance From 27a5d6b5f90dda4cc26fe9069be5df4e88b17fde Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Wed, 15 May 2024 13:31:22 +0200 Subject: [PATCH 14/46] Add GPT-2 with flash attention (#1889) # What does this PR do? This change adds `FlashGPT2ForCausalLM` and wires it up. The model itself is pretty straightforward, the main difference from other models is that it uses trained position embeddings and that all weight matrices are transposed compared to other models (due to the use of Conv1D in the upstream model). Fixes # (issue) ## Before submitting - [x] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [x] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests), Pull Request section? - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link to it if that's the case. - [x] Did you make sure to update the documentation with your changes? Here are the [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation). - [x] Did you write any new necessary tests? ## Who can review? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR. @Narsil --- docs/source/supported_models.md | 1 + .../test_flash_gpt2/test_flash_gpt2.json | 99 ++++ .../test_flash_gpt2/test_flash_gpt2_load.json | 398 +++++++++++++++ integration-tests/models/test_flash_gpt2.py | 44 ++ router/src/config.rs | 1 + .../custom_modeling/flash_gpt2_modeling.py | 454 ++++++++++++++++++ .../models/flash_gpt2.py | 78 +++ 7 files changed, 1075 insertions(+) create mode 100644 integration-tests/models/__snapshots__/test_flash_gpt2/test_flash_gpt2.json create mode 100644 integration-tests/models/__snapshots__/test_flash_gpt2/test_flash_gpt2_load.json create mode 100644 integration-tests/models/test_flash_gpt2.py create mode 100644 server/text_generation_server/models/custom_modeling/flash_gpt2_modeling.py create mode 100644 server/text_generation_server/models/flash_gpt2.py diff --git a/docs/source/supported_models.md b/docs/source/supported_models.md index fa1f9f61..ceb25cfd 100644 --- a/docs/source/supported_models.md +++ b/docs/source/supported_models.md @@ -9,6 +9,7 @@ The following models are optimized and can be served with TGI, which uses custom - [BLOOM](https://huggingface.co/bigscience/bloom) - [FLAN-T5](https://huggingface.co/google/flan-t5-xxl) - [Galactica](https://huggingface.co/facebook/galactica-120b) +- [GPT-2](https://huggingface.co/openai-community/gpt2) - [GPT-Neox](https://huggingface.co/EleutherAI/gpt-neox-20b) - [Llama](https://github.com/facebookresearch/llama) - [OPT](https://huggingface.co/facebook/opt-66b) diff --git a/integration-tests/models/__snapshots__/test_flash_gpt2/test_flash_gpt2.json b/integration-tests/models/__snapshots__/test_flash_gpt2/test_flash_gpt2.json new file mode 100644 index 00000000..ca7393a3 --- /dev/null +++ b/integration-tests/models/__snapshots__/test_flash_gpt2/test_flash_gpt2.json @@ -0,0 +1,99 @@ +{ + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 2061, + "logprob": null, + "text": "What" + }, + { + "id": 318, + "logprob": -3.1835938, + "text": " is" + }, + { + "id": 2769, + "logprob": -9.171875, + "text": " deep" + }, + { + "id": 4673, + "logprob": -1.6425781, + "text": " learning" + }, + { + "id": 30, + "logprob": -0.7314453, + "text": "?" + } + ], + "seed": null, + "tokens": [ + { + "id": 198, + "logprob": -0.68603516, + "special": false, + "text": "\n" + }, + { + "id": 198, + "logprob": -0.005393982, + "special": false, + "text": "\n" + }, + { + "id": 29744, + "logprob": -0.31079102, + "special": false, + "text": "Deep" + }, + { + "id": 4673, + "logprob": -0.08300781, + "special": false, + "text": " learning" + }, + { + "id": 318, + "logprob": -0.58984375, + "special": false, + "text": " is" + }, + { + "id": 257, + "logprob": -0.953125, + "special": false, + "text": " a" + }, + { + "id": 649, + "logprob": -2.0957031, + "special": false, + "text": " new" + }, + { + "id": 2214, + "logprob": -1.8095703, + "special": false, + "text": " field" + }, + { + "id": 286, + "logprob": -1.0673828, + "special": false, + "text": " of" + }, + { + "id": 2267, + "logprob": -0.9375, + "special": false, + "text": " research" + } + ], + "top_tokens": null + }, + "generated_text": "\n\nDeep learning is a new field of research" +} diff --git a/integration-tests/models/__snapshots__/test_flash_gpt2/test_flash_gpt2_load.json b/integration-tests/models/__snapshots__/test_flash_gpt2/test_flash_gpt2_load.json new file mode 100644 index 00000000..7bd15b90 --- /dev/null +++ b/integration-tests/models/__snapshots__/test_flash_gpt2/test_flash_gpt2_load.json @@ -0,0 +1,398 @@ +[ + { + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 2061, + "logprob": null, + "text": "What" + }, + { + "id": 318, + "logprob": -3.1835938, + "text": " is" + }, + { + "id": 2769, + "logprob": -9.171875, + "text": " deep" + }, + { + "id": 4673, + "logprob": -1.6425781, + "text": " learning" + }, + { + "id": 30, + "logprob": -0.7314453, + "text": "?" + } + ], + "seed": null, + "tokens": [ + { + "id": 198, + "logprob": -0.68603516, + "special": false, + "text": "\n" + }, + { + "id": 198, + "logprob": -0.005672455, + "special": false, + "text": "\n" + }, + { + "id": 29744, + "logprob": -0.3251953, + "special": false, + "text": "Deep" + }, + { + "id": 4673, + "logprob": -0.08294678, + "special": false, + "text": " learning" + }, + { + "id": 318, + "logprob": -0.5854492, + "special": false, + "text": " is" + }, + { + "id": 257, + "logprob": -0.9423828, + "special": false, + "text": " a" + }, + { + "id": 649, + "logprob": -2.0800781, + "special": false, + "text": " new" + }, + { + "id": 2214, + "logprob": -1.8369141, + "special": false, + "text": " field" + }, + { + "id": 286, + "logprob": -1.0683594, + "special": false, + "text": " of" + }, + { + "id": 2267, + "logprob": -0.9711914, + "special": false, + "text": " research" + } + ], + "top_tokens": null + }, + "generated_text": "\n\nDeep learning is a new field of research" + }, + { + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 2061, + "logprob": null, + "text": "What" + }, + { + "id": 318, + "logprob": -3.1660156, + "text": " is" + }, + { + "id": 2769, + "logprob": -9.1796875, + "text": " deep" + }, + { + "id": 4673, + "logprob": -1.6376953, + "text": " learning" + }, + { + "id": 30, + "logprob": -0.72216797, + "text": "?" + } + ], + "seed": null, + "tokens": [ + { + "id": 198, + "logprob": -0.7089844, + "special": false, + "text": "\n" + }, + { + "id": 198, + "logprob": -0.0054779053, + "special": false, + "text": "\n" + }, + { + "id": 29744, + "logprob": -0.3190918, + "special": false, + "text": "Deep" + }, + { + "id": 4673, + "logprob": -0.08319092, + "special": false, + "text": " learning" + }, + { + "id": 318, + "logprob": -0.5839844, + "special": false, + "text": " is" + }, + { + "id": 257, + "logprob": -0.9506836, + "special": false, + "text": " a" + }, + { + "id": 649, + "logprob": -2.0878906, + "special": false, + "text": " new" + }, + { + "id": 2214, + "logprob": -1.8496094, + "special": false, + "text": " field" + }, + { + "id": 286, + "logprob": -1.0673828, + "special": false, + "text": " of" + }, + { + "id": 2267, + "logprob": -0.9370117, + "special": false, + "text": " research" + } + ], + "top_tokens": null + }, + "generated_text": "\n\nDeep learning is a new field of research" + }, + { + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 2061, + "logprob": null, + "text": "What" + }, + { + "id": 318, + "logprob": -3.1660156, + "text": " is" + }, + { + "id": 2769, + "logprob": -9.1796875, + "text": " deep" + }, + { + "id": 4673, + "logprob": -1.6376953, + "text": " learning" + }, + { + "id": 30, + "logprob": -0.72216797, + "text": "?" + } + ], + "seed": null, + "tokens": [ + { + "id": 198, + "logprob": -0.7089844, + "special": false, + "text": "\n" + }, + { + "id": 198, + "logprob": -0.0054779053, + "special": false, + "text": "\n" + }, + { + "id": 29744, + "logprob": -0.3190918, + "special": false, + "text": "Deep" + }, + { + "id": 4673, + "logprob": -0.08319092, + "special": false, + "text": " learning" + }, + { + "id": 318, + "logprob": -0.5839844, + "special": false, + "text": " is" + }, + { + "id": 257, + "logprob": -0.9506836, + "special": false, + "text": " a" + }, + { + "id": 649, + "logprob": -2.0878906, + "special": false, + "text": " new" + }, + { + "id": 2214, + "logprob": -1.8496094, + "special": false, + "text": " field" + }, + { + "id": 286, + "logprob": -1.0673828, + "special": false, + "text": " of" + }, + { + "id": 2267, + "logprob": -0.9370117, + "special": false, + "text": " research" + } + ], + "top_tokens": null + }, + "generated_text": "\n\nDeep learning is a new field of research" + }, + { + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 2061, + "logprob": null, + "text": "What" + }, + { + "id": 318, + "logprob": -3.1660156, + "text": " is" + }, + { + "id": 2769, + "logprob": -9.1796875, + "text": " deep" + }, + { + "id": 4673, + "logprob": -1.6376953, + "text": " learning" + }, + { + "id": 30, + "logprob": -0.72216797, + "text": "?" + } + ], + "seed": null, + "tokens": [ + { + "id": 198, + "logprob": -0.7089844, + "special": false, + "text": "\n" + }, + { + "id": 198, + "logprob": -0.0054779053, + "special": false, + "text": "\n" + }, + { + "id": 29744, + "logprob": -0.3190918, + "special": false, + "text": "Deep" + }, + { + "id": 4673, + "logprob": -0.08319092, + "special": false, + "text": " learning" + }, + { + "id": 318, + "logprob": -0.5839844, + "special": false, + "text": " is" + }, + { + "id": 257, + "logprob": -0.9506836, + "special": false, + "text": " a" + }, + { + "id": 649, + "logprob": -2.0878906, + "special": false, + "text": " new" + }, + { + "id": 2214, + "logprob": -1.8496094, + "special": false, + "text": " field" + }, + { + "id": 286, + "logprob": -1.0673828, + "special": false, + "text": " of" + }, + { + "id": 2267, + "logprob": -0.9370117, + "special": false, + "text": " research" + } + ], + "top_tokens": null + }, + "generated_text": "\n\nDeep learning is a new field of research" + } +] diff --git a/integration-tests/models/test_flash_gpt2.py b/integration-tests/models/test_flash_gpt2.py new file mode 100644 index 00000000..0c7977d0 --- /dev/null +++ b/integration-tests/models/test_flash_gpt2.py @@ -0,0 +1,44 @@ +import pytest + + +@pytest.fixture(scope="module") +def flash_gpt2_handle(launcher): + with launcher("openai-community/gpt2", num_shard=2) as handle: + yield handle + + +@pytest.fixture(scope="module") +async def flash_gpt2(flash_gpt2_handle): + await flash_gpt2_handle.health(300) + return flash_gpt2_handle.client + + +@pytest.mark.asyncio +async def test_flash_gpt2(flash_gpt2, response_snapshot): + response = await flash_gpt2.generate( + "What is deep learning?", + max_new_tokens=10, + decoder_input_details=True, + ) + + assert response.details.generated_tokens == 10 + assert response == response_snapshot + + +@pytest.mark.asyncio +async def test_flash_gpt2_load(flash_gpt2, generate_load, response_snapshot): + responses = await generate_load( + flash_gpt2, + "What is deep learning?", + max_new_tokens=10, + n=4, + ) + + generated_texts = [r.generated_text for r in responses] + + assert len(generated_texts) == 4 + assert all( + [text == generated_texts[0] for text in generated_texts] + ), generated_texts + + assert responses == response_snapshot diff --git a/router/src/config.rs b/router/src/config.rs index 8640ede9..989f0e31 100644 --- a/router/src/config.rs +++ b/router/src/config.rs @@ -132,6 +132,7 @@ pub enum Config { Santacoder, Bloom, Mpt, + Gpt2, GptNeox, Phi, #[serde(rename = "phi-msft")] diff --git a/server/text_generation_server/models/custom_modeling/flash_gpt2_modeling.py b/server/text_generation_server/models/custom_modeling/flash_gpt2_modeling.py new file mode 100644 index 00000000..d2599f7a --- /dev/null +++ b/server/text_generation_server/models/custom_modeling/flash_gpt2_modeling.py @@ -0,0 +1,454 @@ +# coding=utf-8 +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +import torch.distributed + +from torch import nn +from transformers.activations import ACT2FN +from typing import Optional, List, Tuple + +from text_generation_server.utils import paged_attention, flash_attn +from text_generation_server.layers import ( + TensorParallelRowLinear, + TensorParallelColumnLinear, + TensorParallelEmbedding, + SpeculativeHead, + get_linear, +) + + +def load_qkv(config, prefix: str, weights, head_size, num_heads): + if config.quantize == "gptq": + return _load_qkv_gptq( + config, + prefix, + weights, + ) + else: + return _load_qkv(config, prefix, weights, head_size, num_heads) + + +def _load_qkv_gptq(config, prefix: str, weights): + world_size = weights.process_group.size() + rank = weights.process_group.rank() + + # Weights + weight = weights.get_weights_col_packed_qkv(f"{prefix}.c_attn", config.quantize) + + # Bias + slice_ = weights._get_slice(f"{prefix}.c_attn.bias") + shape = slice_.get_shape() + total_size = shape[0] + assert total_size % 3 == 0, f"Prepacked is not divisible by {3}" + single_size = total_size // 3 + assert single_size % world_size == 0 + block_size = single_size // world_size + start = rank * block_size + stop = (rank + 1) * block_size + tensors = [] + for i in range(3): + tensor = slice_[start + i * single_size : stop + i * single_size] + tensors.append(tensor) + bias = torch.cat(tensors, dim=0) + bias = bias.to(device=weights.device) + + return TensorParallelColumnLinear(get_linear(weight, bias, config.quantize)) + + +def _load_qkv(config, prefix: str, weights, head_size, num_heads): + """Load QKV from a single, transposed matrix.""" + + slice_ = weights._get_slice(f"{prefix}.c_attn.weight") + shape = slice_.get_shape() + total_size = shape[1] + assert total_size % 3 == 0, f"Prepacked is not divisible by {3}" + world_size = weights.process_group.size() + single_size = total_size // 3 + assert single_size % world_size == 0 + rank = weights.process_group.rank() + + # Weights + block_size = single_size // world_size + start = rank * block_size + stop = (rank + 1) * block_size + tensors = [] + for i in range(3): + tensor = slice_[:, start + i * single_size : stop + i * single_size] + tensors.append(tensor) + weight = torch.cat(tensors, dim=1).T + weight = weight.to(dtype=weights.dtype) + weight = weight.to(device=weights.device) + + # Bias + slice_ = weights._get_slice(f"{prefix}.c_attn.bias") + shape = slice_.get_shape() + total_size = shape[0] + single_size = total_size // 3 + block_size = single_size // world_size + assert single_size % world_size == 0 + start = rank * block_size + stop = (rank + 1) * block_size + b = [] + for i in range(3): + tensor = slice_[start + i * single_size : stop + i * single_size] + b.append(tensor) + bias = torch.cat(b, dim=0) + bias = bias.to(dtype=weights.dtype) + bias = bias.to(device=weights.device) + assert list(bias.shape) == [ + 3 * num_heads * head_size + ], f"{weight.shape} != {[3 * num_heads * head_size]}" + + return TensorParallelColumnLinear(get_linear(weight, bias, config.quantize)) + + +def load_row(config, prefix: str, weights, bias: bool): + """load_row, but with transposed weight matrices.""" + + if config.quantize == "gptq": + weight = weights.get_multi_weights_row(prefix, quantize=config.quantize) + else: + weight = weights.get_sharded(f"{prefix}.weight", dim=0).T + + if bias and weights.process_group.rank() == 0: + # Rank is only on the first rank process + bias = weights.get_tensor(f"{prefix}.bias") + else: + bias = None + + return TensorParallelRowLinear( + get_linear(weight, bias, config.quantize), process_group=weights.process_group + ) + + +def load_col(config, prefix: str, weights, bias: bool): + """load_col, but with transposed weight matrices.""" + if config.quantize == "gptq": + weight = weights.get_multi_weights_col( + [prefix], quantize=config.quantize, dim=1 + ) + else: + weight = weights.get_sharded(f"{prefix}.weight", dim=1).T + + if bias: + bias = weights.get_sharded(f"{prefix}.bias", dim=0) + else: + bias = None + + return TensorParallelColumnLinear(get_linear(weight, bias, config.quantize)) + + +class FlashGPT2Attention(torch.nn.Module): + def __init__( + self, + prefix: str, + config, + weights, + ): + super().__init__() + self.num_heads = config.num_attention_heads + self.hidden_size = config.hidden_size + + self.head_size = self.hidden_size // self.num_heads + self.softmax_scale = self.head_size**-0.5 + + if self.num_heads % weights.process_group.size() != 0: + raise ValueError( + f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} " + f"and `num_shards`: {weights.process_group.size()}" + ) + self.num_heads = self.num_heads // weights.process_group.size() + + self.query_key_value = load_qkv( + config, + prefix=prefix, + weights=weights, + head_size=self.head_size, + num_heads=self.num_heads, + ) + + self.o_proj = load_row( + config, + prefix=f"{prefix}.c_proj", + weights=weights, + bias=True, + ) + + self.kv_head_mapping = torch.arange( + 0, self.num_heads, dtype=torch.int32, device=weights.device + ) + + def forward( + self, + hidden_states, + cu_seqlen_prefill, + kv_cache, + block_tables, + slots, + input_lengths, + max_s, + ): + query, key, value = self.query_key_value(hidden_states).split( + self.head_size * self.num_heads, dim=1 + ) + query = query.view(-1, self.num_heads, self.head_size) + key = key.view(-1, self.num_heads, self.head_size) + value = value.view(-1, self.num_heads, self.head_size) + + paged_attention.reshape_and_cache(key, value, kv_cache[0], kv_cache[1], slots) + + # output tensor + attn_output = torch.empty_like(query) + + # Prefill + if cu_seqlen_prefill is not None: + # flash attention + flash_attn.attention( + query, + key, + value, + attn_output, + cu_seqlen_prefill, + max_s, + self.softmax_scale, + ) + # Decode + else: + paged_attention.attention( + attn_output, + query, + kv_cache[0], + kv_cache[1], + self.kv_head_mapping, + self.softmax_scale, + block_tables, + input_lengths, + max_s, + ) + + return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size)) + + +class GPT2MLP(nn.Module): + def __init__(self, prefix, config, weights): + super().__init__() + act = config.activation_function + self.act = ( + ACT2FN[act] + if "gelu" not in act + else lambda x: torch.nn.functional.gelu( + x, + approximate=( + "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none" + ), + ) + ) + + self.c_fc = load_col( + config, prefix=f"{prefix}.c_fc", weights=weights, bias=True + ) + self.c_proj = load_row( + config, + prefix=f"{prefix}.c_proj", + weights=weights, + bias=True, + ) + + intermediate_size = ( + config.n_inner if config.n_inner is not None else 4 * config.hidden_size + ) + + self.intermediate_size = intermediate_size // weights.process_group.size() + + def forward(self, hidden_states): + hidden_states = self.c_fc(hidden_states) + hidden_states = self.act(hidden_states) + return self.c_proj(hidden_states) + + +class FlashGPT2Layer(nn.Module): + def __init__(self, prefix, config, weights): + super().__init__() + self.self_attn = FlashGPT2Attention( + prefix=f"{prefix}.attn", config=config, weights=weights + ) + self.mlp = GPT2MLP(prefix=f"{prefix}.mlp", config=config, weights=weights) + + self.input_layernorm = nn.LayerNorm.load( + prefix=f"{prefix}.ln_1", weights=weights, eps=config.layer_norm_epsilon + ) + self.post_attention_layernorm = nn.LayerNorm.load( + prefix=f"{prefix}.ln_2", + weights=weights, + eps=config.layer_norm_epsilon, + ) + + def forward( + self, + hidden_states, + residual, + cu_seqlen_prefill, + kv_cache, + block_tables, + slots, + input_lengths, + max_s, + ): + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + attn_output = self.self_attn( + hidden_states, + cu_seqlen_prefill, + kv_cache, + block_tables, + slots, + input_lengths, + max_s, + ) + + hidden_states = attn_output + residual + residual = hidden_states + + hidden_states = self.post_attention_layernorm(hidden_states) + + mlp_output = self.mlp(hidden_states) + + return residual + mlp_output, residual + + +class FlashGPT2Model(torch.nn.Module): + def __init__(self, prefix, config, weights): + super().__init__() + + process_group = weights.process_group + self.tp_rank = process_group.rank() + self.tp_world_size = process_group.size() + self.layers = nn.ModuleList( + [ + FlashGPT2Layer( + prefix=( + f"h.{layer_id}" if not prefix else f"{prefix}.h.{layer_id}" + ), + config=config, + weights=weights, + ) + for layer_id in range(config.num_hidden_layers) + ] + ) + + self.norm = nn.LayerNorm.load( + prefix="ln_f" if not prefix else f"{prefix}.ln_f", + weights=weights, + eps=config.layer_norm_epsilon, + ) + + self.gradient_checkpointing = False + + self.head_size = self.layers[0].self_attn.head_size + self.num_heads = self.layers[0].self_attn.num_heads + + def forward( + self, + inputs_embeds: torch.Tensor, + position_ids: torch.Tensor, + cu_seqlen_prefill: Optional[torch.Tensor], + kv_cache: List[Tuple[torch.Tensor, torch.Tensor]], + block_tables: torch.Tensor, + slots: torch.Tensor, + input_lengths: torch.Tensor, + max_s: int, + true_max_s: int, + prefill_cache_indices: Optional[torch.Tensor], + ) -> torch.Tensor: + hidden_states = inputs_embeds + + residual = None + for i, layer in enumerate(self.layers): + hidden_states, residual = layer( + hidden_states, + residual, + cu_seqlen_prefill, + kv_cache[i], + block_tables, + slots, + input_lengths, + max_s, + ) + + hidden_states = self.norm(hidden_states) + + return hidden_states + + +class FlashGPT2ForCausalLM(torch.nn.Module): + def __init__(self, prefix, config, weights): + super().__init__() + + self.embed_tokens = TensorParallelEmbedding( + prefix=("wte" if not prefix else f"{prefix}.wte"), + weights=weights, + ) + self.embed_positions = TensorParallelEmbedding( + prefix=("wpe" if not prefix else f"{prefix}.wpe"), + weights=weights, + ) + + self.model = FlashGPT2Model(prefix, config, weights) + self.lm_head = SpeculativeHead.load( + config, + prefix="wte" if not prefix else f"{prefix}.wte", + weights=weights, + ) + + def forward( + self, + input_ids: torch.Tensor, + position_ids: torch.Tensor, + cu_seqlen_prefill: Optional[torch.Tensor], + kv_cache: List[Tuple[torch.Tensor, torch.Tensor]], + block_tables: torch.Tensor, + slots: torch.Tensor, + input_lengths: torch.Tensor, + max_s: int, + prefill_cache_indices: Optional[torch.Tensor] = None, + lm_head_indices: Optional[torch.Tensor] = None, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: + token_embeds = self.embed_tokens(input_ids) + position_embeds = self.embed_positions(position_ids) + inputs_embeds = token_embeds + position_embeds + hidden_states = self.model( + inputs_embeds, + position_ids, + cu_seqlen_prefill, + kv_cache, + block_tables, + slots, + input_lengths, + max_s, + true_max_s=max_s, + prefill_cache_indices=prefill_cache_indices, + ) + if lm_head_indices is not None: + hidden_states = hidden_states[lm_head_indices] + logits, speculative_logits = self.lm_head(hidden_states) + return logits, speculative_logits diff --git a/server/text_generation_server/models/flash_gpt2.py b/server/text_generation_server/models/flash_gpt2.py new file mode 100644 index 00000000..5781f55e --- /dev/null +++ b/server/text_generation_server/models/flash_gpt2.py @@ -0,0 +1,78 @@ +import torch +import torch.distributed + +from opentelemetry import trace +from transformers import AutoConfig, AutoTokenizer, GenerationConfig +from transformers.models.gpt2 import GPT2Tokenizer +from typing import Optional + +from text_generation_server.models import FlashCausalLM +from text_generation_server.models.custom_modeling.flash_gpt2_modeling import ( + FlashGPT2ForCausalLM, +) +from text_generation_server.utils import ( + initialize_torch_distributed, + weight_files, + Weights, +) + +tracer = trace.get_tracer(__name__) + +from text_generation_server.utils.import_utils import SYSTEM + + +class FlashGPT2(FlashCausalLM): + def __init__( + self, + model_id: str, + revision: Optional[str] = None, + quantize: Optional[str] = None, + speculator: Optional[str] = None, + dtype: Optional[torch.dtype] = None, + trust_remote_code: bool = False, + ): + self.process_group, rank, world_size = initialize_torch_distributed() + if torch.cuda.is_available(): + device = torch.device(f"cuda:{rank}") + dtype = torch.float16 if dtype is None else dtype + elif SYSTEM == "xpu": + device = torch.device(f"xpu:{rank}") + dtype = torch.float16 if dtype is None else dtype + else: + raise NotImplementedError("FlashGPT2 is only available on GPU") + + tokenizer = AutoTokenizer.from_pretrained( + model_id, + revision=revision, + padding_side="left", + truncation_side="left", + trust_remote_code=trust_remote_code, + ) + + config = AutoConfig.from_pretrained( + model_id, revision=revision, trust_remote_code=trust_remote_code + ) + config.quantize = quantize + config.speculator = speculator + + torch.distributed.barrier(group=self.process_group) + + filenames = weight_files(model_id, revision=revision, extension=".safetensors") + weights = Weights(filenames, device, dtype, process_group=self.process_group) + if config.quantize in ["gptq", "awq"]: + weights._set_gptq_params(model_id, revision) + + prefix = "" + model = FlashGPT2ForCausalLM(prefix, config, weights) + torch.distributed.barrier(group=self.process_group) + super(FlashGPT2, self).__init__( + model=model, + tokenizer=tokenizer, + num_layers=len(model.model.layers), + num_kv_heads=model.model.num_heads, + head_size=model.model.head_size, + dtype=dtype, + device=device, + rank=rank, + world_size=world_size, + ) From 2573f3aed41a4e1dfc5055149f398c5b13cc770b Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Wed, 15 May 2024 13:56:07 +0200 Subject: [PATCH 15/46] Removing accepted ids in the regular info logs, downgrade to debug. (#1898) # What does this PR do? Fixes # (issue) ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests), Pull Request section? - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link to it if that's the case. - [ ] Did you make sure to update the documentation with your changes? Here are the [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation). - [ ] Did you write any new necessary tests? ## Who can review? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR. --- server/text_generation_server/models/flash_causal_lm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py index 01b4862f..36351252 100644 --- a/server/text_generation_server/models/flash_causal_lm.py +++ b/server/text_generation_server/models/flash_causal_lm.py @@ -1102,7 +1102,7 @@ class FlashCausalLM(Model): next_token_texts = [] left = 0 - logger.info(f"Accepted ids {n_accepted_ids}") + logger.debug(f"Accepted ids {n_accepted_ids}") current_stopped = False for j in range(index, index + n_accepted_ids): From 7b11b1804bdb1d39e4e3f2c9ac3a4f78cdb132b3 Mon Sep 17 00:00:00 2001 From: drbh Date: Wed, 15 May 2024 09:40:07 -0400 Subject: [PATCH 16/46] feat: add deprecation warning to clients (#1855) This PR adds a deprecation warning to the clients and points users to the https://github.com/huggingface/huggingface_hub --- clients/python/text_generation/__init__.py | 5 +++++ clients/python/text_generation/client.py | 7 +++++++ 2 files changed, 12 insertions(+) diff --git a/clients/python/text_generation/__init__.py b/clients/python/text_generation/__init__.py index 5ab10fdb..a8e67071 100644 --- a/clients/python/text_generation/__init__.py +++ b/clients/python/text_generation/__init__.py @@ -14,5 +14,10 @@ __version__ = "0.6.0" +DEPRECATION_WARNING = ( + "`text_generation` clients are deprecated and will be removed in the near future. " + "Please use the `InferenceClient` from the `huggingface_hub` package instead." +) + from text_generation.client import Client, AsyncClient from text_generation.inference_api import InferenceAPIClient, InferenceAPIAsyncClient diff --git a/clients/python/text_generation/client.py b/clients/python/text_generation/client.py index 0e86901d..98c018d5 100644 --- a/clients/python/text_generation/client.py +++ b/clients/python/text_generation/client.py @@ -1,10 +1,12 @@ import json import requests +import warnings from aiohttp import ClientSession, ClientTimeout from pydantic import ValidationError from typing import Dict, Optional, List, AsyncIterator, Iterator, Union +from text_generation import DEPRECATION_WARNING from text_generation.types import ( StreamResponse, Response, @@ -19,6 +21,9 @@ from text_generation.types import ( ) from text_generation.errors import parse_error +# emit deprecation warnings +warnings.simplefilter("always", DeprecationWarning) + class Client: """Client to make calls to a text-generation-inference instance @@ -59,6 +64,7 @@ class Client: timeout (`int`): Timeout in seconds """ + warnings.warn(DEPRECATION_WARNING, DeprecationWarning) self.base_url = base_url self.headers = headers self.cookies = cookies @@ -449,6 +455,7 @@ class AsyncClient: timeout (`int`): Timeout in seconds """ + warnings.warn(DEPRECATION_WARNING, DeprecationWarning) self.base_url = base_url self.headers = headers self.cookies = cookies From b1d370e0628d86320c592c60584739786661a237 Mon Sep 17 00:00:00 2001 From: Dhruv Srikanth <51223342+DhruvSrikanth@users.noreply.github.com> Date: Wed, 15 May 2024 20:08:32 +0100 Subject: [PATCH 17/46] Update torch import reference in bnb quantization (#1902) # What does this PR do? Fixes `Import Error` occurring from mismatch of usage between torch.nn.Module and nn.Module. --- server/text_generation_server/layers/bnb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/text_generation_server/layers/bnb.py b/server/text_generation_server/layers/bnb.py index d27a33a1..ca39919c 100644 --- a/server/text_generation_server/layers/bnb.py +++ b/server/text_generation_server/layers/bnb.py @@ -70,7 +70,7 @@ class Linear8bitLt(torch.nn.Module): return out -class Linear4bit(nn.Module): +class Linear4bit(torch.nn.Module): def __init__(self, weight, bias, quant_type): super().__init__() self.weight = Params4bit( From 62b2a8b67ba7c25b8aea4bb031d469e071d92294 Mon Sep 17 00:00:00 2001 From: drbh Date: Thu, 16 May 2024 00:58:47 -0400 Subject: [PATCH 18/46] Pali gemma modeling (#1895) This PR adds paligemma modeling code Blog post: https://huggingface.co/blog/paligemma Transformers PR: https://github.com/huggingface/transformers/pull/30814 install the latest changes and run with ```bash # get the weights # text-generation-server download-weights gv-hf/PaliGemma-base-224px-hf # run TGI text-generation-launcher --model-id gv-hf/PaliGemma-base-224px-hf ``` basic example sending various requests ```python from huggingface_hub import InferenceClient client = InferenceClient("http://127.0.0.1:3000") images = [ "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/cow_beach_1.png", "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png", ] prompts = [ "What animal is in this image?", "Name three colors in this image.", "What are 10 colors in this image?", "Where is the cow standing?", "answer en Where is the cow standing?", "Is there a bird in the image?", "Is ther a cow in the image?", "Is there a rabbit in the image?", "how many birds are in the image?", "how many rabbits are in the image?", ] for img in images: print(f"\nImage: {img.split('/')[-1]}") for prompt in prompts: inputs = f"![]({img}){prompt}\n" json_data = { "inputs": inputs, "parameters": { "max_new_tokens": 30, "do_sample": False, }, } generated_output = client.text_generation(prompt, max_new_tokens=30, stream=False) print([f"{prompt}\n{generated_output}"]) ``` --------- Co-authored-by: Nicolas Patry --- Dockerfile | 1 + integration-tests/images/cow_beach.png | Bin 0 -> 67246 bytes .../test_flash_pali_gemma.json | 25 + .../models/test_flash_pali_gemma.py | 39 ++ router/src/config.rs | 21 +- router/src/validation.rs | 24 + server/requirements_cuda.txt | 4 +- server/requirements_rocm.txt | 4 +- .../text_generation_server/layers/linear.py | 4 +- .../custom_modeling/flash_gemma_modeling.py | 72 ++- .../flash_pali_gemma_modeling.py | 110 ++++ .../models/custom_modeling/siglip.py | 565 ++++++++++++++++++ .../models/custom_modeling/vlm.py | 20 + .../models/flash_causal_lm.py | 11 + .../models/flash_gemma.py | 13 +- .../models/pali_gemma.py | 123 ++++ .../models/vlm_causal_lm.py | 9 +- .../utils/flash_attn.py | 3 +- 18 files changed, 1002 insertions(+), 46 deletions(-) create mode 100644 integration-tests/images/cow_beach.png create mode 100644 integration-tests/models/__snapshots__/test_flash_pali_gemma/test_flash_pali_gemma.json create mode 100644 integration-tests/models/test_flash_pali_gemma.py create mode 100644 server/text_generation_server/models/custom_modeling/flash_pali_gemma_modeling.py create mode 100644 server/text_generation_server/models/custom_modeling/siglip.py create mode 100644 server/text_generation_server/models/pali_gemma.py diff --git a/Dockerfile b/Dockerfile index 175287bb..73a274dc 100644 --- a/Dockerfile +++ b/Dockerfile @@ -50,6 +50,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins ca-certificates \ make \ curl \ + git \ && rm -rf /var/lib/apt/lists/* # Install server diff --git a/integration-tests/images/cow_beach.png b/integration-tests/images/cow_beach.png new file mode 100644 index 0000000000000000000000000000000000000000..d67f8a1b2a52459dedda0b4291d91c9e557be40e GIT binary patch literal 67246 zcmV)kK%l>gP)EX>4Tx0C=2zkv&MmKpe$iQ>7vm2Rn!;q)?sgq9Tr3g(6f4wL+^7CYOFelZGV4 z#ZhoAIQX$xb#QUk)xlK|1V2EW9h?+hq{ROvg%&X$9QWhhy~o`jaO1dJepD55g6j5$e4!FPP!Bf!_YIM4Dw_vh$S^A-aFBJnIUOq+OvczV+| zIPVijSV>li&xyxPx*+i**ACFQaAy0zc|jvC=l8OnsvwdK6aeu2@re+uJpFQ+5l!gNw2rH z*b&gb4P0EeHDwRD+yMrk4B3=jDM(W&6oB_L`lcK(a0~RVdA+svaryvcsH@ZsaBv8W z6)Ah&=iNP>z5RQp)!z@o9CF5D(ChX9002&9SV?A0O#mtY000O81OXZV1poj50RR91 zJ^>p51poj50RR91Mgb@Q0{{R30RRF300I*L0{{R30RRC20000000001EC2ui0RR92 zEC2ui0h?lQEC2u?07*naRCwCNy<5yB*L5bgzO{DMx%{`jv%5)(;zbfk$&_SC?n>0o*9figUkd-9%e8=0wX~JBtTyBoVVmDc?q1n7%(yjf;@O)4?JUg;7DUjvhEa3 zQ55fNb~pR_U(Y#Jwb#nSzSOQd=l`3UJfuO8&F=G`Q>SY0wb%OA_k9b$^*{bG07}2C z|1A-bnT6lv*Mt|CS@^5)=Vn&EN&a&XX!s6B1`2;gWTa%_XUosc?*;-{@#jS2cQal8 zseIXHW{mt5h)B)MjG4oqsi~&7uz00B{1*L}H8T(+U4|8}$R9Vl38P!I{)g-LdGG=? z0Oh9>75_WDX+-g6_MfO~;oSfwQ`LNt0F1~CstN)ZSEy!$;_|}lfYB9(3kshVe-QnB zAR^QLuK*aCr5C3w0TFqE5~(Sye5~mUnmIc2*XWVwr4_$qanb9q50^#xCL;eQz6kMu z;=)~j2aBt+Sc3UssOAMU*kHvvFfy;UahsKw%S1+m@PO%*SH?fhZxpr)k6t{u_~Clp zxPjMKClN7oxsrw(wD?&(>S0mbO=@bJ|CYiBpZ+05 zaHF*8`#afx3nM~^pfM7zuebhNYY5nBv6>o@0UX{r+~)kD;msN0K#4yQo^`(d{ESEU zCoh7txA&&M%KKxytIL-a=MQQBCxcH95j32JIdp{!W`+&Vg!M>NE<05XFB8PuX=X%F z3?3`uB8;0N#FxB7!h=%{ahCFp1i_7Ve>i~8x_85S@rXqnBe|u=8Xl&h46Gn4Hu3l! zFx)aW`rG{A(}pgVA_z<{#KYTCzA)SZVQ~=?Y5eX4ghfy2S9^TKc|8)?5bkXF1j{k$ zk;`rrS2joM4f(+ENrgMR=@BqaD<(kVs&R|_Tm(0?h+;siptrhlRUZxm)FBpZ} zQeJKN3d{^5Zz)C~1Q%HNT}uCq2xji1th3W1!6R?6k?HVnEj>w&>4KT{(R370Ed_il zj@GkNk@2H40Ayi5k||^J)q*IU_2FO4InWc*5=RW9>A)h=!Nc2dF0Y^UedHM4`SmlD zPuaA6P$Hap%AC4zDFv+@TyWX3GTIgYerbI`T^+w_~;@^MH+;1~dy5Yj3e)_;AC zg>c-PnYkDo5RN=fIjp1qZ@*4kA_Y!ieOO9g6&QrB^&d=&n?R>TM|`x)+mxr9^(XSi z_lc{d|FOY2z#zax*l}fMYNnx5AnKIVYusG0lmc>@+l;=$dO6=`a;KkO;re58jQou( zM9{p9&Vao*#qxMM14bOaN5}*KVwRA2Ye*)a>0z^bN#&Z`j&q|A=g zhnzNFvk_SgEyWj!#p=d_X`GH1B~HR|;!V&V%ZtG@M1!<2`i~f{R}dL9`uxWFgLn|d zdNk2e$+)qhaN4&%t2{{WBg`3M9OggRxNmcSz*>vpw1}HChg`Q3;+M%6q+T;t z6<23EI4!Ts5;s-rpadvUt14>D2*HAh6=cGy>dr3usDz0Aoe}^l67lsnv9;8a3iO^f zcBjf%UL+DBRT@KsIdX@u!MW+uC!z&JfPi_#MM=Z}1EwZ`2%C8ZF<5dD5x#PLE3a^&4(Cmq}9p zW&o24Sir(yrb>jlPYH54Vnm%+H8tZ5lyZQn;|l_MqDBTtN_;KChoMK9QeK}A+rDoX zmnjB`R(Ut9D1DGupc!#WZ+wK7&@W$0@EfI_l|H?!5Bhchh|Caf6vwL#r>hyMDG@1i zS-&wOe+e*~DP;&LbUP8&E|I~3TFnoWm`DhYh{U;%9t^Rmpg7}r8ggn2W1CLUI^;=rx^Z|Q>m8te1zqivu0EJtT&ggI-jQS=71A~lzm=h;3 zD+O$jHzVuYwy9)7MBu>)G15%q9Yb09OjG4Ktbz%@QLMUdT z;h}<5iQ>7bxhpu#p8Fcyh7=8~w{1?@+hq+DLHJmfhg5|$l4C4|;d}$ZTmppwAmXzA z4r|>~su34`jH5hy=xpe*?127iy#FaW7+F}6nrlZjRZTrt8&tiU7buDyd>XRI%%kl4 zvsBM z2nePL#!qVkP=ZE+RLmkG_#M!S5Kc#+CE1WCr9GeqC=K((NA2Xbc1}1-*0S()OUq0~ zRcTm7iJ{ElV@3{A>H(R&pobZtg%ukUaOB*#PoD%F?vfe_iIKsGm{rM#U2gqOX#C z&)F%Ds`wm3bRZ@OvjYi2ZiR4Q=WhoB8+lSQ^QK;{iz_uX=Xlapbl2TtdwE#E<#qdNzg5=3KG!A{- zmdwngK9*lV1`RB1??e`{vY_6Cz`-^GchdS|1mk9I*(@0|%|b8G`f|z!{WFXL zq=ax2DGrps88T6m3*@d(k`fOMxx0j-acnSSX0UOH;4x&fk%B{gAD(~`DKVLvlB#KF zC8lc0h+wMamJT;J_n=E=okd(0B{p1wN8`=8mM^Ilam&(@DhgIUrIWr9hZ@}Zz}zkJ zgK{NAAURXLVn9VW&@%xnl5mJYuSG$?OQ6_R0QYNzX{@d_D zYbF)O4$~mT42(oNt^1WKMRI^5@*hX5rlbUMSW5E}r0Z@Nxhw$_hAAQ`( z+itM8A|5dyaOQ-jr(@;qhe0h%V}^4exurf+d0{!hIwcj3?Whvp?l~g)qX8B;k&00T zKwiMHp9By|0*1q};8i`rYY2$(W`x{sBhsF0%@NyjKrN)46I)&-Yxl&E4$-yV?1*ZR zSOFZUY)Zvb$S+=J{tt8Yd=FDGwFb8I3oHGRH=n*>PSChd{{Rc+FBv5+Y0iz}o6E)` z=@g!PxzR_Rrwz+@M8WDyg_6M>n>rhVa}Q69*m~th+gVKY!CFjf+QwHKQrN!rD9n}c zM8pMwMeK&eb30MOyeI3tdDYfJXzCYP){|^NHgY>LKg*Iyh*Z&s7MB%wd&p=p>-kfI z_Wj`2f;o0rjf`!0dVn^N^84(}dPJkYJa&Ddjd1VEBn{~xvYf4@`ZaOl`h<`}#+*+s zW*wA_{l0YrCdH$QnA6C0XO=&fYm|c8Q*OSE3AhzWNAY4Clp4AT79B2Vi`myHdw`Sl zM-U#OnW}|jn9MkQ<-qlhKx5<8n|U==YrStx%{T-pg?=PO?QG|}S(=uR^IVc+fZAqCZE0R>YS!^xkc4Gf@LOup> zA&_a$DPbd3OYkdEXmuh3)FVa~ZPd_lf0UaUDK=#`sfq(rA_aymtNj+?vIT4fWw0c_ zlm}TL)(kfKAWChh?U;s7u7^&E$ik7G^Is;g8H~I)Q)4ixg{~wRUdgMnus8OqowrU^ zBZ)Dj89sKY6C6$CX-&j~-1I_zrv2A5%1}e>S6SF5Swd(T5hq0%ut7srkk4|W{CbNO zTgXGkEwnN1VZFM~Q6P|1v|Mwl2;DShw#-zFZTr~Nh~>Qc96A+=1rbwyA;cbkxCyQ& zg=E#xus>msZntmZW9I~&un{L8r4xxQb}FiWcH`<4jl}qUf$~8u8@txX5%MmEE-1F(r;DBC7ASoT#nO@YVeIc$} zkL{i+E+`5C4j3mFs>`($?lk5~LaCG3!;H+-dypXK7F5>n5fIGvei?=nu+XED7-wivJCOW{+)_Hxym3o>2feXg_M?o7>>1u2AoZRCsTYnjGgOU-M z&mg?FkWr|r4{J6->QQv73Zzn`$VAov9ozVo2Y_Xn8KsyvZtZ3WV@-QAT64wyELcee zR053i`*O2UvC4XtT!1jaW)}XP2>-E;Qtg=)Wnt9Qdh%J_UofR@1|mp0LS00+`}K%N zpi3A9sv>gq?ZUn+Fx!;NmKA!e%{3P3WW%5`iaMba-3_w{>E%7do9GY@G{LF539ts`CAUAeau7xgj8+N zz|1!>U|eg4B_l}&3R;IHZlOgEA!FQR9)YHP*CR5aKQW6?ODj>T7QGnIm* z)KEyg4OLDwsQmmSDqzQ{V~AtS;*z_>M;D&km?oMNgreDPBW`?_*#^cC=Yz?Tmak{t z6rX5pg`hQv`#V{Yb!4<>PQfmC2o6|LAeV2LlzihBXpzOnZ^z6*PQ8(PKCXYucwq_4R1H+b(!qes z5M~<5riqyzAQTI6)GX7z!{ZL2&XKtQz3`AKNJF*t5FZ_U0X52r&U$%{5#$b{GTLq^ zx`)_PlA>l-a)2-dP&;K^uHjAV@N#hb)=3s25H{!wcA@g)afB9T<08T0L*nF%H?LBH?<+{ne6dKxV zmBFkeuk;mzMM>(}@MH;`*)Zhy24KThdl^AT32|weL~ELeb%w?WWayd5;p;?T)R+k@ z!7m_^ybGtU5O+YDP$2`jV%@3TPF=g`?Up557O^4Zmk;14%Z=I+lP`TlP$q~ebtzH^ zl*RAZe1b~3d@O4ER0zGfS0L9gkjZskPm$P<6Ozb;V(@7)f%VN8*p%67woGhy~<`h$)|yd2LxCVxy!NjRYE|UT&5Ei73LLo_K~4CHXY+QOvRu=8-M9 zf($DDkFH5&U}cxTph|^QEcmCSioW0m6)NrcdS)1mAlWR43MosWv$4V>P=l1mVs_8@ zM$7GlY1fiX?R8 zW07Y=wsqbaoMX1063MdZL0XQ;dZPi6z{~`>j^Q8?8R`x;!`d5D&o$oQAexRS<@urI z8YN$Pt{>J@b{l9%*NAL|M@xa((x6{WV;{}FnC?^1zOSRI;6%}62`_YIS1=a=15UM_ zm4ni_jS)mCy~tv7FMh3nePqx55gvs-?5?P+h9%A0pR5JTME{Y-l*X%cZ71o5`)&Zx{wRU-%(4 zB`zH4ip>!6*5G$+g@h=%!-0*w_mp7Q+;quD@ETpx)?+6|t18HUll(3B*9{w@@}Ltd zTpCK|-COu6N7sb{;^_S*tjtxAl0z$PKs0F;;{gUUhB=_FWWtyV&1-hpK=&9^*M|2Z zZ{@(8)v;@Pxw;E2_Qsl{Bx$dE!i;Gmh0>`cgMo~Rt#7jiPrvBnvEIHLTB?>^E^Mxg zhh4bhG#G$MqpJ<-??K_EAx^`3wX&5N1+5V*PV=W7DY{Vf$;z&2?nw27_td-R68{^t zTFPE)fH`TMjFORyo_}JLbU5hSY5mTL3$grh5h+uVz!tCACY9ToRD1L3ykb|{z^tuH z3n@%C^+c9J8YS=I&A2X7tToQl3Dypsk*{g;);2Q<9q`70CNJ@K371rkM*LA( z?1SNeseqU`6(ZqMgAA2Qls@pgO2DK{$W%4JJ)?aVQb|<>bBIqID@X-F(*a_!!=yPg z4Y7lV-Muo$rwHqC;?T7m2wibrn5rF0;7^>tK1_J zu{Xg~z{6|(`b}v=FBJV@?cGW(dc&Hw<8Au=n9b>=m7V zdPEYHq1hpMyp)Ko`&d2FPHVJe12wTM^ARyOPcSDut*eR11gK#Sr;nk@Yn+n9TZkv& zJSC71tkg42R-*?W5i<)TdOvUC&W)sSA;V!nSe$S+s4+E;JkB8b2fR8Gi1XgJT^F*f z&JuZ|FmeYWcjoaZjdebdJ${0u=Nmtw`KRiQFRZ?1Wo?6162{dH_ zoni4pr#ro^Ut$?tN9o79o^vDqb+)NdmOa6YtYZMAweXhkjA>rU)Uf~ra!gn!f0SXQ zN4Hb{vS!uwzW?x@+4N zLz$U)*?Ql$THK>3kt>La*iaVOK!I*xVoT~WW`=Qjj3OEs_6H#i&6Vhs=TyGQ#;1PD z(x$)%%lxciXgt-)^|Y?_h?L1$P*4^nAHm>c<0*>3r^+;ZC!-OeP2s|Qi?0TzlCwbaoo`;D8tqR?w|gLP%e z@U={ah=qa>)L^EZa{@q3E$HZ?|4^Tts*ltyZ<^_rMjp%=x5>+wgGzefvKc7-0789c zefzyPizQAOe4uY+vQ3#`QZG~PA$03ObT+bJM`fwolCvd_M>9qOve5er(1=0_q{*L} zAt38W2~0%6OqqmtpdY)mHJene_ol=|lM1!Ki7}I72$BW(N0$+wrg|%|w{dq7C7=kB z$W?@@N+hVK7Qb2LxOUE^=@P`oEaD_L=pFNN zqh;;f)t#-&+f#upTO(p&7MOr4k=n7qfK65WM_H^NeL?4RJ9#7_J3$G@b{D7K6N6`y z$)t7yka-w6an4Cr7znJct7%;mDT6vss;1rt!nVwufSr?7%d@Ioc{^IQ$IG^f&(AjU zUDvnSfIwJyU7QX_(4d-5T+1QxnnnY)Xnmkt(4P=eo{vHt%r<08jW4^9T#k*;d4Sb! zKBkMNGkQ_L*h{M@!&ygcvdFs=sCNd(tbixar1KiHs&1OD^=iOGs;ybQQ&;CEZp*P_ z%$#&e>N^IUaOIpxpg61h=qvB+;hZDn=rs0WGU5;L9GFNr6hX=i0nMk?q;fEF>_kKW zC(Ixw5`lmuz|_P?YX*-1{;zIDWZkv+`K@_O} ziTkL%H)@iaWFl}09><1icESt^LT*upLp-@WX+i5grQpcGyQRL25yt$~QpKAI+C_E? zE2nf?bek!oqZN_?p%0|pgEAS3f;^ds2vNn%B+J&%g=@#FmIVaz*`$75 zgBAda#J)wu)__vy6yw{O_LN98-m;e%7m)TQHbxs{#{;R>pgQV}&;V>4uCR%l+)u~r zDH`SU&K}~RLOEk*;pxa}T*Sne8zDAc$-p{ezPPisC0MELuiB2?jNPnq>~$gnqT2EH zq~4x##i9f!ETIi0k`3;>_p0iR+|Pb?V`T9#BD{=u8)XS#Ow5@{0D=j_!6p??>q;2H zS-)-+zdBA@h; z6qvl+ktLeeFAyu((fKSc=pY#BTgp~%M7737v;~zU%Mpv+!s!r4BFj->k>t!v!2kdt z07*naR7Zv1`CvX$kyf5@#1&NCB>Ee1N`@Lv+mLw&_ML?xs2F%sDG``VNay0_!dnx^ zZQHcU6SwQ?ikWK-;}INV=1{Y7GPaL*9mGhAKfnm;)s0f}M-u7BcOVWl+e}#s#K*#e z(;y5oz#Q|Wa+6AClUfK84bsVRNMuZyo$$oD%1P~<69^L%$5GG0Ogbd~xi^!DuU#}g zS7DjdJZ;+O9-sJ`ng}4w#AKlw;b!3|`=&=?J zdc+Cr8=VF>4B$JZsZ%l*3f5e>DeZsJ6UoG?fovlDhs0?WrSoWws7Nj;m?p7KVx@NCZ>gLWo z`g$sF;!qD6a_MXtnE}RpfXTwJ?F8pfoJ@pkpsr+VQcaj@p?M`lOk%b0%&{8KIAJy8 znq}fR5JmDe3m`1PI>QJ`+9|bO-5KERnl5NV1kJ6!{2dsjxk}(XAq^ZcW)hG9A>&MO zQ5|ztiIec>juH9c%p@GZQG`jDAwBVCwvo@AXg1=UbDq`2F^4KC-DFO*b^(P!0-n~a z-jZkWz}mfZSU9D=8EN!3D6mtX-7G2TdmkPv_!-Dd@F6N~#SE$O8J%JLGknB{DQvGL zw9#BzP%-2q4(ti73ngPh#SSpX*qzq%N+wRGwVMb#CdVuRbzwNBT1?Fvs5lmJb-)>m zZmpRSOMt9BaqIm`ZRK_8)lmkMhdNB^$B~^GLZccFr3FZN#Bhe+VMG?Hasevg%E1W) z2(bL5atRYd^w^i3QUYDJ3Jvj!3f*^a;pw>;g_{28qQ|knNh671pA`q5|3$>2bm8&aP zF$=>nPwT3#Dnak|!N{Y~)DqoQDGh{5A0E+?#JlR_jH;RX&bLjg8i5@FGsRJH>|Dru zOq7k4N^O_OxVG2|8! z@TklVxe5>4;u{=N2Zo!mM5DwH1k{2<7#YkMREtcVs{kRJIGI#Dt=-ndZB?%3Xq`i< z1QzF5h)mg7m;;k$QcL({s_GTpK>pgAdDYfa>vh$3jn`GHO=qq*BN%x!nQdH=Yax~M zEQmNuB?|?^!+{1QcH(&Aq^>Fv4wD6iS;z!bIadiwdXvz|O#mZQi8xF{>f=~n5@qIW zIPx@(Q}U{9*R?IFkpp)~VonaS0;`=fgI7~UT3h8z)(zZy3`WT$!D=!PPRcvW!wj=; z5E5XXy%N^D@ar^-%!Te9@k_dD!t{I1Q!yzk5pH_t*YZFZY6#PQNMi8l`a}#PBNotv zZC>+yT2Gu*jG9pirj^uAoS+goW{!qb5?$G1TeL3{SmI;_Vj(56mdM(!^Ge=m<$c@v zR-uZf(@u5ObdC4!XYCTLn_h4lv=w$F_l!&e)Ui(MYHLzW98c@I0+=~q?*Uj45f?&Q z;_7c~M}<+(qpC9qBg-KKQAucKAr%vdpfYAA-ziBT(1mS{WuvVR>1?#h2H^}2&e>H_ z4_qM4dK-D_4o}(9oa%MHW{A0A>+m`e3B}OFK^)0BD}R*B9!<|qPy~?)gM@U-c5!FE zJFBL(a|k)52&1Z5h(fg>@o@t~wN%DH0vS%dCsI}IeCG|F;iS=JN3H6rZCkIMH-mVk zw)3s`ovJ%yO6N!UC2>_*BpW4GA(Mfu5}HrkY+BVK0bHmM8$!v}NQ7B9&MFvFY-l*W z(5N3e9-AvnJ&7*&M`Ou9#etz&H%M7noX7;Kv1r_pD#b+a$J6i+-yu~J*nKqtQ-WH8N zT(papS~WFmJ8e6!q(;OQ@9*4*k={%Jn~VCq z6eInfdYl|RZfyhzb0op65nfRi)Y`AE@TotBV zru$ob%1}I@gE1~jW4I?D^C5Yw#D$Ei@tv9v#&4bXUP*}~9WNWEW1vnPO=klUM99h# z93ugj5hr6R#?`)kpl2R4rS{gl@X;IwE3`PVH#X>@QYJL*uDwwe{Xd zvu`zmY8s{(ayK(Fa*w2-W*g3!SfL+0$7yOE*w`$Y5E4sBqIv4ru?b9IoJ=mFzD>=D z1Q86@a*#Q#`2-aVY9fjgGRrzf^Ym|cGR@|?T^$ z2S&PayA~pDDf3YtN-{JHHM8+oocYIpO=M>x=cY1qaArY%C}AKRap*;w zxl{^xGE((kwNoGNzjU8!CNJn38Pm;hHVlA&2E&< zj2f~aLwrclV2Gl>bD&m$kY-S@acaG@pmtITS0YZrJVvCU#Wtkbui4bw(5ZPC9ho9| zH2pg+O=Q!TTp);WZXEdlFhyT)i^Dd(Dmo*O2|8F8rlBf9AqqAl4n4f!1qG_i7cLP| z6_4a}XNZ2MOxdurAI_0^@0k&2)Olt@1>#D54T50;Q?CX#5>IJDBLxe464Nw4C8E1p z=*}ELAq{Fa@7cV;%wv;FK+i`}OUXbMY8SC3>!QL=7#a?P^F|6}IWUL`70AJ+j<=o6 zCZSX}CY@H*yq1dDF-H@7Z{Z;JK9Ks%D|KqE>ZTmcJvvPIJzx4P=A?MyXb zf7JO1m?V$GGxT8?5SB+UM^wkg%*M(}fa9>RkUN=+(KIxE-~=LEJE@o%#2nW#U?{q3 zT#Ps$4FCbhI2xklKvtdHilc}-l4~MMKfCE>gMLF8_-fwNCl8D`e;25{`=~U_t0YX2cel zO$Cky0&8W4ikg{U8ffV%(bP4IgrhaxX^NxPdz6pF$^-asP*jB#YMLEl9a$|htz;UO zCI@vFS3;GmCCr0_1ti*1%otc6Z)RDFp#vv6`*&$G# zb;ozU^T%KQ8=rT3*Y+DeX!KAgOH5k`Sp!oct%!u_%Cvj_nI|V=m9_BgxvlwpTDLkq zJX#!_c#;XZ`u#iiA1vDkOMkd*IuaxJ=oHC{K4O@e2NAay>Ey-TYPPlWoe%Cm#7;iM znAuy@c&!;=fqF|QD-XKDjCVZ$ATm9SM6DJ#t=zmqRY@&W37lgGb|TYSDkc{N;A&8X zcvICl9zu!2Dc>rVO?S9z50}ld^DA$v#zAdM)+#lgJ5OdNR?HGbG+PI=ncAklYtMuu za&ag%dhIx9)ai}ZI?+@-$kEfO4T3;G2DXr?q>Mi11t@^VxWI!(GG3Oq-iA93L`(Fx z-W4e!G8S2AWa^cC3x$~@B1lUPM702BoSG=CFt2K+>dkyh=oo4yAZLLLZJ2D$u0Heo z|MUOw>@&~2^o1|I_}GQX)o0#1!Mzoo-`%>fGrO=ot;laptE;=SbKBE)aa75ynoOK9 zOjW1N-qehR9T7fzeZDe$`0#M4IBNOL_iw**_jrL>gQ~HHSwSS$O=^C8clM9gy4z8rehEpRN}}vRv2m7jv^+Fn0pAi zvFoihXkOFy%x%||2-wUqD&bm4gc&5r*dZ)9!kC63e_?H@`PSQUOUF%nwCYYeYrM7I z)EnSM2!zgS$XX(^wlJ1q%Y%o8ZT9i#Wh){m(FW09Psv8|YG)izOyqRhkv85oGq9bBfhZTqkHJCuG#71h? zsFEssoqDaDwotF6#?%EH7_*kE^H05S>C?aX!$14|H^1|*uU@MIX9opR;!g0BJaW7lx6Sw1!XbA&pmbV z_QR7mZa(V@5pUE2HDrR{TbS6TDMkG}JJ|NMXd<3In_6JPz0o|x{szH7@c z>JY?9-jTY9>yQz9g2qBpz!6>IBKu5FJSnY1%aTllr^53}w(4p=siuxYh2R)Y#6iO& zPK0HjEV5%+rRrPNR;_8flT~}v*vYCrX}ea*6VzM6hx~|mRiKGwCk%8-_$*Z57ANYqceWqooa)<}@WR5jeB;yp9`Gdp5*|Zks z2Zs+;P1~lb++;c@k$KHik&9PnpL}e-eDN|*C({bGUo0QqedFa<{^$?>??3#Pe{!(< ziO>GS|8n=~7gyCR6gj;Km%+o7iA|h}QKtr2#RRa|aCnjuNV1Db`P9L|ql&@8CQMT& z^V)4!Zsxcau7m(%1f^GCZG#5$*+N0XL4(nBx@i5P)034yZo1>P>pE)-K%SP*TX0tDT+66`k&` zwlC{sZy}Wsbwp~yL}KPcg+|(PjjhMN{9FI=H*|UP2VcLZ(;vP4;luyq_fKxVbM5Mt zt#jwEKlj3u&%G!+SJZbWD_gehq^`}n!-Iq4qvHqKO{df4a%DtJy1li9uJcWM`0(iP z;e$^-^HiNIx{QfJi{rO+~)i=NU^4(_li@)(tzVMm9*=}841)d3K^guyY z8>x9SG7^GQu&8(DVd}e1nSE}!T5ovVD~UC%POF53s1}E1gMlne$GU(K>Y5u8}C6V_^8iti{tX|aB?@WH{o``@|wj-5Ps|BY8?QvKC;fAnYH{N9^~ z^1{FW4}S40zgb_nzA`9*gWr=u$b>wbvZ;scSUYb{oOSG+8iACU8lIeBzUatP+cA4$ zBV$TMSZWstsT0&rrWM!1^NJ@TbtRRPO4xDW9ig}JULPNCFtb*5*|Z1C?%|?2TCA4d zS|wlLk`ihdaGZZU1Qe_=Q##cLb8TuxUKK6@<70hSpdE0sRWIhPCgLi`*FL{bJ>vG}D@y#Egwd^h5Hm`i!PkAL@H{`0%{Z#VDW-mOhoW`~FGzy71$r#^G>$!Aw>E3OK3d#UO< z%W}0eNIjiNRk_J*zO!4^lcwp~uDN&n=I{OP|MKwGn>(|~Wbe{}LSUowiw~y@+D&V&MQXMHus9JBmWUF!*tdh>anS%g^-svbh1+ouCC(I%4Q8p9@bZ)c0%rMfh#i)XkiM$>JEb+?|V(c#Yl_aRzIeH-PA8880^wVKf3G@d}j; zY8`oQS3AwUOYJS*fA^i&UwQSFzkcJL5AGiO2PfUan>R8e$6PTp`*~f}pzXTen%L^6 ze|o(C)>B{p&C8$q;@xVGlZzs2t59#sCS5&!@=ITR?CO<2{onql_Ta{BXGb6xFI`!+ z-Qm&k6>;@MSlm>|`>ts_SJeboujkHP?99)%cB*<(2_G+y8Fa9}zg#ZgzSp!}=P#W5 z%760TKJ&R>YNqE~5@^RnN&q_ph@I2s((2yjJKxrKzkSrA(v>X_-ak0pU+T%SRl?oF z)z*y9PjT(~wVlh?E?&9**yZOgY(I0uT{yIwVM@k8%`$cMtU~R$c08#taUw9lf}A0M zEr5c=Qdc!vKO3o$S6#GryljqE-O)-<8o%=D<2h-OJ>CnlLH(4#c+l$DZ991Kqeo?W z0(i1}8IpzK5zjnEdc&z5N{PJQ!NFD~87>6W6NtTMyp;)_ZTg`h%ao{pN>9A0E@9qfVvm)RUc_(tzLciUmaavoa}w}E63HjR%&l8%p0KWDpIIF#@_aYXSR3u_784N z9bLJw{mJKF*nQ&J$3OP0NIjRz%<3w(GJMymnoJmACEMBFo=&G`Hl0ikmhIQS_Kp1q z2i3G*w(^_5^WRc~rS9O$hu_xszkjrU@Pi+}Y1&raw)d`m zaJ0OA|L%UvOOu)SC->^x2XEo!S3dc~<*QG8eEZoiUH{lGzU!_B?}gg&Ol0Q7Rl=}J zc;dKr0dtQyWk{AnyCt+BqU{4=)!DMsMbjNF+vCM@(dwcLzDd|f9zDYxNk^)Ifh5ap z>{_?MPQlDv(frLLe8M4~dh30eWay_)vlL4z3snIKODV9mUJy@f339}b!n`^qN~Do! zLzT&l$u`5tDpFOG<7Rbu@7Bw2y#3k-2R9dT+!(ZjsX?Oc3}ISTPN?g;qozAIs@o)pO-B zQQxt4tja8NU2GlQymt8J`>%ZK`#<=>)u%u4$U__5O|a_`|m!oGgXdH;u}@ zPES_d*0f$M@!E&CUbubo%hTOUm#*W|-jQ5j$F*Q8JaxiOg3_Ep!i5|w{$e#XZ{E!?r_nbG~KfEoi|TVu!dH=(SY`Eh5cAEC@mFQTnv9Hht`*xs$Pkd$N&E1mJy`mcQscwyw-}~)lejf=z>7jO5Gz;Xe!49y+}eet^M5Qf@ayAzSVWQ!lf~-fk$y-ji9RQ)e+zZ@`hr z&YAU_D9IY-Zke!@(vDF1chnjgdp~Pd^zgxZZ@qKx!NXOjtv59mSJjh>fX=H|Ykap< zZM~Vn5gZtzz9Va^&fB6rq+9Pz;4VIPy>@Pk4lnr!)5AN}@tqsr`G4;GiMK{mDH}f!0q(?@6%h~eEFyEcbII?Cm?_O;o)mP{oqrdc>3tz z&Xw~!-O4|H{aUNKY*mOHsgRQD%;Cb$3VHd1@{7kr2ljV<<%%t>*UnwKZtvads5L>&Tsd@I*LB|eXqu&J-aHY6Y*sUo z7^s1FKLYA+7_y2tTt;at8 zE5G_1*I#&vJZ(LE@8tdO{qVJSZaiq7zPR`B!O=5MJo)Yi_irt_KmN<_e)7pH@H(xx z-u>XtrOTHd9Nbu$dZR|I0xnGXv8&f+m#?|43lOQ8i8(maFa{2Z3V5?tt?{;M{i5lP zS9G#kEki4)n7Jm2U|S5VMwWBX!1k$7_MW14iqrGZH<7FuMuXlJdIU?w!Je*?*bK`P z0`dT2K%Kv^i1PI##D7rhq#&NJKCoJ{R4OHfc~EDV7aHx22rxCH=?k9 zEzI;Y&#WZ%)_gLPsjlG3t2Qk|6`)$e+-_9lm83Q9ykE2$#_+gMN{Bllw_G0Fd*x4# z?XB(Fu5O8U)t!So`v(VPcJ%OQI%mH+=H*Fktx20WI3d7$K#+_Yopa3W0%4HYjFq^a z?s5ChUwrcq-+S+;_wF1uE%`+=n|VH*=#T&1n^ z&Q(#j8c}0PRXMzBW436zlU2KD{BqT<8f}$2ubvQkAAE{3BqODn!DG~O`E0Dz=_%TY z->;buX<0`|M9gVU(on43kFzH-b1tiJ(y?83O3gNy(2wj;v?n4Bu6H02&VxY11xJZj zG!!5B-ddSB3|wCbO5}nEu8M1`iJR}e_`*x~KD=?`_O0X2DG|tnxoJD^bzaG~W0lEa z;}1@Jqvk-aViMP?vA4?Xa7nMccmM9;@lzM;XRqycUHfp~Pii)(>wPMZr(fm>ksbE_I3{sPfpan^XgAl ztMpbXy2Iii z9P>9qNas7FRp%E?bJX~wRnxTEwAT7mgCcZ`!{Mpury^Nm{b4Zfv7lgTK3B{6OtGl$ zz0xfIiS=+)DU@7+Co`K|lkdgZ2T?WL!#y?^)4ofCU_(yo@RKRetUOk>+tB% zi@STUn$G4;(?Z}4O=lOj+~=Nu_OoC5h556e>vkWXPG(>s=0Nany>C1gogFOO`^U?> zhbQ~T?V{19Gan*aDkrjlfvLzW;~1ZTg-`A|C6!+UpOCn5!{1x7oQ@e?BCQuAjS?*w z{R@KrqR&blnJ?m`(YPy$k<;_2Rcfg{uX*TKni8RAo&vc;BfBun9neIYjh65IptC@N zjvCxN!O|kh70gEBKp?Hm?dmn8cxjb*`E`(CNLq&Aa=DpIUZLUOlhNg9#d$U);O)l-1K_|L*Hw z|I<(X+Wg$Lr${8YLl*HS<>m3-`wIU4?Hg~s{HV3wY{)iKmY8<-@CuOxA<^>(R}6?f91da@BisU z>@WY~>%aTE|Lf7gy;;o{x7^Ra^yv%NpE;gAM)i&(ZHUa9SM{pPWqZ=manl{I+LO+= z22u%WWSab7g%4jSZHH#DhBUC52M+H^dtzB#Xd6t|PIIW~nZD~&jEg2x{vY|uKl)E& z2OxehC+bEV78l&cviOHBS}ue|uW3Ae-jqR%5g=1+ITWM>)-cE^P-Y~+ zQc7$Qgn|JKW70~zgLl--+T+>&4R`ZZyZN%-{6TembH=TMIo8@?v20hKjIdqN`I)TL z9&@Vr^`^MM*K$iQH*>*jd3bnPP#qsg?fAYqi6aUrE zKer=#=l;EwmxGn6+U0Y5i^byMqB~l3qW+gZ^UO0l zM#=Y2H!Ba*_)OD0=pr59v2I9Je8vKZ)1dK8eM~A|HXiU<(v1m zC-+``@7{ysZo85x+e7WZ&IcwvxYOt=7^WRqt7&N$R)&h_n z?mwJ4Ot$B9Szg?#-@DU&=9AZ6fAz1A4vv*87Cty^Zrt6UIyzjoUdfR@wl_QPR)?K` z`q_{F`rrL~+ZQf<^H2Wlg%_UnPk(&t@yCAWzxb#1@xAL$d`lQTdc z-rb{w@M$0{<>)xH1&yah4_uYhDPwfOf8N^4I+%J?1V_Y5LPiricI3sZG7_V?Kazv@ z{LNSW?VmOW51Ydy=TFF=Xm`9j#n#r0Y1*1rv#kpkE-Mj>RZR741{PPfPhZ{JakMvc zufO{tk#t_q&nNd+O|ldSM2E-_XQj@lBh{03HlNJ5rgh~`Iz2pD9yXZoUPN8rdw6*M z?!&F~`>0k`EnAhm@x8A<^SRGnc=~zr^8a%7E-|)d*L~kwYd^m4JFj}&y3fA%_U&dj zyG2s6=!jHA$3P%C0RjZ}#2zFAXOd9{0TLt=&w?1pI1Ug5L4bIWL1ZKdAS(fk#1SIE zi6mN*MN%Z2&3@j#k9wZh_u7xORtD!()j6lCo3tB^0;*AU>U7oq_Fj9f|Nr;DVklon zd9SvYZaIJXaq5?g(1yN~*i9-wpgV1SjyXSjV}E}BVtRhj^olO25*RpL^v(hiiO61w zpQ3EZMvqeXzN!c zDU*-O9mWkV_3CX{)sGxvQGvc6KIx zJ1J~c_;CJ3JpGADbIpOl@x&r#)q>2V8jhJ|(RG7~Ez6>)5P(#`8kdraW+EC72bR>A z-fDXB`aB!z0ZIdwA@LfIzb<|O5=&HNhQZhpu#vDgv?&Aytb+{q_M4Qy`5WK6m^B~# z@Xy|O`lvK6ajcJD|Kv~p*Zn9j%n^%u>57A9GCA{*N+E+iV^~$Y%uN+(85+y{8si9C<&?qDWn-` zGvsy+t+m0&#;??QA&qHSxnN>7zBEA*)Qt(=Mvp*~=M@VQ6 ztF})U(?uK8gSVa?K6!R@UqAlYzxd?Ge}4b|+rzzabAJBE|HtpY`OSa*$*=s{is-i8 z!b3osoxG0Av)I-3Y@S2kra*_2>@%%3&r+V`1%i>WJG^k{vRk*Vtn#*`-#4FCy zEo;J<)iq-)N8K3QEYp4oY}_?~#`KfW7itl-xM;FV$%MAa&gk{D&LQcB*}dYGo8 z@UrZWsXs5`OmcS4kcB95a5z-X^YhoCnE_`i9V;S0Rh5OcDP`4BQI=7p=Kzk7rK(2d zVCaiV!EmOWl7gTiR>_f#1>@}7Uwe}fiywcul=I`ho;X~BNdh1&NCfDChX%Zd%9)AN zA%+TVRhIk1a(c0N_sLOFjSu#Zd^upky}i9JKKk)wboXd7ctzkbHLpI=ci-b`NNY9L zy4j!#7-ygV_|KNdC!CkDZl{Y2&_o8rmI2GG@p#cqs(bJMoB!T^8^o3a6aMU<{GMOF zT+C1RM#Hvg4VgJ33HagYqo4oOQkoxsI<1#qeD>V!Jzf^~X2Z7{9AD)qtp4KmQm#gq z1HMaD+`6}oTakH+n+Ns`vc zV%X;_s_YCA>R>eTMM1_`=dAM>NfZHc>U(C+tOHw&#(U1$u5I3b|Lat|C({6mZ0x2=Nt z(GP#{&a+3t-NkJA`~T}dDjq$1>zm)cgk`T(JcKCA^OuX)pTurXPV1(dPEU>wCkJ;P zo_2CN`|*Q={onh?|NFoHAO5GNdU)R$V|_*8?K}7B_$L=HemJ0*W1PFSZ$P&c;=K4p2H{Z*9Z=)Y%bOHnb18FfWNjlH%0F%v0Iids*VD-ilS!Ut7UY5Q< zm1cgP+J!;1)zBD=XuR`9RmB*c^Q#z4GGKs!a8wKkEm>bymGusZyf0P|MF3#tvK$h* z%otOc&FTmD@BQj~_RXh{p1(RdK0P(4&JL{iec!(E@a{Lh{w#F!)3e3+od=JeKI*e5 zO@8+A=LusyJxj&pzxs#&Tc>tl%y0e5y(g12KYejfXn{DJ%|3Yf&+GsGzx$8=^S^iR zjW@43w4g?%K7GYu?tJcJcy)5lSq$1moByvr{@@~*x~?6*I&0Y--Z2(gbRp&8-FKfo zee|ur`fGXd(xHlIOtNfKI{f+tmnWG6Jx;}V=?{Y`8BsyaDgp$|b{O21%VuqpzZ&1{ z;Kr}bXV>F#z03-})b(tmw>OA7mpdkG&Aor|TE|-fd%9Di?o_DOkx^W$saKDMLuD`- z2S)S0&fhgJAGSY){w&yP?2DtKG==S1$sP=b6^5TnByg2+rL`6?iAicf1PHu12UHo& z$PpzY=yZD8ckO66;1rwXqVE>YIadq>h(Vnz%Cb^nW4$qm03ymN&bWauSd`EWhC{O6 zI0s;uvkI>MwZ>axS7iIX&s`VJ&Zf@!2lwuC&X4ZyTZ2_OZo94z>FuvQ8I1-f$I|u7 zyZ7#$P16s5_NguQFD}jo#LHznZOo6J&s&BeK-Vw7^YuG--ROrOe){;yp|Kx)+SmuAfDBo?;>|O&b*}BYRfC3e zDoYc)HuOCxbbaXh+=X;{dQlDr#uy}S`bf6;%b&j-@9qEm)6c*7{Ppv*-j)8tpMP@a z&i%(v9+b!iBrK;B3!Oj^`1mYlJb!1;RA9};VsBIeH#x(>-ag1`I<^|3siv?zekGw1 zfqEHoh*2k>oiv{uFYM#@c-g-5&O6bO+dJaRfKsEJEL})hFZt&+pt2zgWLQg41{EO^ zK&x@3`XXIH&1fe{Y%{P+6=5?aL>0a)c@1tQvToeguLz&p5T2bdS-5PF!;LA?jf5cB ztcQ2*`!^*YR@DFm1r%@yksu2eR>Na_{jT}qPV?#F;)Rfw4mdVc9VtpPTTI3ybZQDT z;*5Z5N1Ag&(Woq(g_#v8^=({Er}O2-#l_Llou`l9QjH<>q3=Z0xw5Dn8OH(^Wi%NA zD6=4$Ra_jal6BsCKvK}cT3kgR8>0%G84#G0aCY9Qg0se2n{sx}?eFi^bpxOmXXj1b zot@5yM|SUI(f4Ql|m0+1j87Dn@>oSOlyB9X4iDsUN@yixbb%?{xT zA`6!}j=CGLcuSNLuG33)0~%Mjxhs$nZVW`P*Z&5Q=f$kUe-MX~Y? z7$Olt<5xZ=Uma!?xMR&CytG8$`9>$>rUZ~K6V)|mat z#2Onx08L0dsLGUeT$V%chgDfPH!O?Ez*iQWfyiW$fBRp1=Rf$nzmZb<{!hR72mj;m z&3dr&Tz7dq;ivcZn%K3c&z+`({t^I)ecR4vFC`}wR?Te8rzeY-^Spm&@Qp`D=d+lb z*%-^FRI3$7c;|j_1?HVLF_QOGhdKXwGN# zq6@QL1hleHSvc4qJ$>@z8(;t0y`#HC#Io8PJsBl~sz78!SOKiDz9=%YCE%P9714-@ z_Xbf2NkoL12vjA;jzvWki2$$~ln6v*m?eZDoSpSC#z9$T7H^Z5F0&g}qZpDwyKI_) zx7A==w{48M91e`3g1`)l!Wl`i&%t}hu}|FGDf;og8@dPo-!~rr;a|Ks>2Z+ZZPK@>0qZ=Ypfxg4%E9)#OVa8tRq`6hM*Av;ER+{$3t;PJwZ zNQsqzk=9zx?KfiyOLi@&EtK}){H^@_&f+ig^pmEOJ4cTn92RFM&-=D*BfLJH7vA2p z7GShkEEHhg&Q+2!vYIwXm!v|m0r96r2z@b;Te9zJ^D{Q!|vQFs+`O+Zdnag|$Q zh;uH=0U&zgM3_O1rLGTUSsG_aMI`4G5tKvDNpj8#;G7c#B)@7ht7?n^K#egXxhRT= z5m3v*1AudTL?*-#vxE@J!V`!^1&ao?u;eW|a)f{?mJ~8C=hHsdAO7I~9hY|h{+qw^ zop-O2G#j&F6jCY zCX-1@{ctqQ!bDb7MASLE%5lETFV@h97!x8@Ww~6oG374gm}NW}h7jQLCk}^|C=($W zR9L-F?n9m~+s{9HIUW~%7MJMWP+wj!Gn_a2`7}k)gZTRV`A7Rt-o?Ql0IATj>H03@ z#S**^Juf@3q<1QU(DuM>FL-BStV7L;B53iNx3YWQ;btz`_8sO5<$a|}y~>u~nA+^- zE+}3tUENk_z>TO&-I0Or7}Kx4$9OX%b4`)H1lnyN&^OeH3c%MZ`8NH?&5c)^enNBDK&L>_x>Ffu+{<+5s4%u${`tJKrzJ_L$4xbQ30ZJ1#_)v`q)|L z$(WFG%$&0r=VD@Kk%f�YS}~Ic2Rotvj5DW?Fa4mhKNJv6*%GtiiTfem=8*?fuF0 zi;pi}eRTBf8$zY3QK0EEjr-)$l(r<(Rs>_mgs~S&R8)r#ye}f;wT^v58Fm$wr`s3v z?IZqb-VM0HJHIqb>aA$-^?%$DWL&kOuJ+ICh=8j-Z~Gj_+YYSK!*bva7~&8fh~yZ8uvSGe9uCXl(0FI97Z%MSXI3Otvl3*g3IG8MA_?cL zVvS#ojdM!YImv=ztS`wJLzw!WQ?DYSUDngtY|)=D>(uKT5BJ~t+5-VFD<#ni&LLS< z08CCXrz9zlszDS`(7Ijrp^qUNKtl+^O;a;cOlfF+h}n=s&c+hyDkGB>A!l0l*!JxO zVb12sxTrMd{j$rvY#BK3@9*1wnxB8bcc18}AWVvRUQ2G;`-9?OJUH~zlSHMFl(Fst zN7#SkYhX(Rqw9OsWoVYH|A5y3TOJu z--w&Vhi(ifuS(>r*qbf<2X3?v5H`T`+b!62=q+3&l_=oN*fe|Z;*YDsR;Fx%Mlp_L ze_UEf%Zt~iuU@`*aq{uYi>~MW@wi#e1cA$ea5BCaRmJ|np>Zb0=&kYAj zDS3-IGjj$5CyQlWM?+>rK4~SG0^LeD+{B_ZP%0?zwR%XQf~K0 zcQo#@L2?0ronZQ_&3yABg;#wgy*kQw5)E+Y9)yeK35|&YV5isR9VNFt+P3$D`>`*a3<$0tXZ-(|d} z5D=so)!TIfl>nhlDJt+qo4cMYSe5wTz&^a=VmKRd_x_{FMP=^q7o}H|m+AP!bFmM< z{_SQy-`}hBqBy@;RD(Py$Xg_WHbf=UYjOACW5&yT4cG~-xUNBQON_a3)Z$jzxY<{S z%`x^?obUF^+dWNns}{Wq^U}*a`3>cVqJUmP>NWyyE?pe59kR0(!-R-m;q1V5ixqeF zR8+8mDl*QIDGXC;+xDVfET+@5(~I+y^YrPtmSm$*osU22@w_qp5*yPyMXv@mZ+E+j8 z4(~Pd^Fc`qC1ulhabRWWps*A&i;_EdG(5c9lR;nrMUYD&)Oy@=xsR?d=DX8#*zxLZ zY1+5e^frR#Do=Y8j!Afh%Y}`6qDx`xhHvRwS-JK^03ojI?s6F=4*=#edSIImin~{} z8(7BMa*D4BA67#|+ZfW2Lf`SD^lo6Z*J%k;$=p3ZnU_Kv_nPJMH629rbZtC}Ch z7!qzJCCEuqym<90#cnd$7hqd>&RG>$vL;FIVu%QdRddR8`K!!1lQF7{2%J-n zfnxv_0jyjZ1ebN_q7KS|Hv-IE5KGRdY^-5sVa_pn=ObqXT**fOz#2pVYfXq8Q&|1V za$Y4tYfMpc_JH1bRZS^5XG9RtsK{_IKuttzjBCo!^immTRR)F+?u-r#%zdQcD79@& z-R$M3<6#kV==)6I45huw?+sGvsgFSg?>)uZ9NGW?AOJ~3K~#I!mBS7Vs=#`6Zr<59 zJ+zxQW&E=A)SH)~%|D|s*2Al_6a@wzhou9J^Z6Wd91Ta3 zQxZ`$k`j|4WsWIyUDvkdaJZT-I&X5{D^kdesw|8oqC|$Avk1yLDX+`#&^Kg&DGLD+ zr5INr$`k{O42nVDH5aqRvd?|w$*3qS#TcXJ$`@@@uTLXHLK|YwIRg+t4l#z5V-^rB z%L)NwjD!YsJ(?m!BFmhew}w`NYfxcOQjxCf5BA;us63vA3^ofGP)kraV+jgx3Xi^U z!_i2qG9~65>$BJQ9zXN0WKkf~v@vn0DxWip0GR6GTVGEru{&OJl6Dv#ba~Lx_E6@w zvC#F>U3KrTg6UU@9(vP?w0+CQ>*Tx3J``?o6)&M0c$unqqc6L8>CxSk9KF?1d=pUr z3(g07c6uTqGQ*;WS+KK)>kOxj_Q8ytObRNDSiqE$p@Adh7&4$Q2OLA6v(=nZ5RuGW z4681L#LOWmuULo{kkFWvvauc!RgxeZB2Y*viwH|JMp2EoKC-rLUshEYLkO|-&PY5y zK6!mnpDmJzj0cF2GDlGhl7MZD*&1Vw??UUHv&J-i7emf0%$j1B%tcYg6cKgh8&_rm z1i=&{b24BQ!5Q>}!dbJ5uwlWv>n4S91|kEZF`85CZCR>7T{lHJOgT4QC`&`ebnPM& z#^v;2JUT`~AY+^;#+YMZMsjx^Eg~0aR}Npi)|cEEh~q9T`z!9|8-07-zC7>FqV$GU z?K*F0cc8lNC0u(tZbXZ3x_&lh^tXQHZXES5sgGW_nU>3#h3a097cy<*bScR|n{hy9 z=x|`!LGX^e18)uFlzIdrYfK7u)d~~g6-}-0ySnK(voY3NudFG=VlXi1j9F(k6G{rv zS=aTUsax+0kzg$sW*{K*iv^%jP9a2VopVLkEKgp&TK2jKS{5`Z(NRv45K&WDd!-Vy zrr7nUnh|54nzmD*kQfxKF)5`OqIJe9W#+=W6jK%mIfAel0D~gx-Ov#^G#X~yL>cp;{kcAfgmw+PMJZJ&`aAiUEhfcXI3B)4LPO6&N@}? z`>t*pG7eBBW&mrA5fOvPDl6rAQ54qNoI>dg0uuSU?# zjt4~-K?@xkN91!3&iSrwFOENxSU-4p_p|3OMVO2g5g{9C@U>t6t}lni8d|%T3|t12 z(pm+)Nki4`qwoqlbIXF*rn78~?6iGXljCI$8TJNLq8c>Tl9ilOa^)y?p$#FV)Tf-GQ$}mNL1RFph_yCl z?n7+5wyXvqc3HQ^6`2(Ptucg_42IARN=pcRj6KkD(R$-UYBR&or^iQC)AU`N4C10s z5wr0wiXGU>mz5wQdG88KZ48|Se*Ez6N1uL?SV{6K>2vt>{YUSAi=3x5O8AO5eXZ-j z4R816;dyO2L)_3@u9Gz`p*;QvD3@e|E@h< zH!ZFk`pld~korbf83ZS>-h=)$@AC#n;QrsW=!iccZoHeUz z&Ky!s&N(Dx5OS6jsg}1h_ z#u66}x_a3(O&3JO^jR5rQUc`eeA=8|EKU}wiBfrZ<7hM}S9o2t1_aECYZ`NEx}bo| zW#^6C8&#Y+Cst8m&MYBG-NvLwIkzE6Wbexyqc3b#6$ZptMNTorm@?Pe&+8_{%&Kjl zCY29}$LGtUf`W8l(H7M8v1!_(9L11)1&Nteo%04DaNHjc3SVNM+R&r%QQ&+3#_trv zeI~*cky2%Qx7fW)Tw>yGayRihYyfWy8PcoU%=+5=RW;u>`EX;i)N2T$-agsixEXDw zyzVGJuK%yrz5Y#L$-2hfo+IeyJVI4%69kEzF@=GW^!^y%zE><}%V=@v336V|&s8~cdA!UiFM?^!3Q!adgh{lpwiY(*t-gMf2@#17W9$G`9 zGAyZYmrY$y=Swt>*(sW;ggbkCeb>G|oxYgGi95HF8EntGwD}8=~5#juNtf0s&A)Btos6;moD?!r56<3kOMyu4_UBPqStF zlxeZHEWxn-w=&HfT@%(kure>ZaHuNg;| zx2nya(G^Tv;kplSbuO#{qgOgZ+#Z~*(z;Q>4$DF{7T!9I6ORujc&CygLlY^d3=Axa z2rSqKPN{X?0w|#(XzUw{CWf|cmntBledtxy8dEq2#>CKBYlzJJygod7u&BF_pI>xM zdvv%5ptJdWqV5(=91MKuLs|K%D#^gvtUsQomkXWNdE(*zo|z0Nro5QWiLe}0 z&fCCwRVwA2Q&Q_at16%pL}mmvYU{3DnUh!vL|4^)PN}fY5m`gcvh#Mt%xxE&KCFm* zhKw)D#bVaBT=;Ta1@CMDTop8|9L8KYTNWiD8bg94qJ5u+zOxNYagMn}YmA31gGp48(^@40M zUv@7~=PzEJzw_231Y9oai|O3osZ!Qy%^?@G>`lhikQ8BA*D(sK5s{+sXe=W2G2|?(xRAsWL-NiQS*zhV#~gDqmI`N# zf-x9VR-is|&Kz>PtRs0cWDH9I3W)W*9+i4$r~`+}=#fd1bu^p|i{W5=c!;I%>cBA# zilKMLIc&?t!9?DCeD7ubGK4&N@W>1&OuD}9-_}=x+i$pE-I3YQ>s=xnbQ3m<8`Aad zGk5E*{-sUU>&w>eTf%E_>&=(iXw+{~nP8)3p4KXeWIX5W zTv4}qKI=o!0V^|D)fA$rQ2_5884Jd?ZB#%7j4>BQxkA-j!eQYn@5I`ejW0Y18&G2@ zrYK5HA481HAw>3|j%ZSq&f4){K!}6Nj*99|Sz3##LZGA3-f%n`?(G3W4slTUR&tIR zs7xs=<}*qB+^tpl%A}R-e(8iX3msL-dxCQFnzOS72+sAMx40dIEEb@%pmYu+Z}W;C2$SD>5o_Uk0%HPUXYx4)h{puIOKrfsm64u^hk zP!MoR8Y1;clSl+M04SIlL=;d2q0hN+02rNR%ptHQb<4I#BoYPbOCus!mQ~C-F0(Z_ zYs0IHloUsmH#v4eE_!@8R$!hkLY*z;=n0PY$0gx(kv@D;zntY>ki|M9&iZ)+vw5iN zJSa7Rj7H-e5=e?E5mgFg3)9Echv2JW+eQN8$cD@zW^bLdZq;~nWK#&cM3A?8#walfe?fEc-YdA1Xp>7sJscHVf*`q*->2 z>HAdA=WQ3j`nTRV{P<;a^yb?ur;7sLTyuJTM@NNiw#?;X+5o$6#7*tgueuwxq1&D% zgd1}KyebcFZ341HF7b$YR^yGQ$zc}VAVT+}eIvUdREQPGC{;&lG{qKjGu&IzD0 zXWy)__g5ex6UF9@xwvML$MDGzJ~ z$~ke!oLIU(rpz&?QB{yKDi*#{qZ~Vx^7y=;&gwTF-uu?$quh1vGNz~ooQ4i!9m0Hi zb{gBcNDgZ1CSXpIQ%-zmG&$Jc+dtSBKmfkYA-N5c+2pWqu0MsF#@rpq%vO_pXPf-e zM&{N}*=@YvOjEc{H$+&2*e-qU*U!>c%WRX4x8=CH;b7Vt5vZ=c(g3!JskErVfCQjf z(2-@$f+~P1qbDeQku+woSxruPP-0eO0p=9gMAj_goI{Yl??ywH7(&Qx5;EwA#qm6x z&U04SAD6?@o-fkzyk`Nl?zD+7r(xNd@uA%t5|Ws=~=Y!&VUdA2#$wCB$KnS0ISB(D`H4#Q1~pY1UYid ziI7NTU~yPjgD3>bf(XWy{W6~|;~)Ouvu6*7CGf@hiFNk}g>#My@A`V4!(6x@j)w_p ze7Gl6_8HHo%hThtZ$5kQaIcu7MDT=y8x5vaG|yFb#Z{E*mHqd|Zu{k^qZ_tQxM4fm z9*%E~h;G_THtC^PHsUX#lEAIH&&zMborec@3gfGm?$!HQ-~ZR2+LJm}#;Iln1Ij8Y z#>fhYOxX5dJyB_lpeV%1fF=ouA|i#vb+1W)2o;h6u|zDI6NdybL+NZA(zK3EkWpz4 zM}}EGK3+@%jtgYg*RwPWq^M{`>s7Tz2RWLLzGy%Ebn)J^iQjwh;KBQoqkT8<2555k{+*}KR%=QK zQ`SBKP2kiyNO=WX!fTO}DAGNFyD_svDN zOHYQ)GUFxq4%qU2rY5+x9!SsQxwE{qftg~$w1A_t_ zO{$XetkJV3F9YmVlv$c+drpqH8dwmxSaz?c-6CREQiT*#&!)}zdKoSj%V{(POP>^h zQj~fbj?daIX<~7bOJ}_IXVY3$oG%U1a8Sjpi6z7gAl6uN4y`fCsH(6KA%PHT@T2o;Z}iPy{T{iaDYxA8x$PzQMd{F5MH6Gbm@ZCE zr!m2M-}u(s-~H{qS0_LF=l{>0C*OK{_nlAkAfUT05jV#!tKjNQvC(Gw?@eFQ?ML=6 z_%i(x-tvYx?#h;qvfDSkQb=9q&sgXy-kFXyt5RGi{p~VK~InLEYukDNP&G^vpyd3IHL{x%^9DVka`5*n+Kl{#a{Dw#V@Q43m zNIa>$tl&(dU5t|lPaZvecK4lcmE!^Qqc1-E(?9s*-+k}hUw!Yl|IVrUOoXrk+L(<% z)!l7UMBtjHxmhw^Um>pkxLG*CNVCWC?ey58y5b2dhu!+n;xB| zqNv(Fhq%5vbP2j;xM;NQa$-oFmDM0BXv`w2235m5Q_-FFWlMlz?p)9B|&$52!5C|-cTTbZO568HSowh{5TG$8=CleYNijQLnm$(EmfnTFDGTol*9T@rMT!jb zE?6q=AB>$fkDfdx@BfG2{ry*q^!9iD#*;@69@sl0Z=Qer(fQel^S&}jBlK?g(d&!n z|K}ghQ~dFdKK}I6j|hE?R0B98|IWb2+2BP3R&BxZl) z@9Y(Q*Pc$>MRG}C;1B?sKF7Su4K&k6bI{W!bxcf{vSdz%2-ks=LZU;DheI4VkSuj4 ziw>et%+Q6z$c6WXw}!+JIZLQamK1Hsrm((l`z&gWt*k?1Lf0W83-=)uRXH9M14mq< zvlc|V5EpgmGc-}6Qqw0=xwAJu9G53ey{y}W12Z`It>5`~3}NoW`RU2)WVSnT- zgtpHSO9nKQMx`@E%fJGTTvoE@a}F8ik-Dr2AhCc#X;42b5S})s{6EzNMuA! zj=>jxSdelbm^wBoc20FvmLc>_lEf%RL&|Lz17MdSGpNz53ILHsSQ!8m41zaEh!zc~ zB2iU&=ws8Rq2s;5hz#{90%FPFOj!&{w6wxNI`6%j$RMaqA5^nJMHOpJS9g)+Y_=GA z3_0%&3sp77E}LZ=V$%yald|kh_U}vv4*6(raQ|@faygryzWQ(f!GCT3~cFfgE~Ah7^Fil>3}3 z=ME;L#cT;+d|^Q~1p$SkWMjyc!!B~&)os_;U2gj%h@lTDCjzjBKtz=LD8j%fiXIS0 z1HY{MMVHUpybMVREJDsg#yX>S4ks?cn+N5q+3<^H{`Aw2{{DaYUp#p9hI5{Xe*V#i z%o0;XqQm`zljGMRww1$S)@o4PCGI&D9pEUv1~OY6a|U`J3%n3*@}x;r@_3 za@3wRp|T>3-6lvK;2;4T_tRke$(3Lv70%U-6f2pJ+JiDhNIE9P~a z&QcH5b>a-&*&B;$(97-E7{}U0yVuEeeC=ogn}Ns+^4>BtgvJT)<(^|!h$$xqMPOD1%&NRju2^><1VBOU zI_7a@Hm1V+96AKBmQq|Ej@-epF!=wo_HNCRWyg6~ewn$}-c@x@pT13Z&-7e?0Wg3; z5TbY$NJ5lMQDI3DexMxTP=x=0e~ZJv!cUISgB=cq{h+W(Te2KBX$po2fF$MyOm|OD zU(T(n_GPWi^uyY9x_f3o%9noY6Q@ttsmi@p=9k}vhGCv7G^O*w=IzDv?eotr#?yTn zXWz{>ni&n5WtCH6S}DT~QSaWmc74^ItmMwElX;%r|JkFnfj)k=d%km4ZDU3?dVV== zYm(#J+4-Gs-LM}#=zCp}L`XFg0h1YgeY^sZx$VYj=XFbOmxlfV6q#qcW8>5KMB@p)wRW%lPUVngE#p$G?i#s4>rhq9(2>e<*2RwclM zxrUn&i-@PJx9+^=vY%rrZlw|_RU@i4R#Ji@hNziqk34h09o>AE}z;w5{Kh%c?4_Qfyi-Cj1ad1bo4 z+7|@RXh}(Jd9GAQsv5UCwHy?&o?jOxuySW8Ib5mGXQ)Urs(6 zRj?Bmi=YS#fs1eijm3HPV~ygj!khl+<3~?S(p2%}eB8|hVF{}U3))TLjz9a!N8kDJ z`zb$obV*Ul?W10g6AL#F$6=g6mdH@1qb}dSdxB?|yWQSG!aesL3Df`Ie-K~7Ke{ro zLHvT*Dz5UgKKC78*bBUT!v0Ntvx7kZ03ZNKL_t)2Eq=lJ^oxqT&uLS?^jW?7qK2>< z{qj5a&R@U(!R}`r@!Gt27!iboc)PcHyPv(3N=VY=79PQ(Q>kOMpeSY(x}V93nViA< zg57R@z7M0&6mBC2DR8vuhq>%#pB*)V+#w(*9E^Vsfl%QGOYFwwcA^q|)Mq!}57xxK z&z_%)XqWo^Xct4h9QbT+r*m8sn!UC8)rf0^fE6N)a4!+$!2}CRNyDuadUkeM)8_H% z?!!kH4pw$)X{_!QY~ALQCzpTpum2P*k1pzG=hJ$_3rWGAO3L~&N}tpFiGTu$%z?B zXc#Apqp4VNCFZ7tC--39&&3HU9FWV|Pj{sR_loL%y06Y>xlCp? zsCp$2CRSCaaQA~Z@A8VB0t7iYzmd_kRhst0>9%^OJk+TWl@b_}_NvusHqdNQgT0BT zEM{0E!mvPzh(eeIlz>Qw#7#r$k4PtS*)kyG%&S-WL5T?|tq`r${<&bPy0 zIKQmS+$Zk4*sRnHqeY}HrImSbUd^S1+pW7_ee=$p`(J(c+rROq1afxdJve%g(WD=jN1<1mKAU;DshlI2U-lIaGV^i9-Ut9 zji)Kdi-7tr&!wi+XJe5NSQbhi!C^)o?C#BI#=O)3C^DnxxVh1<-0a@CzS)oV@ssn1 zm;T`SI9t?Z;}Xpfuw563=o4qJcaPHM_&OwShtY{kxwJa%^~P`g!5@C-H-CG*IiAL8 zwO$30g*#*2Z8WVS<%vmye!*Jr5TX$D#XA>Q8zEfnuV0vZ;6<%qv3b9$)>r8sI5;VP z9=g0L=D)U}0$<>ce4beS`QnSe7U!a0l<4s4UzN&y_Q{7lU#44k*JF`kNKCz;1{ovu z5>8QqSOY}v<~2x&WvcFw1a8ClZl=k=IrsfpHi;Qpyd52IOZSu*r>pz`8J)e2!wHNLcYJqSS0z3uRb~eKE^Q|LJdk>*n$5Z1ne^ z>_2)+)l6h{Ii)w>{mtL`{eMg+w@dZaH88+n1(}3}MhM7->Wi$`S5+DK($DE2J&u?7 z7JQX0M2k7et0@MJ^A?97qL=?M1zxqDf2l}4UN|wnM9BY=VhZ;w+i+S$q_4i_SMBNI z^QvNc`H=v=pj-VyE2?!KKiil6wq8H3l#-})>XSi1ob+$~>Th2!=i<*CHak)ca|%%P z7>f~0HJ>XPY4KtajHuNox4A|kR&xb{u$cH3BLX1BYm+!eI0d`Ah!7~k;w4Mgg9kr1 ze&LOi>lOO!a(KS8(Va57O+OV+9Iz)OWLc2-O>{n}Tqzj6DT_t*a0fA!}-{rOpOc&}SN{=2{T zAK~c6V8P4;zG4}1Fi`-;1h~YPFJE4Wa~z*zcD$U_67FOc=H}5tt$D%hJ-%S=#9#=a znFw4tc+-{1=(3dbm&qtsyYnl*nwN2dc!8J3=QU3Y7wVPWy1%60`&@tUi>NK}r8sCY zJB#(F7xmHh1#UZOXuG%GE1O>~o?fBa9({1_W>3%x}XH=)D zx(7H*7V)4`&E1(I$-+piL}uxVx-{mY*}-VWWj{@_2k_3n2j%7g&77%p#B$HBx#;SmZ- z1f$PIy|s717Y>{qRx$2wW;J3irPf;A=XoAx8|E^VX)c8TVI~iMachQKxH}jrB~?v; z2sd+JfP^5dDx#{QDgq`K5oy&7PAtk2FQ*l~=zm|ZTIeOD054tsS9Iem)qcGESkNmH zyjQ09FX?0BiY}|NIlgiC{?Ai4?0oO@@lj5}81`Le%5i>iar5?3cXDl0>Sc*aHIU53 zJOboY&BEM^lQmo&Yyk(=Xm2+t>|mz4%m&sRxp^|Jn5~}?b-Qm3JdP~PIA^_snp$MV~J|*X$CVTMux8hs{CwUcB5^_ z@L+OVU_|C7>oyA}=d`pS8}WPBawlGioKW44kB=|*5BZ&M{>u0N;VhZSn7GmR@j}*K zIhx69)F{CRAU(nlLbGaC&F3=DrA$`M>O7ZPYprFhrPNw2in*E3X7%c=TMov>3Upy- z1p7h4#>{fC`eeb?Wy{RWoO2cyZV_s#%92ycBB}}ogWIXmo*Z{?eY~cWa$ zU%htg?(ykxc{)U(TGW;nN^A&QP$4-;4j9ZF6pefk?M$L8xmeriMi4nw@|IxY%|JT1 zG18n6K}PJH66zZEyWe^J+jqK8{;G^44f7n#Bs|pm^m5uHW~Ql9Ayivh$6C!igh;>v zw4eqdS5*h^XO94qc=77Z>LP?llq}Hx>O^4XT33j-Zs$AKKnM%|?AiHGK6vop+4jBf z{o$M6`0gCp-6*`a>mg{aWjpSNX&R<-vE5$o_hTu=Ou&NnEVkc1s#$Z@Xs`?uF~i)$ zqt;4alX4*_bpz7g4)tVK0VKAea znlcEj*D~|vv?lV18jBZ(M@wLXKo3{^RqO%@5aARdDmmv>%ImK0QdSiaG$tg2g;@Zw zlv;Pgu-)w_pd!r7Oi7egTc4x*5s08dq*W-);lef?E@IEt9DRwrS*AO91 zG~0myfF}LB3T0^pRFz_#`S$fqCthuycb6Z1^7w;?PtLIU{qO(&z^naS_q)r>i;LY< zW~YK|~o2$OLjB!h@r)=mrGLEG^X0@4VF6wwZ<0M+*Nnwyzf zgu}xFAh#eAc4iI_iwb~<0Vj&uRs#)YW?U6nwTNXbwFpWF1*Wt^I+pve{mfuEQIn!H zT~feao^0H`#n}_N)k;M$&F$G@a6=IU3?PZBFq1P&H8YF$%!a!#I#m_1h+&+9LO(e@ zyVDOA$UQo)?_KBL`=vL2_d9Q`WhmIbb7u`M zH6~^nXBy3$2VoGmZ9WdIyeKydp=x2az%vZCK(z?K30B?WN~ZvAXqWo}iAP&UkP3j1 ztb}@YarVjMXOB*&TVMaR*c|=j;iJ=G*zYc>0&g#4*eP~;gCGl5IGWsyK!Fw=sj3QrWKcUP7dx=! zS_-$QX5L!HHc=QYeRMlH&C0TI2esTr{mB~}~wJZvdg zM(VnD6Nw6gmtME!ZX&K03PB{AjJT~@H09&t<8j#UxBCT0b0<)33D)3tb{e?=ZeUap z&1Mb;i&o%t7*6`UdhLz#pZ!o$E~WHYX(}FMu(R{a_{*O{@bgCxUjOcYc>A}0@8Q4t zAHqgYwK0)e!wyF96-%KJ{I8-RaJA$69KY2T3B=XXd71MN&J=ImIsUC*e&^(eO~;S05#Yj_Qhc4Eac z7cciFlY6+EM>vI5VhD(qbL(KI$wZ}D2Of~QpfED$g%Eco`&A;U-Ugyo*Hv&SwRlTL zrIZq*QTe0oLHw{mHIEw3;A&MRF+sr1IjTKr5kb|3>w<}zcU@*~oQd!@SA;T9tGPQR zh%kYhJ-ij4rfS4VQ#*ziGBm-hU8*f|P$Hs$Bf?k+AQ2_F$8gHR=pFAEv_*2z@VRA>y5@ij(N>IBp=y?Hh9YT5ICG*_? zacld++X>4pv$9z>+`e`F=AGLt(~hTS=kKF>>ieGSQDSp1s1MI#5_FKXUg)2f;baOlt zJP=?H3XjE!pl#fPTP0NYhH{DuIkf}Bs)Y?TdK&NpR!k5bF=Is#P?WSv+zDr4EE>2xu!>73n*YCaa*02B0!_ED^@4opD zSAY7?pYih#dJ-2BkeL(wK;V4IqZNoSdJzNAc=vRGD$tOnSiGem2*5qyMglrLzc?-O zw}18fKYagR5~rhFTv$1g(%qYD*E-mCJdIjWSVPH+D~Be|DLRQHq;POquh_%Anslrx zK~bw03KkNHE+Hq>8U(PRRyPk1GIt9IQ>Tn5407gM*H>SA1D}nTx8MEd19XL?k_Qol zbIR_`4}v4Yns=#NHDf|vZ!FB*2!NS^%sd1jl1`GQtSaDfo|r+v?CwOWN(&1qM8w10 z-Q1bGhA%R+2!r6v5NQaih@4t23gblJFsEuom=>08J5VHu$Q@=KKrM&|E#MAL90&rG zqG5zCaEg|J8x(aRgdHrzh=>e~@b=hokg+g;WvPVb{EwG1lLAD{5Q5ni7KFp7+Cm*j zSGQh6(j@tAuNQy%N4@`iKkUP$l}(Za8dukkPTqKLr?;Mal*>Uk*WUPh|KQVq@m~ja zT}XAB_oKV1O0~v2Ub-yd_WQQzHymOEUJzHcm9Yds!kin)jl>)b>@UtgeE(;+?|@b5^ujewAddMMTy;NBCTP znnM5xcbQh1L9rV=JBh%Xv;>%qPA7n@&vNs6#~dUgK78Yi&0n3??)DpVzFi|`uTCU{ zLR2Zjmz7wl1po;mNP#VQE|U|dlv;^aGXkrKFi+LPO&K){^_yVwh-$8?ZYIp`*5w=? z7H;jBWOg?ZS>|d2KqgF@Rg@W?VPYiLR!Pb1ZFmC$0*u(Ep4VG4hIDZA&CgM zqAggUo;sG8ZGIA~^yb&`;E`J)5fP|SDRb`n?J#b-TcDAi{|!tv|hy8d_naR0~u zd_@D1nsviel*P^Luzq7|qXXRCog$nM;?}lC2EfE%VwehLjdfxL0CtCwGviNw@S}@i ze){|~8|=orH`Mna5SZLk(2WlHX3En2yPuywpWPbIgFFEVBBFt)R%0wo)UP_Q&y%@F zVmi78W=bh}#4yzo0U!m3030mb1@@&T#nGD9T_qA5w@0_{y*ASQ`^T57ZW|cpu~4uI zKxWlESdbH|$UIpzBo!#`3hMh~r;lylg;#TB|KvN)YpYoVU}E zQ)ULSxS1-0#er#_S$Hm`)G8vIZWT^h6Kg7FtsCYbVh%LWEtsWPC8xYf08of)tVCE9 zBKvWaB%7}DYUI&nRTeXw<~anV)UZgZMm5~Qu0en-JWL`$R8<&F`O&A(-hTIM4?q0zcAQ_ko@*)Ne#qKq?bLnM%gyW8Tvz|=Uwmpr4L)6H zXWTsDsn!Z=c}ip!Oz2jUGYcbvrl}Nb2m&!&1k4bj>b}z0bb4~sZC1Wob;q~v-Ms(K z`Q`AxDJvq0c(7GNI0cDl1V~6&S$ts}hUcuPtO0k|YQ@mL(fe)_>zg^Af@0I>T}1;j zA`7!-krks#By<)sCcgK*-#fGM(;xncPW$6^+Rk&Zj%CDhg9N`6+T3m;Gdjeu5P=LX zE!HX^P$ueB6F~3;c7{v$=*g$IUjGWRhNvVG&X6R$GLwofG1nU3dh_}RPo97LYyyN> zNfF$Zj!kFRw>Ym-YuB+brz9x}fKnV_)d+G`kh=(L5>=_@*Bw2%!kKBc-rRiaTiwz6 zdT(Fr`aj!3d`<~{?yH$B#uy8bCJb4v8+qN$g_REiZEs5 zoGD-AQkjj2L|CaUCMq%93~lZt(uhi&$R=X5^|k)i>*->5-0|gbKD7-y zt21lv^J?88lvyQvpw{9@-}-z1IHY_0Z~s`PU8mrbN|$|{!aSBE{{>0geTfw3#h54% zZVnd+kyZy0v8}T*BH#h$)LGciA3R7THAsY6Nwd1EmDS6{z%w)@nAL`76Y zOa!V5{4pmj>_A|eg{ z&H!))*e)B5JidfklW& z1=BQzd6yDJxVaLAyNa-&hDDtL4)SJ8&}jxpH5eSk3~sk0$Aax!c%PVwlu3eASePYC zF(>yVl7->aan89fxi;v-91&EQAzVtSR0HM_8twt=K!8K_`1b2EJyM=FS@xw6r{W;i ze!Y^Ml2;`rA-5@^67uQhjc@$U-yfg+Z2z+#X|ZYCpB$}f%fwzbLs!eaD-}v>@tZdh zs(ZEYz+6if^u@={*1fV2MRXt*W!9(#gi=aYacXfHv>Ucljq|}CpH0(#zI(Dd8>XzB zRkg)$=Dg}VRWJtyW6h$lsPkOS0V5CU!5(lzMAIO-nK4VwQmdnS4R_Jw8?SZilV=}% zL=p7*I{|`Cy$VEHHvuzZgrdgc$ZA4fEh0FG-BQ-HP9S21g_~D`PvaoKs_R+BBUBQY z-5nqeSa@|~kUOQEl_hDSNaj9U*_W{lqvo77F(Vl3zBe}}Bo?AY#?H&=O%`5j)ud&f zOD!BwRc6WRW~NL4a;M>5=IDgFE=8z$t<{VusV49`ch#(VHE$RK+`aKGa?&oZZ7vbf zC+wa&GeLxEM!eaw5VNBA@iL^Tl@UUY4|-tOnkW)tVk5jlmKrvoL(A~J*ka}Q=0Fwj;4W>dH`agZcHx-2Z* zXC?>>2;rnE1k{GbcnLE$>qTkT1>GFml7%oIj$rG$H4ua!kNT@qbLd`84<1R@g(u?ay6a-r}F2Z$QFAm*Jz z6AL7>XT*_2779_%Op$v{nYm9XrBvN(t<0jLwG@G;g}cg0q*SZsNt3Q6fk`DNLMMr@ z-^sb_^ELGxx+V6L>jkm7#XP1Lj z4<>VXBdMTEo5%kE}jh!se<$KGQe<$7NP+2*|N#(CaP0g`bT=Q#>>7lVCv zK0Up(ZH?KiR;%Fws_w`l%BxkEJFz+oBZ$UngosGis4O&$lL#}5RgWMi_@RA1aIDOX zj;@{LE|sX4+l!Ar`0)6x@5Bt!q)Z76AmjzmqrvV*L`e`t-fsS+1ga5c21pW>EX*v- ztu=RdcVh`cm{q`4lcYW(l!ciYkd{r@2poiBwQ|BD+;R{U3n6Y<_mc-TA1jJ*CGPu` zFeA{wV1XM~QQf2z5h~m|GR0$*z zF^h>o@k#Z)Mkp+$M9}g1r`Luv&PR;{nsuS9CeU0*kni^Mwv>~N-o0~!pqH2Xj`Q>L z-L6Cc5eyQ5!+F0ShsxWzes(dR z4t_q=uw4svgZ=Yen#B!H+e zVD8sb4ox_OygA4t$UKSxSVNB18)0HVRoD?uhaksbEM&XG3qXyKW+Z8j#vS2CiL#i( z%A6=7C0DWj#wR%d03ZNKL_t(i7+?`)5rT`Zy3VN#bD4(0%;ssHYN>8KxF!n)n!jNR zs>2Y|Fm1O!4^ihjIa5z;Pd@y~-EaS`ygqTKtLp^r8vf+zqus^nJceqD)LpyvT9@;U zn`OV*M3&SoP1( zo<05e;pLMLk-E8-p~}wE*=SF<^YcAjOcK%xV_3>Fa@)e1PtS2*GGe!!WtB*bZW5Us!=kL@EMi0`<8eKZJ1h13huEFpiXv59-_ojsm<; zkQxE0IuVh`l3UDemDi{y!ESn$wcNHRB2CC}V1%`u-{LzIK&4>vM%hcEa!`S^|E}Y< z>Lv>d_h>QLrGkYRL{@AjFX1VyLK8e{ovV2T&jVjxoL&Xmwi`g1>pba7#18>A2~1Wl z9zPf^pGfFD&+}}eJ(y|OqKpx)qHQ@u?kt=mo;>YkK{*9A77Z1=^`|?4}Dk zy7l(G_iXn|$De*M{N#Tb_wzW@=`K$9*v&M|HhBclF_^i_$tI)Vlsip%dwIrCVzFQr zVvbkM1cQwnEDP=p2#bQ}TD(~Gs8+3-fy13V5({@7gKDj+NmYVSOKrh<8nrl)2m(T& zF2?QI<2qdgb=zT_s~n|FthLU=Zof&Hf&%2>1Sm7+>FMJ~wayXh(3@|*i_KA=j~{&a z(`Qf4%VeN9-mLmQ@3!0hzD|e+6d&EV{my&8^yWAJgB$l=*W;U|Ti3J}3Jx!}TXcU>vDys++fVL?VuzR&S2azz^x{rkqqrZE2!a(NYuAU#i-QB%y*i;WsVKs|PP@y;5113@=x^S= zcK>Tx;*%f!;K|dcrP%D^QP)C!PU+-W6xpT@+TD8N&ENR`d%ymDJ-+QrI1nBB?d1R~ zrI4iDDVN#9%S@J!L;A}p2O)Ui#AfaU3lAe=F-Wl{z$ex8;Nd4UPQdwQbF?}-3K7L3 z+ycVGtD(#$Ld=42t92@6Kh#p(j8e`?mBNT+4j-rSv(KJ%Z~f}Cbi;s^q=3TBy;_qF zdYz~Wa}sU+#B7sS1DK+5Ivnl%ljO2!=gbYxsd-K`7!u8`m_SAWkEVtJ2O)yO`AX1* zXj-3_;=3EP?DmXDoxvPP#G^GA)=rP;$)5Vuq%Q7kadz=^mR0tao&;iM6#<7u>#K;Fr~Tf?T>$8l9&LDiqOix0 zpO$$BQv@BYuZtqs$f(0A)h?w?clYi4?|nC2d+ji&ya-;22$#s(yE}-K85G5d%RWum z+_>@N;o~;53MZ996jnVDN!d%C>ui#`Zga8o=g)RMf(7$@R>}m5QZ03TmlG#th8f(& zgHnRKR}Zfg)$3d>JSp7V<~eG%TKB{7>}>b;?|ysAsZa>jh^SVZsCnTJn!BS|Ekw-l zHc@z3JK0ExxOtbfUz@lgL}<26Od&bpN&$nW2!$`PZovohRElud<`6+IE6{@){hdUU z2sepIi-%%1_i$Uj0+`pvZl;D;lS`}2+IXI z?KsZDtV!J3qfsTiXLa{dhn^ycs(F}OtsDWk3XqZm%A(8-BjY6m1Vmn|7n|K9!kciE z1fsz4eBO`GgmzA)PaIT@W3|~#w)YaXRBpU1Z=4u_h-&QwwAnZ~R(*GT^6;mBvDpO3St~n|wm{~*`)!M2t$l)Z! zjErE0aZ*~Z3-32Kn90nH#LU8sL8@r@s)aC1P-G3NQzu%WsqZ1QpT-LKqSPlGr}qKmFw9HO-rqO6j^xUFy5eZ5tLg&+urZbwMzo%o9VC)*7Vg`rS8j z*7yJ7$J2g44U^xv_qYGSKU&{=>sGb5zwzz=>c9Li=*ee}X;|gsJNLQYNE0uFQ_w;g z3kV|?8N-2P?@mqz0w+%!b1S8cp6XPaIwXlSQ|%f-AqmdX`R3;4n{VFv>U;0L_w|eO zv;Fg@5C7^frVoFVWDHnJX*bSgyWi~x3tAtoHRV!h9A_ZbIR|qw8mAf^J4wiDa;+zC zoxFCByCe1qP1WRKZX^JcKoH@GAkI_#m#$wx=#UO z4pb818f0$mP_bGW-I#CwoqzV9*W;TaU6FJ|bldCKerx;W2Y>u$fArti%g4hw1+lxY z*ZpoP7nh?`X)^)Ad>kS$ISmz^;&M0Lk?ya4_C#*pk=%D7?CgXZ&f!8~R-<~(9fBIQ zkQx)-NkwtsZw1Y@j*806Apl7$HSt1LlV*TDg2R=C!lZf@N?m49ooAcp+;v%riBn3< zX*P0q69FN086xDd${FExE}Z}(5hR+4SwWk=ABTOdwe{BFRW)@vgUHQ_m2ntFL_|uI zqvLf-StZxq=pMZ$4niPFo=YJj)hxmy;cfr`b$1IYW%h`dPB$ecymG# z32mz%6ZfzE<{!MfIr->+`fvErPq*`UIq<4Wh5hmQ9LUAI@6jdkVN5BxQDO$`*`9y= z;ptC4{Pes3fG_XriunrE1Kbt|C6%>I8f z_NGskWLKWhS?;~={@#l%Blpaz$|{wl(Ne9}(!10%tzpc7FMt`?dh$~MSwncgla1NrZrO{Mx%&TE|Pmu!_r&ZW`L}sb6>`h)<35lU$A>7 zMkG2|1#D!fc^k*0P?YIF<{ePcJd#Dl36-O;S_qM%apH(X&Ec(eIDGi{!(Qz~s?zC# zg`_=L)%#(W%C>Ay7rQ$?M`gr_2q~nL;9LWvcANKp{D;EFd%0g!!u#>U9k*?>yD$?ihY)J_go*@LyTUM z_`LW(G*@;QHFGZl=V(BOW;j>ko2lgmg%(tUWlG7GyQ??8`0-!=vK3V3JGXAU^N-%F zS5>U)w7U857|Q0XA}}N%@WsA@B4S9@s;S!-PmUfvB+J0@t+&6pTJCH)0xA^y&9B~V zvmE#F#Yz7sU;avnY-oxwGN~`5`6ugN|IP>nfm}xc4540TUKD*PW=LY93}nOuR2)@J zg)OUfx$DOnib`45^|mb`2Igz&?JtG9uXhiByt^BopGxjnEfV*Z#Q-*aQA1WZS)bp% zyGK?1=*7vV6O5^AdlZX^0T2NTn3#cKMJA0z)nY{yQdPx(A_b93Rs^nMET#0@wiH2tKIbB0 zX5HD?N-55T-(6SC%!NqEMtv?bF!j3M)ayq;9mPaP6)n7?Cy$7xGzu|#D5OiCac+Sz zgRfmxZnjJIvNP#W8;7%ohyhApj1p#6!qgnxdF@w%ruOx3eC6A}_s+GQ`tkGUf`l>F zQMTQ-Vj2{r(FozRZPfs)27z9D^dYos%^5;<>-JqlU{e5%k-q(%KY8}>z3=_`zx~QP z-}=H=-XTUb2Z9d5QU390P-!wk!cm5WfC$-z1`_Sodxz!Yj}XnY_;6mCjhtYjhM+*m zArSb_ph1-aN}H0k-7y)*{oODB>35Fq-~QoW|K<77^UbCuW+ZJYM1u1+0}-O7i09|s z(aUwtf`m+buzRhE%K#855Jn?1E$Wj5hDC*}ygu)XApplvnTh6B+7ghe6p;l12p9|! zlLEw88<@jJ)C{=_p<+%5K@FR_+H5yNjUd+crQLSEJW?xW0wAtcCL)u>nvLoorUPSS zl%^Qa9ZuUH&ZrHfBM+p5y!bFP)6t#~u9(Z)81N?jFXu%}8u^Hm($Q!tGze2Tiga}M zp4_|#_il9QE>}MpH0lqU9YGFc!Hg|j-@lDkKe%`6kH7v--)-M}`S{t37t6hUhNH5BP6s#8sXdzUq zy0H*az$(W5oh7IssC3JNy&D#Yee@XSqFf4q9C8sr?0ae34NA_`!L?3#6hMyRSU@yVf8Nak{6t^h|gKcJVK=1?!2ciQM8Ytv|Z?o~Hx zk_e82Tbt=HU$`E_nEGMxNuuCGcmOhNc6V2+<^F0}`tH$_PhXs#K7amVv0o&VGu*j# zJGz2o7$hg*`n7}A!QuAVqpFJST2YkHfWvCFBGUjsjHae4H*eki=l}daKfG~+N-t8F zy=Wu%;#5;KAOd1??3tE)VJwaWs*2c??%uw;irOl-y$H&-FXz2=3al6c7cdbLvj(=k z5EiVtUOzvQ((kNRP2`hjAD(Q^`_f9bW^eJ`|M1T~|D^|?e)y?o{lWMD_V~q#ccWSi zj0$QMgZUhuFj6V&7l?>fBxfb#Kp`aSAu^lEX5FWh0#*W6I+Ls%n#e08HUR-B1s!DQ z;R*~z)xd}%jQVP22x$h`nkl-loei)$05Vk(K}0e#FrP!Ynhh8t5n>=537KYIOZvRs zC7rxK!$GYMM62cFb{Ere3L)4**a)pO>p$dqxe<(xtZ& z2r0&|ef{fy{%=0c)SYiODFmV(LRc)CKm;o2N)~1*@SlF?+kI~Pl3iS|bZW#?=0GAw zB7;B(1UfF6Rn@Ev50=>K;QFeG-PSN5i9`Y>3`l*^0&IqA)}F8XlP8%GIUcXO+-*e4 zNr+@fgr$_gK~({**}L}nukOG7rQaop-~7%$`)~i7|L*wcMJdI}si9^=B@;(v9yGk7 zt_X;f2m+?sN#7D8DMbY;P-`YY4ji0-XFzC#!Xj2oJ!%dR3>y>R&_Ea%IE>s?0#wS- zUptfD>B0^@88zs@j43!&!{zA)k~bnDt8>5+0ZdVeiGUnkg$M@9%s6yN1ctyD21^Ry z)94|m52xxve0g3{;+WcBRi=N{mdRgjip_S(fA*4p4k89J@n?w9n6PeEcW&QUEa>sk z`SYW*^?9jMvs&#e7WL8bmKlSo!weA&2>|H}Z@>M$m^SNUlM?H&TrNvh?JSppxuAkU z8L_`9B&I45yN)#zJ7QNvT#W_Q7fXzY?3!SV$XG;F6acGOEpFWXAw-bY04lU=%Sgda zdnjVL7!sKl%Pn@B7+JcrNC^FQQ8gh|Db}ji^_>)nAp-jofdCxhVu{NaFJDR#0tgI| zF%c#-1tS0is}n6!=){URGBf5ZB8uqqKwwNB@Ld5=Reeb_`5lwIU8h{jukxLOB(V|0?LPaogu>}7}w%h z4;MLj0L$Xs19ZfhykakQHEP0)j%IVUZ@OZuFghMzv6l$Z6>%E1ug{_Wcj9z6KT55HfhSo*FAAW~#vQxp{y zG7(Ww%_RdOnGJ+~LMB8G1)-GUum>g&a#>uF5F)wII|l%$DaQSq-0YX5kHs*NMYcu6 zwkQ#n3B*y}i$3P%tKp+)Gaz`5gG$wGmtltGMcsyN5 z?qdpFw?)JhD^(LIA%sEo(>-5HkIXI-;)P==CB^6_rcN3~Q)U(sLNpuZwSK~gNkkFQ zX_Evv>Qj<~puHYMM6Yy5fsoJeyu0KeKRR?>v9p;O#9#RpoO-UwL;O|mnm{h81`rSk z%-FK=$Dh17esNj{NQss;qIQui5iL>`0T6MJDiIJv2=w~>*M9z!AMCGI$Im+{<;Jyx zRKS_f(pM+Os7m!3l`*ck5^kcuFPnGM=kW(In8x^5Otl@g&5ftlt$_kAxSfkU%s z9Byu^?Y0jg5VMM^2!Kja=e(PlXTMT1R*?|;KBMqq8tF)({-MzgkM&>{v zTE>`C3bFKk>3bD1uz*hcz>q2cAa-3L1c;tQVu&2r75?kGE|PQZ4KP*9s;W|oeO~}H zYw1<1)n1iCCmLeRYNa?3)(D{}^f|X}OB6U(#M~^Gq9vD1%wWM3Qc5l*kqa5pAj>1h zA#k8*S`#+`nhHpVL5}*30RjUiBq9(|D=vH6=boAOcNfUPbWEynxK)5Dit0F*r>k#u#JLK|dB%CC0ej_G+?Pt;XyMjyCdN^a8*S5HoovG$4s# zkQMVy(4fNVqk0&0vR?I_H^~MH~e%tw*n}mPnIleSuo&7*y zV?c-;n?>`HZl_HR53A;55oFxb9Q=mM8sgBSu`{qXatvw&*o&EDCR3N`Bt#<-m!1rwmQqj~RAC3E6(MpU7pDVo`Rx!X zrh$4t1(oTDs9FeNkQa45;>ex`Q);}S9(!{fyO%+vX?!Uj|LdTaVV7-;he~1WzsJ`B z@9{5O#c13^I{Rbbk>GC+gou}1z3oa#Fq@L%|A(8|R1)}q54UgJR8=IZQrc;jlDl%} zK2*EqbldiuKvpm6le6v9v+d&WO)L4~Z+@{{F825KOifk$R*h|u!hi@c3xsH*rK3~> z47%3_@6l&X}>AR|`^Iihs9 z&80W!3Up$HATQ)#YIgSd2x8Z|MXGc_jOf0d3sEB_vD=Z^qb$!{Z`sD zZi4jZUag>N5er03z1VEleeOfx$SgVMt)Z!wLIaA>dJ!G^#Nk`<;B%&V@MdvfH+Efb zgC;-KvC`;MX>u!>RTDT6Gpm)ZXfG&6Vg^W~bT=KJte1y^ILrcv*_|Mu@Bh&V0D~#I zBCyK`rWgr`L70Y>fr^Q!ngCMQ^a z1kN_^2Ghj)N+abp^?0%`z1RS_828(gl&sqsjiTFObMnGqw0I;`V1W3>7vA~l4}ZWM zH`}ZLr|XmI`hy!cuGdw%bLX}hsO$QSdx*tyb?w@Xwp}BW7QA))PKq_5l_F~70{p6= zifZ6GfTDs(1PtU>HD-fY>b&Dp|ckT0gIh`~ZN&dC4(*$jzp-GA-Q{Rb}|e^iQXHeI!I5UQGp5lO+Vnudzl z(_lcmcmMvA#~-?H)b*RUfr9(15FoL06O=|5ju;~kqKqRqXL5TAph!SKIh~v2-n*|?T@Pvaga#%!nuZfW7$j}g7i~!3Y}EkV*o9v+Fjzqd%W~(n z*Q7l%J7bcXogmxLJDAQssgh@A!Z5$pkza2vMg|Nwgs+0%Wxwne|9&A z9RwPEw>K8|#Cz8c+vl|#ds*&^vKT0#0(DuC8PKWWh#|lzD@jCERn0T$nB~VJPY@Av z=}YNjjKgk^*r(!Ce>0Dr)A^gnTCYcEQz3-QfFu7fbV;y8r@x$x)Z8bKC;-yz*Ja2K zgURoOr6H>L-u}{Xe%O(sKZ1ya1}Ov>;k=FqnOY~WeFKx^B2oyI2z+@MV<5|FGVq^F zLkON30uf(s@r;lo?!WQ5FU$JrvJNNfZW|WM*rci+Ayr6&^z-n+2gwqRA;{8U;)za{e7(oj6Wkd{W7p4-@82x9;KqfHQB{@-c{x zyP2sA`!%e_UD)U6|E7lg$9YStujE&P5vrXp)WI zPQnO)Kmai?6BCh08P>L_3c?`_APX=AWg=&26Ok%U9~d!A&wO~LxIr{6hOpR8t3y3r z6XSN1-i)Q+egQv&XMqC-|%$I)73Y<`IQ%arxq@OU)!mn;+t~B8O z!``Z2SuI`(!_$jjI;I!r5=F%A4_{rWBcOnc-eT(OdFQOU0u-5$yAJ_)wDWU|0L<>U zDS-#c9V9Yy@oHouQw2mO_Vt*{K=_9VA-GlUFsPrWCjSUDvDYFQ2?uRn_KX zyO$c~00S-cf`Edf5wWOk+rIBhF7oWzi(VweFrZOiN$3|r@waSvU;X*P<_s7W(&`3K zbAEpM^k|)d0`hv>^`b`H_gT9_2}Q**dh_~hK0cg^vx|lXtywIH43ef7 zbi;-ztq*NKnNCy#8Ay* zjLdZLy+mC*GNj5Ecrio(D5a!D6#`G-)M+CSu2kFr3P5f@y10Jx$-}xmdx@lVQxV3& zI(X(~52#+4Za3}m%abCe3MZ#W+s(SJYqLS5Z~hjS%B(AX_7`FNvkFHuEYMT1)V%EY z7mtpfoaGLe-C1vvJ5>cmfKoKa>gLTmjG>6m-}DN-W(qAsW-+UqWpNQWV6om*q;M(L zi+tl7zrV9O2m`jQ3?8dCi~xs=GD_sTP1|T}21tHrh=vMH2QZVTq!8wT^CG(u_=wt( zpdP~5orVm|l+l>X4d)3-Z7@V+;=%6e^2r|M$wy2(nm>(T>cYJ+D_6)UxW*JBqH!Q^ zaRM()(gc!9KtL?j}LoD}Tjaz;3tGyHeAaGSZKIxCvt#SSG_^gvaC`zE<6%EH!zw>Tk&TcUn+*;KNpaAwNO8C3K`*lA_1AW3SI5lo!GR>mx;!M~yMYUW7^C3?r&40V+b^Fg;`l5JE6jWM($7!K7V>#n<3XKa~yhD?hGBY_RkkKkYEM z^@a!)X#l}7Qwo6xPbL6!S07^5Hq50wee(46H{QZo zu$^b{`HcDsc59xkJ}ciNfI)0*d5ieNDkY$-z&@Krb#*tm|N4UmZ@#HBTGn)3F4l~f zKacz$0&(bL}2b)7^;rkq=+DiBSKN+t)Oq7Y+r zwPW%wn|K&r7@2V(7ijX%Z*1*l`kKrSj$m2@Bo1aevGp-NszxDXNf2b7+5q%eB!_c zntJ#}K%!FIs3gqWAz!1b`cjICIOnXYLpBCfwNf%t7%kL>r*NSCjt@V(Y#xDIS`Z=; zhCm@EL}13rRF_;N*QFar;IhjsJFpN(ozSl9eB&@pUWur#YyTE5Ju+U6@R=Bq*{L8^ zrTNB;91}6XR}{#Wt0f|DHrunav#x7#T+_`aYB&uM2YC-wLVWk#ce}1dxHNm4-yWL% zxw0?0N+-BziqX$puXOvhH)E=s$b^C*2nr%(%7j%)-}=@!4X}vX?9UaB!E^;L8Yu(} z=}TY!9Yj)7H&rr2G`xTRjYYG#68;fzTGGtxy!qFeAe-|7V>*;BN;SgO%tX;Zi1Yo<&Qu5L`3G9%0nKFjc{O!0RmeJP*s?s zRY_7TDS~@L#(*&r8l)Igh*e$3lsHCYcA3QK-c5Jb*Hke^$4wHE|F4IJ&%ERF(aM!A z?ofzN7i(NykXcbsK~l{kiX5IlJG!|l20pdbDUgu!B^+01)}%(upxX1~(I+2$`0(v7 zd{F^hVHajLva{?)v#M)cP57V@$+azE4BBc66u^Mk?!EPo_HV!0K6-DjPTGQi6;q!G zf9?1F@b10Wh$viQGU#M_ex>@H$z`FL-F@SAZg$f8v=(R;C<_7JyLT@H&N+MSHMIpe z#E(IGd-Rzfh4^3Dr{i48Zy*QTq*Nf_)uAG}PSa%!@2KcG5Z25bdIv}fplU+_2|yh0 z-+z4&7a!^I!z_KM%9IJ~6o@b|jXGswrXdiK+xH*d&(eU`?^Y!Uzs+M3fTo=GWs~-#* zOYT#Q+nfWlMGb-e_V51o!CP-}U>gwJt^`UKgEn7%&a3MSDw!e31r61a7pj0pz=Rz& z`>%cBtDk)GZVDXH#3TgHMO5Lt-~BGf$`__CaXG~>hdla&J})hX;3`5)&F<>%o1fpD zJgc?D5-5O4xpVh!452UOYEq`bgk{{OUaXe@uZ(YCp8ZQHXh6wcZp1FgjL&2pQ3c z!e+Apu-%=74?&s3h(^#W5Yj74zIhoof4rvzopbi{Jymg6zWelB7gWA!gkn}oLBz#k z(KJnraj+X6(E-Edb>akKAOHkCK0Yy#okh~VJbLu${Wrfrc)7&^w?+XqJU%%KF`$bQ z_NA5WfB!%H&)4p}zRe}33QR$Do4fOEJ81Y41|&ubA@~!ysN#)--M8O-aOd{Ty;W0J zfd>_#aq7eY2RjDZJvdARH7lyS+DTFPC*S=K>gCem0x)~5ry^;p+$W8V34A1~K%g9M z-?>lBDRJBixCbsFg+KtRa`7w9O&#atS31A>z{rp6G;bOpMQS!nd4Q-g^y5PxHi9#V zsFVWa;6M}6p-&pNUqoSeRDcly9FogyF4$=*F$89G4EAD?VhqGCs0$;B6h0H^J};MF zU2FLzRn-uja}wrDRBxVM9Vt#&mU#sAdGM=cdhkp=HBbWJOm-2m*uwDLfV)^Tv0c-=}A3emduP0RLCdMV~o!`OJmwbfM-o>l`0AKfVDG zMu_{E>K&d*!~!W$O0ixvr`z?1@4qi1DORcs2qM|()k^&E!$&{<=`UlwljCcrf3?5O^Xql`iK#^VCW>bt-k!^=%*jO1f+zvs`y}8-M(?{ z3va#ox%>B*yLDjTf=#6To%(Q1PtRLW_{ZP~HkTbwh^8-F}q&1=3!L2N(xep>;8p9a^sVXr^ zDI+T#<}t~CErf_;0l`EB8Uh7ELX9yXn3OIxRSbjuIai4g#f+FHf)tL5C4&|-qKH0y z`n0O5-Q8X9rF>uIqfr>Lhq?6`0Gyw%Q>sE>*J8PNrW_$fql?H?7uc}2$T_!dSJ&0q zStlZ!&F1j%uxXlU>@dYtGYjKK9K;$K4G9#807zt&!ruNKQrcU>uRnR#Znt$65P5uf z z0GP_4LTST9T~!Q}DG_qXLQFQecnhFlz?2qLx)gcY7}aTWwsBu;P;)H3pQIrIF%Bg3 z+5F%C_0lfD?R&|w+Ug2O;|5I;kcq_1L_r66BR3%l4*MUP#c8?+%!H=yE6#+0I0he^ z#|xP0_!)Q%^QzO*Yi!pL9nX5>#*I0q48T%Ks)LY|syur1D5bQ&zn`kqEULiK`&tAx zPyj2r1TRrc4V0L1+^5M%4l5=0A9bcX!qx)lrjNwFIv}GfHA2FwH?5QsV|2B6_gPX^ARrDGv^W4Wbx~s^BZMjt zB33CeBhy9loj%DGWXyD7Cu*{P0Dt$>(^F=y>v|l`f{2vT`pay$8zQ`Q>z3D@XQyX% zT?Yo$L1CE?T(s9iL$}?6J5Ljzo}Ds><#Oo<)c;|s-Nyn9uOMq33KPZyZkI=GV3Lns zJYSz3Bd14CpWeK2c=P(6bZ3#HX))mUKl=24`QQHC&pvu)904%a3j?vXFU5NZGcaO< z(zWZ;W!0by1sSM_0jS7O@i;A^6dNgn?r`NiOq>mM5s{D$RFy!>Zm``doK|6{!75P? z;rl=S`Ct6xqd?rmu(wk`c&XsTGpKp@2ApfLa;P*EieaTJzOLn*%P zue-Gcv`E%U{?)s`I6XVtU9QH(om@excU?C@hql{oO37DibDG;UA<8+A-NPUQ7DDLz zOyDh-f@utlgoZ|d9MCz-fnrSV;ekvPiJhq6;$8)SiTnWhEg$PVa5l5EizWc;6tPlD zM09X)FkxiXHUg| z4jJDSvW*#?Ct6wzF~(@3pL^~8_Qli9`5IFF z^WQxD7ytUtpPXzdrh$G{K#^2&yKMuZihu#4MJw8GOS_@Uhz^{tjH9N6Wh zR7(*fGGEFMVO|giJTeWe)(gs12<#tV!r`-1xGpbX=VPPga_QgQkD#O7T}(G2Rj&JQ#Vxz#LfGkeE5?e{!mxf{`LR$H;>PHqJUD|*vzyLVKFPY_!yJOrJaGy zK*R$VsKB^_m@+tr;Foop)*^z4N#9>00)ho3MKnZ4Tdh())06YI>l`FYthu3NKIa19 z5Q>^Iy*%%Zx6gj{>9e9r^rJci23RfXo7eWezLjx`P?sgF)a;>R#an9u)IWn2h(k(Wgz z##q;hhyoI#r4)!U1Y*KL4&U_n=xBFmr=W=Gd6!d6ecxAg?X>!F{YVp9egxhOL!YXu zL?q|VHz1xw%z)M$&*7V(zY!#GE>ey5k?NT?%YCv_4>T3 zlOQoMAc(8>l|o2$T~GSo6Xs@EX*v{U3`WxoD_j{LB#X3&8H`4Kl$L(M4Z4r|K{C4{o_A)qQiw6c*tV^zR#oMEDiPIUNMJII2@yd} z6ckjeSg8~=#A2*ytPP$&f8KT7-rioSD;MEe?yS^I0Yo%b0f-eKR#mEp z(;rg;bb&7*qWx-VW(-Pac0F&yOiqOpXV&B1PJkwIgDVO-PZL>$*`hsv;#!jc=W z28tmNnuaiTS|A)ih1#xF)s$jN(NCn42VEY=zsqvDRMoEQJo?V3j6R`M1 z4(KH=A1DBn+~wTWi&ZbO-Q;?A|JJp;YRHSt2QTx{*|}nG;zo}|h^ek(&Dfv!?Is3d z4io|)k^v*AmeRJOrK%dFV5Wdf9Mud;k6?tzJUIWUs0~)%f{KG5Fref12^7tgQf2Gh zb!+X{9X>faZHoe{l)ja9als765CT(lh?W|71w6uOXmr4FZ6nM8FE-^bfA-#A{rKHb z+Qs>!Rko6`D2oL`gR(t7K3X2ur`s;XxZUKF51#zu!^a1!AAS8RU%Yqg#+{po5nP-d zbJw{uYw&x9;u?A?rR)0`7j>!v1EQ>&S~Ulzl+xppJp7>~T zi90$%E&IL)01x5QWX3b8|CV>hR}it&>7UkJKFjo^d}X9RuTv*3zpBD03a+YX7pu-! zB^sv{)RjX&Uw#e|^p#!00Qp=kR@{We&h9o_Y*rsXdiw17i{1U}-~Febr;vX0{s+JO z^?S!BXKm>Lad+AL*|)#*_UregJ3Bghc7DF;az><+2GlrB! z5IKg?ht~{D2+4&;LWl-XR1qx)*AH+sK#y^;=YD$ z22(_eF{Qde4z9kYszr5B79dYd3V@p@e`=)CN2%N z^zCUernqjq=imDQmhR5s-WT3{c;P2k>Vn%0MK>acDt>rYH(2x1HQ-0+`jLts`3+A*Y%a@k`L#mZJEnE z`s;Y-KIxH6bBUSrhRv08uCVj2C_$KA13@%ESRKYIN1dDEP{^NnwP^-Ev++B@&~K48;zo6W|g+&-H$bx{?Y1^M|%WvNM;N)z3-s$6KFM5TOP21(-f+7gOgHO7+DX`-r0=oCJ1Eo=m zxC#X@?MCR^_U!mff!A%{i&4%@VYS$4i&ls#C3l_lEtH&F0E)y#$Lr!;GXm1VTv|}O zu02a3)~T*m%_=VO*nk9qI53$qLVtFI*KftN%4z{f^Jcp_>Ne}rw|%>LSf#)J@lTd@ z`u%Ty{g3|O_ix|6Q6&sSRSH!Ifg42?DGX40V=DndDZb4o$(~vx z#MUucHlbVR0v2&#R|)b{-JbSrtgJzfJcF0Wx&a#dYltPU~0 z`NjhPDh8!!*O$|iv$NB6n|mi2#*|V_KrDdk^YfFl^~ve!(aGtv7su;u+v^A_BQvN7 z*LN4TU)4t`0E(y~Ldk93o`&T^@&Dqh*u5GI-?e6apACyvZE}ilBfAsTTKK$hN z&FlAW?%%$7{odW%_ir6u+g(U0>&=D%O4n9ZLeeT$1`sGvB7?yJyYD+9@}=Tpv25Gz zcC)P)HK$-;UDvB=;E-ZsLJ{3;H&v=cj2V4K>vz|4y$3n3P5n&xJ8lQC001BWNklFL??=TE@2>pDP;Av}2Vb0H=X8S$VauNM(PLB*DNd_~iKX^yql~@Z)E>y!_JrdvW^_fbQ(95RYRF zq$X__igF+^5l{}Pu2vxgH7G^ZEKAO%xM72tGR43QO5SX@sagaCAVZ|wm#)oBSk=|u-hSUpA}Nv)QF1ZWiqdx5 zi)i3D$LCK28V_q#Rk^svygQz@<5LyOOjlfyCb%FX>I{?RzmofYKpTazH#hL|V;mO> zvoC2O=Nw`Lye#cBb8lm!MO81Cjc?H$lkXuv%<*Qb3J7Gx6zjUl?K+EX`uzO(q|ZXB zLF7OdW8A-XonoaW!zkNZz?c{jo2Jo>A=VIT;Kl@13lahj%I;39R~4TmPm-uO8eR;f zNdyL6MA%C;>@VQ%O*K#pXWMP)o~?iWJ=wINsv?1$7-3~|e8~OI?%u+`n46hpm?&ARQ^n-)yK zj8qu0^cy1UL==hZ9VDQ*;FL~_eg7Aq{{6cjAJlmBVDF9lcW&RfcI(FBYI$~WZEv-x z7Ad5v*{f?6P%X^Z^<5xTv#xFX+{M&X)ne)@>bh>*)?JqnDUMZX&e;dCUJ^`)vhTB3 zv6I=C*ZnaDCjd__hE9L;mfuTTGwb_)sv;)D_e53!FlL|pTGCv`Oy((gl@ZBrqsi0 zw+^pgztMMn<*3ntr!A(&9CD{xR55d(11mX7!N4Q`z>7c~_?pL6<1ijx zH}x`0R&K4hoSr;?{@yAf7_8T41lH7{UH4E<{^Rd{1Y<5sGM54P)Nc+Wyz%&a_L)JA60vt`z%O1 z%OxW&s;b>?HkIf>;|@fgCdmh?G^IY~Xpf{^gTjF-ToS5?Lm`fBX6y_ix`g z+&|dc*;y={HdJz(0>>0N5UWy914FPFV%u(&gUd_Dm^iSni%JnQ@G1Ib-Ovg8ZtghW z_r1?k&(898+wSe{j;fJaWT*&Z10+ zSQ#{s;u{K{mY>AzM`AQP(HN2z2^Stb!-p^}l5_i4fBjb|+mqvyBK_+(`CxDLrO&^4 z$h)2X0w72{16S6Jb)s~2r5047RuHr@lf`W>7bkI>r0z;W)s!MB@W%= z1_5&yCgK6me{pL^|(zxkE_<9k2& z>mU8{!{?{5TDlh)4_Z{{VFoQ*DP8VNvU7a>qxBsr3D*#++S)FJ5DmLhM6m5TD3W## ztHo-ycWr02TA!bFIrqW(2%Ks&RbnxW9J7>C3WUf^v8p&kA`%r<>xmJy&-(I%r~m#Z zzo;WuF;uAv$a|~BwS(o2>-*PMiwAGqyMON{5qIrYa_`qIgiuu#amcy&RMpkQnZtxb zz0!^h=m;|hE_rm!rC%%?L=-IuD574BdU+6I%sKP;XBr)NkeIvx9?!sqjU@38U?w1QCNsnGd2=mB@CM^*{dhAMWkMygAc+j-W)cv#5ze z9dWU{+TYs+GeA}?ri6q635bhQE(#_@DBU@Rni5IQMA7q$qRp5I^3ys2haf&h8Ec zI5|E#+N?R0x~`wKIe+q~>o(V_^o_55>7RW2_aEH7z6h97Dk6)juB)n)%*-5;nU+#Y z&IT3(00w94F_W4WDNKBRz9#15<8|Nvzh#~4ujSWy*Pq*3zsuhHT;|M~`#3YcWMUJy ziIX%-y-v^Z{@)OPIHu|4CN z@p$IUnK@@)ewTH59$u_{womChk8O>PMr+n@J>Td1{d~jj?(T=&8=;dRA(@%ybtOZo zxFKQ@w;9_vJo0j0Q%W0OT8ZuF6Zg+w*4s}9JIgUEugr2eu2Tl8AF8Tc;JOL`a@r!k z?u9UQOy|z-c^;$nnVfTsgd|nY;TT|e}FPefZ=TU*;RM4V11n%DcAlv1Hm6H!i@a~Ouf`&!ma1=;B0 zMT*)u3cUB_Da_1CGZD>So z<`YTE$n4^9l;fC9a{@$%>#{0rYA*~a1zz*Ett`qh2Lxq8GR$hK;9Yf!1yeFO>mW|~ z3zXF8q=~ce+d~Q#$EL1W$ShhIQa&*dAR<_Qd_cmgvWl9LDUz`6PQ|wl>&b(ILo3>; z211Gk2)5<%xl6NWFMal_UwGkoHQc#(@OZwwdGpq{UjE_Z-hnETC+~{8Cv-h(BCzFr zd1yH+^dZKSQqEwA#t6Uwj}JcB^)s2wNGe}dBAjw^6CKRH@#0<%O-TP7Vt~BOcA*f z_UyfalX|+fb(V~$?ezA&yM3m{v(!#e$kf)%NXtoe=m;`_CADl2*8&hcU4B zPhGnF+_N{XT)OhBzxr>YCIw0gfWqEUD&Hwfc?hxV=8yKKQ)y}i%Bq$3YPRURIF6I1 za!CDr5OE2xj~xOvv$ky}nlg(Q@B41`_|YL*uImcfWrG|>Gc(XI4qMwheLnz#aP!*R zclzNE{?6CF{QQk4lcpS#p^<^CqttX->n`ghGb7~F(9n$!u*^C!#m(o{iDPDFSN=^o ze`R>zyix2=|EMPhj9d=J>2Li8BQOj@&RPEA z0k`s9Lh-1L#fq#3Rq@lU-Tn99yK``G7;<30mE#Cu%%yG@k%fgx(NY{aCqp$OWJltG z9Ret7fherWX`3z|WHU3(#AK>yS2`2*cZT<5IC&5RkvEsofjqShXm{u*~oF_Cev943kqbI#=h&8n$?RS@GwTj?CZui z%{UGKxO?_25~XNrHjFW4eEr?KZ`{21)H5I5_T)GN5s52n!EqeRw>afoB%U%zu}Bd{ zH$&2THzQI;)V}ZDzjyEI<*U=lv`n|!B+tZr=|A)=r zq+&K+@6Dksw`nmbmNV*{a~m(h$xTZL$qMX(7}!~6VmFL=vFs;XJCn)u@bLb8Ij^j8 z=(%Ps=-uyhP+N;!m(;y8ATusTlTk{}R+ z_fj>?EZ#dJcFuWU^$VAQq?y(p!o2fsoKAUncQ>by#(q}&J9pndd->XfM@O%{{^qy8 z`@^^HK1jwTo5L6YX&L1Qw+>&w_09kE``@~8?ZOv7_rm$J+kfTBr_J=-?yPlyI!+ol znKWaxVGLMkMWtVlz%-g70>pmRFOMN+sT*`WESH+17iN)cz^-a0+um0sQi?_`r7-rG zQdlh!)ceYNKgQA3wHgo$I;owAld!BG9np3&beSEPoLD3VhI?ZctdS1OO>1X!=*mtTp$Rl znib9B*u^y3&i>Bs*`uS!NUp9uu{ajOQhjcUg;K)+Ucq@WB+8&_v9x2B2MG})fs?{C zM?*+sH+IX_(Zg}|*wTm=&9ZakD<_^r#7iyCmBLODC$8?sn*gx8JwxP*oyL)zd$>sF zu1*PrBI&5@cx%9T?~IgaDt z>vp;gNf8XFXix;qnjotsO~HT-7>U5f7@E4)sBs9DuU*?Z?+a|B58=%_hd;XgzrXq7 z%Rl$ipZ%2p($L?oNE zP|jJT9MDqTc!HhWAQ%+JaUh};l|{T4g_u>%SV&4@838os^80I6K*IG*45VfX80fTGo{39Kup#jA4jj3}F$*m8Jli0CX~$2-hM)!c|o%K|@YzT@(3IvEGF0EGu zu&R?1fJa$XA0+>#|#VyO(14vYDt_a}UbX=;Q9 z0_+LYF=2{hwO8-7VAI4;HVy`qsy;UA%hXi~$nxdJ%$FRW7EKQ!=A4hMooltg31w)R#e( z%0@nA@2fJ@m*-~NRvQQ1Uv|fdI3nJFgEpl(P3sXc#NM(2j?PV30FtN$GNV#SV`{1; z?OE_J3J?KON-?FXszd}Vmr76pwwyx3m~|L>$r?|W^C7u{2tvS2O@J{Z+iE5`Y8OG; zovPj;sYq%=XNB*DP&!rB)JTmfCL<*893zBr6}yC*AjcGgn&voyM$`N1&L&&V zZ#B~`R|N|@hO=zNX>$VZrFAVsjDgG=#*|V6Yu_{y<(M_*oCY)kQYwF2z^FI`nMqGN znb1&yfG6C#X0m5Uicw6E2;BtP>3VOeCq^g(%|Zr%L(mkWIA9q2{z1$MzzB5cS4T(3 zXLhG%T8x{%a_lF}g3RI^5<6GWYTmg@oG&z211oPpWLh0}heykc7cNlK46FH&RO)vB z%oe+?M|l$T>vsH5+or3jNS1UKlAB}=g)g_&wu>cXP&yAE&T59{g40r z-~Ql(M_sjbw&KPU8J&Pn%#7HO&~gZarBE8%V@L|*)v8}CG{$UMd|OQ$ao*RB5hF8U)|@MGnzJKD zfZAcsaQlOU-}_>)Cfp=820;z=+6%u-3P8iiivbVGPJbNFwfpX0%p$oO2th;|cCm zH~(D5teoR!GvvKbDV>01w2-ZsSvB|lNGP*ud;a2;oxP8suaPq5Y(QFK+-up0z(6(S zoXymUFs_rXC7%?=L9>~f_inK`3d5qs?3hG#O*c5fqACL68X`!5LK>5*<&=iO)wKcJ znz#Zd^b6ITL+mUJPBSPv_BoD2Hxz}G2*kTe($0)(Zz7IGob%3ml$KbK&^d>Mo_%?R zlmXAm0$$C~oEIWmu2%2ge|YWslT7j8-ow>uSgyKiv~f&z+djR!%YE0@^_d+ySlvfZ zGq#+B?TK?czwr0}#!r6g1|fPDCuFujpuhf=FFkkTxo`ZJ-~awwclu=y0KRHs3lmj3Fuw{*XHb@6KH$5P)ghs8%Q15H;wncG~1S>chS;Ml_hW3Relrt zDPU+x_K1K03HJKb{M&Qx$OXqey8NE6s!ojF#C}^nx^%26I5gDO_OD{oZ>J~F;F{HL_i_Dxg z8-Qm7_Wip@$A@#ev7>L^?;iBCt%g7Iq&Ndo*GTp4SAOu) zOE1cYIM#KQ;+n|1?sCi#V(#Y0tK&leM6P6#A&vcN75X{qK#+18g}`#mx|ZVwv922@ zytVB>SXiK>bId|gmIl|$gJh)B*#wcZDzYy(vac&J6DQtR%#u|NjF_vs_Ep_ZW<$s< ze%4N^$!xlPCWeqx8e@L<_PwTQjl%td`x*y`xVJt1yMOB!KlhnWv=tH+K&!+UHJ3_B z(wq?W+?k!Td*?s-W1oEI&fVYm&ENTxmtSY8R5cgq7YkR7^>y3SQ*boOpGENqN*y8WHnb+2->X{i0rDSABNC(sFBPJ$&j3LLx|lF6~QqPVoDJK z6g)d;tO|sjd{b|2Z+q{DVd(pQ=+eQfcaU-R+B-k@Q!hOI+$Xu3W(}r^2{CJkmd8=N z_f<_qY}r&}9Aj3^2|*34s{972TfCbHW@<$XjWLz2(nen0OiqP8VW!N^SFN}@p+*8V zTGI6PdwO>@KAih&JNf;4Y5$D=>D&1hjK{%V{ZaU@3 zVf(QsXWxD0{U3k&@&t_|VB5~SE^FGDMc_25RIL=b8iGO|Jpr{1F@wmsI?g$#FmN8p z#+;U#B2<+ZC$1qUu4slvPQ0spRE;q>@5`ohYimNnA&gR2uCAT03<+6?!Ak2a03Z#+ zu*Nqds_JAisj6xWp{}dxbfTtxh|VQICh>U;UDpr8aCr2%nQk2&FJtU?C-k|`KL6F9 z|I!mzE-?nn5K?B6k_tSD=Bz`pqQH=yqwAN>Tt2tsb=ce5S@mO|lWJx`a_yNj+h6*r z&p!3h8@KO2{Il=9y6Vz6_VdMjHH^Nh%rjexieSP>PS}7A1pt{DEV;%rH!&L%fW|m1 zOmVWiU(aSi%{h;XCH6^zF((9|c1v>_hR(IEY9_+@q7$Ep$jDJum6}4T>u-PW&AQ$C z;%7fOX`G|X2%flVn!2hG%rrwzNmC*H6hs*miIF(x9OJMtJ)Q7T%sE$8b$Oc>r>O`# zx1Qxg7?~LWonuuqMKMWk@T!cxOS2mnx2Cvw?s^r~-dyO>p*y!v{nD%Tvk6tLjxp`D z@R3V&X6BcNeknj z|M|`TQ@J{uJzDmUj#r0|ms+qC&Jp6?cH4KWTlXH+btAsgtWgbxomAerzF+xDs;VKP zz3ti0eeq{nftOx<@ueTU!*!hjLd*mx&c6?QhK{ZS8Uj%f+0TYVSoHIcJ8tHjb6f zos>f=HW3kFCIvbT-32h`q)4R0Fdz~UJI4TI; zhM98##=AFP55r3SqPew@7%`5Wrhe>~DRhF#fif{MA!M;pDo8ox!gi`>Na%_lgfxbc zghZTXGcf1Zt||m1!aKL$8;AMs&R*TjP~pPgeBYWV=hyk%5Mkn0$ zd{*pMrs1i^9~VkRTXgr+%$`EfpccyR0J@SS(x)im_;j-(=%vfDN_0!-c5 ztyVdQtXgWHn4N2yq3?4HMctfCTSDvmZiu7!dO&w)o@ABYdhkP3b1XvC)Uq?F&z;@A za`F6=*Dhbbayyl?z?>2)&GfME$~X?a96gN$ZggoTM2 z6@aMtKboq}fG3CxP_+<4Rr&I8bmGQgc<<)B&wTVLW=2BiU17L|m=Q(90qAPgHErv> zGsF!L77PfWXiRH(EE1T8Fuw8D%{Sk8TF^V5U zzIf^CFaLvIzIf>pGt;^&8bXX^5rIfKXJIygvi>22oG~KQ=Kuf%EJ;K`RC0M|th&|W z__*?J3_)0CTid>BfZW^f-uc#_y!7JBuiksS%x+4~5n*xlFeoD+6&Kn1$rN);s?K== zm{wIgYj$?`?jJnr`!4j0zVFPKoiMX7S?4562)%dy%F{~YlNYmMj znO5CWv$9B4H=3b1Ok)_CKnT%*g%J&M5`>EN*=v{n_TT*4m2*2bt`IZ>o+igG7mMX` zHJi?ycL?ZwDK$5M z1=|g#=DY*Mq`Ab7Pk3kQm`i-nu@GX4X?}EcaQ|L18|IHz$A?|QJGbv`?`~hZeC7Q9 z`L?pEqA}|Tm zky1=pKRjA|=cVud;UE3k%?HPdOs-;apjlJQIgR5OWAxtVlmM`5n+yAUFMQ(1L2c2m zZr#3j@8B^4=GcWWR8plBhHjN(NO2@2TxW9Doqohp)w9Xg4vOzq%a}qtnfj^{R}KAY z=sRLAdPo8O7jOcTk!CO;;iTC#pH#lZZKwa@pZvWW*DgRBlIGH{<-IRIHW4}T%z{Wv zqQ!WgbBrOS?3^#?uW=l6jyXpnhrlt7DF@0h-PzmP-6Nzh46AO*OyWEcd0%PHap?Pg z8N!%j?)$N7n(1_U`~ACHTeJQB{jJ%yunfaWb5Kpc`5V9S{+&CY{?rTj^KZz7%_?jx{oHH=zeT<2nr-BPJ14Cx1>xMVq_{qhM{%uG{Ov#dHlI;yMMi7*l6@pyiOVPV5uN!n(z8kWA9?eAS^ zT8Uv8LKfeG)a;rJ1jr|5GqLqMfq)5t03qjGC>_N-yCETNQs*aTSvsjkj@b&`o$ekk z|JAR3lH4$k27KlAzBy|cG&zWwgIx0EOuDsbO-s6hzh zFr+YOig6r~*;Tcf#u&hI>6aJpr(1hfGYMH&tEC9nZQHh!t{=L^B8`KYv3PdEBH~?H ziz2ATk=Y@$LN--MiAhq9VRgLU(m(%|UwZz=lMGspcQJ8_IHqETz1nOEAPaY znW=o`Rn;t&4R?t;SyWX6vgK+q_FX$^d{v8cO*OV literal 0 HcmV?d00001 diff --git a/integration-tests/models/__snapshots__/test_flash_pali_gemma/test_flash_pali_gemma.json b/integration-tests/models/__snapshots__/test_flash_pali_gemma/test_flash_pali_gemma.json new file mode 100644 index 00000000..037e0b16 --- /dev/null +++ b/integration-tests/models/__snapshots__/test_flash_pali_gemma/test_flash_pali_gemma.json @@ -0,0 +1,25 @@ +{ + "details": { + "best_of_sequences": null, + "finish_reason": "eos_token", + "generated_tokens": 2, + "prefill": [], + "seed": null, + "tokens": [ + { + "id": 54901, + "logprob": -0.72753906, + "special": false, + "text": "beach" + }, + { + "id": 1, + "logprob": -0.011009216, + "special": true, + "text": "" + } + ], + "top_tokens": null + }, + "generated_text": "beach" +} diff --git a/integration-tests/models/test_flash_pali_gemma.py b/integration-tests/models/test_flash_pali_gemma.py new file mode 100644 index 00000000..d4e83c9f --- /dev/null +++ b/integration-tests/models/test_flash_pali_gemma.py @@ -0,0 +1,39 @@ +import pytest +import requests +import io +import base64 + + +@pytest.fixture(scope="module") +def flash_pali_gemma_handle(launcher): + with launcher( + "google/paligemma-3b-pt-224", + num_shard=1, + revision="float16", + max_input_length=4000, + max_total_tokens=4096, + ) as handle: + yield handle + + +@pytest.fixture(scope="module") +async def flash_pali_gemma(flash_pali_gemma_handle): + await flash_pali_gemma_handle.health(300) + return flash_pali_gemma_handle.client + + +def get_cow_beach(): + with open("integration-tests/images/cow_beach.png", "rb") as image_file: + encoded_string = base64.b64encode(image_file.read()) + return f"data:image/png;base64,{encoded_string.decode('utf-8')}" + + +@pytest.mark.asyncio +@pytest.mark.private +async def test_flash_pali_gemma(flash_pali_gemma, response_snapshot): + cow = get_cow_beach() + inputs = f"![]({cow})Where is the cow standing?\n" + response = await flash_pali_gemma.generate(inputs, max_new_tokens=20) + + assert response.generated_text == "beach" + assert response == response_snapshot diff --git a/router/src/config.rs b/router/src/config.rs index 989f0e31..d27b1136 100644 --- a/router/src/config.rs +++ b/router/src/config.rs @@ -100,7 +100,6 @@ impl LlavaNext { } #[derive(Clone, Debug, Serialize, Deserialize)] -#[serde(tag = "model_type")] #[serde(rename_all = "snake_case")] pub struct ClipVisionModel { image_size: usize, @@ -108,7 +107,6 @@ pub struct ClipVisionModel { } #[derive(Clone, Debug, Serialize, Deserialize)] -#[serde(tag = "model_type")] #[serde(rename_all = "snake_case")] pub struct Idefics2 {} @@ -118,6 +116,24 @@ impl Idefics2 { } } +#[derive(Clone, Debug, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub struct PaliTextConfig { + num_image_tokens: usize, +} + +#[derive(Clone, Debug, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub struct Paligemma { + text_config: PaliTextConfig, +} + +impl Paligemma { + pub fn get_number_of_features(&self, _height: usize, _width: usize) -> usize { + self.text_config.num_image_tokens + } +} + #[derive(Clone, Debug, Serialize, Deserialize)] #[serde(tag = "model_type")] #[serde(rename_all = "snake_case")] @@ -140,6 +156,7 @@ pub enum Config { Phi3, Llama, Baichuan, + Paligemma(Paligemma), Gemma, Cohere, Drbx, diff --git a/router/src/validation.rs b/router/src/validation.rs index db832042..ee48f705 100644 --- a/router/src/validation.rs +++ b/router/src/validation.rs @@ -565,6 +565,30 @@ fn prepare_input( inputs = modified_inputs; tokenizer_query } + Some(Config::Paligemma(config)) => { + let mut modified_inputs = String::with_capacity(inputs.len()); + let mut tokenizer_query = String::with_capacity(inputs.len()); + let mut start = 0; + for chunk in RE.find_iter(&inputs) { + let chunk_start = chunk.start(); + let chunk_end = chunk.end(); + if chunk_start != start { + modified_inputs.push_str(&inputs[start..chunk_start]); + tokenizer_query.push_str(&inputs[start..chunk_start]); + } + let (image_uri, height, width) = fetch_image(&inputs[chunk_start..chunk_end])?; + let slots = config.get_number_of_features(height, width); + tokenizer_query.push_str(&"".repeat(slots)); + modified_inputs.push_str(&image_uri); + start = chunk_end; + } + if start != inputs.len() - 1 { + modified_inputs.push_str(&inputs[start..]); + tokenizer_query.push_str(&inputs[start..]); + } + inputs = modified_inputs; + tokenizer_query + } Some(Config::Idefics2(config)) => { let mut modified_inputs = String::with_capacity(inputs.len()); let mut tokenizer_query = String::with_capacity(inputs.len()); diff --git a/server/requirements_cuda.txt b/server/requirements_cuda.txt index 7f0efded..9035f6bc 100644 --- a/server/requirements_cuda.txt +++ b/server/requirements_cuda.txt @@ -13,7 +13,7 @@ grpcio-reflection==1.62.2 ; python_version >= "3.9" and python_version < "3.13" grpcio-status==1.62.2 ; python_version >= "3.9" and python_version < "3.13" grpcio==1.63.0 ; python_version >= "3.9" and python_version < "3.13" hf-transfer==0.1.6 ; python_version >= "3.9" and python_version < "3.13" -huggingface-hub==0.19.4 ; python_version >= "3.9" and python_version < "3.13" +huggingface-hub==0.23.0 ; python_version >= "3.9" and python_version < "3.13" idna==3.7 ; python_version >= "3.9" and python_version < "3.13" loguru==0.6.0 ; python_version >= "3.9" and python_version < "3.13" numpy==1.26.4 ; python_version >= "3.9" and python_version < "3.13" @@ -40,7 +40,7 @@ sentencepiece==0.1.99 ; python_version >= "3.9" and python_version < "3.13" setuptools==69.5.1 ; python_version >= "3.9" and python_version < "3.13" tokenizers==0.19.1 ; python_version >= "3.9" and python_version < "3.13" tqdm==4.66.4 ; python_version >= "3.9" and python_version < "3.13" -transformers==4.40.2 ; python_version >= "3.9" and python_version < "3.13" +transformers @ git+https://github.com/huggingface/transformers.git@b8aee2e918d7ba2d5e9e80162ae26b4806873307 ; python_version >= "3.9" and python_version < "3.13" typer==0.6.1 ; python_version >= "3.9" and python_version < "3.13" typing-extensions==4.11.0 ; python_version >= "3.9" and python_version < "3.13" urllib3==2.2.1 ; python_version >= "3.9" and python_version < "3.13" diff --git a/server/requirements_rocm.txt b/server/requirements_rocm.txt index 7f0efded..9035f6bc 100644 --- a/server/requirements_rocm.txt +++ b/server/requirements_rocm.txt @@ -13,7 +13,7 @@ grpcio-reflection==1.62.2 ; python_version >= "3.9" and python_version < "3.13" grpcio-status==1.62.2 ; python_version >= "3.9" and python_version < "3.13" grpcio==1.63.0 ; python_version >= "3.9" and python_version < "3.13" hf-transfer==0.1.6 ; python_version >= "3.9" and python_version < "3.13" -huggingface-hub==0.19.4 ; python_version >= "3.9" and python_version < "3.13" +huggingface-hub==0.23.0 ; python_version >= "3.9" and python_version < "3.13" idna==3.7 ; python_version >= "3.9" and python_version < "3.13" loguru==0.6.0 ; python_version >= "3.9" and python_version < "3.13" numpy==1.26.4 ; python_version >= "3.9" and python_version < "3.13" @@ -40,7 +40,7 @@ sentencepiece==0.1.99 ; python_version >= "3.9" and python_version < "3.13" setuptools==69.5.1 ; python_version >= "3.9" and python_version < "3.13" tokenizers==0.19.1 ; python_version >= "3.9" and python_version < "3.13" tqdm==4.66.4 ; python_version >= "3.9" and python_version < "3.13" -transformers==4.40.2 ; python_version >= "3.9" and python_version < "3.13" +transformers @ git+https://github.com/huggingface/transformers.git@b8aee2e918d7ba2d5e9e80162ae26b4806873307 ; python_version >= "3.9" and python_version < "3.13" typer==0.6.1 ; python_version >= "3.9" and python_version < "3.13" typing-extensions==4.11.0 ; python_version >= "3.9" and python_version < "3.13" urllib3==2.2.1 ; python_version >= "3.9" and python_version < "3.13" diff --git a/server/text_generation_server/layers/linear.py b/server/text_generation_server/layers/linear.py index d137a500..8de6ead0 100644 --- a/server/text_generation_server/layers/linear.py +++ b/server/text_generation_server/layers/linear.py @@ -10,9 +10,9 @@ class FastLinear(torch.nn.Module): bias, ) -> None: super().__init__() - self.weight = torch.nn.Parameter(weight) + self.weight = torch.nn.Parameter(weight, requires_grad=False) if bias is not None: - self.bias = torch.nn.Parameter(bias) + self.bias = torch.nn.Parameter(bias, requires_grad=False) else: self.bias = None diff --git a/server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py b/server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py index 43b90bdd..ac6fd0e6 100644 --- a/server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py @@ -99,8 +99,13 @@ class GemmaConfig(PretrainedConfig): class GemmaFastRMSNorm(FastRMSNorm): @classmethod def load(cls, prefix, weights, eps=1e-6): + dtype = weights.dtype + weights.dtype = torch.float32 weight = weights.get_tensor(f"{prefix}.weight") + 1 - return cls(weight, eps) + weights.dtype = dtype + new = cls(weight, eps) + new.dtype = dtype + return new # perform the multiplication in full precision and downcast after def forward(self, hidden_states, residual=None): @@ -111,7 +116,7 @@ class GemmaFastRMSNorm(FastRMSNorm): variance = hidden_states.pow(2).mean(-1, keepdim=True) hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) hidden_states = hidden_states * self.weight - return hidden_states.to(self.weight.dtype), residual + return hidden_states.to(self.dtype), residual def load_attention(config, prefix, weights): @@ -153,15 +158,11 @@ def _load_gqa(config, prefix: str, weights): class FlashGemmaAttention(torch.nn.Module): - def __init__( - self, - prefix: str, - config, - weights, - ): + def __init__(self, prefix: str, config, weights, causal: bool): super().__init__() self.num_heads = config.num_attention_heads self.head_size = config.head_dim + self.causal = causal self.rotary_emb = PositionRotaryEmbedding.static( config=config, @@ -238,6 +239,7 @@ class FlashGemmaAttention(torch.nn.Module): cu_seqlen_prefill, max_s, self.softmax_scale, + causal=self.causal, ) # Decode else: @@ -295,11 +297,10 @@ class GemmaMLP(nn.Module): class FlashGemmaLayer(nn.Module): - def __init__(self, layer_id, config, weights): + def __init__(self, prefix, config, weights, causal: bool): super().__init__() - prefix = f"model.layers.{layer_id}" self.self_attn = FlashGemmaAttention( - prefix=f"{prefix}.self_attn", config=config, weights=weights + prefix=f"{prefix}.self_attn", config=config, weights=weights, causal=causal ) self.mlp = GemmaMLP(prefix=f"{prefix}.mlp", config=config, weights=weights) @@ -351,30 +352,25 @@ class FlashGemmaLayer(nn.Module): class FlashGemmaModel(torch.nn.Module): - def __init__(self, config, weights): + def __init__(self, prefix, config, weights, causal: bool): super().__init__() process_group = weights.process_group self.tp_rank = process_group.rank() self.tp_world_size = process_group.size() - embed_norm = config.hidden_size**0.5 - self.embed_tokens = TensorParallelEmbedding( - prefix="model.embed_tokens", weights=weights - ) - self.embed_tokens.weight *= embed_norm - self.layers = nn.ModuleList( [ FlashGemmaLayer( - layer_id, - config, - weights, + prefix=f"{prefix}.layers.{layer_id}", + config=config, + weights=weights, + causal=causal, ) for layer_id in range(config.num_hidden_layers) ] ) self.norm = GemmaFastRMSNorm.load( - prefix="model.norm", weights=weights, eps=config.rms_norm_eps + prefix=f"{prefix}.norm", weights=weights, eps=config.rms_norm_eps ) self.gradient_checkpointing = False @@ -385,7 +381,7 @@ class FlashGemmaModel(torch.nn.Module): def forward( self, - input_ids: torch.Tensor, + inputs_embeds: torch.Tensor, position_ids: torch.Tensor, cu_seqlen_prefill: Optional[torch.Tensor], kv_cache: List[Tuple[torch.Tensor, torch.Tensor]], @@ -394,7 +390,7 @@ class FlashGemmaModel(torch.nn.Module): input_lengths: torch.Tensor, max_s: int, ) -> torch.Tensor: - hidden_states = self.embed_tokens(input_ids) + hidden_states = inputs_embeds # Get rotary cos and sin for this forward # Avoid to index in each layer @@ -423,13 +419,30 @@ class FlashGemmaModel(torch.nn.Module): class FlashGemmaForCausalLM(torch.nn.Module): - def __init__(self, config, weights): + def __init__(self, prefix, config, weights, causal: bool): super().__init__() - self.model = FlashGemmaModel(config, weights) + embed_norm = config.hidden_size**0.5 + if prefix is None: + prefix = "model" + else: + prefix = f"{prefix}.model" + + self.embed_tokens = TensorParallelEmbedding( + prefix=f"{prefix}.embed_tokens", weights=weights + ) + self.embed_tokens.weight *= embed_norm + + self.model = FlashGemmaModel( + prefix=prefix, config=config, weights=weights, causal=causal + ) self.lm_head = SpeculativeHead.load( - config, - prefix="model.embed_tokens" if config.tie_word_embeddings else "lm_head", + prefix=( + f"{prefix}.embed_tokens" + if config.tie_word_embeddings + else f"{prefix}.lm_head" + ), + config=config, weights=weights, ) @@ -445,8 +458,9 @@ class FlashGemmaForCausalLM(torch.nn.Module): max_s: int, lm_head_indices: Optional[torch.Tensor] = None, ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: + input_embeds = self.embed_tokens(input_ids) hidden_states = self.model( - input_ids, + input_embeds, position_ids, cu_seqlen_prefill, kv_cache, diff --git a/server/text_generation_server/models/custom_modeling/flash_pali_gemma_modeling.py b/server/text_generation_server/models/custom_modeling/flash_pali_gemma_modeling.py new file mode 100644 index 00000000..91c709e4 --- /dev/null +++ b/server/text_generation_server/models/custom_modeling/flash_pali_gemma_modeling.py @@ -0,0 +1,110 @@ +# coding=utf-8 +# Copyright 2024 HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +import torch.distributed +from torch import nn +from transformers.configuration_utils import PretrainedConfig +from typing import Optional, List, Tuple + +from text_generation_server.layers.tensor_parallel import TensorParallelColumnLinear +from text_generation_server.models.custom_modeling.vlm import ( + load_text_model, + load_vision_model, +) + + +class PaliGemmaForConditionalGeneration(nn.Module): + def __init__(self, prefix, config, weights): + super().__init__() + config.vision_config.quantize = config.quantize + self.vision_tower = load_vision_model( + prefix="vision_tower" if not prefix else f"{prefix}.vision_tower", + config=config.vision_config, + weights=weights, + ) + + self.multi_modal_projector = TensorParallelColumnLinear.load( + config, + prefix="multi_modal_projector.linear", + weights=weights, + bias=True, + ) + + self.vocab_size = config.vocab_size + self.config = config + + text_config = config.text_config + text_config.speculator = config.speculator + text_config.quantize = config.quantize + self.text_model = load_text_model( + prefix="language_model" if not prefix else f"{prefix}.language_model", + config=config.text_config, + weights=weights, + ) + self.pad_token_id = ( + config.pad_token_id if config.pad_token_id is not None else -1 + ) + + def forward( + self, + input_ids: torch.Tensor, + position_ids: torch.Tensor, + cu_seqlen_prefill: Optional[torch.Tensor], + kv_cache: List[Tuple[torch.Tensor, torch.Tensor]], + block_tables: torch.Tensor, + slots: torch.Tensor, + input_lengths: torch.Tensor, + max_s: int, + prefill_cache_indices: Optional[torch.Tensor] = None, + lm_head_indices: Optional[torch.Tensor] = None, + pixel_values: torch.FloatTensor = None, + # Unused here + pixel_attention_mask: Optional[torch.BoolTensor] = None, + image_sizes: Optional[torch.Tensor] = None, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: + inputs_embeds = self.text_model.embed_tokens(input_ids) + # TODO This is odd but apparently pali gemma position ids start at 1. + if cu_seqlen_prefill is not None: + max_s += 1 + position_ids += 1 + + if pixel_values is not None: + pixel_values = pixel_values.to(dtype=inputs_embeds.dtype) + image_outputs = self.vision_tower(pixel_values) + image_features = self.multi_modal_projector(image_outputs.last_hidden_state) + + # mask where image or padding tokens + mask = input_ids == self.config.image_token_index + + # insert image features into input embeddings + inputs_embeds[mask] = image_features.view(-1, image_features.shape[-1]) + + hidden_states = self.text_model.model( + inputs_embeds=inputs_embeds, + position_ids=position_ids, + cu_seqlen_prefill=cu_seqlen_prefill, + kv_cache=kv_cache, + block_tables=block_tables, + slots=slots, + input_lengths=input_lengths, + max_s=max_s, + ) + + if lm_head_indices is not None: + hidden_states = hidden_states[lm_head_indices] + logits, speculative_logits = self.text_model.lm_head(hidden_states) + + return logits, speculative_logits diff --git a/server/text_generation_server/models/custom_modeling/siglip.py b/server/text_generation_server/models/custom_modeling/siglip.py new file mode 100644 index 00000000..f17d6562 --- /dev/null +++ b/server/text_generation_server/models/custom_modeling/siglip.py @@ -0,0 +1,565 @@ +from typing import Optional, Tuple, Union + +import math +import torch +from torch import nn + +from transformers.activations import ACT2FN +from transformers.modeling_attn_mask_utils import ( + _create_4d_causal_attention_mask, + _prepare_4d_attention_mask, +) +from transformers.modeling_outputs import ( + BaseModelOutput, + BaseModelOutputWithPooling, + ImageClassifierOutput, +) +from transformers import SiglipConfig, SiglipTextConfig, SiglipVisionConfig + +from text_generation_server.layers.tensor_parallel import ( + TensorParallelEmbedding, + TensorParallelColumnLinear, + TensorParallelRowLinear, +) + + +class SiglipVisionEmbeddings(nn.Module): + def __init__(self, prefix, config: SiglipVisionConfig, weights): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.image_size = config.image_size + self.patch_size = config.patch_size + self.patch_embedding = nn.Conv2d( + in_channels=config.num_channels, + out_channels=self.embed_dim, + kernel_size=self.patch_size, + stride=self.patch_size, + padding="valid", + ) + self.patch_embedding.weight = nn.Parameter( + weights.get_tensor(f"{prefix}.patch_embedding.weight"), requires_grad=False + ) + self.patch_embedding.bias = nn.Parameter( + weights.get_tensor(f"{prefix}.patch_embedding.bias"), requires_grad=False + ) + self.num_patches = (self.image_size // self.patch_size) ** 2 + self.num_positions = self.num_patches + self.position_embedding = TensorParallelEmbedding( + prefix=f"{prefix}.position_embedding", weights=weights + ) + self.register_buffer( + "position_ids", + torch.arange(self.num_positions, device=weights.device).expand((1, -1)), + persistent=False, + ) + + def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor: + patch_embeds = self.patch_embedding( + pixel_values + ) # shape = [*, width, grid, grid] + embeddings = patch_embeds.flatten(2).transpose(1, 2) + + embeddings = embeddings + self.position_embedding(self.position_ids) + return embeddings + + +class SiglipTextEmbeddings(nn.Module): + def __init__(self, config: SiglipTextConfig): + super().__init__() + embed_dim = config.hidden_size + + self.token_embedding = nn.Embedding(config.vocab_size, embed_dim) + self.position_embedding = nn.Embedding( + config.max_position_embeddings, embed_dim + ) + + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.register_buffer( + "position_ids", + torch.arange(config.max_position_embeddings).expand((1, -1)), + persistent=False, + ) + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + ) -> torch.Tensor: + seq_length = ( + input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2] + ) + + if position_ids is None: + position_ids = self.position_ids[:, :seq_length] + + if inputs_embeds is None: + inputs_embeds = self.token_embedding(input_ids) + + position_embeddings = self.position_embedding(position_ids) + embeddings = inputs_embeds + position_embeddings + + return embeddings + + +class SiglipAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, prefix, config, weights): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.embed_dim // self.num_heads + self.head_size = self.head_dim + if self.head_dim * self.num_heads != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" + f" {self.num_heads})." + ) + self.num_heads = self.num_heads // weights.process_group.size() + self.embed_dim = self.embed_dim // weights.process_group.size() + self.scale = self.head_dim**-0.5 + self.dropout = config.attention_dropout + + self.k_proj = TensorParallelColumnLinear.load( + config, prefix=f"{prefix}.k_proj", weights=weights, bias=True + ) + self.v_proj = TensorParallelColumnLinear.load( + config, prefix=f"{prefix}.v_proj", weights=weights, bias=True + ) + self.q_proj = TensorParallelColumnLinear.load( + config, prefix=f"{prefix}.q_proj", weights=weights, bias=True + ) + self.out_proj = TensorParallelRowLinear.load( + config, prefix=f"{prefix}.out_proj", weights=weights, bias=True + ) + + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return ( + tensor.view(bsz, seq_len, self.num_heads, self.head_dim) + .transpose(1, 2) + .contiguous() + ) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + """Input shape: Batch x Time x Channel""" + + bsz, tgt_len, _ = hidden_states.size() + query_states = self.q_proj(hidden_states) + key_states = self._shape(self.k_proj(hidden_states), -1, bsz) + value_states = self._shape(self.v_proj(hidden_states), -1, bsz) + proj_shape = (bsz * self.num_heads, -1, self.head_dim) + query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape) + key_states = key_states.view(*proj_shape) + value_states = value_states.view(*proj_shape) + + src_len = key_states.size(1) + # scale post matmul + attn_weights = torch.bmm(query_states, key_states.transpose(1, 2)) * self.scale + + if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len): + raise ValueError( + f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is" + f" {attn_weights.size()}" + ) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, tgt_len, src_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}" + ) + attn_weights = ( + attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + + attention_mask + ) + attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) + + # upcast attention to fp32 + attn_weights = nn.functional.softmax( + attn_weights, dim=-1, dtype=torch.float32 + ).to(attn_weights.dtype) + attn_weights = nn.functional.dropout( + attn_weights, p=self.dropout, training=self.training + ) + attn_output = torch.matmul(attn_weights, value_states) + + if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_size): + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_size)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_size) + attn_output = attn_output.transpose(1, 2) + attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) + + attn_output = self.out_proj(attn_output) + + return attn_output, attn_weights + + +class SiglipMLP(nn.Module): + def __init__(self, prefix, config, weights): + super().__init__() + self.config = config + self.activation_fn = ACT2FN[config.hidden_act] + self.fc1 = TensorParallelColumnLinear.load( # config.hidden_size, config.intermediate_size + prefix=f"{prefix}.fc1", config=config, weights=weights, bias=True + ) + self.fc2 = TensorParallelRowLinear.load( # config.intermediate_size, config.hidden_size + prefix=f"{prefix}.fc2", config=config, weights=weights, bias=True + ) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.fc1(hidden_states) + hidden_states = self.activation_fn(hidden_states) + hidden_states = self.fc2(hidden_states) + return hidden_states + + +class SiglipEncoderLayer(nn.Module): + def __init__(self, prefix, config: SiglipConfig, weights): + super().__init__() + self.embed_dim = config.hidden_size + self.self_attn = SiglipAttention( + prefix=f"{prefix}.self_attn", config=config, weights=weights + ) + self.layer_norm1 = nn.LayerNorm.load( + prefix=f"{prefix}.layer_norm1", weights=weights, eps=config.layer_norm_eps + ) + self.mlp = SiglipMLP(prefix=f"{prefix}.mlp", config=config, weights=weights) + self.layer_norm2 = nn.LayerNorm.load( + prefix=f"{prefix}.layer_norm2", weights=weights, eps=config.layer_norm_eps + ) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.FloatTensor]: + """ + Args: + hidden_states (`torch.FloatTensor`): + Input to the layer of shape `(batch, seq_len, embed_dim)`. + attention_mask (`torch.FloatTensor`): + Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values. + output_attentions (`bool`, *optional*, defaults to `False`): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + hidden_states = self.layer_norm1(hidden_states) + hidden_states, attn_weights = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + output_attentions=output_attentions, + ) + hidden_states = residual + hidden_states + residual = hidden_states + hidden_states = self.layer_norm2(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + if output_attentions: + return hidden_states, attn_weights + return hidden_states, None + + +class SiglipMultiheadAttentionPoolingHead(nn.Module): + """Multihead Attention Pooling.""" + + def __init__(self, prefix, config: SiglipVisionConfig, weights): + super().__init__() + + self.probe = nn.Parameter(torch.randn(1, 1, config.hidden_size)) + self.attention = torch.nn.MultiheadAttention( + config.hidden_size, config.num_attention_heads, batch_first=True + ) + self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.mlp = SiglipMLP(prefix, config, weights) + + def forward(self, hidden_state): + batch_size = hidden_state.shape[0] + probe = self.probe.repeat(batch_size, 1, 1) + + hidden_state = self.attention(probe, hidden_state, hidden_state)[0] + + residual = hidden_state + hidden_state = self.layernorm(hidden_state) + hidden_state = residual + self.mlp(hidden_state) + + return hidden_state[:, 0] + + +import warnings + + +def _trunc_normal_(tensor, mean, std, a, b): + # Cut & paste from PyTorch official master until it's in a few official releases - RW + # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf + def norm_cdf(x): + # Computes standard normal cumulative distribution function + return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0 + + if (mean < a - 2 * std) or (mean > b + 2 * std): + warnings.warn( + "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. " + "The distribution of values may be incorrect.", + stacklevel=2, + ) + + # Values are generated by using a truncated uniform distribution and + # then using the inverse CDF for the normal distribution. + # Get upper and lower cdf values + l = norm_cdf((a - mean) / std) + u = norm_cdf((b - mean) / std) + + # Uniformly fill tensor with values from [l, u], then translate to + # [2l-1, 2u-1]. + tensor.uniform_(2 * l - 1, 2 * u - 1) + + # Use inverse cdf transform for normal distribution to get truncated + # standard normal + tensor.erfinv_() + + # Transform to proper mean, std + tensor.mul_(std * math.sqrt(2.0)) + tensor.add_(mean) + + # Clamp to ensure it's in the proper range + tensor.clamp_(min=a, max=b) + + +def trunc_normal_tf_( + tensor: torch.Tensor, + mean: float = 0.0, + std: float = 1.0, + a: float = -2.0, + b: float = 2.0, +) -> torch.Tensor: + """Fills the input Tensor with values drawn from a truncated + normal distribution. The values are effectively drawn from the + normal distribution :math:`\\mathcal{N}(\text{mean}, \text{std}^2)` + with values outside :math:`[a, b]` redrawn until they are within + the bounds. The method used for generating the random values works + best when :math:`a \\leq \text{mean} \\leq b`. + + NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the + bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0 + and the result is subsquently scaled and shifted by the mean and std args. + + Args: + tensor: an n-dimensional `torch.Tensor` + mean: the mean of the normal distribution + std: the standard deviation of the normal distribution + a: the minimum cutoff value + b: the maximum cutoff value + """ + with torch.no_grad(): + _trunc_normal_(tensor, 0, 1.0, a, b) + tensor.mul_(std).add_(mean) + + +from torch.nn.init import _calculate_fan_in_and_fan_out + + +def variance_scaling_(tensor, scale=1.0, mode="fan_in", distribution="normal"): + fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor) + if mode == "fan_in": + denom = fan_in + elif mode == "fan_out": + denom = fan_out + elif mode == "fan_avg": + denom = (fan_in + fan_out) / 2 + + variance = scale / denom + + if distribution == "truncated_normal": + # constant is stddev of standard normal truncated to (-2, 2) + trunc_normal_tf_(tensor, std=math.sqrt(variance) / 0.87962566103423978) + elif distribution == "normal": + with torch.no_grad(): + tensor.normal_(std=math.sqrt(variance)) + elif distribution == "uniform": + bound = math.sqrt(3 * variance) + with torch.no_grad(): + tensor.uniform_(-bound, bound) + else: + raise ValueError(f"invalid distribution {distribution}") + + +def lecun_normal_(tensor): + variance_scaling_(tensor, mode="fan_in", distribution="truncated_normal") + + +def default_flax_embed_init(tensor): + variance_scaling_(tensor, mode="fan_in", distribution="normal") + + +from transformers import PreTrainedModel + + +class SiglipPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = SiglipConfig + base_model_prefix = "siglip" + supports_gradient_checkpointing = True + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, SiglipVisionEmbeddings): + width = ( + self.config.vision_config.hidden_size + if isinstance(self.config, SiglipConfig) + else self.config.hidden_size + ) + nn.init.normal_(module.position_embedding.weight, std=1 / np.sqrt(width)) + elif isinstance(module, nn.Embedding): + default_flax_embed_init(module.weight) + elif isinstance(module, SiglipAttention): + nn.init.xavier_uniform_(module.q_proj.weight) + nn.init.xavier_uniform_(module.k_proj.weight) + nn.init.xavier_uniform_(module.v_proj.weight) + nn.init.xavier_uniform_(module.out_proj.weight) + nn.init.zeros_(module.q_proj.bias) + nn.init.zeros_(module.k_proj.bias) + nn.init.zeros_(module.v_proj.bias) + nn.init.zeros_(module.out_proj.bias) + elif isinstance(module, SiglipMLP): + nn.init.xavier_uniform_(module.fc1.weight) + nn.init.xavier_uniform_(module.fc2.weight) + nn.init.normal_(module.fc1.bias, std=1e-6) + nn.init.normal_(module.fc2.bias, std=1e-6) + elif isinstance(module, SiglipMultiheadAttentionPoolingHead): + nn.init.xavier_uniform_(module.probe.data) + nn.init.xavier_uniform_(module.attention.in_proj_weight.data) + nn.init.zeros_(module.attention.in_proj_bias.data) + elif isinstance(module, SiglipModel): + logit_scale_init = torch.log(torch.tensor(1.0)) + module.logit_scale.data.fill_(logit_scale_init) + module.logit_bias.data.zero_() + elif isinstance(module, (nn.Linear, nn.Conv2d)): + lecun_normal_(module.weight) + if module.bias is not None: + nn.init.zeros_(module.bias) + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + +class SiglipEncoder(nn.Module): + """ + Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a + [`SiglipEncoderLayer`]. + + Args: + config: SiglipConfig + """ + + def __init__(self, prefix, config: SiglipConfig, weights): + super().__init__() + self.config = config + self.layers = nn.ModuleList( + [ + SiglipEncoderLayer( + prefix=f"{prefix}.layers.{i}", config=config, weights=weights + ) + for i in range(config.num_hidden_layers) + ] + ) + + def forward( + self, + inputs_embeds, + attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[torch.Tensor] = None, + ): + r""" + Args: + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Causal mask for the text model. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + """ + + hidden_states = inputs_embeds + for idx, encoder_layer in enumerate(self.layers): + hidden_states, _ = encoder_layer( + hidden_states, + attention_mask, + output_attentions=output_attentions, + ) + + return hidden_states + + +class SiglipVisionTransformer(nn.Module): + def __init__(self, prefix, config: SiglipVisionConfig, weights): + super().__init__() + self.config = config + embed_dim = config.hidden_size + + self.embeddings = SiglipVisionEmbeddings( + prefix=f"{prefix}.embeddings", config=config, weights=weights + ) + self.encoder = SiglipEncoder( + prefix=f"{prefix}.encoder", config=config, weights=weights + ) + self.post_layernorm = nn.LayerNorm.load( + prefix=f"{prefix}.post_layernorm", + weights=weights, + eps=config.layer_norm_eps, + ) + + def forward( + self, + pixel_values: Optional[torch.FloatTensor] = None, + ): + r""" + Returns: + + """ + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + hidden_states = self.embeddings(pixel_values) + + # NOTE: up until this point, the code logits are exactly + # the same as the transformers code. The values evaulate + # slightly differently in our encoder layer. + encoder_outputs = self.encoder( + inputs_embeds=hidden_states, + ) + last_hidden_state = encoder_outputs + post_last_hidden_state = self.post_layernorm(last_hidden_state) + + return BaseModelOutputWithPooling( + last_hidden_state=post_last_hidden_state, + # pooler_output=pooled_output, + # hidden_states=encoder_outputs, + ) diff --git a/server/text_generation_server/models/custom_modeling/vlm.py b/server/text_generation_server/models/custom_modeling/vlm.py index 690957d0..b74b43ff 100644 --- a/server/text_generation_server/models/custom_modeling/vlm.py +++ b/server/text_generation_server/models/custom_modeling/vlm.py @@ -11,6 +11,18 @@ def load_text_model(prefix, config, weights, name=None): ) return FlashMistralForCausalLM(prefix, config, weights, name=name) + elif config.model_type == "gemma": + from text_generation_server.models.custom_modeling.flash_gemma_modeling import ( + FlashGemmaForCausalLM, + ) + + return FlashGemmaForCausalLM(prefix, config, weights, causal=False) + elif config.model_type == "paligemma": + from text_generation_server.models.custom_modeling.flash_gemma_modeling import ( + FlashGemmaForCausalLM, + ) + + return FlashGemmaForCausalLM(prefix, config, weights) else: raise RuntimeError(f"Unsupported model type {config.model_type}") @@ -24,5 +36,13 @@ def load_vision_model(prefix, config, weights): return CLIPVisionTransformer( prefix=f"{prefix}.vision_model", config=config, weights=weights ) + if config.model_type == "siglip_vision_model": + from text_generation_server.models.custom_modeling.siglip import ( + SiglipVisionTransformer, + ) + + return SiglipVisionTransformer( + prefix=f"vision_tower.vision_model", config=config, weights=weights + ) else: raise RuntimeError(f"Unsupported model type {config.model_type}") diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py index 36351252..c029d8f3 100644 --- a/server/text_generation_server/models/flash_causal_lm.py +++ b/server/text_generation_server/models/flash_causal_lm.py @@ -133,6 +133,17 @@ class FlashCausalLMBatch(Batch): device: torch.device, ) -> "FlashCausalLMBatch": batch_tokenized_inputs = cls.batch_tokenized_inputs(pb.requests, tokenizer) + return cls.from_tokenized(pb, tokenizer, batch_tokenized_inputs, dtype, device) + + @classmethod + def from_tokenized( + cls, + pb: generate_pb2.Batch, + tokenizer: PreTrainedTokenizerBase, + batch_tokenized_inputs, + dtype: torch.dtype, + device: torch.device, + ) -> "FlashCausalLMBatch": position_ids = [] speculative_ids = [] cu_seqlen_prefill = [0] diff --git a/server/text_generation_server/models/flash_gemma.py b/server/text_generation_server/models/flash_gemma.py index 9c00a056..53bfd064 100644 --- a/server/text_generation_server/models/flash_gemma.py +++ b/server/text_generation_server/models/flash_gemma.py @@ -3,12 +3,11 @@ import torch.distributed from opentelemetry import trace from typing import Optional -from transformers.models.gemma import GemmaTokenizerFast +from transformers import AutoConfig, AutoTokenizer from text_generation_server.models import FlashCausalLM from text_generation_server.models.custom_modeling.flash_gemma_modeling import ( FlashGemmaForCausalLM, - GemmaConfig, ) from text_generation_server.utils import ( initialize_torch_distributed, @@ -36,17 +35,15 @@ class FlashGemma(FlashCausalLM): else: raise NotImplementedError("FlashGemma is only available on GPU") - tokenizer = GemmaTokenizerFast.from_pretrained( + tokenizer = AutoTokenizer.from_pretrained( model_id, revision=revision, padding_side="left", truncation_side="left", trust_remote_code=trust_remote_code, - use_fast=True, - from_slow=False, ) - config = GemmaConfig.from_pretrained( + config = AutoConfig.from_pretrained( model_id, revision=revision, trust_remote_code=trust_remote_code ) config.quantize = quantize @@ -59,7 +56,9 @@ class FlashGemma(FlashCausalLM): if config.quantize in ["gptq", "awq"]: weights._set_gptq_params(model_id, revision) - model = FlashGemmaForCausalLM(config, weights) + # TODO hardcoded + prefix = "language_model" + model = FlashGemmaForCausalLM(prefix, config, weights, causal=True) torch.distributed.barrier(group=self.process_group) super(FlashGemma, self).__init__( diff --git a/server/text_generation_server/models/pali_gemma.py b/server/text_generation_server/models/pali_gemma.py new file mode 100644 index 00000000..d94b9526 --- /dev/null +++ b/server/text_generation_server/models/pali_gemma.py @@ -0,0 +1,123 @@ +import torch +import torch.distributed +from opentelemetry import trace +from typing import Optional, Tuple +from text_generation_server.models.vlm_causal_lm import ( + VlmCausalLM, + VlmCausalLMBatch, + image_text_replacement, + load_data_uri, + split, +) +from text_generation_server.models.custom_modeling.flash_pali_gemma_modeling import ( + PaliGemmaForConditionalGeneration, +) +from transformers import AutoProcessor, AutoConfig, AutoImageProcessor + +tracer = trace.get_tracer(__name__) + + +class PaliGemmaBatch(VlmCausalLMBatch): + @classmethod + def batch_tokenized_inputs(cls, requests, tokenizer, processor, config): + batch_inputs = [] + image_inputs = [] + max_truncation = 0 + for r in requests: + chunks = split(r.inputs) + full_text = "" + image_id = 0 + for chunk in chunks: + if chunk["type"] == "text": + full_text += "" + chunk["content"] + "\n" + elif chunk["type"] == "image": + image = chunk["content"] + # Should never receive URLs anymore, processing should be done + # On the rust layer. + # This avoid making n queries per TP + # if image.startswith("https://") or image.startswith("http://"): + # image = processor.image_processor.fetch_images(image) + if image.startswith("data:"): + image = load_data_uri(image) + else: + raise RuntimeError( + "Cannot process input image not starting with data:" + ) + # TODO do_convert_RGB should be on by default ? + image = image.convert("RGB") + image_input = processor.image_processor(image, return_tensors="pt") + full_text += image_text_replacement(image_input, config, image_id) + image_inputs.append(image_input) + else: + raise RuntimeError(f"Invalid chunk type {chunk['type']}") + + batch_inputs.append(full_text) + max_truncation = max(max_truncation, r.truncate) + + batch_tokenized_inputs = tokenizer( + batch_inputs, + truncation=True, + max_length=max_truncation, + add_special_tokens=False, + )["input_ids"] + if image_inputs: + image_input = image_inputs[0] + new_image_inputs = { + "pixel_values": torch.cat( + [img["pixel_values"] for img in image_inputs], dim=0 + ), + } + if "pixel_attention_mask" in image_input: + new_image_inputs["pixel_attention_mask"] = torch.cat( + [img["pixel_attention_mask"] for img in image_inputs], dim=0 + ) + if "image_sizes" in image_input: + new_image_inputs["image_sizes"] = torch.cat( + [img["image_sizes"] for img in image_inputs], dim=0 + ) + image_inputs = new_image_inputs + else: + image_inputs = None + return batch_tokenized_inputs, image_inputs + + +class PaliGemma(VlmCausalLM): + def __init__( + self, + model_id: str, + revision: Optional[str] = None, + quantize: Optional[str] = None, + speculator: Optional[str] = None, + dtype: Optional[torch.dtype] = None, + trust_remote_code: bool = False, + ): + self.processor = AutoProcessor.from_pretrained( + model_id, + revision=revision, + trust_remote_code=trust_remote_code, + ) + + super().__init__( + config_cls=AutoConfig, + model_cls=PaliGemmaForConditionalGeneration, + model_id=model_id, + revision=revision, + quantize=quantize, + speculator=speculator, + dtype=dtype, + trust_remote_code=trust_remote_code, + ) + + @property + def batch_type(self): + return PaliGemmaBatch + + def get_layer_config(self, model) -> Tuple[int, int, int]: + return ( + len(model.text_model.model.layers), + model.text_model.model.num_key_value_heads, + model.text_model.model.head_size, + ) + + def max_past(self) -> Optional[int]: + return getattr(self.model.text_model, "max_past", None) diff --git a/server/text_generation_server/models/vlm_causal_lm.py b/server/text_generation_server/models/vlm_causal_lm.py index 5394feb5..f0db89b2 100644 --- a/server/text_generation_server/models/vlm_causal_lm.py +++ b/server/text_generation_server/models/vlm_causal_lm.py @@ -15,6 +15,7 @@ from text_generation_server.models.flash_mistral import ( BaseFlashMistral, FlashMistralBatch, ) +from text_generation_server.models.flash_causal_lm import FlashCausalLMBatch from text_generation_server.models.cache_manager import ( get_cache_manager, ) @@ -80,6 +81,9 @@ def image_text_replacement(image_input, config, image_id) -> str: logger.info(f"Found {num_features} in image of resolution {height}x{width}") return "" * num_features + + elif config.model_type == "paligemma": + return "" * config.text_config.num_image_tokens else: raise RuntimeError(f"Unknown config {config.model_type} for multimodal") @@ -193,7 +197,10 @@ class VlmCausalLMBatch(FlashMistralBatch): max_truncation = max(max_truncation, r.truncate) batch_tokenized_inputs = tokenizer( - batch_inputs, truncation=True, max_length=max_truncation + batch_inputs, + truncation=True, + max_length=max_truncation, + add_special_tokens=not config.model_type == "paligemma", )["input_ids"] if image_inputs: image_input = image_inputs[0] diff --git a/server/text_generation_server/utils/flash_attn.py b/server/text_generation_server/utils/flash_attn.py index 0830656d..ae60fa63 100644 --- a/server/text_generation_server/utils/flash_attn.py +++ b/server/text_generation_server/utils/flash_attn.py @@ -116,6 +116,7 @@ if HAS_FLASH_ATTN_V2_CUDA: max_s, softmax_scale, window_size_left=-1, + causal=True, ): if window_size_left <= 0 and window_size_left != -1: raise ValueError("`window_size_left` must be > 0 or -1") @@ -134,7 +135,7 @@ if HAS_FLASH_ATTN_V2_CUDA: 0.0, softmax_scale, False, - True, + causal, window_size_left, 0, False, From f691a945aac1eb3f1b339366e1c943e68438893a Mon Sep 17 00:00:00 2001 From: phangiabao98 <60313144+phangiabao98@users.noreply.github.com> Date: Thu, 16 May 2024 15:17:00 +0700 Subject: [PATCH 19/46] OpenAI function calling compatible support (#1888) # What does this PR do? Fixes # (issue) https://github.com/huggingface/text-generation-inference/issues/1887 ## Before submitting - [no ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [yes] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests), Pull Request section? - [ yes] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link to it if that's the case. - [yes ] Did you make sure to update the documentation with your changes? Here are the [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation). - [ yes] Did you write any new necessary tests? ## Who can review? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR. @Narsil --> --------- Co-authored-by: Bao Phan --- router/src/infer.rs | 23 +++++++++++++++++++++++ router/src/lib.rs | 8 ++++++-- router/src/server.rs | 3 +-- 3 files changed, 30 insertions(+), 4 deletions(-) diff --git a/router/src/infer.rs b/router/src/infer.rs index 85e8775e..9646deb9 100644 --- a/router/src/infer.rs +++ b/router/src/infer.rs @@ -990,24 +990,28 @@ mod tests { content: Some("Hi!".to_string()), name: None, tool_calls: None, + tool_call_id: None, }, Message { role: "assistant".to_string(), content: Some("Hello how can I help?".to_string()), name: None, tool_calls: None, + tool_call_id: None, }, Message { role: "user".to_string(), content: Some("What is Deep Learning?".to_string()), name: None, tool_calls: None, + tool_call_id: None, }, Message { role: "assistant".to_string(), content: Some("magic!".to_string()), name: None, tool_calls: None, + tool_call_id: None, }, ], bos_token: Some("[BOS]"), @@ -1060,30 +1064,35 @@ mod tests { content: Some("Hi!".to_string()), name: None, tool_calls: None, + tool_call_id: None, }, Message { role: "user".to_string(), content: Some("Hi again!".to_string()), name: None, tool_calls: None, + tool_call_id: None, }, Message { role: "assistant".to_string(), content: Some("Hello how can I help?".to_string()), name: None, tool_calls: None, + tool_call_id: None, }, Message { role: "user".to_string(), content: Some("What is Deep Learning?".to_string()), name: None, tool_calls: None, + tool_call_id: None, }, Message { role: "assistant".to_string(), content: Some("magic!".to_string()), name: None, tool_calls: None, + tool_call_id: None, }, ], bos_token: Some("[BOS]"), @@ -1141,24 +1150,28 @@ mod tests { content: Some("Hi!".to_string()), name: None, tool_calls: None, + tool_call_id: None, }, Message { role: "assistant".to_string(), content: Some("Hello how can I help?".to_string()), name: None, tool_calls: None, + tool_call_id: None, }, Message { role: "user".to_string(), content: Some("What is Deep Learning?".to_string()), name: None, tool_calls: None, + tool_call_id: None, }, Message { role: "assistant".to_string(), content: Some("magic!".to_string()), name: None, tool_calls: None, + tool_call_id: None, }, ], bos_token: Some("[BOS]"), @@ -1200,24 +1213,28 @@ mod tests { content: Some("Hi!".to_string()), name: None, tool_calls: None, + tool_call_id: None, }, Message { role: "assistant".to_string(), content: Some("Hello how can I help?".to_string()), name: None, tool_calls: None, + tool_call_id: None, }, Message { role: "user".to_string(), content: Some("What is Deep Learning?".to_string()), name: None, tool_calls: None, + tool_call_id: None, }, Message { role: "assistant".to_string(), content: Some("magic!".to_string()), name: None, tool_calls: None, + tool_call_id: None, }, ], bos_token: Some("[BOS]"), @@ -1245,18 +1262,21 @@ mod tests { content: Some("Hello, how are you?".to_string()), name: None, tool_calls: None, + tool_call_id: None, }, Message { role: "assistant".to_string(), content: Some("I'm doing great. How can I help you today?".to_string()), name: None, tool_calls: None, + tool_call_id: None, }, Message { role: "user".to_string(), content: Some("I'd like to show off how chat templating works!".to_string()), name: None, tool_calls: None, + tool_call_id: None, }, ]; @@ -1268,6 +1288,7 @@ mod tests { ), name: None, tool_calls: None, + tool_call_id: None, }] .iter() .chain(&example_chat) @@ -1412,12 +1433,14 @@ mod tests { content: Some("You are a friendly chatbot who always responds in the style of a pirate".to_string()), name: None, tool_calls: None, + tool_call_id: None, }, Message { role: "user".to_string(), content: Some("How many helicopters can a human eat in one sitting?".to_string()), name: None, tool_calls: None, + tool_call_id: None, }, ], add_generation_prompt: true, diff --git a/router/src/lib.rs b/router/src/lib.rs index 96a9fdf6..85e18dfb 100644 --- a/router/src/lib.rs +++ b/router/src/lib.rs @@ -546,6 +546,7 @@ impl ChatCompletion { content: output, name: None, tool_calls, + tool_call_id: None, }, logprobs: return_logprobs .then(|| ChatCompletionLogprobs::from((details.tokens, details.top_tokens))), @@ -881,7 +882,7 @@ pub(crate) struct ChatTemplateInputs<'a> { #[derive(Clone, Deserialize, Serialize, ToSchema, Default, Debug)] pub(crate) struct ToolCall { - pub id: u32, + pub id: String, pub r#type: String, pub function: FunctionDefinition, } @@ -954,13 +955,16 @@ pub(crate) struct Message { pub role: String, #[serde(skip_serializing_if = "Option::is_none")] #[schema(example = "My name is David and I")] - #[serde(deserialize_with = "message_content_serde::deserialize")] + #[serde(default, deserialize_with = "message_content_serde::deserialize")] pub content: Option, #[serde(default, skip_serializing_if = "Option::is_none")] #[schema(example = "\"David\"")] pub name: Option, #[serde(default, skip_serializing_if = "Option::is_none")] pub tool_calls: Option>, + #[serde(default, skip_serializing_if = "Option::is_none")] + #[schema(example = "\"get_weather\"")] + pub tool_call_id: Option, } #[derive(Clone, Debug, Deserialize, ToSchema)] diff --git a/router/src/server.rs b/router/src/server.rs index 6b51109b..52652b72 100644 --- a/router/src/server.rs +++ b/router/src/server.rs @@ -990,7 +990,6 @@ async fn chat_completions( ) -> Result)> { let span = tracing::Span::current(); metrics::increment_counter!("tgi_request_count"); - let ChatRequest { logprobs, max_tokens, @@ -1162,7 +1161,7 @@ async fn chat_completions( ) })?; let tool_calls = vec![ToolCall { - id: 0, + id: "0".to_string(), r#type: "function".to_string(), function: FunctionDefinition { description: None, From 1687e00bfb4354f3b11386600cf86cf6b2e0c6d4 Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Thu, 16 May 2024 16:59:05 +0200 Subject: [PATCH 20/46] Fixing types. (#1906) # What does this PR do? Fixes # (issue) ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests), Pull Request section? - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link to it if that's the case. - [ ] Did you make sure to update the documentation with your changes? Here are the [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation). - [ ] Did you write any new necessary tests? ## Who can review? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR. --- router/src/infer.rs | 181 ++++++++------------------ router/src/lib.rs | 304 +++++++++++++++++++++++++++++--------------- 2 files changed, 256 insertions(+), 229 deletions(-) diff --git a/router/src/infer.rs b/router/src/infer.rs index 9646deb9..04328246 100644 --- a/router/src/infer.rs +++ b/router/src/infer.rs @@ -4,7 +4,7 @@ use crate::validation::{Validation, ValidationError}; use crate::{ ChatTemplateInputs, ChatTemplateVersions, Entry, GenerateRequest, GenerateStreamResponse, - HubTokenizerConfig, Message, PrefillToken, Queue, Token, + HubTokenizerConfig, Message, MessageChunk, PrefillToken, Queue, Text, TextMessage, Token, }; use crate::{FunctionRef, FunctionsMap, GrammarType, Properties, Tool, ToolType, Tools}; use futures::future::try_join_all; @@ -373,16 +373,15 @@ impl ChatTemplate { if self.use_default_tool_template { if let Some(last_message) = messages.last_mut() { if let Some((GrammarType::Json(tools), tool_prompt)) = grammar_with_prompt { - last_message.content = Some(format!( - "{}\n---\n{}\n{}", - last_message.content.as_deref().unwrap_or_default(), - tool_prompt, - tools - )); + last_message.content.push(MessageChunk::Text(Text { + text: format!("\n---\n{}\n{}", tool_prompt, tools), + })); } } } + let messages: Vec = messages.into_iter().map(|c| c.into()).collect(); + self.template .render(ChatTemplateInputs { messages, @@ -950,8 +949,7 @@ impl InferError { #[cfg(test)] mod tests { use crate::infer::raise_exception; - use crate::ChatTemplateInputs; - use crate::Message; + use crate::{ChatTemplateInputs, TextMessage}; use minijinja::Environment; #[test] @@ -985,33 +983,21 @@ mod tests { let chat_template_inputs = ChatTemplateInputs { messages: vec![ - Message { + TextMessage { role: "user".to_string(), - content: Some("Hi!".to_string()), - name: None, - tool_calls: None, - tool_call_id: None, + content: "Hi!".to_string(), }, - Message { + TextMessage { role: "assistant".to_string(), - content: Some("Hello how can I help?".to_string()), - name: None, - tool_calls: None, - tool_call_id: None, + content: "Hello how can I help?".to_string(), }, - Message { + TextMessage { role: "user".to_string(), - content: Some("What is Deep Learning?".to_string()), - name: None, - tool_calls: None, - tool_call_id: None, + content: "What is Deep Learning?".to_string(), }, - Message { + TextMessage { role: "assistant".to_string(), - content: Some("magic!".to_string()), - name: None, - tool_calls: None, - tool_call_id: None, + content: "magic!".to_string(), }, ], bos_token: Some("[BOS]"), @@ -1059,40 +1045,25 @@ mod tests { let chat_template_inputs = ChatTemplateInputs { messages: vec![ - Message { + TextMessage { role: "user".to_string(), - content: Some("Hi!".to_string()), - name: None, - tool_calls: None, - tool_call_id: None, + content: "Hi!".to_string(), }, - Message { + TextMessage { role: "user".to_string(), - content: Some("Hi again!".to_string()), - name: None, - tool_calls: None, - tool_call_id: None, + content: "Hi again!".to_string(), }, - Message { + TextMessage { role: "assistant".to_string(), - content: Some("Hello how can I help?".to_string()), - name: None, - tool_calls: None, - tool_call_id: None, + content: "Hello how can I help?".to_string(), }, - Message { + TextMessage { role: "user".to_string(), - content: Some("What is Deep Learning?".to_string()), - name: None, - tool_calls: None, - tool_call_id: None, + content: "What is Deep Learning?".to_string(), }, - Message { + TextMessage { role: "assistant".to_string(), - content: Some("magic!".to_string()), - name: None, - tool_calls: None, - tool_call_id: None, + content: "magic!".to_string(), }, ], bos_token: Some("[BOS]"), @@ -1145,33 +1116,21 @@ mod tests { let chat_template_inputs = ChatTemplateInputs { messages: vec![ - Message { + TextMessage { role: "user".to_string(), - content: Some("Hi!".to_string()), - name: None, - tool_calls: None, - tool_call_id: None, + content: "Hi!".to_string(), }, - Message { + TextMessage { role: "assistant".to_string(), - content: Some("Hello how can I help?".to_string()), - name: None, - tool_calls: None, - tool_call_id: None, + content: "Hello how can I help?".to_string(), }, - Message { + TextMessage { role: "user".to_string(), - content: Some("What is Deep Learning?".to_string()), - name: None, - tool_calls: None, - tool_call_id: None, + content: "What is Deep Learning?".to_string(), }, - Message { + TextMessage { role: "assistant".to_string(), - content: Some("magic!".to_string()), - name: None, - tool_calls: None, - tool_call_id: None, + content: "magic!".to_string(), }, ], bos_token: Some("[BOS]"), @@ -1208,33 +1167,21 @@ mod tests { let chat_template_inputs = ChatTemplateInputs { messages: vec![ - Message { + TextMessage { role: "user".to_string(), - content: Some("Hi!".to_string()), - name: None, - tool_calls: None, - tool_call_id: None, + content: "Hi!".to_string(), }, - Message { + TextMessage { role: "assistant".to_string(), - content: Some("Hello how can I help?".to_string()), - name: None, - tool_calls: None, - tool_call_id: None, + content: "Hello how can I help?".to_string(), }, - Message { + TextMessage { role: "user".to_string(), - content: Some("What is Deep Learning?".to_string()), - name: None, - tool_calls: None, - tool_call_id: None, + content: "What is Deep Learning?".to_string(), }, - Message { + TextMessage { role: "assistant".to_string(), - content: Some("magic!".to_string()), - name: None, - tool_calls: None, - tool_call_id: None, + content: "magic!".to_string(), }, ], bos_token: Some("[BOS]"), @@ -1257,38 +1204,24 @@ mod tests { #[test] fn test_many_chat_templates() { let example_chat = vec![ - Message { + TextMessage { role: "user".to_string(), - content: Some("Hello, how are you?".to_string()), - name: None, - tool_calls: None, - tool_call_id: None, + content: "Hello, how are you?".to_string(), }, - Message { + TextMessage { role: "assistant".to_string(), - content: Some("I'm doing great. How can I help you today?".to_string()), - name: None, - tool_calls: None, - tool_call_id: None, + content: "I'm doing great. How can I help you today?".to_string(), }, - Message { + TextMessage { role: "user".to_string(), - content: Some("I'd like to show off how chat templating works!".to_string()), - name: None, - tool_calls: None, - tool_call_id: None, + content: "I'd like to show off how chat templating works!".to_string(), }, ]; - let example_chat_with_system = [Message { + let example_chat_with_system = [TextMessage { role: "system".to_string(), - content: Some( - "You are a friendly chatbot who always responds in the style of a pirate" - .to_string(), - ), - name: None, - tool_calls: None, - tool_call_id: None, + content: "You are a friendly chatbot who always responds in the style of a pirate" + .to_string(), }] .iter() .chain(&example_chat) @@ -1428,19 +1361,13 @@ mod tests { chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}", input: ChatTemplateInputs { messages: vec![ - Message { + TextMessage{ role: "system".to_string(), - content: Some("You are a friendly chatbot who always responds in the style of a pirate".to_string()), - name: None, - tool_calls: None, - tool_call_id: None, + content: "You are a friendly chatbot who always responds in the style of a pirate".to_string(), }, - Message { + TextMessage{ role: "user".to_string(), - content: Some("How many helicopters can a human eat in one sitting?".to_string()), - name: None, - tool_calls: None, - tool_call_id: None, + content: "How many helicopters can a human eat in one sitting?".to_string(), }, ], add_generation_prompt: true, diff --git a/router/src/lib.rs b/router/src/lib.rs index 85e18dfb..5ae861dd 100644 --- a/router/src/lib.rs +++ b/router/src/lib.rs @@ -11,6 +11,7 @@ use queue::{Entry, Queue}; use serde::{Deserialize, Serialize}; use tokio::sync::OwnedSemaphorePermit; use tokio_stream::wrappers::UnboundedReceiverStream; +use tracing::warn; use utoipa::ToSchema; use validation::Validation; @@ -440,7 +441,7 @@ pub(crate) struct ChatCompletion { #[derive(Clone, Deserialize, Serialize, ToSchema)] pub(crate) struct ChatCompletionComplete { pub index: u32, - pub message: Message, + pub message: OutputMessage, pub logprobs: Option, pub finish_reason: String, } @@ -533,6 +534,30 @@ impl ChatCompletion { return_logprobs: bool, tool_calls: Option>, ) -> Self { + let message = match (output, tool_calls) { + (Some(content), None) => OutputMessage::ChatMessage(TextMessage { + role: "assistant".into(), + content, + }), + (None, Some(tool_calls)) => OutputMessage::ToolCall(ToolCallMessage { + role: "assistant".to_string(), + tool_calls, + }), + (Some(output), Some(_)) => { + warn!("Received both chat and tool call"); + OutputMessage::ChatMessage(TextMessage { + role: "assistant".into(), + content: output, + }) + } + (None, None) => { + warn!("Didn't receive an answer"); + OutputMessage::ChatMessage(TextMessage { + role: "assistant".into(), + content: "".to_string(), + }) + } + }; Self { id: String::new(), object: "text_completion".into(), @@ -541,13 +566,7 @@ impl ChatCompletion { system_fingerprint, choices: vec![ChatCompletionComplete { index: 0, - message: Message { - role: "assistant".into(), - content: output, - name: None, - tool_calls, - tool_call_id: None, - }, + message, logprobs: return_logprobs .then(|| ChatCompletionLogprobs::from((details.tokens, details.top_tokens))), finish_reason: details.finish_reason.to_string(), @@ -569,6 +588,7 @@ pub(crate) struct CompletionCompleteChunk { pub model: String, pub system_fingerprint: String, } + #[derive(Clone, Deserialize, Serialize, ToSchema)] pub(crate) struct ChatCompletionChunk { pub id: String, @@ -589,21 +609,20 @@ pub(crate) struct ChatCompletionChoice { pub finish_reason: Option, } -#[derive(Clone, Debug, Deserialize, Serialize, ToSchema)] -pub(crate) struct ChatCompletionDelta { - #[schema(example = "user")] - // TODO Modify this to a true enum. - #[serde(default, skip_serializing_if = "Option::is_none")] - pub role: Option, - #[serde(default, skip_serializing_if = "Option::is_none")] - #[schema(example = "What is Deep Learning?")] - pub content: Option, - // default to None - #[serde(default, skip_serializing_if = "Option::is_none")] - pub tool_calls: Option, +#[derive(Clone, Deserialize, ToSchema, Serialize, Debug, PartialEq)] +pub struct ToolCallDelta { + #[schema(example = "assistant")] + role: String, + tool_calls: DeltaToolCall, } -#[derive(Clone, Deserialize, Serialize, ToSchema, Debug)] +#[derive(Clone, Debug, Deserialize, Serialize, ToSchema)] +enum ChatCompletionDelta { + Chat(TextMessage), + Tool(ToolCallDelta), +} + +#[derive(Clone, Deserialize, Serialize, ToSchema, Debug, PartialEq)] pub(crate) struct DeltaToolCall { pub index: u32, pub id: String, @@ -611,7 +630,7 @@ pub(crate) struct DeltaToolCall { pub function: Function, } -#[derive(Clone, Deserialize, Serialize, ToSchema, Debug)] +#[derive(Clone, Deserialize, Serialize, ToSchema, Debug, PartialEq)] pub(crate) struct Function { pub name: Option, pub arguments: String, @@ -629,15 +648,13 @@ impl ChatCompletionChunk { finish_reason: Option, ) -> Self { let delta = match (delta, tool_calls) { - (Some(delta), _) => ChatCompletionDelta { - role: Some("assistant".to_string()), - content: Some(delta), - tool_calls: None, - }, - (None, Some(tool_calls)) => ChatCompletionDelta { - role: Some("assistant".to_string()), - content: None, - tool_calls: Some(DeltaToolCall { + (Some(delta), _) => ChatCompletionDelta::Chat(TextMessage { + role: "assistant".to_string(), + content: delta, + }), + (None, Some(tool_calls)) => ChatCompletionDelta::Tool(ToolCallDelta { + role: "assistant".to_string(), + tool_calls: DeltaToolCall { index: 0, id: String::new(), r#type: "function".to_string(), @@ -645,13 +662,12 @@ impl ChatCompletionChunk { name: None, arguments: tool_calls[0].to_string(), }, - }), - }, - (None, None) => ChatCompletionDelta { - role: None, - content: None, - tool_calls: None, - }, + }, + }), + (None, None) => ChatCompletionDelta::Chat(TextMessage { + role: "assistant".to_string(), + content: "".to_string(), + }), }; Self { id: String::new(), @@ -852,7 +868,7 @@ where state.end() } -#[derive(Clone, Debug, Deserialize, Serialize, ToSchema, Default)] +#[derive(Clone, Debug, Deserialize, Serialize, ToSchema, Default, PartialEq)] pub(crate) struct FunctionDefinition { #[serde(default)] pub description: Option, @@ -872,7 +888,7 @@ pub(crate) struct Tool { #[derive(Clone, Serialize, Deserialize, Default)] pub(crate) struct ChatTemplateInputs<'a> { - messages: Vec, + messages: Vec, bos_token: Option<&'a str>, eos_token: Option<&'a str>, add_generation_prompt: bool, @@ -880,91 +896,113 @@ pub(crate) struct ChatTemplateInputs<'a> { tools_prompt: Option<&'a str>, } -#[derive(Clone, Deserialize, Serialize, ToSchema, Default, Debug)] +#[derive(Clone, Deserialize, Serialize, ToSchema, Default, Debug, PartialEq)] pub(crate) struct ToolCall { pub id: String, pub r#type: String, pub function: FunctionDefinition, } -#[derive(Clone, Deserialize, Serialize, ToSchema, Default, Debug)] -pub(crate) struct Text { - #[serde(default)] - pub text: String, +#[derive(Clone, Deserialize, ToSchema, Serialize, Debug, PartialEq)] +struct Url { + url: String, } -#[derive(Clone, Deserialize, Serialize, ToSchema, Default, Debug)] -pub(crate) struct ImageUrl { - #[serde(default)] - pub url: String, +#[derive(Clone, Deserialize, ToSchema, Serialize, Debug, PartialEq)] +struct ImageUrl { + image_url: Url, } -#[derive(Clone, Deserialize, Serialize, ToSchema, Default, Debug)] -pub(crate) struct Content { - pub r#type: String, +#[derive(Clone, Deserialize, ToSchema, Serialize, Debug, PartialEq)] +struct Text { + text: String, +} + +#[derive(Clone, Deserialize, ToSchema, Serialize, Debug, PartialEq)] +#[serde(tag = "type")] +#[serde(rename_all = "snake_case")] +enum MessageChunk { + Text(Text), + ImageUrl(ImageUrl), +} + +#[derive(Clone, Deserialize, ToSchema, Serialize, Debug, PartialEq)] +pub struct Message { + #[schema(example = "user")] + role: String, + #[schema(example = "My name is David and I")] + #[serde(deserialize_with = "message_content_serde::deserialize")] + content: Vec, #[serde(default, skip_serializing_if = "Option::is_none")] - pub text: Option, - #[serde(default, skip_serializing_if = "Option::is_none")] - pub image_url: Option, + #[schema(example = "\"David\"")] + name: Option, } mod message_content_serde { use super::*; - use serde::de; - use serde::Deserializer; - use serde_json::Value; + use serde::{Deserialize, Deserializer}; - pub fn deserialize<'de, D>(deserializer: D) -> Result, D::Error> + pub fn deserialize<'de, D>(deserializer: D) -> Result, D::Error> where D: Deserializer<'de>, { - let value = Value::deserialize(deserializer)?; - match value { - Value::String(s) => Ok(Some(s)), - Value::Array(arr) => { - let results: Result, _> = arr - .into_iter() - .map(|v| { - let content: Content = - serde_json::from_value(v).map_err(de::Error::custom)?; - match content.r#type.as_str() { - "text" => Ok(content.text.unwrap_or_default()), - "image_url" => { - if let Some(url) = content.image_url { - Ok(format!("![]({})", url.url)) - } else { - Ok(String::new()) - } - } - _ => Err(de::Error::custom("invalid content type")), - } - }) - .collect(); - - results.map(|strings| Some(strings.join(""))) + #[derive(Deserialize)] + #[serde(untagged)] + enum Message { + Text(String), + Chunks(Vec), + } + let message: Message = Deserialize::deserialize(deserializer)?; + let chunks = match message { + Message::Text(text) => { + vec![MessageChunk::Text(Text { text })] } - Value::Null => Ok(None), - _ => Err(de::Error::custom("invalid token format")), + Message::Chunks(s) => s, + }; + Ok(chunks) + } +} + +#[derive(Clone, Deserialize, ToSchema, Serialize, Debug, PartialEq)] +pub struct TextMessage { + #[schema(example = "user")] + pub role: String, + #[schema(example = "My name is David and I")] + pub content: String, +} + +impl From for TextMessage { + fn from(value: Message) -> Self { + TextMessage { + role: value.role, + content: value + .content + .into_iter() + .map(|c| match c { + MessageChunk::Text(Text { text }) => text, + MessageChunk::ImageUrl(image) => { + let url = image.image_url.url; + format!("![]({url})") + } + }) + .collect::>() + .join(""), } } } -#[derive(Clone, Deserialize, ToSchema, Serialize, Debug)] -pub(crate) struct Message { - #[schema(example = "user")] - pub role: String, - #[serde(skip_serializing_if = "Option::is_none")] - #[schema(example = "My name is David and I")] - #[serde(default, deserialize_with = "message_content_serde::deserialize")] - pub content: Option, - #[serde(default, skip_serializing_if = "Option::is_none")] - #[schema(example = "\"David\"")] - pub name: Option, - #[serde(default, skip_serializing_if = "Option::is_none")] - pub tool_calls: Option>, - #[serde(default, skip_serializing_if = "Option::is_none")] - #[schema(example = "\"get_weather\"")] - pub tool_call_id: Option, +#[derive(Clone, Deserialize, ToSchema, Serialize, Debug, PartialEq)] +pub struct ToolCallMessage { + #[schema(example = "assistant")] + role: String, + tool_calls: Vec, +} + +#[derive(Clone, Deserialize, ToSchema, Serialize, Debug, PartialEq)] +#[serde(untagged)] +pub(crate) enum OutputMessage { + ChatMessage(TextMessage), + ToolCall(ToolCallMessage), } #[derive(Clone, Debug, Deserialize, ToSchema)] @@ -1127,7 +1165,7 @@ pub(crate) struct ErrorResponse { #[cfg(test)] mod tests { use super::*; - + use serde_json::json; use tokenizers::Tokenizer; pub(crate) async fn get_tokenizer() -> Tokenizer { @@ -1195,4 +1233,66 @@ mod tests { ); assert_eq!(config.eos_token, Some("<|end▁of▁sentence|>".to_string())); } + + #[test] + fn test_chat_simple_string() { + let json = json!( + + { + "model": "", + "messages": [ + {"role": "user", + "content": "What is Deep Learning?" + } + ] + }); + let request: ChatRequest = serde_json::from_str(json.to_string().as_str()).unwrap(); + + assert_eq!( + request.messages[0], + Message { + role: "user".to_string(), + content: vec![MessageChunk::Text(Text { + text: "What is Deep Learning?".to_string() + }),], + name: None + } + ); + } + + #[test] + fn test_chat_request() { + let json = json!( + + { + "model": "", + "messages": [ + {"role": "user", + "content": [ + {"type": "text", "text": "Whats in this image?"}, + { + "type": "image_url", + "image_url": { + "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png" + }, + }, + ] + + } + ] + }); + let request: ChatRequest = serde_json::from_str(json.to_string().as_str()).unwrap(); + + assert_eq!( + request.messages[0], + Message{ + role: "user".to_string(), + content: vec![ + MessageChunk::Text(Text { text: "Whats in this image?".to_string() }), + MessageChunk::ImageUrl(ImageUrl { image_url: Url { url: "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png".to_string() } }) + ], + name: None + } + ); + } } From 313960a829c6119af3b17bfadb7d498712d09105 Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Thu, 16 May 2024 17:21:00 +0200 Subject: [PATCH 21/46] Types. (#1909) # What does this PR do? Fixes # (issue) ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests), Pull Request section? - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link to it if that's the case. - [ ] Did you make sure to update the documentation with your changes? Here are the [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation). - [ ] Did you write any new necessary tests? ## Who can review? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR. --- router/src/lib.rs | 76 ++++++++++++++++++++++++++++++++++------------- 1 file changed, 55 insertions(+), 21 deletions(-) diff --git a/router/src/lib.rs b/router/src/lib.rs index 5ae861dd..3df650e9 100644 --- a/router/src/lib.rs +++ b/router/src/lib.rs @@ -1236,15 +1236,12 @@ mod tests { #[test] fn test_chat_simple_string() { - let json = json!( - - { + let json = json!({ "model": "", - "messages": [ - {"role": "user", + "messages": [{ + "role": "user", "content": "What is Deep Learning?" - } - ] + }] }); let request: ChatRequest = serde_json::from_str(json.to_string().as_str()).unwrap(); @@ -1262,24 +1259,15 @@ mod tests { #[test] fn test_chat_request() { - let json = json!( - - { + let json = json!({ "model": "", - "messages": [ - {"role": "user", + "messages": [{ + "role": "user", "content": [ {"type": "text", "text": "Whats in this image?"}, - { - "type": "image_url", - "image_url": { - "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png" - }, - }, + {"type": "image_url", "image_url": {"url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png"}}, ] - - } - ] + }] }); let request: ChatRequest = serde_json::from_str(json.to_string().as_str()).unwrap(); @@ -1295,4 +1283,50 @@ mod tests { } ); } + + #[test] + fn text_message_convert() { + let message = Message{ + role: "user".to_string(), + content: vec![ + MessageChunk::Text(Text { text: "Whats in this image?".to_string() }), + MessageChunk::ImageUrl(ImageUrl { image_url: Url { url: "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png".to_string() } }) + ], + name: None + }; + let textmsg: TextMessage = message.into(); + assert_eq!(textmsg.content, "Whats in this image?![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png)"); + } + #[test] + fn openai_output() { + let message = OutputMessage::ChatMessage(TextMessage { + role: "assistant".to_string(), + content: "This is the answer".to_string(), + }); + let serialized = serde_json::to_string(&message).unwrap(); + assert_eq!( + serialized, + r#"{"role":"assistant","content":"This is the answer"}"# + ); + + let message = OutputMessage::ToolCall(ToolCallMessage { + role: "assistant".to_string(), + tool_calls: vec![ToolCall { + id: "0".to_string(), + r#type: "function".to_string(), + function: FunctionDefinition { + description: None, + name: "myfn".to_string(), + arguments: json!({ + "format": "csv" + }), + }, + }], + }); + let serialized = serde_json::to_string(&message).unwrap(); + assert_eq!( + serialized, + r#"{"role":"assistant","tool_calls":[{"id":"0","type":"function","function":{"description":null,"name":"myfn","arguments":{"format":"csv"}}}]}"# + ); + } } From 125c8a05c37c0b14746a4e9df197ee22884f51fa Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Thu, 16 May 2024 21:40:10 +0200 Subject: [PATCH 22/46] Fixing signals. (#1910) Taking the signal handles later, so during loads, regular signal handling is done, we only need to handle SIGINT and SIGTERM during real loads to get more graceful shutdowns when queries are in flight. Fixes #1842 # What does this PR do? Fixes # (issue) ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests), Pull Request section? - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link to it if that's the case. - [ ] Did you make sure to update the documentation with your changes? Here are the [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation). - [ ] Did you write any new necessary tests? ## Who can review? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR. --- server/text_generation_server/server.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/server/text_generation_server/server.py b/server/text_generation_server/server.py index f52d801c..80076ba6 100644 --- a/server/text_generation_server/server.py +++ b/server/text_generation_server/server.py @@ -33,9 +33,6 @@ class SignalHandler: self.KEEP_PROCESSING = False -signal_handler = SignalHandler() - - class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServicer): def __init__( self, @@ -216,7 +213,7 @@ def serve( await server.start() logger.info("Server started at {}".format(local_url)) - + signal_handler = SignalHandler() while signal_handler.KEEP_PROCESSING: await asyncio.sleep(0.5) From 398ad027c7f9270d2db609e1bdb2de823c207512 Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Fri, 17 May 2024 11:35:49 +0200 Subject: [PATCH 23/46] Removing some unused code. (#1915) # What does this PR do? Fixes # (issue) ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests), Pull Request section? - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link to it if that's the case. - [ ] Did you make sure to update the documentation with your changes? Here are the [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation). - [ ] Did you write any new necessary tests? ## Who can review? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR. --- router/src/lib.rs | 7 +- .../models/custom_modeling/siglip.py | 129 ------------------ 2 files changed, 4 insertions(+), 132 deletions(-) diff --git a/router/src/lib.rs b/router/src/lib.rs index 3df650e9..febbf277 100644 --- a/router/src/lib.rs +++ b/router/src/lib.rs @@ -589,7 +589,7 @@ pub(crate) struct CompletionCompleteChunk { pub system_fingerprint: String, } -#[derive(Clone, Deserialize, Serialize, ToSchema)] +#[derive(Clone, Serialize, ToSchema)] pub(crate) struct ChatCompletionChunk { pub id: String, pub object: String, @@ -601,7 +601,7 @@ pub(crate) struct ChatCompletionChunk { pub choices: Vec, } -#[derive(Clone, Deserialize, Serialize, ToSchema)] +#[derive(Clone, Serialize, ToSchema)] pub(crate) struct ChatCompletionChoice { pub index: u32, pub delta: ChatCompletionDelta, @@ -616,7 +616,8 @@ pub struct ToolCallDelta { tool_calls: DeltaToolCall, } -#[derive(Clone, Debug, Deserialize, Serialize, ToSchema)] +#[derive(Clone, Debug, Serialize, ToSchema)] +#[serde(untagged)] enum ChatCompletionDelta { Chat(TextMessage), Tool(ToolCallDelta), diff --git a/server/text_generation_server/models/custom_modeling/siglip.py b/server/text_generation_server/models/custom_modeling/siglip.py index f17d6562..5fbc6d29 100644 --- a/server/text_generation_server/models/custom_modeling/siglip.py +++ b/server/text_generation_server/models/custom_modeling/siglip.py @@ -64,45 +64,6 @@ class SiglipVisionEmbeddings(nn.Module): return embeddings -class SiglipTextEmbeddings(nn.Module): - def __init__(self, config: SiglipTextConfig): - super().__init__() - embed_dim = config.hidden_size - - self.token_embedding = nn.Embedding(config.vocab_size, embed_dim) - self.position_embedding = nn.Embedding( - config.max_position_embeddings, embed_dim - ) - - # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.register_buffer( - "position_ids", - torch.arange(config.max_position_embeddings).expand((1, -1)), - persistent=False, - ) - - def forward( - self, - input_ids: Optional[torch.LongTensor] = None, - position_ids: Optional[torch.LongTensor] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - ) -> torch.Tensor: - seq_length = ( - input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2] - ) - - if position_ids is None: - position_ids = self.position_ids[:, :seq_length] - - if inputs_embeds is None: - inputs_embeds = self.token_embedding(input_ids) - - position_embeddings = self.position_embedding(position_ids) - embeddings = inputs_embeds + position_embeddings - - return embeddings - - class SiglipAttention(nn.Module): """Multi-headed attention from 'Attention Is All You Need' paper""" @@ -147,7 +108,6 @@ class SiglipAttention(nn.Module): self, hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = False, ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: """Input shape: Batch x Time x Channel""" @@ -243,32 +203,18 @@ class SiglipEncoderLayer(nn.Module): self, hidden_states: torch.Tensor, attention_mask: torch.Tensor, - output_attentions: Optional[bool] = False, ) -> Tuple[torch.FloatTensor]: - """ - Args: - hidden_states (`torch.FloatTensor`): - Input to the layer of shape `(batch, seq_len, embed_dim)`. - attention_mask (`torch.FloatTensor`): - Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values. - output_attentions (`bool`, *optional*, defaults to `False`): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more detail. - """ residual = hidden_states hidden_states = self.layer_norm1(hidden_states) hidden_states, attn_weights = self.self_attn( hidden_states=hidden_states, attention_mask=attention_mask, - output_attentions=output_attentions, ) hidden_states = residual + hidden_states residual = hidden_states hidden_states = self.layer_norm2(hidden_states) hidden_states = self.mlp(hidden_states) hidden_states = residual + hidden_states - if output_attentions: - return hidden_states, attn_weights return hidden_states, None @@ -406,58 +352,6 @@ def default_flax_embed_init(tensor): from transformers import PreTrainedModel -class SiglipPreTrainedModel(PreTrainedModel): - """ - An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained - models. - """ - - config_class = SiglipConfig - base_model_prefix = "siglip" - supports_gradient_checkpointing = True - - def _init_weights(self, module): - """Initialize the weights""" - if isinstance(module, SiglipVisionEmbeddings): - width = ( - self.config.vision_config.hidden_size - if isinstance(self.config, SiglipConfig) - else self.config.hidden_size - ) - nn.init.normal_(module.position_embedding.weight, std=1 / np.sqrt(width)) - elif isinstance(module, nn.Embedding): - default_flax_embed_init(module.weight) - elif isinstance(module, SiglipAttention): - nn.init.xavier_uniform_(module.q_proj.weight) - nn.init.xavier_uniform_(module.k_proj.weight) - nn.init.xavier_uniform_(module.v_proj.weight) - nn.init.xavier_uniform_(module.out_proj.weight) - nn.init.zeros_(module.q_proj.bias) - nn.init.zeros_(module.k_proj.bias) - nn.init.zeros_(module.v_proj.bias) - nn.init.zeros_(module.out_proj.bias) - elif isinstance(module, SiglipMLP): - nn.init.xavier_uniform_(module.fc1.weight) - nn.init.xavier_uniform_(module.fc2.weight) - nn.init.normal_(module.fc1.bias, std=1e-6) - nn.init.normal_(module.fc2.bias, std=1e-6) - elif isinstance(module, SiglipMultiheadAttentionPoolingHead): - nn.init.xavier_uniform_(module.probe.data) - nn.init.xavier_uniform_(module.attention.in_proj_weight.data) - nn.init.zeros_(module.attention.in_proj_bias.data) - elif isinstance(module, SiglipModel): - logit_scale_init = torch.log(torch.tensor(1.0)) - module.logit_scale.data.fill_(logit_scale_init) - module.logit_bias.data.zero_() - elif isinstance(module, (nn.Linear, nn.Conv2d)): - lecun_normal_(module.weight) - if module.bias is not None: - nn.init.zeros_(module.bias) - elif isinstance(module, nn.LayerNorm): - module.bias.data.zero_() - module.weight.data.fill_(1.0) - - class SiglipEncoder(nn.Module): """ Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a @@ -483,36 +377,13 @@ class SiglipEncoder(nn.Module): self, inputs_embeds, attention_mask: Optional[torch.Tensor] = None, - output_attentions: Optional[torch.Tensor] = None, ): - r""" - Args: - inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): - Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. - This is useful if you want more control over how to convert `input_ids` indices into associated vectors - than the model's internal embedding lookup matrix. - attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): - Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: - - - 1 for tokens that are **not masked**, - - 0 for tokens that are **masked**. - - [What are attention masks?](../glossary#attention-mask) - causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): - Causal mask for the text model. Mask values selected in `[0, 1]`: - - - 1 for tokens that are **not masked**, - - 0 for tokens that are **masked**. - - [What are attention masks?](../glossary#attention-mask) - """ hidden_states = inputs_embeds for idx, encoder_layer in enumerate(self.layers): hidden_states, _ = encoder_layer( hidden_states, attention_mask, - output_attentions=output_attentions, ) return hidden_states From 166dc0b87d658af53f7fa15b4d6093925c2b51ec Mon Sep 17 00:00:00 2001 From: fxmarty <9808326+fxmarty@users.noreply.github.com> Date: Fri, 17 May 2024 15:30:47 +0200 Subject: [PATCH 24/46] MI300 compatibility (#1764) Adds support for AMD Instinct MI300 in TGI. Most changes are: * Support PyTorch TunableOp to pick the GEMM/GEMV kernels for decoding https://github.com/pytorch/pytorch/tree/main/aten/src/ATen/cuda/tunable. TunableOp is disabled by default, and can be enabled with `PYTORCH_TUNABLEOP_ENABLED=1`. * Update ROCm dockerfile to PyTorch 2.3 (actually patched with changes from https://github.com/pytorch/pytorch/pull/124362) * Support SILU & Linear custom kernels contributed by AMD * Update vLLM paged attention to https://github.com/fxmarty/rocm-vllm/, branching out of a much more recent commit https://github.com/ROCm/vllm/commit/3489ce7936c5de588916ae3047c44c23c0b0c308 * Support FA2 Triton kernel as recommended by AMD. Can be used by specifying `ROCM_USE_FLASH_ATTN_V2_TRITON=1`. * Update dockerfile to ROCm 6.1 By default, TunableOp tuning results are saved in `/data` (e.g. `/data/tunableop_meta-llama-Llama-2-70b-chat-hf_tp1_rank0.csv`) in order to avoid to have to rerun the tuning at each `docker run`. Example: ``` Validator,PT_VERSION,2.3.0 Validator,ROCM_VERSION,6.1.0.0-82-5fabb4c Validator,HIPBLASLT_VERSION,0.7.0-1549b021 Validator,GCN_ARCH_NAME,gfx942:sramecc+:xnack- Validator,ROCBLAS_VERSION,4.1.0-cefa4a9b-dirty GemmTunableOp_Half_TN,tn_8192_7_28672,Gemm_Rocblas_45475,0.132098 GemmTunableOp_Half_TN,tn_10240_4_8192,Gemm_Rocblas_45546,0.0484431 GemmTunableOp_Half_TN,tn_32000_6_8192,Default,0.149546 GemmTunableOp_Half_TN,tn_32000_3_8192,Gemm_Rocblas_45520,0.147119 GemmTunableOp_Half_TN,tn_8192_3_28672,Gemm_Rocblas_45475,0.132645 GemmTunableOp_Half_TN,tn_10240_3_8192,Gemm_Rocblas_45546,0.0482971 GemmTunableOp_Half_TN,tn_57344_5_8192,Gemm_Rocblas_45520,0.255694 GemmTunableOp_Half_TN,tn_10240_7_8192,Gemm_Rocblas_45517,0.0482522 GemmTunableOp_Half_TN,tn_8192_3_8192,Gemm_Rocblas_45546,0.0444671 GemmTunableOp_Half_TN,tn_8192_5_8192,Gemm_Rocblas_45546,0.0445834 GemmTunableOp_Half_TN,tn_57344_7_8192,Gemm_Rocblas_45520,0.25622 GemmTunableOp_Half_TN,tn_8192_2_28672,Gemm_Rocblas_45475,0.132122 GemmTunableOp_Half_TN,tn_8192_4_8192,Gemm_Rocblas_45517,0.0453191 GemmTunableOp_Half_TN,tn_10240_5_8192,Gemm_Rocblas_45517,0.0482514 GemmTunableOp_Half_TN,tn_8192_5_28672,Gemm_Rocblas_45542,0.133914 GemmTunableOp_Half_TN,tn_8192_2_8192,Gemm_Rocblas_45517,0.0446516 GemmTunableOp_Half_TN,tn_8192_1_28672,Gemm_Hipblaslt_TN_10814,0.131953 GemmTunableOp_Half_TN,tn_10240_2_8192,Gemm_Rocblas_45546,0.0481043 GemmTunableOp_Half_TN,tn_32000_4_8192,Gemm_Rocblas_45520,0.147497 GemmTunableOp_Half_TN,tn_8192_6_28672,Gemm_Rocblas_45529,0.134895 GemmTunableOp_Half_TN,tn_57344_2_8192,Gemm_Rocblas_45520,0.254716 GemmTunableOp_Half_TN,tn_57344_4_8192,Gemm_Rocblas_45520,0.255731 GemmTunableOp_Half_TN,tn_10240_6_8192,Gemm_Rocblas_45517,0.0484816 GemmTunableOp_Half_TN,tn_57344_3_8192,Gemm_Rocblas_45520,0.254701 GemmTunableOp_Half_TN,tn_8192_4_28672,Gemm_Rocblas_45475,0.132159 GemmTunableOp_Half_TN,tn_32000_2_8192,Default,0.147524 GemmTunableOp_Half_TN,tn_32000_5_8192,Default,0.147074 GemmTunableOp_Half_TN,tn_8192_6_8192,Gemm_Rocblas_45546,0.0454045 GemmTunableOp_Half_TN,tn_57344_6_8192,Gemm_Rocblas_45520,0.255582 GemmTunableOp_Half_TN,tn_32000_7_8192,Default,0.146705 GemmTunableOp_Half_TN,tn_8192_7_8192,Gemm_Rocblas_45546,0.0445489 ``` --------- Co-authored-by: Mohit Sharma --- Dockerfile_amd | 73 +- docs/source/_toctree.yml | 10 +- .../basic_tutorials/gated_model_access.md | 2 +- docs/source/installation.md | 8 +- docs/source/installation_amd.md | 38 + docs/source/installation_gaudi.md | 3 + docs/source/installation_inferentia.md | 3 + docs/source/installation_nvidia.md | 18 + docs/source/quicktour.md | 23 +- docs/source/supported_models.md | 14 - server/Makefile-flash-att-v2 | 6 +- server/Makefile-vllm | 6 +- .../exllama_kernels/hip_compat.cuh | 5 +- .../layers/layernorm.py | 4 +- .../text_generation_server/layers/linear.py | 65 +- .../text_generation_server/layers/rotary.py | 4 +- .../custom_modeling/flash_cohere_modeling.py | 4 +- .../custom_modeling/flash_llama_modeling.py | 38 +- .../custom_modeling/flash_mistral_modeling.py | 38 +- .../custom_modeling/idefics_modeling.py | 4 +- .../models/flash_causal_lm.py | 73 +- .../models/flash_gpt2.py | 3 +- .../text_generation_server/models/globals.py | 9 + server/text_generation_server/server.py | 2 + .../utils/flash_attn.py | 62 +- .../utils/flash_attn_triton.py | 816 ++++++++++++++++++ .../utils/paged_attention.py | 136 +-- 27 files changed, 1288 insertions(+), 179 deletions(-) create mode 100644 docs/source/installation_amd.md create mode 100644 docs/source/installation_gaudi.md create mode 100644 docs/source/installation_inferentia.md create mode 100644 docs/source/installation_nvidia.md create mode 100644 server/text_generation_server/utils/flash_attn_triton.py diff --git a/Dockerfile_amd b/Dockerfile_amd index 57a7c637..6f8f874b 100644 --- a/Dockerfile_amd +++ b/Dockerfile_amd @@ -36,7 +36,7 @@ COPY launcher launcher RUN cargo build --release # Text Generation Inference base image for RoCm -FROM rocm/dev-ubuntu-22.04:5.7 as base +FROM rocm/dev-ubuntu-22.04:6.1.1_hip_update as base RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ build-essential \ @@ -50,13 +50,24 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins # Needed to build VLLM & flash. rocthrust-dev \ hipsparse-dev \ - hipblas-dev && \ + hipblas-dev \ + hipblaslt-dev \ + rocblas-dev \ + hiprand-dev \ + rocrand-dev \ + miopen-hip-dev \ + hipfft-dev \ + hipcub-dev \ + hipsolver-dev \ + rccl-dev \ + cmake \ + python3-dev && \ rm -rf /var/lib/apt/lists/* # Keep in sync with `server/pyproject.toml ARG MAMBA_VERSION=23.1.0-1 -ARG PYTORCH_VERSION='2.2.0.dev0' -ARG ROCM_VERSION='5.7' +ARG PYTORCH_VERSION='2.3.0' +ARG ROCM_VERSION='6.0.2' ARG PYTHON_VERSION='3.10.10' # Automatically set by buildx ARG TARGETPLATFORM @@ -75,12 +86,43 @@ RUN chmod +x ~/mambaforge.sh && \ mamba init && \ rm ~/mambaforge.sh -# Install PyTorch 2.2 RC compiled against RoCm 5.7, as VLLM can not be compiled with RoCm 5.6. -RUN pip install torch --index-url https://download.pytorch.org/whl/test/rocm5.7/ +# Install flash-attention, torch dependencies +RUN pip install numpy einops ninja --no-cache-dir + +RUN conda install intel::mkl-static intel::mkl-include +RUN pip uninstall -y triton && \ + git clone --depth 1 --single-branch https://github.com/ROCm/triton.git && \ + cd triton/python && \ + pip install . + +RUN git clone --depth 1 --recursive --single-branch --branch 2.3-patched https://github.com/fxmarty/pytorch.git pytorch && cd pytorch && pip install -r requirements.txt --no-cache-dir + +ARG _GLIBCXX_USE_CXX11_ABI="1" +ARG CMAKE_PREFIX_PATH="/opt/conda" +ARG PYTORCH_ROCM_ARCH="gfx90a;gfx942" +ARG BUILD_CAFFE2="0" \ + BUILD_CAFFE2_OPS="0" \ + USE_CUDA="0" \ + USE_ROCM="1" \ + BUILD_TEST="0" \ + USE_FBGEMM="0" \ + USE_NNPACK="0" \ + USE_QNNPACK="0" \ + USE_XNNPACK="0" \ + USE_FLASH_ATTENTION="1" \ + USE_MEM_EFF_ATTENTION="0" + +RUN cd pytorch && python tools/amd_build/build_amd.py && python setup.py install + +# Set as recommended: https://github.com/ROCm/triton/wiki/A-script-to-set-program-execution-environment-in-ROCm +ENV HIP_FORCE_DEV_KERNARG=1 + +# On MI300, performances for flash with Triton FA is very competitive (actually better than CK) +ENV ROCM_USE_FLASH_ATTN_V2_TRITON=1 FROM base AS kernel-builder -# Build vllm kernels +# # Build vllm kernels FROM kernel-builder AS vllm-builder WORKDIR /usr/src @@ -102,21 +144,21 @@ RUN make build-flash-attention-v2-rocm FROM kernel-builder as custom-kernels-builder WORKDIR /usr/src COPY server/custom_kernels/ . -RUN PYTORCH_ROCM_ARCH=gfx90a python setup.py build +RUN python setup.py build # Build exllama kernels FROM kernel-builder as exllama-kernels-builder WORKDIR /usr/src COPY server/exllama_kernels/ . -RUN PYTORCH_ROCM_ARCH="gfx90a" python setup.py build +RUN python setup.py build # Build exllama v2 kernels FROM kernel-builder as exllamav2-kernels-builder WORKDIR /usr/src COPY server/exllamav2_kernels/ . -RUN PYTORCH_ROCM_ARCH="gfx90a" python setup.py build +RUN python setup.py build FROM base as base-copy @@ -140,9 +182,6 @@ COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 # Copy build artifacts from exllamav2 kernels builder COPY --from=exllamav2-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages -# Install flash-attention dependencies -RUN pip install einops --no-cache-dir - # Install server COPY proto proto COPY server server @@ -160,7 +199,8 @@ COPY --from=builder /usr/src/target/release/text-generation-router /usr/local/bi COPY --from=builder /usr/src/target/release/text-generation-launcher /usr/local/bin/text-generation-launcher # AWS Sagemaker compatible image -FROM base-copy as sagemaker +FROM base as sagemaker + COPY sagemaker-entrypoint.sh entrypoint.sh RUN chmod +x entrypoint.sh @@ -169,5 +209,8 @@ ENTRYPOINT ["./entrypoint.sh"] # Final image FROM base-copy -ENTRYPOINT ["text-generation-launcher"] +COPY ./tgi-entrypoint.sh /tgi-entrypoint.sh +RUN chmod +x /tgi-entrypoint.sh + +ENTRYPOINT ["/tgi-entrypoint.sh"] CMD ["--json-output"] diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index c815b535..a52dd7f3 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -3,8 +3,16 @@ title: Text Generation Inference - local: quicktour title: Quick Tour + - local: installation_nvidia + title: Using TGI with Nvidia GPUs + - local: installation_amd + title: Using TGI with AMD GPUs + - local: installation_gaudi + title: Using TGI with Intel Gaudi + - local: installation_inferentia + title: Using TGI with AWS Inferentia - local: installation - title: Installation + title: Installation from source - local: supported_models title: Supported Models and Hardware - local: messages_api diff --git a/docs/source/basic_tutorials/gated_model_access.md b/docs/source/basic_tutorials/gated_model_access.md index 060d177d..970afa0e 100644 --- a/docs/source/basic_tutorials/gated_model_access.md +++ b/docs/source/basic_tutorials/gated_model_access.md @@ -19,6 +19,6 @@ docker run --gpus all \ --shm-size 1g \ -e HUGGING_FACE_HUB_TOKEN=$token \ -p 8080:80 \ - -v $volume:/data ghcr.io/huggingface/text-generation-inference:1.4 \ + -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.0.3 \ --model-id $model ``` diff --git a/docs/source/installation.md b/docs/source/installation.md index 3e62102d..b6c24d55 100644 --- a/docs/source/installation.md +++ b/docs/source/installation.md @@ -1,6 +1,10 @@ -# Installation +# Installation from source -This section explains how to install the CLI tool as well as installing TGI from source. **The strongly recommended approach is to use Docker, as it does not require much setup. Check [the Quick Tour](./quicktour) to learn how to run TGI with Docker.** + + +Installing TGI from source is not the recommended usage. We strongly recommend to use TGI through Docker, check the [Quick Tour](./quicktour), [Installation for Nvidia GPUs](./installation_nvidia) and [Installation for AMD GPUs](./installation_amd) to learn how to use TGI with Docker. + + ## Install CLI diff --git a/docs/source/installation_amd.md b/docs/source/installation_amd.md new file mode 100644 index 00000000..9c6aa409 --- /dev/null +++ b/docs/source/installation_amd.md @@ -0,0 +1,38 @@ +# Using TGI with AMD GPUs + +TGI is supported and tested on [AMD Instinct MI210](https://www.amd.com/en/products/accelerators/instinct/mi200/mi210.html), [MI250](https://www.amd.com/en/products/accelerators/instinct/mi200/mi250.html) and [MI300](https://www.amd.com/en/products/accelerators/instinct/mi300.html) GPUs. The support may be extended in the future. The recommended usage is through Docker. Make sure to check the [AMD documentation](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/docker.html) on how to use Docker with AMD GPUs. + +On a server powered by AMD GPUs, TGI can be launched with the following command: + +```bash +model=teknium/OpenHermes-2.5-Mistral-7B +volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run + +docker run --rm -it --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \ + --device=/dev/kfd --device=/dev/dri --group-add video \ + --ipc=host --shm-size 256g --net host -v $volume:/data \ + ghcr.io/huggingface/text-generation-inference:2.0.3-rocm \ + --model-id $model +``` + +The launched TGI server can then be queried from clients, make sure to check out the [Consuming TGI](./basic_tutorials/consuming_tgi) guide. + +## TunableOp + +TGI's docker image for AMD GPUs integrates [PyTorch's TunableOp](https://github.com/pytorch/pytorch/tree/main/aten/src/ATen/cuda/tunable), which allows to do an additional warmup to select the best performing matrix multiplication (GEMM) kernel from rocBLAS or hipBLASLt. + +Experimentally, on MI300X, we noticed a 6-8% latency improvement when using TunableOp on top of ROCm 6.1 and PyTorch 2.3. + +TunableOp is enabled by default, the warmup may take 1-2 minutes. In case you would like to disable TunableOp, please pass `--env PYTORCH_TUNABLEOP_ENABLED="0"` when launcher TGI's docker container. + +## Flash attention implementation + +Two implementations of Flash Attention are available for ROCm, the first is [ROCm/flash-attention](https://github.com/ROCm/flash-attention) based on a [Composable Kernel](https://github.com/ROCm/composable_kernel) (CK) implementation, and the second is a [Triton implementation](https://github.com/huggingface/text-generation-inference/blob/main/server/text_generation_server/utils/flash_attn_triton.py). + +By default, as its performances have experimentally been better, Triton implementation is used. It can be disabled (using CK implementation instead) by passing `--env ROCM_USE_FLASH_ATTN_V2_TRITON="0"` when launching TGI's docker container. + +## Unsupported features + +The following features are currently not supported in the ROCm version of TGI, and the supported may be extended in the future: +* Loading [AWQ](https://huggingface.co/docs/transformers/quantization#awq) checkpoints. +* Kernel for sliding window attention (Mistral) diff --git a/docs/source/installation_gaudi.md b/docs/source/installation_gaudi.md new file mode 100644 index 00000000..1ddf2b47 --- /dev/null +++ b/docs/source/installation_gaudi.md @@ -0,0 +1,3 @@ +# Using TGI with Intel Gaudi + +Check out this [repository](https://github.com/huggingface/tgi-gaudi) to serve models with TGI on Gaudi and Gaudi2 with [Optimum Habana](https://huggingface.co/docs/optimum/habana/index). diff --git a/docs/source/installation_inferentia.md b/docs/source/installation_inferentia.md new file mode 100644 index 00000000..0394e6de --- /dev/null +++ b/docs/source/installation_inferentia.md @@ -0,0 +1,3 @@ +# Using TGI with Inferentia + +Check out this [guide](https://github.com/huggingface/optimum-neuron/tree/main/text-generation-inference) on how to serve models with TGI on Inferentia2. diff --git a/docs/source/installation_nvidia.md b/docs/source/installation_nvidia.md new file mode 100644 index 00000000..62e1a3d6 --- /dev/null +++ b/docs/source/installation_nvidia.md @@ -0,0 +1,18 @@ +# Using TGI with Nvidia GPUs + +TGI optimized models are supported on NVIDIA [H100](https://www.nvidia.com/en-us/data-center/h100/), [A100](https://www.nvidia.com/en-us/data-center/a100/), [A10G](https://www.nvidia.com/en-us/data-center/products/a10-gpu/) and [T4](https://www.nvidia.com/en-us/data-center/tesla-t4/) GPUs with CUDA 12.2+. Note that you have to install [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html) to use it. + +For other NVIDIA GPUs, continuous batching will still apply, but some operations like flash attention and paged attention will not be executed. + +TGI can be used on NVIDIA GPUs through its official docker image: + +```bash +model=teknium/OpenHermes-2.5-Mistral-7B +volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run + +docker run --gpus all --shm-size 64g -p 8080:80 -v $volume:/data \ + ghcr.io/huggingface/text-generation-inference:2.0.3 \ + --model-id $model +``` + +The launched TGI server can then be queried from clients, make sure to check out the [Consuming TGI](./basic_tutorials/consuming_tgi) guide. diff --git a/docs/source/quicktour.md b/docs/source/quicktour.md index 70cf575c..6137c6f6 100644 --- a/docs/source/quicktour.md +++ b/docs/source/quicktour.md @@ -2,30 +2,27 @@ The easiest way of getting started is using the official Docker container. Install Docker following [their installation instructions](https://docs.docker.com/get-docker/). -Let's say you want to deploy [teknium/OpenHermes-2.5-Mistral-7B](https://huggingface.co/teknium/OpenHermes-2.5-Mistral-7B) model with TGI. Here is an example on how to do that: +## Launching TGI + +Let's say you want to deploy [teknium/OpenHermes-2.5-Mistral-7B](https://huggingface.co/teknium/OpenHermes-2.5-Mistral-7B) model with TGI on an Nvidia GPU. Here is an example on how to do that: ```bash model=teknium/OpenHermes-2.5-Mistral-7B volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run -docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:1.4 --model-id $model +docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data \ + ghcr.io/huggingface/text-generation-inference:2.0.3 \ + --model-id $model ``` - +### Supported hardware -To use NVIDIA GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). We also recommend using NVIDIA drivers with CUDA version 12.2 or higher. +TGI supports various hardware. Make sure to check the [Using TGI with Nvidia GPUs](./installation_nvidia), [Using TGI with AMD GPUs](./installation_amd), [Using TGI with Gaudi](./installation_gaudi), [Using TGI with Inferentia](./installation_inferentia) guides depending on which hardware you would like to deploy TGI on. - - -TGI also supports ROCm-enabled AMD GPUs (only MI210 and MI250 are tested), details are available in the [Supported Hardware section](./supported_models#supported-hardware) and [AMD documentation](https://rocm.docs.amd.com/en/latest/deploy/docker.html). To launch TGI on ROCm GPUs, please use instead: - -```bash -docker run --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --device=/dev/kfd --device=/dev/dri --group-add video --ipc=host --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:1.4-rocm --model-id $model -``` +## Consuming TGI Once TGI is running, you can use the `generate` endpoint by doing requests. To learn more about how to query the endpoints, check the [Consuming TGI](./basic_tutorials/consuming_tgi) section, where we show examples with utility libraries and UIs. Below you can see a simple snippet to query the endpoint. - @@ -91,7 +88,7 @@ curl 127.0.0.1:8080/generate \ To see all possible deploy flags and options, you can use the `--help` flag. It's possible to configure the number of shards, quantization, generation parameters, and more. ```bash -docker run ghcr.io/huggingface/text-generation-inference:1.4 --help +docker run ghcr.io/huggingface/text-generation-inference:2.0.3 --help ``` diff --git a/docs/source/supported_models.md b/docs/source/supported_models.md index ceb25cfd..d478085e 100644 --- a/docs/source/supported_models.md +++ b/docs/source/supported_models.md @@ -40,17 +40,3 @@ If you wish to serve a supported model that already exists on a local folder, ju ```bash text-generation-launcher --model-id `````` - - -## Supported Hardware - -TGI optimized models are supported on NVIDIA [A100](https://www.nvidia.com/en-us/data-center/a100/), [A10G](https://www.nvidia.com/en-us/data-center/products/a10-gpu/) and [T4](https://www.nvidia.com/en-us/data-center/tesla-t4/) GPUs with CUDA 12.2+. Note that you have to install [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html) to use it. For other NVIDIA GPUs, continuous batching will still apply, but some operations like flash attention and paged attention will not be executed. - -TGI also has support of ROCm-enabled AMD Instinct MI210 and MI250 GPUs, with paged attention, GPTQ quantization, flash attention v2 support. The following features are currently not supported in the ROCm version of TGI, and the supported may be extended in the future: -* Loading [AWQ](https://huggingface.co/docs/transformers/quantization#awq) checkpoints. -* Flash [layer norm kernel](https://github.com/Dao-AILab/flash-attention/tree/main/csrc/layer_norm) -* Kernel for sliding window attention (Mistral) - -TGI is also supported on the following AI hardware accelerators: -- *Habana first-gen Gaudi and Gaudi2:* check out this [repository](https://github.com/huggingface/tgi-gaudi) to serve models with TGI on Gaudi and Gaudi2 with [Optimum Habana](https://huggingface.co/docs/optimum/habana/index) -* *AWS Inferentia2:* check out this [guide](https://github.com/huggingface/optimum-neuron/tree/main/text-generation-inference) on how to serve models with TGI on Inferentia2. diff --git a/server/Makefile-flash-att-v2 b/server/Makefile-flash-att-v2 index 803b3d1f..36ef576a 100644 --- a/server/Makefile-flash-att-v2 +++ b/server/Makefile-flash-att-v2 @@ -1,5 +1,5 @@ flash_att_v2_commit_cuda := 23e8fa5a263d1c7122bc46a86ef32030ee7130f9 -flash_att_v2_commit_rocm := 8736558c287ff2ef28b24878e42828c595ac3e69 +flash_att_v2_commit_rocm := 2554f490101742ccdc56620a938f847f61754be6 flash-attention-v2-cuda: @@ -18,12 +18,12 @@ install-flash-attention-v2-cuda: build-flash-attention-v2-cuda flash-attention-v2-rocm: # Clone flash attention pip install -U packaging ninja --no-cache-dir - git clone https://github.com/fxmarty/flash-attention-rocm flash-attention-v2 + git clone https://github.com/ROCm/flash-attention.git flash-attention-v2 build-flash-attention-v2-rocm: flash-attention-v2-rocm cd flash-attention-v2 && git fetch && git checkout $(flash_att_v2_commit_rocm) cd flash-attention-v2 && git submodule update --init --recursive - cd flash-attention-v2 && PYTORCH_ROCM_ARCH=gfx90a python setup.py build + cd flash-attention-v2 && GPU_ARCHS="gfx90a;gfx942" PYTORCH_ROCM_ARCH="gfx90a;gfx942" python setup.py build install-flash-attention-v2-rocm: build-flash-attention-v2-rocm cd flash-attention-v2 && git submodule update --init --recursive && python setup.py install diff --git a/server/Makefile-vllm b/server/Makefile-vllm index 6f36c679..62fa413f 100644 --- a/server/Makefile-vllm +++ b/server/Makefile-vllm @@ -14,11 +14,11 @@ install-vllm-cuda: build-vllm-cuda vllm-rocm: # Clone vllm pip install -U ninja packaging --no-cache-dir - git clone https://github.com/fxmarty/vllm-public.git vllm + git clone https://github.com/fxmarty/rocm-vllm.git vllm build-vllm-rocm: vllm-rocm - cd vllm && git fetch && git checkout ad9b7c4095ef54419a0533d254f2ad84bd2dfcae - cd vllm && python setup.py build + cd vllm && git fetch && git checkout ca6913b3c2ffacdcb7d15e914dc34adbc6c89479 + cd vllm && PYTORCH_ROCM_ARCH="gfx90a;gfx942" python setup.py install install-vllm-rocm: build-vllm-rocm pip uninstall vllm -y || true diff --git a/server/exllama_kernels/exllama_kernels/hip_compat.cuh b/server/exllama_kernels/exllama_kernels/hip_compat.cuh index 5e698b1a..f2a3dcad 100644 --- a/server/exllama_kernels/exllama_kernels/hip_compat.cuh +++ b/server/exllama_kernels/exllama_kernels/hip_compat.cuh @@ -10,8 +10,9 @@ __device__ __forceinline__ __half __compat_hrcp(__half x) { } __device__ __forceinline__ __half2 __compat_h2rcp(__half2 x) { - return _Float16_2{static_cast<_Float16>(__builtin_amdgcn_rcph(x.x)), - static_cast<_Float16>(__builtin_amdgcn_rcph(x.y))}; + return _Float16_2{ + _Float16_2{static_cast<_Float16>(1.0f), + static_cast<_Float16>(1.0f)} / x.data}; } #define hrcp __compat_hrcp diff --git a/server/text_generation_server/layers/layernorm.py b/server/text_generation_server/layers/layernorm.py index 15d24e80..c4aa6c7d 100644 --- a/server/text_generation_server/layers/layernorm.py +++ b/server/text_generation_server/layers/layernorm.py @@ -72,7 +72,7 @@ if SYSTEM == "cuda": return normed_hidden_states, residual elif SYSTEM == "rocm": - from vllm import layernorm_ops + from vllm._C import ops class FastLayerNorm(nn.LayerNorm): def forward(self, hidden_states, residual=None): @@ -172,7 +172,7 @@ class FastRMSNorm(nn.Module): residual = hidden_states out = torch.empty_like(hidden_states) - layernorm_ops.rms_norm( + ops.rms_norm( out, hidden_states, self.weight.data, diff --git a/server/text_generation_server/layers/linear.py b/server/text_generation_server/layers/linear.py index 8de6ead0..5bd6aa95 100644 --- a/server/text_generation_server/layers/linear.py +++ b/server/text_generation_server/layers/linear.py @@ -2,6 +2,12 @@ import torch from torch.nn import functional as F from text_generation_server.utils.import_utils import SYSTEM +if SYSTEM == "rocm": + try: + from vllm import _custom_C + except Exception as e: + raise ImportError(f"Could not load `vllm._custom_C`. Full error: {e}") + class FastLinear(torch.nn.Module): def __init__( @@ -29,9 +35,66 @@ class FastLinear(torch.nn.Module): return F.linear(input, self.weight, self.bias) +class FastLinearROCm(torch.nn.Module): + def __init__( + self, + weight, + bias, + ) -> None: + super().__init__() + self.weight = torch.nn.Parameter(weight) + if bias is not None: + self.bias = torch.nn.Parameter(bias) + else: + self.bias = None + + @classmethod + def load(cls, config, prefix: str, weights, bias: bool): + weight = weights.get_tensor(f"{prefix}.weight") + if bias: + bias = weights.get_tensor(f"{prefix}.bias") + else: + bias = None + return cls(weight, bias) + + def forward(self, inp: torch.Tensor) -> torch.Tensor: + weight = self.weight + bias = self.bias + + if SYSTEM == "rocm" and inp.numel() // inp.shape[-1] == 1: + batched = False + inp_shape = inp.shape + + if inp.dim() == 3: + inp = inp.view(-1, inp_shape[-1]) + batched = True + + m, k = weight.shape[0], inp_shape[1] + out = torch.empty( + inp_shape[0], weight.shape[0], dtype=inp.dtype, device="cuda" + ) + if (k == 8192 and (m == 1280 or m == 7168)) or (k == 3584 and m == 8192): + _custom_C.LLMM1(weight, inp, out, 8) + elif k <= 8192 and k % 8 == 0 and m % 4 == 0: + _custom_C.LLMM1(weight, inp, out, 4) + else: + out = F.linear(inp, weight) + + if batched: + out.view(*inp_shape[:-1], out.shape[-1]) + + if bias is not None: + out = out + bias + return out + return F.linear(inp, self.weight, self.bias) + + def get_linear(weight, bias, quantize): if quantize is None: - linear = FastLinear(weight, bias) + if SYSTEM == "rocm": + linear = FastLinearROCm(weight, bias) + else: + linear = FastLinear(weight, bias) elif quantize == "eetq": try: from text_generation_server.layers.eetq import EETQLinear diff --git a/server/text_generation_server/layers/rotary.py b/server/text_generation_server/layers/rotary.py index 503dd554..198e5d8d 100644 --- a/server/text_generation_server/layers/rotary.py +++ b/server/text_generation_server/layers/rotary.py @@ -8,7 +8,7 @@ if SYSTEM == "cuda": from flash_attn.layers.rotary import RotaryEmbedding import rotary_emb elif SYSTEM == "rocm": - from vllm import pos_encoding_ops + from vllm._C import ops def _create_inv_freq(dim, base, device): @@ -66,7 +66,7 @@ class PositionRotaryEmbedding(nn.Module): head_size = query.shape[-1] # Inplace operation, updating query and key. - pos_encoding_ops.rotary_embedding(query, key, head_size, cos, sin, True) + ops.rotary_embedding(query, key, head_size, cos, sin, True) elif SYSTEM == "xpu": ipex.llm.functional.rotary_embedding( query, key, sin, cos, query.size(-1), True diff --git a/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py b/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py index 8c423eaf..bd8b8016 100644 --- a/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py @@ -69,7 +69,7 @@ class CohereRotary(PositionRotaryEmbedding): rotary_emb.apply_rotary(k1, k2, cos, sin, k1, k2, False) elif SYSTEM == "rocm": - from vllm import pos_encoding_ops + from vllm._C import ops # NOTE: On RoCm systems, we use a ROPE implementatation adapted from VLLM which launches a single kernel for both query/key, contrary to flash-attn implementation used on NVIDIA systems. # Compiling flash-attn rotary on RoCm, it appears hipcc is unable to unroll loops, resulting in an even slower inference compared to eager: https://github.com/pytorch/pytorch/issues/113773 @@ -77,7 +77,7 @@ class CohereRotary(PositionRotaryEmbedding): head_size = query.shape[-1] # Inplace operation, updating query and key. - pos_encoding_ops.rotary_embedding(query, key, head_size, cos, sin, False) + ops.rotary_embedding(query, key, head_size, cos, sin, False) else: raise ValueError( "Your system seem to be not supported. Please check your install or open an issue at https://github.com/huggingface/text-generation-inference/issues with a clear reproduction." diff --git a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py index 40ccb576..47758d30 100644 --- a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py @@ -22,10 +22,12 @@ from typing import List, Optional, Tuple import torch import torch.distributed + from torch import nn from transformers.activations import ACT2FN from typing import Optional, List, Tuple +from text_generation_server.utils.import_utils import SYSTEM from text_generation_server.utils import paged_attention, flash_attn from text_generation_server.layers import ( TensorParallelRowLinear, @@ -38,6 +40,12 @@ from text_generation_server.layers.layernorm import ( FastRMSNorm, ) +if SYSTEM == "rocm": + try: + from vllm import _custom_C + except Exception as e: + raise ImportError(f"Could not load `vllm._custom_C`. Full error: {e}") + def load_attention(config, prefix, weights): bias = config.attention_bias @@ -182,14 +190,16 @@ class FlashLlamaAttention(torch.nn.Module): class LlamaMLP(nn.Module): def __init__(self, prefix, config, weights): super().__init__() - act = config.hidden_act + self.hidden_act = config.hidden_act self.act = ( - ACT2FN[act] - if "gelu" not in act + ACT2FN[self.hidden_act] + if "gelu" not in self.hidden_act else lambda x: torch.nn.functional.gelu( x, approximate=( - "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none" + "tanh" + if self.hidden_act in ["gelu_fast", "gelu_pytorch_tanh"] + else "none" ), ) ) @@ -221,9 +231,23 @@ class LlamaMLP(nn.Module): ) def forward(self, hidden_states): - gate_up_states = self.gate_up_proj(hidden_states) - gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size) - return self.down_proj(self.act(gate_up_states[:, 0]) * gate_up_states[:, 1]) + if ( + SYSTEM == "rocm" + and self.hidden_act == "silu" + and hidden_states.shape[0] == 1 + ): + out = torch.empty( + hidden_states.shape[0], + self.intermediate_size, + dtype=hidden_states.dtype, + device="cuda", + ) + _custom_C.LLMM_Silu(self.gate_up_proj.linear.weight, hidden_states, out, 8) + return self.down_proj(out) + else: + gate_up_states = self.gate_up_proj(hidden_states) + gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size) + return self.down_proj(self.act(gate_up_states[:, 0]) * gate_up_states[:, 1]) class FlashLlamaLayer(nn.Module): diff --git a/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py b/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py index 3e13c26d..21edc79e 100644 --- a/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py @@ -26,6 +26,7 @@ from transformers.activations import ACT2FN from transformers.configuration_utils import PretrainedConfig from typing import Optional, List, Tuple +from text_generation_server.utils.import_utils import SYSTEM from text_generation_server.utils import paged_attention, flash_attn from text_generation_server.layers import ( TensorParallelRowLinear, @@ -40,6 +41,13 @@ from text_generation_server.layers.layernorm import ( ) +if SYSTEM == "rocm": + try: + from vllm import _custom_C + except Exception as e: + raise ImportError(f"Could not load `vllm._custom_C`. Full error: {e}") + + class MistralConfig(PretrainedConfig): model_type = "mistral" @@ -251,14 +259,16 @@ class MistralAttention(torch.nn.Module): class MistralMLP(nn.Module): def __init__(self, prefix, config, weights): super().__init__() - act = config.hidden_act + self.hidden_act = config.hidden_act self.act = ( - ACT2FN[act] - if "gelu" not in act + ACT2FN[self.hidden_act] + if "gelu" not in self.hidden_act else lambda x: torch.nn.functional.gelu( x, approximate=( - "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none" + "tanh" + if self.hidden_act in ["gelu_fast", "gelu_pytorch_tanh"] + else "none" ), ) ) @@ -281,9 +291,23 @@ class MistralMLP(nn.Module): ) def forward(self, hidden_states): - gate_up_states = self.gate_up_proj(hidden_states) - gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size) - return self.down_proj(self.act(gate_up_states[:, 0]) * gate_up_states[:, 1]) + if ( + SYSTEM == "rocm" + and self.hidden_act == "silu" + and hidden_states.shape[0] == 1 + ): + out = torch.empty( + hidden_states.shape[0], + self.intermediate_size, + dtype=hidden_states.dtype, + device="cuda", + ) + _custom_C.LLMM_Silu(self.gate_up_proj.linear.weight, hidden_states, out, 8) + return self.down_proj(out) + else: + gate_up_states = self.gate_up_proj(hidden_states) + gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size) + return self.down_proj(self.act(gate_up_states[:, 0]) * gate_up_states[:, 1]) class MistralLayer(nn.Module): diff --git a/server/text_generation_server/models/custom_modeling/idefics_modeling.py b/server/text_generation_server/models/custom_modeling/idefics_modeling.py index ec3f900b..d0c84308 100644 --- a/server/text_generation_server/models/custom_modeling/idefics_modeling.py +++ b/server/text_generation_server/models/custom_modeling/idefics_modeling.py @@ -60,7 +60,7 @@ from text_generation_server.utils.import_utils import SYSTEM if SYSTEM == "cuda": import dropout_layer_norm elif SYSTEM == "rocm": - from vllm import layernorm_ops + from vllm._C import ops else: raise RuntimeError(f"Unsupported system {SYSTEM}") @@ -420,7 +420,7 @@ class IdeficsRMSNorm(nn.Module): hidden_states = hidden_states.reshape(-1, shape[-1]) out = torch.empty_like(hidden_states) - layernorm_ops.rms_norm( + ops.rms_norm( out, hidden_states, self.weight.data, diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py index c029d8f3..333efe33 100644 --- a/server/text_generation_server/models/flash_causal_lm.py +++ b/server/text_generation_server/models/flash_causal_lm.py @@ -12,6 +12,9 @@ from dataclasses import dataclass from opentelemetry import trace from transformers import PreTrainedTokenizerBase from typing import Optional, Tuple, List, Type, Dict + +from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE +from text_generation_server.utils.import_utils import SYSTEM from text_generation_server.models import Model from text_generation_server.utils.tokens import batch_top_tokens from text_generation_server.utils.speculate import get_speculate @@ -28,6 +31,7 @@ from text_generation_server.models.cache_manager import ( ) from text_generation_server.pb import generate_pb2 from text_generation_server.models.globals import MEM_POOL, CUDA_GRAPHS +import text_generation_server.models.globals as tgi_globals from text_generation_server.utils import StoppingCriteria, HeterogeneousNextTokenChooser from text_generation_server.utils.dist import MEMORY_FRACTION @@ -783,6 +787,9 @@ class FlashCausalLM(Model): ) max_bt = batch.max_blocks max_s = max_bt * get_cache_manager().block_size + + if SYSTEM == "rocm" and os.environ.get("PYTORCH_TUNABLEOP_ENABLED", False): + torch.cuda.tunable.tuning_enable(False) _, batch, _ = self.generate_token(batch) except torch.cuda.OutOfMemoryError as e: raise RuntimeError( @@ -820,6 +827,49 @@ class FlashCausalLM(Model): self.device, ) + if SYSTEM == "rocm": + if ( + os.environ.get("PYTORCH_TUNABLEOP_ENABLED") is None + or os.environ.get("PYTORCH_TUNABLEOP_ENABLED") == "1" + ): + if os.environ.get("PYTORCH_TUNABLEOP_TUNING") != "0": + torch.cuda.tunable.tuning_enable(True) + + if os.environ.get("PYTORCH_TUNABLEOP_SEQLENS") is not None: + tuning_sequences = [ + int(val) + for val in os.environ["PYTORCH_TUNABLEOP_SEQLENS"].split(",") + ] + else: + tuning_sequences = CUDA_GRAPHS + + tunableop_filepath = os.path.join( + HUGGINGFACE_HUB_CACHE, + f"tunableop_{tgi_globals.MODEL_ID.replace('/', '-')}_tp{self.world_size}_rank{self.rank}.csv", + ) + + logger.info( + f"PyTorch TunableOp (https://github.com/fxmarty/pytorch/tree/2.3-patched/aten/src/ATen/cuda/tunable) is enabled. The warmup may take several minutes, picking the ROCm optimal matrix multiplication kernel for the target lengths {', '.join([str(seqlen) for seqlen in tuning_sequences])}, with typical 5-8% latency improvement for small sequence lengths. The picked GEMMs are saved in the file {tunableop_filepath}. To disable TunableOp, please launch TGI with `PYTORCH_TUNABLEOP_ENABLED=0`." + ) + + if os.path.isfile(tunableop_filepath): + logger.info( + f"The file {tunableop_filepath} already exists and will be reused." + ) + torch.cuda.tunable.read_file(tunableop_filepath) + + os.makedirs(HUGGINGFACE_HUB_CACHE, exist_ok=True) + + for seqlen in tuning_sequences: + logger.info(f"Warming up TunableOp for seqlen={seqlen}") + self.tunableop_warmup(seqlen) + torch.cuda.tunable.write_file(tunableop_filepath) + torch.cuda.tunable.tuning_enable(False) + else: + logger.info( + "PyTorch ROCm TunableOp (https://github.com/pytorch/pytorch/tree/main/aten/src/ATen/cuda/tunable) is disabled. TunableOp brings an additional 5-8% latency improvement for small sequence lengths but requires a warmup. If necessary, please use the environment variable PYTORCH_TUNABLEOP_ENABLED=1 to enable TunableOp." + ) + if CUDA_GRAPHS: try: logger.info(f"Cuda Graphs are enabled for sizes {CUDA_GRAPHS}") @@ -834,6 +884,27 @@ class FlashCausalLM(Model): return int(num_blocks * BLOCK_SIZE) + def tunableop_warmup(self, seqlen: int): + input_ids = torch.zeros(seqlen, dtype=torch.int64, device=self.device) + position_ids = torch.zeros(seqlen, dtype=torch.int32, device=self.device) + slots = torch.arange(seqlen, dtype=torch.int64, device=self.device) + kv_cache = get_cache_manager().kv_cache + + # We pass a `cu_seqlen_prefill` in order not to have to deal with paged attention cache allocation/deallocation. + self.model.forward( + input_ids=input_ids, + position_ids=position_ids, + cu_seqlen_prefill=torch.tensor( + [0, seqlen], device=self.device, dtype=torch.int32 + ), + kv_cache=get_cache_manager().kv_cache, + block_tables=None, + input_lengths=None, + slots=slots, + max_s=seqlen, + lm_head_indices=None, + ) + def forward( self, batch: FlashCausalLMBatch ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: @@ -1113,8 +1184,6 @@ class FlashCausalLM(Model): next_token_texts = [] left = 0 - logger.debug(f"Accepted ids {n_accepted_ids}") - current_stopped = False for j in range(index, index + n_accepted_ids): # Generated token diff --git a/server/text_generation_server/models/flash_gpt2.py b/server/text_generation_server/models/flash_gpt2.py index 5781f55e..0067a806 100644 --- a/server/text_generation_server/models/flash_gpt2.py +++ b/server/text_generation_server/models/flash_gpt2.py @@ -15,11 +15,10 @@ from text_generation_server.utils import ( weight_files, Weights, ) +from text_generation_server.utils.import_utils import SYSTEM tracer = trace.get_tracer(__name__) -from text_generation_server.utils.import_utils import SYSTEM - class FlashGPT2(FlashCausalLM): def __init__( diff --git a/server/text_generation_server/models/globals.py b/server/text_generation_server/models/globals.py index 6f8d1017..e8a11958 100644 --- a/server/text_generation_server/models/globals.py +++ b/server/text_generation_server/models/globals.py @@ -15,3 +15,12 @@ else: cuda_graphs = None CUDA_GRAPHS = cuda_graphs + +# This is overridden at model loading. +global MODEL_ID +MODEL_ID = None + + +def set_model_id(model_id: str): + global MODEL_ID + MODEL_ID = model_id diff --git a/server/text_generation_server/server.py b/server/text_generation_server/server.py index 80076ba6..97828ffb 100644 --- a/server/text_generation_server/server.py +++ b/server/text_generation_server/server.py @@ -19,6 +19,7 @@ from text_generation_server.interceptor import ExceptionInterceptor from text_generation_server.models import Model, get_model from text_generation_server.pb import generate_pb2_grpc, generate_pb2 from text_generation_server.tracing import UDSOpenTelemetryAioServerInterceptor +from text_generation_server.models.globals import set_model_id class SignalHandler: @@ -217,6 +218,7 @@ def serve( while signal_handler.KEEP_PROCESSING: await asyncio.sleep(0.5) + set_model_id(model_id) asyncio.run( serve_inner( model_id, revision, sharded, speculate, dtype, trust_remote_code diff --git a/server/text_generation_server/utils/flash_attn.py b/server/text_generation_server/utils/flash_attn.py index ae60fa63..9ac5655c 100644 --- a/server/text_generation_server/utils/flash_attn.py +++ b/server/text_generation_server/utils/flash_attn.py @@ -2,14 +2,18 @@ import os import torch from loguru import logger +import math from text_generation_server.utils.import_utils import SYSTEM +from text_generation_server.utils.flash_attn_triton import triton_attention if os.getenv("USE_FLASH_ATTENTION", "").lower() == "false": raise ImportError("`USE_FLASH_ATTENTION` is false.") -HAS_FLASH_ATTN = True +HAS_FLASH_ATTN = False HAS_FLASH_ATTN_V2_CUDA = False HAS_FLASH_ATTN_V2_ROCM = False +ROCM_USE_FLASH_ATTN_V2_CK = False +ROCM_USE_FLASH_ATTN_V2_TRITON = False if SYSTEM == "xpu": import intel_extension_for_pytorch as ipex @@ -57,10 +61,21 @@ if SYSTEM in {"cuda", "rocm"}: is_sm75 = major == 7 and minor == 5 is_sm8x = major == 8 and minor >= 0 is_sm90 = major == 9 and minor == 0 + is_sm94 = major == 9 and minor == 4 + + if SYSTEM == "rocm": + if ( + os.getenv("ROCM_USE_FLASH_ATTN_V2_TRITON", "").lower() == "true" + or os.getenv("ROCM_USE_FLASH_ATTN_V2_TRITON", "0") == "1" + ): + ROCM_USE_FLASH_ATTN_V2_TRITON = True + logger.info("ROCm: using Flash Attention 2 Triton implementation.") + else: + ROCM_USE_FLASH_ATTN_V2_CK = True + logger.info( + "ROCm: using Flash Attention 2 Composable Kernel implementation." + ) - HAS_FLASH_ATTN = False - HAS_FLASH_ATTN_V2_CUDA = False - HAS_FLASH_ATTN_V2_ROCM = False try: try: import flash_attn_2_cuda @@ -71,11 +86,16 @@ if SYSTEM in {"cuda", "rocm"}: "Use the official Docker image (ghcr.io/huggingface/text-generation-inference:latest) " f"or install flash attention v2 with `cd server && make install install-flash-attention-v2{architecture_suffix}`" ) - if not (is_sm8x or is_sm90): + if SYSTEM == "cuda" and not (is_sm8x or is_sm90): raise ImportError( f"GPU with CUDA capability {major} {minor} is not supported for " "Flash Attention V2" ) + elif SYSTEM == "rocm" and not (is_sm8x or is_sm90 or is_sm94): + raise ImportError( + f"AMD GPU with compute capability {major} {minor} is not supported for " + "Flash Attention V2" + ) HAS_FLASH_ATTN_V2_CUDA = SYSTEM == "cuda" HAS_FLASH_ATTN_V2_ROCM = SYSTEM == "rocm" except ImportError as e: @@ -142,7 +162,7 @@ if HAS_FLASH_ATTN_V2_CUDA: None, ) -elif HAS_FLASH_ATTN_V2_ROCM: +elif HAS_FLASH_ATTN_V2_ROCM and ROCM_USE_FLASH_ATTN_V2_CK: def attention( q, @@ -153,6 +173,7 @@ elif HAS_FLASH_ATTN_V2_ROCM: max_s, softmax_scale, window_size_left=-1, + causal=True, ): if window_size_left <= 0 and window_size_left != -1: raise ValueError("`window_size_left` must be > 0 or -1") @@ -174,11 +195,38 @@ elif HAS_FLASH_ATTN_V2_ROCM: 0.0, softmax_scale, False, - True, + causal, False, None, ) +elif HAS_FLASH_ATTN_V2_ROCM and ROCM_USE_FLASH_ATTN_V2_TRITON: + + def attention( + q, + k, + v, + out, + cu_seqlens, + max_s, + softmax_scale, + window_size_left=-1, + causal=True, + ): + output, _ = triton_attention( + q, + k, + v, + out, + cu_seqlens, + cu_seqlens, + max_s, + max_s, + causal, + softmax_scale, + ) + return output + elif HAS_FLASH_ATTN: def attention( diff --git a/server/text_generation_server/utils/flash_attn_triton.py b/server/text_generation_server/utils/flash_attn_triton.py new file mode 100644 index 00000000..3fe32231 --- /dev/null +++ b/server/text_generation_server/utils/flash_attn_triton.py @@ -0,0 +1,816 @@ +#!/usr/bin/env python +""" +Fused Attention +=============== + +This is a Triton implementation of the Flash Attention v2 algorithm from Tri Dao +(https://tridao.me/publications/flash2/flash2.pdf) +Credits: OpenAI kernel team, AMD ML Frameworks Triton team + +Features supported: + +1) Fwd with causal masking +2) Any sequence lengths without padding (currently fwd kernel only) +3) Support for different sequence lengths for q and k +4) Nested tensor API currently does not support dropout or bias. + +Not currently supported: + +1) Non power of two head dims + +""" + +import torch +import triton +import triton.language as tl + +torch_dtype: tl.constexpr = torch.float16 + + +@triton.jit +def cdiv_fn(x, y): + return (x + y - 1) // y + + +@triton.jit +def max_fn(x, y): + return tl.math.max(x, y) + + +@triton.jit +def dropout_offsets(philox_seed, philox_offset, dropout_p, m, n, stride): + ms = tl.arange(0, m) + ns = tl.arange(0, n) + return philox_offset + ms[:, None] * stride + ns[None, :] + + +@triton.jit +def dropout_rng(philox_seed, philox_offset, dropout_p, m, n, stride): + rng_offsets = dropout_offsets( + philox_seed, philox_offset, dropout_p, m, n, stride + ).to(tl.uint32) + # TODO: use tl.randint for better performance + return tl.rand(philox_seed, rng_offsets) + + +@triton.jit +def dropout_mask(philox_seed, philox_offset, dropout_p, m, n, stride): + rng_output = dropout_rng(philox_seed, philox_offset, dropout_p, m, n, stride) + rng_keep = rng_output > dropout_p + return rng_keep + + +@triton.jit +def load_fn(block_ptr, first, second, pad): + if first and second: + tensor = tl.load(block_ptr, boundary_check=(0, 1), padding_option=pad) + elif first: + tensor = tl.load(block_ptr, boundary_check=(0,), padding_option=pad) + elif second: + tensor = tl.load(block_ptr, boundary_check=(1,), padding_option=pad) + else: + tensor = tl.load(block_ptr) + return tensor + + +@triton.jit +def _attn_fwd_inner( + acc, + l_i, + m_i, + q, + K_block_ptr, + V_block_ptr, + start_m, + actual_seqlen_k, + dropout_p, + philox_seed, + batch_philox_offset, + encoded_softmax_block_ptr, + block_min, + block_max, + offs_n_causal, + masked_blocks, + n_extra_tokens, + bias_ptr, + IS_CAUSAL: tl.constexpr, + BLOCK_M: tl.constexpr, + BLOCK_DMODEL: tl.constexpr, + BLOCK_N: tl.constexpr, + OFFS_M: tl.constexpr, + OFFS_N: tl.constexpr, + PRE_LOAD_V: tl.constexpr, + MASK_STEPS: tl.constexpr, + ENABLE_DROPOUT: tl.constexpr, + RETURN_ENCODED_SOFTMAX: tl.constexpr, + PADDED_HEAD: tl.constexpr, +): + # loop over k, v, and update accumulator + for start_n in range(block_min, block_max, BLOCK_N): + # For padded blocks, we will overrun the tensor size if + # we load all BLOCK_N. For others, the blocks are all within range. + k = load_fn( + K_block_ptr, + PADDED_HEAD, + MASK_STEPS and (n_extra_tokens != 0), + "zero", + ) + if PRE_LOAD_V: + v = load_fn( + V_block_ptr, + MASK_STEPS and (n_extra_tokens != 0), + PADDED_HEAD, + "zero", + ) + qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) + # We start from end of seqlen_k so only the first iteration would need + # to be checked for padding if it is not a multiple of block_n + # TODO: This can be optimized to only be true for the padded block. + if MASK_STEPS: # noqa: SIM102 + # If this is the last block / iteration, we want to + # mask if the sequence length is not a multiple of block size + # a solution is to always do BLOCK_M // BLOCK_N + 1 steps + # if not is_modulo_mn. last step might get wasted but that is okay. + # check if this masking works for that case. + if (start_n + BLOCK_N == block_max) and (n_extra_tokens != 0): + boundary_m = tl.full([BLOCK_M], actual_seqlen_k, dtype=tl.int32) + size_n = start_n + OFFS_N[None, :] + mask = size_n < boundary_m[:, None] + qk = tl.where(mask, qk, float("-inf")) + if IS_CAUSAL: + causal_boundary = start_n + offs_n_causal + causal_mask = OFFS_M[:, None] >= causal_boundary[None, :] + qk = tl.where(causal_mask, qk, float("-inf")) + # -- compute qk ---- + qk += tl.dot(q, k) + if bias_ptr is not None: + bias = load_fn( + bias_ptr, False, MASK_STEPS and (n_extra_tokens != 0), "zero" + ) + # While bias is added after multiplying qk with sm_scale, our + # optimization to use 2^x instead of e^x results in an additional + # scale factor of log2(e) which we must also multiply the bias with. + qk += bias * 1.44269504089 + m_ij = tl.maximum(m_i, tl.max(qk, 1)) + qk = qk - m_ij[:, None] + p = tl.math.exp2(qk) + + # CAVEAT: Must update l_ij before applying dropout + l_ij = tl.sum(p, 1) + if ENABLE_DROPOUT: + philox_offset = ( + batch_philox_offset + + start_m * BLOCK_M * actual_seqlen_k + + start_n + - BLOCK_N + ) + keep = dropout_mask( + philox_seed, + philox_offset, + dropout_p, + BLOCK_M, + BLOCK_N, + actual_seqlen_k, + ) + if RETURN_ENCODED_SOFTMAX: + tl.store( + encoded_softmax_block_ptr, + tl.where(keep, p, -p).to(encoded_softmax_block_ptr.type.element_ty), + ) + p = tl.where(keep, p, 0.0) + elif RETURN_ENCODED_SOFTMAX: + tl.store( + encoded_softmax_block_ptr, + p.to(encoded_softmax_block_ptr.type.element_ty), + ) + # -- update output accumulator -- + alpha = tl.math.exp2(m_i - m_ij) + acc = acc * alpha[:, None] + if not PRE_LOAD_V: + v = load_fn( + V_block_ptr, + MASK_STEPS and (n_extra_tokens != 0), + PADDED_HEAD, + "zero", + ) + # -- update m_i and l_i + l_i = l_i * alpha + l_ij + # update m_i and l_i + m_i = m_ij + acc += tl.dot(p.to(V_block_ptr.type.element_ty), v) + V_block_ptr = tl.advance(V_block_ptr, (BLOCK_N, 0)) + K_block_ptr = tl.advance(K_block_ptr, (0, BLOCK_N)) + if bias_ptr is not None: + bias_ptr = tl.advance(bias_ptr, (0, BLOCK_N)) + if RETURN_ENCODED_SOFTMAX: + encoded_softmax_block_ptr = tl.advance( + encoded_softmax_block_ptr, (0, BLOCK_N) + ) + return acc, l_i, m_i + + +@triton.autotune( + configs=[ + triton.Config( + { + "BLOCK_M": 256, + "BLOCK_N": 64, + "waves_per_eu": 2, + "PRE_LOAD_V": False, + }, + num_stages=1, + num_warps=8, + ), + triton.Config( + { + "BLOCK_M": 128, + "BLOCK_N": 128, + "waves_per_eu": 2, + "PRE_LOAD_V": False, + }, + num_stages=1, + num_warps=4, + ), + triton.Config( + { + "BLOCK_M": 256, + "BLOCK_N": 128, + "waves_per_eu": 2, + "PRE_LOAD_V": False, + }, + num_stages=1, + num_warps=8, + ), + triton.Config( + { + "BLOCK_M": 128, + "BLOCK_N": 64, + "waves_per_eu": 3, + "PRE_LOAD_V": True, + }, + num_stages=1, + num_warps=4, + ), + triton.Config( + { + "BLOCK_M": 128, + "BLOCK_N": 64, + "waves_per_eu": 3, + "PRE_LOAD_V": False, + }, + num_stages=1, + num_warps=4, + ), + triton.Config( + { + "BLOCK_M": 64, + "BLOCK_N": 64, + "waves_per_eu": 4, + "PRE_LOAD_V": False, + }, + num_stages=1, + num_warps=8, + ), + triton.Config( + { + "BLOCK_M": 32, + "BLOCK_N": 32, + "waves_per_eu": 4, + "PRE_LOAD_V": False, + }, + num_stages=1, + num_warps=8, + ), + # TODO: This config fails with head_size not pow2 with data mismatches. + # triton.Config({'BLOCK_M': 32, 'BLOCK_N': 16, 'waves_per_eu': 1, + # 'PRE_LOAD_V': False}, num_stages=1, num_warps=4), + triton.Config( + { + "BLOCK_M": 16, + "BLOCK_N": 16, + "waves_per_eu": 1, + "PRE_LOAD_V": False, + }, + num_stages=1, + num_warps=4, + ), + triton.Config( + { + "BLOCK_M": 128, + "BLOCK_N": 64, + "waves_per_eu": 1, + "PRE_LOAD_V": False, + }, + num_stages=1, + num_warps=4, + ), + ], + key=["IS_CAUSAL", "dropout_p", "BLOCK_DMODEL"], +) +@triton.jit +def attn_fwd( + Q, + K, + V, + bias, + sm_scale, + L, + Out, + stride_qz, + stride_qh, + stride_qm, + stride_qk, + stride_kz, + stride_kh, + stride_kn, + stride_kk, + stride_vz, + stride_vh, + stride_vk, + stride_vn, + stride_oz, + stride_oh, + stride_om, + stride_on, + stride_bz, + stride_bh, + stride_bm, + stride_bn, + cu_seqlens_q, + cu_seqlens_k, + dropout_p, + philox_seed, + philox_offset_base, + encoded_softmax, + HQ: tl.constexpr, + HK: tl.constexpr, + ACTUAL_BLOCK_DMODEL: tl.constexpr, + MAX_SEQLENS_Q: tl.constexpr, + MAX_SEQLENS_K: tl.constexpr, + VARLEN: tl.constexpr, + IS_CAUSAL: tl.constexpr, + BLOCK_M: tl.constexpr, + BLOCK_DMODEL: tl.constexpr, + BLOCK_N: tl.constexpr, + PRE_LOAD_V: tl.constexpr, + BIAS_TYPE: tl.constexpr, + ENABLE_DROPOUT: tl.constexpr, + RETURN_ENCODED_SOFTMAX: tl.constexpr, +): + start_m = tl.program_id(0) + off_h_q = tl.program_id(1) + off_z = tl.program_id(2) + offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M) + offs_n = tl.arange(0, BLOCK_N) + if VARLEN: + cu_seqlens_q_start = tl.load(cu_seqlens_q + off_z) + cu_seqlens_q_end = tl.load(cu_seqlens_q + off_z + 1) + seqlen_q = cu_seqlens_q_end - cu_seqlens_q_start + # We have a one-size-fits-all grid in id(0). Some seqlens might be too + # small for all start_m so for those we return early. + if start_m * BLOCK_M > seqlen_q: + return + cu_seqlens_k_start = tl.load(cu_seqlens_k + off_z) + cu_seqlens_k_end = tl.load(cu_seqlens_k + off_z + 1) + seqlen_k = cu_seqlens_k_end - cu_seqlens_k_start + else: + cu_seqlens_q_start = 0 + cu_seqlens_k_start = 0 + seqlen_q = MAX_SEQLENS_Q + seqlen_k = MAX_SEQLENS_K + + # Now we compute whether we need to exit early due to causal masking. + # This is because for seqlen_q > seqlen_k, M rows of the attn scores + # are completely masked, resulting in 0s written to the output, and + # inf written to LSE. We don't need to do any GEMMs in this case. + # This block of code determines what N is, and if this WG is operating + # on those M rows. + n_blocks = cdiv_fn(seqlen_k, BLOCK_N) + if IS_CAUSAL: + # If seqlen_q == seqlen_k, the attn scores are a square matrix. + # If seqlen_q != seqlen_k, attn scores are rectangular which means + # the causal mask boundary is bottom right aligned, and ends at either + # the top edge (seqlen_q < seqlen_k) or left edge. + # This captures the decrease in n_blocks if we have a rectangular attn + # matrix + n_blocks_seqlen = cdiv_fn( + (start_m + 1) * BLOCK_M + seqlen_k - seqlen_q, BLOCK_N + ) + # This is what adjusts the block_max for the current WG, only + # if IS_CAUSAL. Otherwise we want to always iterate through all n_blocks + n_blocks = min(n_blocks, n_blocks_seqlen) + # If we have no blocks after adjusting for seqlen deltas, this WG is + # part of the blocks that are all 0. We exit early. + if n_blocks <= 0: + o_offset = ( + off_z * stride_oz + cu_seqlens_q_start * stride_om + off_h_q * stride_oh + ) + O_block_ptr = tl.make_block_ptr( + base=Out + o_offset, + shape=(seqlen_q, BLOCK_DMODEL), + strides=(stride_om, stride_on), + offsets=(start_m * BLOCK_M, 0), + block_shape=(BLOCK_M, BLOCK_DMODEL), + order=(1, 0), + ) + acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=Out.type.element_ty) + # We still need to write 0s to the result + # tl.store(O_block_ptr, + # acc.to(Out.type.element_ty), boundary_check=(0,1)) + # l_ptrs = L + off_z * hq * MAX_SEQLENS_Q + off_h_q * MAX_SEQLENS_Q + # + offs_m + # We store inf to LSE, not -inf because in the bwd pass, + # we subtract this + # from qk which makes it -inf, such that exp(qk - inf) = 0 + # for these masked blocks. + # l = tl.full([BLOCK_M], value=float("inf"), dtype=tl.float32) + # tl.store(l_ptrs, l) + # TODO: Should dropout and return encoded softmax be handled here? + return + + # If MQA / GQA, set the K and V head offsets appropriately. + GROUP_SIZE: tl.constexpr = HQ // HK + if GROUP_SIZE != 1: + off_h_k = off_h_q // GROUP_SIZE + else: + off_h_k = off_h_q + + n_extra_tokens = 0 + if seqlen_k < BLOCK_N: + n_extra_tokens = BLOCK_N - seqlen_k + elif seqlen_k % BLOCK_N: + n_extra_tokens = seqlen_k % BLOCK_N + PADDED_HEAD: tl.constexpr = ACTUAL_BLOCK_DMODEL != BLOCK_DMODEL + + # Compute pointers for all the tensors used in this kernel. + q_offset = off_z * stride_qz + off_h_q * stride_qh + cu_seqlens_q_start * stride_qm + Q_block_ptr = tl.make_block_ptr( + base=Q + q_offset, + shape=(seqlen_q, ACTUAL_BLOCK_DMODEL), + strides=(stride_qm, stride_qk), + offsets=(start_m * BLOCK_M, 0), + block_shape=(BLOCK_M, BLOCK_DMODEL), + order=(1, 0), + ) + k_offset = off_z * stride_kz + off_h_k * stride_kh + cu_seqlens_k_start * stride_kn + K_block_ptr = tl.make_block_ptr( + base=K + k_offset, + shape=(ACTUAL_BLOCK_DMODEL, seqlen_k), + strides=(stride_kk, stride_kn), + offsets=(0, 0), + block_shape=(BLOCK_DMODEL, BLOCK_N), + order=(0, 1), + ) + v_offset = off_z * stride_vz + off_h_k * stride_vh + cu_seqlens_k_start * stride_vk + V_block_ptr = tl.make_block_ptr( + base=V + v_offset, + shape=(seqlen_k, ACTUAL_BLOCK_DMODEL), + strides=(stride_vk, stride_vn), + offsets=(0, 0), + block_shape=(BLOCK_N, BLOCK_DMODEL), + order=(1, 0), + ) + if BIAS_TYPE != 0: + bias_ptr = tl.make_block_ptr( + base=bias + off_h_q * stride_bh, + shape=(seqlen_q, seqlen_k), + strides=(stride_bm, stride_bn), + offsets=(start_m * BLOCK_M, 0), + block_shape=(BLOCK_M, BLOCK_N), + order=(1, 0), + ) + else: + bias_ptr = None + if ENABLE_DROPOUT: + batch_philox_offset = ( + philox_offset_base + (off_z * HQ + off_h_q) * seqlen_q * seqlen_k + ) + else: + batch_philox_offset = 0 + # We can ask to return the dropout mask without actually doing any dropout. + # In this case, we return an invalid pointer so indicate the mask is not i + # valid. + # TODO: Fix encoded softmax. It currently uses just h_q in the base offset. + if RETURN_ENCODED_SOFTMAX: + encoded_softmax_block_ptr = tl.make_block_ptr( + base=encoded_softmax + off_h_q * seqlen_q * seqlen_k, + shape=(seqlen_q, seqlen_k), + strides=(seqlen_k, 1), + offsets=(start_m * BLOCK_M, 0), + block_shape=(BLOCK_M, BLOCK_N), + order=(1, 0), + ) + else: + encoded_softmax_block_ptr = 0 + # initialize pointer to m and l + m_i = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32) + l_i = tl.full([BLOCK_M], 1.0, dtype=tl.float32) + acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32) + # scale sm_scale by log_2(e) and use 2^x in the loop as we do not + # have native e^x support in HW. + qk_scale = sm_scale * 1.44269504089 + # Q is loaded once at the beginning and shared by all N blocks. + q = load_fn(Q_block_ptr, True, PADDED_HEAD, "zero") + q = (q * qk_scale).to(Q_block_ptr.type.element_ty) + + # Here we compute how many full and masked blocks we have. + padded_block_k = n_extra_tokens != 0 + is_modulo_mn = not padded_block_k and (seqlen_q % BLOCK_M == 0) + if IS_CAUSAL: + # There are always at least BLOCK_M // BLOCK_N masked blocks. + # Additionally there might be one more due to dissimilar seqlens. + masked_blocks = BLOCK_M // BLOCK_N + (not is_modulo_mn) + else: + # Padding on Q does not need to be masked in the FA loop. + masked_blocks = padded_block_k + # if IS_CAUSAL, not is_modulo_mn does not always result in an additional + # block. In this case we might exceed n_blocks so pick the min. + masked_blocks = min(masked_blocks, n_blocks) + n_full_blocks = n_blocks - masked_blocks + block_min = 0 + block_max = n_blocks * BLOCK_N + # Compute for full blocks. Here we set causal to false regardless of its + # value because there is no masking. Similarly we do not need padding. + if n_full_blocks > 0: + block_max = (n_blocks - masked_blocks) * BLOCK_N + acc, l_i, m_i = _attn_fwd_inner( + acc, + l_i, + m_i, + q, + K_block_ptr, + V_block_ptr, + start_m, + seqlen_k, + dropout_p, + philox_seed, + batch_philox_offset, + encoded_softmax_block_ptr, + # _, _, offs_n_causal, masked_blocks, n_extra_tokens, _ + block_min, + block_max, + 0, + 0, + 0, + bias_ptr, + # IS_CAUSAL, .... + False, + BLOCK_M, + BLOCK_DMODEL, + BLOCK_N, + offs_m, + offs_n, + # _, MASK_STEPS, ... + PRE_LOAD_V, + False, + ENABLE_DROPOUT, + RETURN_ENCODED_SOFTMAX, + PADDED_HEAD, + ) + block_min = block_max + block_max = n_blocks * BLOCK_N + + tl.debug_barrier() + # Remaining blocks, if any, are full / not masked. + if masked_blocks > 0: + offs_n_causal = offs_n + (seqlen_q - seqlen_k) if IS_CAUSAL else 0 + K_block_ptr = tl.advance(K_block_ptr, (0, n_full_blocks * BLOCK_N)) + V_block_ptr = tl.advance(V_block_ptr, (n_full_blocks * BLOCK_N, 0)) + if bias_ptr is not None: + bias_ptr = tl.advance(bias_ptr, (0, n_full_blocks * BLOCK_N)) + if RETURN_ENCODED_SOFTMAX: + encoded_softmax_block_ptr = tl.advance( + encoded_softmax_block_ptr, (0, n_full_blocks) + ) + acc, l_i, m_i = _attn_fwd_inner( + acc, + l_i, + m_i, + q, + K_block_ptr, + V_block_ptr, + start_m, + seqlen_k, + dropout_p, + philox_seed, + batch_philox_offset, + encoded_softmax_block_ptr, + block_min, + block_max, + offs_n_causal, + masked_blocks, + n_extra_tokens, + bias_ptr, + IS_CAUSAL, + BLOCK_M, + BLOCK_DMODEL, + BLOCK_N, + offs_m, + offs_n, + # _, MASK_STEPS, ... + PRE_LOAD_V, + True, + ENABLE_DROPOUT, + RETURN_ENCODED_SOFTMAX, + PADDED_HEAD, + ) + # epilogue + acc = acc / l_i[:, None] + if ENABLE_DROPOUT: + acc = acc / (1 - dropout_p) + # If seqlen_q > seqlen_k but the delta is not a multiple of BLOCK_M, + # then we have one block with a row of all NaNs which come from computing + # softmax over a row of all -infs (-inf - inf = NaN). We check for that here + # and store 0s where there are NaNs as these rows should've been zeroed out. + end_m_idx = (start_m + 1) * BLOCK_M + start_m_idx = start_m * BLOCK_M + causal_start_idx = seqlen_q - seqlen_k + acc = acc.to(Out.type.element_ty) + if IS_CAUSAL: # noqa: SIM102 + if causal_start_idx > start_m_idx and causal_start_idx < end_m_idx: + out_mask_boundary = tl.full( + (BLOCK_DMODEL,), causal_start_idx, dtype=tl.int32 + ) + mask_m_offsets = start_m_idx + tl.arange(0, BLOCK_M) + out_ptrs_mask = mask_m_offsets[:, None] >= out_mask_boundary[None, :] + z = 0.0 + acc = tl.where(out_ptrs_mask, acc, z.to(acc.type.element_ty)) + # write back LSE + # l_ptrs = L + off_z * hq * MAX_SEQLENS_Q + off_h_q * MAX_SEQLENS_Q + offs_m + # If seqlen_q not multiple of BLOCK_M, we need to mask out the last + # few rows. This is only true for the last M block. For others, + # overflow_size will be -ve + # overflow_size = end_m_idx - seqlen_q + # if overflow_size > 0: + # boundary = tl.full((BLOCK_M,), BLOCK_M - overflow_size, dtype=tl.int32) + # # This is a > check because mask being 0 blocks the store. + # l_ptrs_mask = boundary > tl.arange(0, BLOCK_M) + # tl.store(l_ptrs, m_i + tl.math.log2(l_i), mask=l_ptrs_mask) + # else: + # tl.store(l_ptrs, m_i + tl.math.log2(l_i)) + + # write back O + o_offset = off_z * stride_oz + cu_seqlens_q_start * stride_om + off_h_q * stride_oh + O_block_ptr = tl.make_block_ptr( + base=Out + o_offset, + shape=(seqlen_q, ACTUAL_BLOCK_DMODEL), + strides=(stride_om, stride_on), + offsets=(start_m * BLOCK_M, 0), + block_shape=(BLOCK_M, BLOCK_DMODEL), + order=(1, 0), + ) + # Need boundary check on this to make sure the padding from the + # Q and KV tensors in both dims are not part of what we store back. + # TODO: Do the boundary check optionally. + tl.store(O_block_ptr, acc, boundary_check=(0, 1)) + + +def check_args( + q, + k, + v, + o, + varlen=True, + max_seqlens=None, + cu_seqlens_q=None, + cu_seqlens_k=None, +): + assert q.dim() == k.dim() and q.dim() == v.dim() + if varlen: + assert q.dim() == 3 + total_q, nheads_q, head_size = q.shape + total_k, nheads_k, _ = k.shape + assert cu_seqlens_q is not None + assert cu_seqlens_k is not None + assert len(cu_seqlens_q) == len(cu_seqlens_k) + else: + assert q.dim() == 4 + batch, nheads_q, seqlen_q, head_size = q.shape + _, nheads_k, seqlen_k, _ = k.shape + assert max_seqlens > 0 + assert k.shape == v.shape + assert q.shape[-1] == k.shape[-1] and q.shape[-1] == v.shape[-1] + # TODO: Change assert if we support qkl f8 and v f16 + assert q.dtype == k.dtype and q.dtype == v.dtype + # TODO: Fix assert to check head size <=256 once supported + assert head_size <= 128 + assert o.shape == q.shape + assert (nheads_q % nheads_k) == 0 + + +class _attention(torch.autograd.Function): + + @staticmethod + def forward( + ctx, + q, + k, + v, + o, + cu_seqlens_q, + cu_seqlens_k, + max_seqlens_q, + max_seqlens_k, + causal=False, + sm_scale=1.0, + bias=None, + ): + if o is None: + o = torch.empty_like(q, dtype=v.dtype) + + check_args( + q, + k, + v, + o, + varlen=True, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + ) + if True: # varlen + total_q, nheads_q, head_size = q.shape + total_k, nheads_k, _ = k.shape + batch = len(cu_seqlens_q) - 1 + q_strides = (0, q.stride(1), q.stride(0), q.stride(2)) + k_strides = (0, k.stride(1), k.stride(0), k.stride(2)) + v_strides = (0, v.stride(1), v.stride(0), v.stride(2)) + o_strides = (0, o.stride(1), o.stride(0), o.stride(2)) + else: + batch, seqlen_q, nheads_q, head_size = q.shape + _, seqlen_k, nheads_k, _ = k.shape + q_strides = (q.stride(0), q.stride(2), q.stride(1), q.stride(3)) + k_strides = (k.stride(0), k.stride(2), k.stride(1), k.stride(3)) + v_strides = (v.stride(0), v.stride(2), v.stride(1), v.stride(3)) + o_strides = (o.stride(0), o.stride(2), o.stride(1), o.stride(3)) + + # Get closest power of 2 over or equal to 32. + padded_d_model = 1 << (head_size - 1).bit_length() + padded_d_model = max(padded_d_model, 16) + + grid = lambda META: ( + triton.cdiv(max_seqlens_q, META["BLOCK_M"]), + nheads_q, + batch, + ) + + encoded_softmax = None + + # Seed the RNG so we get reproducible results for testing. + philox_seed = 0x1BF52 + philox_offset = 0x1D4B42 + + if bias is not None: + bias_strides = ( + bias.stride(0), + bias.stride(1), + bias.stride(2), + bias.stride(3), + ) + else: + bias_strides = (0, 0, 0, 0) + + attn_fwd[grid]( + q, + k, + v, + bias, + sm_scale, + None, + o, + *q_strides, + *k_strides, + *v_strides, + *o_strides, + *bias_strides, + cu_seqlens_q, + cu_seqlens_k, + dropout_p=0.0, + philox_seed=philox_seed, + philox_offset_base=philox_offset, + encoded_softmax=encoded_softmax, + HQ=nheads_q, + HK=nheads_k, + ACTUAL_BLOCK_DMODEL=head_size, + MAX_SEQLENS_Q=max_seqlens_q, + MAX_SEQLENS_K=max_seqlens_k, + IS_CAUSAL=causal, + VARLEN=True, + BLOCK_DMODEL=padded_d_model, + BIAS_TYPE=0 if bias is None else 1, + ENABLE_DROPOUT=False, + RETURN_ENCODED_SOFTMAX=False, + ) + + ctx.grid = grid + ctx.sm_scale = sm_scale + ctx.BLOCK_DMODEL = head_size + ctx.causal = causal + ctx.dropout_p = 0.0 + ctx.philox_seed = philox_seed + ctx.philox_offset = philox_offset + ctx.encoded_softmax = encoded_softmax + ctx.return_encoded_softmax = False + return o, encoded_softmax + + +triton_attention = _attention.apply diff --git a/server/text_generation_server/utils/paged_attention.py b/server/text_generation_server/utils/paged_attention.py index 1b31f7e7..6cc30e6d 100644 --- a/server/text_generation_server/utils/paged_attention.py +++ b/server/text_generation_server/utils/paged_attention.py @@ -5,6 +5,14 @@ _PARTITION_SIZE = 512 if SYSTEM == "xpu": import intel_extension_for_pytorch as ipex +else: + try: + from vllm._C import cache_ops + from vllm._C import ops + except Exception as e: + raise ImportError( + f"Could not import vllm paged attention. Make sure your installation is correct. Complete error: {e}" + ) def reshape_and_cache( @@ -14,22 +22,14 @@ def reshape_and_cache( value_cache: torch.Tensor, slots: torch.Tensor, ): - if SYSTEM == "cuda": - from vllm._C import cache_ops - - cache_ops.reshape_and_cache( - key, value, key_cache, value_cache, slots, "auto", 1.0 - ) - elif SYSTEM == "rocm": - from vllm import cache_ops - - cache_ops.reshape_and_cache(key, value, key_cache, value_cache, slots) - elif SYSTEM == "xpu": + if SYSTEM == "xpu": ipex.llm.modules.PagedAttention.reshape_and_cache( key, value, key_cache, value_cache, slots ) else: - raise ValueError("vllm is not supported on your system") + cache_ops.reshape_and_cache( + key, value, key_cache, value_cache, slots, "auto", 1.0 + ) def attention( @@ -87,43 +87,21 @@ def attention( # to parallelize. use_v1 = max_s <= 8192 and (max_num_partitions == 1 or num_seqs * num_heads > 512) if use_v1: - if SYSTEM == "cuda": - from vllm._C import ops - - ops.paged_attention_v1( - out, - query, - key_cache, - value_cache, - kv_head_mapping, - softmax_scale, - block_tables, - input_lengths, - block_size, - max_s, - None, - "auto", - 1.0, - ) - elif SYSTEM == "rocm": - from vllm import attention_ops - - attention_ops.paged_attention_v1( - out, - query, - key_cache, - value_cache, - kv_head_mapping, - softmax_scale, - block_tables, - input_lengths, - block_size, - max_s, - None, - ) - else: - raise ValueError("vllm is not supported on your system") - + ops.paged_attention_v1( + out, + query, + key_cache, + value_cache, + kv_head_mapping, + softmax_scale, + block_tables, + input_lengths, + block_size, + max_s, + None, + "auto", + 1.0, + ) else: # Run PagedAttention V2. assert _PARTITION_SIZE % block_size == 0 @@ -139,45 +117,21 @@ def attention( ) max_logits = torch.empty_like(exp_sums) - if SYSTEM == "cuda": - from vllm._C import ops - - ops.paged_attention_v2( - out, - exp_sums, - max_logits, - tmp_output, - query, - key_cache, - value_cache, - kv_head_mapping, - softmax_scale, - block_tables, - input_lengths, - block_size, - max_s, - None, - "auto", - 1.0, - ) - elif SYSTEM == "rocm": - from vllm import attention_ops - - attention_ops.paged_attention_v2( - out, - exp_sums, - max_logits, - tmp_output, - query, - key_cache, - value_cache, - kv_head_mapping, - softmax_scale, - block_tables, - input_lengths, - block_size, - max_s, - None, - ) - else: - raise ValueError("vllm is not supported on your system") + ops.paged_attention_v2( + out, + exp_sums, + max_logits, + tmp_output, + query, + key_cache, + value_cache, + kv_head_mapping, + softmax_scale, + block_tables, + input_lengths, + block_size, + max_s, + None, + "auto", + 1.0, + ) From 3631347766882f9a3278225ad27d469a2da8a4d9 Mon Sep 17 00:00:00 2001 From: fxmarty <9808326+fxmarty@users.noreply.github.com> Date: Fri, 17 May 2024 16:34:44 +0200 Subject: [PATCH 25/46] Add TGI monitoring guide through Grafana and Prometheus (#1908) As per title. It is very useful. --- assets/tgi_grafana.json | 3973 +++++++++++++++++++++ docs/source/_toctree.yml | 4 +- docs/source/basic_tutorials/monitoring.md | 75 + 3 files changed, 4051 insertions(+), 1 deletion(-) create mode 100644 assets/tgi_grafana.json create mode 100644 docs/source/basic_tutorials/monitoring.md diff --git a/assets/tgi_grafana.json b/assets/tgi_grafana.json new file mode 100644 index 00000000..6a23e811 --- /dev/null +++ b/assets/tgi_grafana.json @@ -0,0 +1,3973 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS_EKS API INFERENCE PROD", + "label": "Prometheus EKS API Inference Prod", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__elements": {}, + "__requires": [ + { + "type": "panel", + "id": "gauge", + "name": "Gauge", + "version": "" + }, + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "10.0.2" + }, + { + "type": "panel", + "id": "heatmap", + "name": "Heatmap", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 2, + "id": 551, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "fieldMinMax": false, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 9, + "x": 0, + "y": 0 + }, + "id": 49, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "((histogram_quantile(0.5, sum by (le) (rate(tgi_request_queue_duration_bucket{container=\"$service\"}[10m]))) * 1000) + histogram_quantile(0.5, sum by (le) (rate(tgi_batch_inference_duration_bucket{method=\"prefill\", container=\"$service\"}[10m]))))>0 ", + "instant": false, + "range": true, + "refId": "A" + } + ], + "title": "Time to first token", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 9, + "y": 0 + }, + "id": 44, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "(histogram_quantile(0.5, sum by (le) (rate(tgi_batch_forward_duration_bucket{method=\"decode\", container=\"$service\"}[10m]))) * 1000)>0", + "instant": false, + "range": true, + "refId": "A" + } + ], + "title": "Decode per-token latency", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 7, + "x": 17, + "y": 0 + }, + "id": 45, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "sum((rate(tgi_request_generated_tokens_sum{container=\"$service\"}[10m]) / rate(tgi_request_generated_tokens_count{container=\"$service\"}[10m]))>0)", + "instant": false, + "range": true, + "refId": "A" + } + ], + "title": "Throughput (generated tok/s)", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "p50" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "p90" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "p99" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 7 + }, + "id": 48, + "options": { + "legend": { + "calcs": [ + "min", + "max" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.5, sum by (le) (rate(tgi_request_input_length_bucket{container=\"$service\"}[10m])))", + "legendFormat": "p50", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.9, sum by (le) (rate(tgi_request_input_length_bucket{container=\"$service\"}[10m])))", + "hide": false, + "legendFormat": "p90", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.99, sum by (le) (rate(tgi_request_input_length_bucket{container=\"$service\"}[10m])))", + "hide": false, + "legendFormat": "p99", + "range": true, + "refId": "C" + } + ], + "title": "Number of tokens per prompt", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "p50" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "p90" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "p99" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 7 + }, + "id": 30, + "options": { + "legend": { + "calcs": [ + "min", + "max" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.5, sum by (le) (rate(tgi_request_generated_tokens_bucket{container=\"$service\"}[10m])))", + "legendFormat": "p50", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.9, sum by (le) (rate(tgi_request_generated_tokens_bucket{container=\"$service\"}[10m])))", + "hide": false, + "legendFormat": "p90", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.99, sum by (le) (rate(tgi_request_generated_tokens_bucket{container=\"$service\"}[10m])))", + "hide": false, + "legendFormat": "p99", + "range": true, + "refId": "C" + } + ], + "title": "Number of generated tokens per request", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 15 + }, + "id": 20, + "panels": [], + "title": "General", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 30, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 16 + }, + "id": 4, + "maxDataPoints": 100, + "options": { + "legend": { + "calcs": [ + "min", + "max" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "sum(increase(tgi_request_success{container=\"$service\"}[1m]))", + "legendFormat": "Success", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "sum(increase(tgi_request_failure{container=\"$service\"}[1m])) by (err)", + "hide": false, + "legendFormat": "Error: {{err}}", + "range": true, + "refId": "B" + } + ], + "title": "Requests", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "p50" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "p90" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "p99" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 13, + "w": 9, + "x": 6, + "y": 16 + }, + "id": 6, + "options": { + "legend": { + "calcs": [ + "min", + "max" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.5, sum by (le) (rate(tgi_request_mean_time_per_token_duration_bucket{container=\"$service\"}[10m])))", + "legendFormat": "p50", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.9, sum by (le) (rate(tgi_request_mean_time_per_token_duration_bucket{container=\"$service\"}[10m])))", + "hide": false, + "legendFormat": "p90", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.99, sum by (le) (rate(tgi_request_mean_time_per_token_duration_bucket{container=\"$service\"}[10m])))", + "hide": false, + "legendFormat": "p99", + "range": true, + "refId": "C" + } + ], + "title": "Mean Time Per Token quantiles", + "type": "timeseries" + }, + { + "cards": {}, + "color": { + "cardColor": "#5794F2", + "colorScale": "linear", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "min": 0, + "mode": "opacity" + }, + "dataFormat": "tsbuckets", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 13, + "w": 9, + "x": 15, + "y": 16 + }, + "heatmap": {}, + "hideZeroBuckets": false, + "highlightCards": true, + "id": 13, + "legend": { + "show": false + }, + "maxDataPoints": 25, + "options": { + "calculate": false, + "calculation": {}, + "cellGap": 2, + "cellValues": {}, + "color": { + "exponent": 0.5, + "fill": "#5794F2", + "min": 0, + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Spectral", + "steps": 128 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": false + }, + "rowsFrame": { + "layout": "auto" + }, + "showValue": "never", + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": false + }, + "yAxis": { + "axisPlacement": "left", + "decimals": 1, + "reverse": false, + "unit": "s" + } + }, + "pluginVersion": "10.4.2", + "reverseYBuckets": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(increase(tgi_request_mean_time_per_token_duration_bucket{container=\"$service\"}[5m])) by (le)", + "format": "heatmap", + "interval": "", + "legendFormat": "{{ le }}", + "range": true, + "refId": "A" + } + ], + "title": "Mean Time Per Token", + "tooltip": { + "show": true, + "showHistogram": false + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "yAxis": { + "decimals": 1, + "format": "s", + "logBase": 1, + "show": true + }, + "yBucketBound": "auto" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 70 + }, + { + "color": "red", + "value": 85 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 0, + "y": 24 + }, + "id": 18, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "count(tgi_request_count{container=\"$service\"})", + "legendFormat": "Replicas", + "range": true, + "refId": "A" + } + ], + "title": "Number of replicas", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 70 + }, + { + "color": "red", + "value": 85 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 3, + "y": 24 + }, + "id": 32, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "10.4.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "sum(tgi_queue_size{container=\"$service\"})", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Queue Size", + "type": "gauge" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 29 + }, + "id": 26, + "panels": [], + "title": "Batching", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "bars", + "fillOpacity": 50, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 0, + "y": 30 + }, + "id": 29, + "maxDataPoints": 40, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "avg(tgi_batch_current_max_tokens{container=\"$service\"})", + "legendFormat": "{{ pod }}", + "range": true, + "refId": "A" + } + ], + "title": "Max tokens per batch", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "p50" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "p90" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "p99" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 9, + "w": 4, + "x": 6, + "y": 30 + }, + "id": 33, + "options": { + "legend": { + "calcs": [ + "min", + "max" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.5, sum by (le) (rate(tgi_request_skipped_tokens_bucket{container=\"$service\"}[10m])))", + "legendFormat": "p50", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.9, sum by (le) (rate(tgi_request_skipped_tokens_bucket{container=\"$service\"}[10m])))", + "hide": false, + "legendFormat": "p90", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.99, sum by (le) (rate(tgi_request_skipped_tokens_bucket{container=\"$service\"}[10m])))", + "hide": false, + "legendFormat": "p99", + "range": true, + "refId": "C" + } + ], + "title": "Speculated Tokens", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "p50" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "p90" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "p99" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 9, + "w": 5, + "x": 10, + "y": 30 + }, + "id": 46, + "options": { + "legend": { + "calcs": [ + "min", + "max" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.5, sum by (le) (rate(tgi_request_input_length_bucket{container=\"$service\"}[10m])))", + "legendFormat": "p50", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.9, sum by (le) (rate(tgi_request_input_length_bucket{container=\"$service\"}[10m])))", + "hide": false, + "legendFormat": "p90", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.99, sum by (le) (rate(tgi_request_input_length_bucket{container=\"$service\"}[10m])))", + "hide": false, + "legendFormat": "p99", + "range": true, + "refId": "C" + } + ], + "title": "Prompt Tokens", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "p50" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "p90" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "p99" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 9, + "w": 9, + "x": 15, + "y": 30 + }, + "id": 8, + "options": { + "legend": { + "calcs": [ + "min", + "max" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.5, sum by (le) (rate(tgi_request_duration_bucket{container=\"$service\"}[10m])))", + "legendFormat": "p50", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.9, sum by (le) (rate(tgi_request_duration_bucket{container=\"$service\"}[10m])))", + "hide": false, + "legendFormat": "p90", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.99, sum by (le) (rate(tgi_request_duration_bucket{container=\"$service\"}[10m])))", + "hide": false, + "legendFormat": "p99", + "range": true, + "refId": "C" + } + ], + "title": "Latency quantiles", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "bars", + "fillOpacity": 50, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 35 + }, + "id": 27, + "maxDataPoints": 40, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "avg(tgi_batch_current_size{container=\"$service\"})", + "legendFormat": "{{ pod }}", + "range": true, + "refId": "A" + } + ], + "title": "Batch Size", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 30, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 6, + "x": 0, + "y": 39 + }, + "id": 28, + "maxDataPoints": 100, + "options": { + "legend": { + "calcs": [ + "min", + "max" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "sum(increase(tgi_batch_concat{container=\"$service\"}[1m])) by (reason)", + "hide": false, + "legendFormat": "Reason: {{ reason }}", + "range": true, + "refId": "B" + } + ], + "title": "Concatenates", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "p50" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "p90" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "p99" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 9, + "w": 9, + "x": 6, + "y": 39 + }, + "id": 31, + "options": { + "legend": { + "calcs": [ + "min", + "max" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.5, sum by (le) (rate(tgi_request_queue_duration_bucket{container=\"$service\"}[10m])))", + "legendFormat": "p50", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.9, sum by (le) (rate(tgi_request_queue_duration_bucket{container=\"$service\"}[10m])))", + "hide": false, + "legendFormat": "p90", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.99, sum by (le) (rate(tgi_request_queue_duration_bucket{container=\"$service\"}[10m])))", + "hide": false, + "legendFormat": "p99", + "range": true, + "refId": "C" + } + ], + "title": "Queue quantiles", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 48 + }, + "id": 22, + "panels": [], + "title": "Prefill", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "p50" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "p90" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "p99" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 0, + "y": 49 + }, + "id": 7, + "options": { + "legend": { + "calcs": [ + "min", + "max" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.5, sum by (le) (rate(tgi_batch_inference_duration_bucket{method=\"prefill\", container=\"$service\"}[10m])))", + "legendFormat": "p50", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.9, sum by (le) (rate(tgi_batch_inference_duration_bucket{method=\"prefill\", container=\"$service\"}[10m])))", + "hide": false, + "legendFormat": "p90", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.99, sum by (le) (rate(tgi_batch_inference_duration_bucket{method=\"prefill\", container=\"$service\"}[10m])))", + "hide": false, + "legendFormat": "p99", + "range": true, + "refId": "C" + } + ], + "title": "Prefill Quantiles", + "type": "timeseries" + }, + { + "cards": {}, + "color": { + "cardColor": "#5794F2", + "colorScale": "linear", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "min": 0, + "mode": "opacity" + }, + "dataFormat": "tsbuckets", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 12, + "y": 49 + }, + "heatmap": {}, + "hideZeroBuckets": false, + "highlightCards": true, + "id": 14, + "legend": { + "show": false + }, + "maxDataPoints": 25, + "options": { + "calculate": false, + "calculation": {}, + "cellGap": 2, + "cellValues": {}, + "color": { + "exponent": 0.5, + "fill": "#5794F2", + "min": 0, + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Spectral", + "steps": 128 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": false + }, + "rowsFrame": { + "layout": "auto" + }, + "showValue": "never", + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": false + }, + "yAxis": { + "axisPlacement": "left", + "decimals": 1, + "reverse": false, + "unit": "s" + } + }, + "pluginVersion": "10.4.2", + "reverseYBuckets": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(increase(tgi_batch_inference_duration_bucket{method=\"prefill\", container=\"$service\"}[5m])) by (le)", + "format": "heatmap", + "interval": "", + "legendFormat": "{{ le }}", + "range": true, + "refId": "A" + } + ], + "title": "Prefill Latency", + "tooltip": { + "show": true, + "showHistogram": false + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "yAxis": { + "decimals": 1, + "format": "s", + "logBase": 1, + "show": true + }, + "yBucketBound": "auto" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 60 + }, + "id": 24, + "panels": [], + "title": "Decode", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "p50" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "p90" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "p99" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 0, + "y": 61 + }, + "id": 11, + "options": { + "legend": { + "calcs": [ + "min", + "max" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.5, sum by (le) (rate(tgi_batch_inference_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))", + "legendFormat": "p50", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.9, sum by (le) (rate(tgi_batch_inference_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))", + "hide": false, + "legendFormat": "p90", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.99, sum by (le) (rate(tgi_batch_inference_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))", + "hide": false, + "legendFormat": "p99", + "range": true, + "refId": "C" + } + ], + "title": "Decode quantiles", + "type": "timeseries" + }, + { + "cards": {}, + "color": { + "cardColor": "#5794F2", + "colorScale": "linear", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "min": 0, + "mode": "opacity" + }, + "dataFormat": "tsbuckets", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 12, + "y": 61 + }, + "heatmap": {}, + "hideZeroBuckets": false, + "highlightCards": true, + "id": 15, + "legend": { + "show": false + }, + "maxDataPoints": 25, + "options": { + "calculate": false, + "calculation": {}, + "cellGap": 2, + "cellValues": {}, + "color": { + "exponent": 0.5, + "fill": "#5794F2", + "min": 0, + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Spectral", + "steps": 128 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": false + }, + "rowsFrame": { + "layout": "auto" + }, + "showValue": "never", + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": false + }, + "yAxis": { + "axisPlacement": "left", + "decimals": 1, + "reverse": false, + "unit": "s" + } + }, + "pluginVersion": "10.4.2", + "reverseYBuckets": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(increase(tgi_batch_inference_duration_bucket{method=\"decode\", container=\"$service\"}[5m])) by (le)", + "format": "heatmap", + "interval": "", + "legendFormat": "{{ le }}", + "range": true, + "refId": "A" + } + ], + "title": "Decode Latency", + "tooltip": { + "show": true, + "showHistogram": false + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "yAxis": { + "decimals": 1, + "format": "s", + "logBase": 1, + "show": true + }, + "yBucketBound": "auto" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 72 + }, + "id": 43, + "panels": [], + "title": "Debug", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "p50" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "p90" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "p99" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 11, + "w": 6, + "x": 0, + "y": 73 + }, + "id": 38, + "options": { + "legend": { + "calcs": [ + "min", + "max" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.5, sum by (le) (rate(tgi_batch_forward_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))", + "legendFormat": "p50", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.9, sum by (le) (rate(tgi_batch_forward_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))", + "hide": false, + "legendFormat": "p90", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.99, sum by (le) (rate(tgi_batch_forward_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))", + "hide": false, + "legendFormat": "p99", + "range": true, + "refId": "C" + } + ], + "title": "Forward quantiles", + "type": "timeseries" + }, + { + "cards": {}, + "color": { + "cardColor": "#5794F2", + "colorScale": "linear", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "min": 0, + "mode": "opacity" + }, + "dataFormat": "tsbuckets", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 6, + "x": 6, + "y": 73 + }, + "heatmap": {}, + "hideZeroBuckets": false, + "highlightCards": true, + "id": 35, + "legend": { + "show": false + }, + "maxDataPoints": 25, + "options": { + "calculate": false, + "calculation": {}, + "cellGap": 2, + "cellValues": {}, + "color": { + "exponent": 0.5, + "fill": "#5794F2", + "min": 0, + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Spectral", + "steps": 128 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": false + }, + "rowsFrame": { + "layout": "auto" + }, + "showValue": "never", + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": false + }, + "yAxis": { + "axisPlacement": "left", + "decimals": 1, + "reverse": false, + "unit": "s" + } + }, + "pluginVersion": "10.4.2", + "reverseYBuckets": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(increase(tgi_batch_forward_duration_bucket{method=\"decode\", container=\"$service\"}[5m])) by (le)", + "format": "heatmap", + "interval": "", + "legendFormat": "{{ le }}", + "range": true, + "refId": "A" + } + ], + "title": "Forward Latency", + "tooltip": { + "show": true, + "showHistogram": false + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "yAxis": { + "decimals": 1, + "format": "s", + "logBase": 1, + "show": true + }, + "yBucketBound": "auto" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "p50" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "p90" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "p99" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 11, + "w": 6, + "x": 12, + "y": 73 + }, + "id": 34, + "options": { + "legend": { + "calcs": [ + "min", + "max" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.5, sum by (le) (rate(tgi_batch_decode_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))", + "legendFormat": "p50", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.9, sum by (le) (rate(tgi_batch_decode_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))", + "hide": false, + "legendFormat": "p90", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.99, sum by (le) (rate(tgi_batch_decode_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))", + "hide": false, + "legendFormat": "p99", + "range": true, + "refId": "C" + } + ], + "title": "Token Decode quantiles", + "type": "timeseries" + }, + { + "cards": {}, + "color": { + "cardColor": "#5794F2", + "colorScale": "linear", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "min": 0, + "mode": "opacity" + }, + "dataFormat": "tsbuckets", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 6, + "x": 18, + "y": 73 + }, + "heatmap": {}, + "hideZeroBuckets": false, + "highlightCards": true, + "id": 40, + "legend": { + "show": false + }, + "maxDataPoints": 25, + "options": { + "calculate": false, + "calculation": {}, + "cellGap": 2, + "cellValues": {}, + "color": { + "exponent": 0.5, + "fill": "#5794F2", + "min": 0, + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Spectral", + "steps": 128 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": false + }, + "rowsFrame": { + "layout": "auto" + }, + "showValue": "never", + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": false + }, + "yAxis": { + "axisPlacement": "left", + "decimals": 1, + "reverse": false, + "unit": "s" + } + }, + "pluginVersion": "10.4.2", + "reverseYBuckets": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(increase(tgi_batch_decode_duration_bucket{method=\"decode\", container=\"$service\"}[5m])) by (le)", + "format": "heatmap", + "interval": "", + "legendFormat": "{{ le }}", + "range": true, + "refId": "A" + } + ], + "title": "Token Decode Latency", + "tooltip": { + "show": true, + "showHistogram": false + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "yAxis": { + "decimals": 1, + "format": "s", + "logBase": 1, + "show": true + }, + "yBucketBound": "auto" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "p50" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "p90" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "p99" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 11, + "w": 6, + "x": 0, + "y": 84 + }, + "id": 42, + "options": { + "legend": { + "calcs": [ + "min", + "max" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.5, sum by (le) (rate(tgi_batch_filter_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))", + "legendFormat": "p50", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.9, sum by (le) (rate(tgi_batch_filter_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))", + "hide": false, + "legendFormat": "p90", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.99, sum by (le) (rate(tgi_batch_filter_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))", + "hide": false, + "legendFormat": "p99", + "range": true, + "refId": "C" + } + ], + "title": "Filter Batch quantiles", + "type": "timeseries" + }, + { + "cards": {}, + "color": { + "cardColor": "#5794F2", + "colorScale": "linear", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "min": 0, + "mode": "opacity" + }, + "dataFormat": "tsbuckets", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 6, + "x": 6, + "y": 84 + }, + "heatmap": {}, + "hideZeroBuckets": false, + "highlightCards": true, + "id": 39, + "legend": { + "show": false + }, + "maxDataPoints": 25, + "options": { + "calculate": false, + "calculation": {}, + "cellGap": 2, + "cellValues": {}, + "color": { + "exponent": 0.5, + "fill": "#5794F2", + "min": 0, + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Spectral", + "steps": 128 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": false + }, + "rowsFrame": { + "layout": "auto" + }, + "showValue": "never", + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": false + }, + "yAxis": { + "axisPlacement": "left", + "decimals": 1, + "reverse": false, + "unit": "s" + } + }, + "pluginVersion": "10.4.2", + "reverseYBuckets": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(increase(tgi_batch_filter_duration_bucket{method=\"decode\", container=\"$service\"}[5m])) by (le)", + "format": "heatmap", + "interval": "", + "legendFormat": "{{ le }}", + "range": true, + "refId": "A" + } + ], + "title": "Filter Batch Latency", + "tooltip": { + "show": true, + "showHistogram": false + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "yAxis": { + "decimals": 1, + "format": "s", + "logBase": 1, + "show": true + }, + "yBucketBound": "auto" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "p50" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "p90" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "p99" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 11, + "w": 6, + "x": 12, + "y": 84 + }, + "id": 36, + "options": { + "legend": { + "calcs": [ + "min", + "max" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.5, sum by (le) (rate(tgi_batch_concat_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))", + "legendFormat": "p50", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.9, sum by (le) (rate(tgi_batch_concat_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))", + "hide": false, + "legendFormat": "p90", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.99, sum by (le) (rate(tgi_batch_concat_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))", + "hide": false, + "legendFormat": "p99", + "range": true, + "refId": "C" + } + ], + "title": "Batch Concat quantiles", + "type": "timeseries" + }, + { + "cards": {}, + "color": { + "cardColor": "#5794F2", + "colorScale": "linear", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "min": 0, + "mode": "opacity" + }, + "dataFormat": "tsbuckets", + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 6, + "x": 18, + "y": 84 + }, + "heatmap": {}, + "hideZeroBuckets": false, + "highlightCards": true, + "id": 41, + "legend": { + "show": false + }, + "maxDataPoints": 25, + "options": { + "calculate": false, + "calculation": {}, + "cellGap": 2, + "cellValues": {}, + "color": { + "exponent": 0.5, + "fill": "#5794F2", + "min": 0, + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Spectral", + "steps": 128 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": false + }, + "rowsFrame": { + "layout": "auto" + }, + "showValue": "never", + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": false + }, + "yAxis": { + "axisPlacement": "left", + "decimals": 1, + "reverse": false, + "unit": "s" + } + }, + "pluginVersion": "10.4.2", + "reverseYBuckets": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(increase(tgi_batch_concat_duration_bucket{method=\"decode\", container=\"$service\"}[5m])) by (le)", + "format": "heatmap", + "interval": "", + "legendFormat": "{{ le }}", + "range": true, + "refId": "A" + } + ], + "title": "Batch Concat latency", + "tooltip": { + "show": true, + "showHistogram": false + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "yAxis": { + "decimals": 1, + "format": "s", + "logBase": 1, + "show": true + }, + "yBucketBound": "auto" + } + ], + "refresh": "", + "schemaVersion": 39, + "tags": [], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "gpu-txt-gen-cohereforai-c4ai-command-r-plu-ba7f1", + "value": "gpu-txt-gen-cohereforai-c4ai-command-r-plu-ba7f1" + }, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "definition": "label_values(tgi_request_count, container)", + "hide": 0, + "includeAll": false, + "multi": false, + "name": "service", + "options": [], + "query": { + "query": "label_values(tgi_request_count, container)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-30m", + "to": "now-30s" + }, + "timepicker": { + "nowDelay": "30s" + }, + "timezone": "", + "title": "Text Generation Inference", + "uid": "RHSk7EL4kdqsd", + "version": 12, + "weekStart": "" +} diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index a52dd7f3..0fa02bc1 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -28,7 +28,7 @@ - local: basic_tutorials/using_cli title: Using TGI CLI - local: basic_tutorials/launcher - title: All TGI CLI options + title: All TGI CLI options - local: basic_tutorials/non_core_models title: Non-core Model Serving - local: basic_tutorials/safety @@ -37,6 +37,8 @@ title: Using Guidance, JSON, tools - local: basic_tutorials/visual_language_models title: Visual Language Models + - local: basic_tutorials/monitoring + title: Monitoring TGI with Prometheus and Grafana title: Tutorials - sections: - local: conceptual/streaming diff --git a/docs/source/basic_tutorials/monitoring.md b/docs/source/basic_tutorials/monitoring.md new file mode 100644 index 00000000..a24cf902 --- /dev/null +++ b/docs/source/basic_tutorials/monitoring.md @@ -0,0 +1,75 @@ +# Monitoring TGI server with Prometheus and Grafana dashboard + +TGI server deployment can easily be monitored through a Grafana dashboard, consuming a Prometheus data collection. Example of inspectable metrics are statistics on the effective batch sizes used by TGI, prefill/decode latencies, number of generated tokens, etc. + +In this tutorial, we look at how to set up a local Grafana dashboard to monitor TGI usage. + +![Grafana dashboard for TGI](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/grafana.png) + +## Setup on the server machine + +First, on your server machine, TGI needs to be launched as usual. TGI exposes [multiple](https://github.com/huggingface/text-generation-inference/discussions/1127#discussioncomment-7240527) metrics that can be collected by Prometheus monitoring server. + +In the rest of this tutorial, we assume that TGI was launched through Docker with `--network host`. + +On the server where TGI is hosted, a Prometheus server needs to be installed and launched. To do so, please follow [Prometheus installation instructions](https://prometheus.io/download/#prometheus). For example, at the time of writing on a Linux machine: + +``` +wget https://github.com/prometheus/prometheus/releases/download/v2.52.0/prometheus-2.52.0.linux-amd64.tar.gz +tar -xvzf prometheus-2.52.0.linux-amd64.tar.gz +cd prometheus +``` + +Prometheus needs to be configured to listen on TGI's port. To do so, in Prometheus configuration file `prometheus.yml`, one needs to edit the lines: +``` + static_configs: + - targets: ["0.0.0.0:80"] +``` +to use the correct IP address and port. + +We suggest to try `curl 0.0.0.0:80/generate -X POST -d '{"inputs":"hey chatbot, how are","parameters":{"max_new_tokens":15}}' -H 'Content-Type: application/json'` on the server side to make sure to configure the correct IP and port. + +Once Prometheus is configured, Prometheus server can be launched on the same machine where TGI is launched: +``` +./prometheus --config.file="prometheus.yml" +``` + +In this guide, Prometheus monitoring data will be consumed on a local computer. Hence, we need to forward Prometheus port (by default 9090) to the local computer. To do so, we can for example: +* Use ssh [local port forwarding](https://www.ssh.com/academy/ssh/tunneling-example) +* Use ngrok port tunneling + +For simplicity, we will use [Ngrok](https://ngrok.com/docs/) in this guide to tunnel Prometheus port from the TGI server to the outside word. + +For that, you should follow the steps at https://dashboard.ngrok.com/get-started/setup/linux, and once Ngrok is installed, use: +```bash +ngrok http http://0.0.0.0:9090 +``` + +As a sanity check, one can make sure that Prometheus server can be accessed at the URL given by Ngrok (in the style of https://d661-4-223-164-145.ngrok-free.app) from a local machine. + +## Setup on the monitoring machine + +Monitoring is typically done on an other machine than the server one. We use a Grafana dashboard to monitor TGI's server usage. + +Two options are available: +* Use Grafana Cloud for an hosted dashboard solution (https://grafana.com/products/cloud/). +* Self-host a grafana dashboard. + +In this tutorial, for simplicity, we will self host the dashbard. We recommend installing Grafana Open-source edition following [the official install instructions](https://grafana.com/grafana/download?platform=linux&edition=oss), using the available Linux binaries. For example: + +```bash +wget https://dl.grafana.com/oss/release/grafana-11.0.0.linux-amd64.tar.gz +tar -zxvf grafana-11.0.0.linux-amd64.tar.gz +cd grafana-11.0.0 +./bin/grafana-server +``` + +Once the Grafana server is launched, the Grafana interface is available at http://localhost:3000. One needs to log in with the `admin` username and `admin` password. + +Once logged in, the Prometheus data source for Grafana needs to be configured, in the option `Add your first data source`. There, a Prometheus data source needs to be added with the Ngrok address we got earlier, that exposes Prometheus port (example: https://d661-4-223-164-145.ngrok-free.app). + +Once Prometheus data source is configured, we can finally create our dashboard! From home, go to `Create your first dashboard` and then `Import dashboard`. There, we will use the recommended dashboard template [tgi_grafana.json](https://github.com/huggingface/text-generation-inference/blob/main/assets/grafana.json) for a dashboard ready to be used, but you may configure your own dashboard as you like. + +Community contributed dashboard templates are also available, for example [here](https://grafana.com/grafana/dashboards/19831-text-generation-inference-dashboard/) or [here](https://grafana.com/grafana/dashboards/20246-text-generation-inference/). + +Load your dashboard configuration, and your TGI dashboard should be ready to go! \ No newline at end of file From 24317977a7e0a842ad079adf2b821a5dd90cb4f2 Mon Sep 17 00:00:00 2001 From: fxmarty <9808326+fxmarty@users.noreply.github.com> Date: Fri, 17 May 2024 17:37:23 +0200 Subject: [PATCH 26/46] Update grafana template (#1918) As per title, there was a mistake credit to @Narsil updated https://huggingface.co/docs/text-generation-inference/basic_tutorials/monitoring as well Co-authored-by: Nicolas Patry --- assets/tgi_grafana.json | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/assets/tgi_grafana.json b/assets/tgi_grafana.json index 6a23e811..5f5a74ad 100644 --- a/assets/tgi_grafana.json +++ b/assets/tgi_grafana.json @@ -93,7 +93,7 @@ }, { "color": "red", - "value": 80 + "value": 1000 } ] }, @@ -103,7 +103,7 @@ }, "gridPos": { "h": 7, - "w": 9, + "w": 8, "x": 0, "y": 0 }, @@ -132,10 +132,36 @@ "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" }, "editorMode": "code", - "expr": "((histogram_quantile(0.5, sum by (le) (rate(tgi_request_queue_duration_bucket{container=\"$service\"}[10m]))) * 1000) + histogram_quantile(0.5, sum by (le) (rate(tgi_batch_inference_duration_bucket{method=\"prefill\", container=\"$service\"}[10m]))))>0 ", + "expr": "(histogram_quantile(0.5, sum by (le) (rate(tgi_request_queue_duration_bucket{container=\"$service\"}[10m]))) * 1000) > 0", + "hide": true, "instant": false, + "legendFormat": "__auto", "range": true, - "refId": "A" + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}" + }, + "editorMode": "code", + "expr": "(histogram_quantile(0.5, sum by (le) (rate(tgi_batch_inference_duration_bucket{method=\"prefill\", container=\"$service\"}[10m]))) * 1000) > 0", + "hide": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "C" + }, + { + "datasource": { + "name": "Expression", + "type": "__expr__", + "uid": "__expr__" + }, + "expression": "$B + $C", + "hide": false, + "refId": "D", + "type": "math" } ], "title": "Time to first token", From 05600c55a52f590587a9971ce85fd678eb86ccad Mon Sep 17 00:00:00 2001 From: fxmarty <9808326+fxmarty@users.noreply.github.com> Date: Fri, 17 May 2024 18:21:51 +0200 Subject: [PATCH 27/46] Fix TunableOp bug (#1920) cc @Narsil --- docs/source/basic_tutorials/monitoring.md | 2 +- .../models/flash_mistral.py | 22 +++++++++++++++++ server/text_generation_server/models/mamba.py | 24 +++++++++++++++++++ 3 files changed, 47 insertions(+), 1 deletion(-) diff --git a/docs/source/basic_tutorials/monitoring.md b/docs/source/basic_tutorials/monitoring.md index a24cf902..d6e50cfd 100644 --- a/docs/source/basic_tutorials/monitoring.md +++ b/docs/source/basic_tutorials/monitoring.md @@ -72,4 +72,4 @@ Once Prometheus data source is configured, we can finally create our dashboard! Community contributed dashboard templates are also available, for example [here](https://grafana.com/grafana/dashboards/19831-text-generation-inference-dashboard/) or [here](https://grafana.com/grafana/dashboards/20246-text-generation-inference/). -Load your dashboard configuration, and your TGI dashboard should be ready to go! \ No newline at end of file +Load your dashboard configuration, and your TGI dashboard should be ready to go! diff --git a/server/text_generation_server/models/flash_mistral.py b/server/text_generation_server/models/flash_mistral.py index b83f49a4..30ae95c9 100644 --- a/server/text_generation_server/models/flash_mistral.py +++ b/server/text_generation_server/models/flash_mistral.py @@ -391,6 +391,28 @@ class BaseFlashMistral(FlashCausalLM): def batch_type(self) -> Type[FlashMistralBatch]: return FlashMistralBatch + def tunableop_warmup(self, seqlen: int): + input_ids = torch.zeros(seqlen, dtype=torch.int64, device=self.device) + position_ids = torch.zeros(seqlen, dtype=torch.int32, device=self.device) + slots = torch.arange(seqlen, dtype=torch.int64, device=self.device) + kv_cache = get_cache_manager().kv_cache + + # We pass a `cu_seqlen_prefill` in order not to have to deal with paged attention cache allocation/deallocation. + self.model.forward( + input_ids=input_ids, + position_ids=position_ids, + cu_seqlen_prefill=torch.tensor( + [0, seqlen], device=self.device, dtype=torch.int32 + ), + kv_cache=get_cache_manager().kv_cache, + block_tables=None, + input_lengths=None, + slots=slots, + max_s=seqlen, + lm_head_indices=None, + prefill_cache_indices=None, + ) + def cuda_graph_warmup(self, bs: int, max_s: int, max_bt: int): input_ids = torch.zeros(bs, dtype=torch.int64, device=self.device) position_ids = torch.zeros(bs, dtype=torch.int32, device=self.device) diff --git a/server/text_generation_server/models/mamba.py b/server/text_generation_server/models/mamba.py index b28b744f..d9f90590 100644 --- a/server/text_generation_server/models/mamba.py +++ b/server/text_generation_server/models/mamba.py @@ -522,6 +522,30 @@ class Mamba(Model): } self.cuda_graphs[batch_size] = graph_dict + def tunableop_warmup(self, seqlen: int): + input_ids = torch.zeros((batch_size, 1), dtype=torch.int64, device=self.device) + n_blocks = len(self.model.blocks) + + d_state = self.model.config.d_state + d_conv = self.model.config.d_conv + # Inner takes the expand multiplication + d_inner = self.model.config.d_inner + + # Important seqlen_offset to go through the update mecanism with the state + seqlen_offset = 1 + inference_params = new_inference_params( + n_blocks=n_blocks, + batch_size=seqlen, + d_state=d_state, + d_conv=d_conv, + d_inner=d_inner, + seqlen_offset=seqlen_offset, + device=self.device, + dtype=self.dtype, + ) + + self.model.forward(input_ids=input_ids, inference_params=inference_params) + def forward( self, input_ids: torch.Tensor, inference_params: Any ) -> Tuple[torch.Tensor, torch.Tensor]: From 14ed7c7b4ad3aeb119cd2e4a4e7888612a35d2bc Mon Sep 17 00:00:00 2001 From: fxmarty <9808326+fxmarty@users.noreply.github.com> Date: Fri, 17 May 2024 19:50:52 +0200 Subject: [PATCH 28/46] Fix TGI issues with ROCm (#1921) Not all models were tested in https://github.com/huggingface/text-generation-inference/pull/1764. Fixing some more issues (notably starcoder2) here, the full CI will come shortly once we split `build.yml` in two --- .../models/custom_modeling/flash_llama_modeling.py | 4 ++++ .../models/custom_modeling/flash_mistral_modeling.py | 4 ++++ server/text_generation_server/models/flash_causal_lm.py | 5 ++++- server/text_generation_server/models/flash_mistral.py | 5 ++++- 4 files changed, 16 insertions(+), 2 deletions(-) diff --git a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py index 47758d30..6e23aa2b 100644 --- a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py @@ -230,11 +230,15 @@ class LlamaMLP(nn.Module): config.intermediate_size // weights.process_group.size() ) + # TODO: This is a hotfix to be removed & properly refactored. + self.quantize = config.quantize + def forward(self, hidden_states): if ( SYSTEM == "rocm" and self.hidden_act == "silu" and hidden_states.shape[0] == 1 + and not self.quantize ): out = torch.empty( hidden_states.shape[0], diff --git a/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py b/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py index 21edc79e..ef3777da 100644 --- a/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py @@ -290,11 +290,15 @@ class MistralMLP(nn.Module): config.intermediate_size // weights.process_group.size() ) + # TODO: This is a hotfix to be removed & properly refactored. + self.quantize = config.quantize + def forward(self, hidden_states): if ( SYSTEM == "rocm" and self.hidden_act == "silu" and hidden_states.shape[0] == 1 + and not self.quantize ): out = torch.empty( hidden_states.shape[0], diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py index 333efe33..45ddd856 100644 --- a/server/text_generation_server/models/flash_causal_lm.py +++ b/server/text_generation_server/models/flash_causal_lm.py @@ -890,6 +890,9 @@ class FlashCausalLM(Model): slots = torch.arange(seqlen, dtype=torch.int64, device=self.device) kv_cache = get_cache_manager().kv_cache + # Dummy value, some models (starcoder2) don't accept `None`. + input_lengths = torch.ones(seqlen, dtype=torch.int32, device=self.device) + # We pass a `cu_seqlen_prefill` in order not to have to deal with paged attention cache allocation/deallocation. self.model.forward( input_ids=input_ids, @@ -899,7 +902,7 @@ class FlashCausalLM(Model): ), kv_cache=get_cache_manager().kv_cache, block_tables=None, - input_lengths=None, + input_lengths=input_lengths, slots=slots, max_s=seqlen, lm_head_indices=None, diff --git a/server/text_generation_server/models/flash_mistral.py b/server/text_generation_server/models/flash_mistral.py index 30ae95c9..e6125e29 100644 --- a/server/text_generation_server/models/flash_mistral.py +++ b/server/text_generation_server/models/flash_mistral.py @@ -397,6 +397,9 @@ class BaseFlashMistral(FlashCausalLM): slots = torch.arange(seqlen, dtype=torch.int64, device=self.device) kv_cache = get_cache_manager().kv_cache + # Dummy value, some models (starcoder2) don't accept `None`. + input_lengths = torch.ones(seqlen, dtype=torch.int32, device=self.device) + # We pass a `cu_seqlen_prefill` in order not to have to deal with paged attention cache allocation/deallocation. self.model.forward( input_ids=input_ids, @@ -406,7 +409,7 @@ class BaseFlashMistral(FlashCausalLM): ), kv_cache=get_cache_manager().kv_cache, block_tables=None, - input_lengths=None, + input_lengths=input_lengths, slots=slots, max_s=seqlen, lm_head_indices=None, From ed2539510a933c77d43a1e6ffdf3cf98c42ced58 Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Sat, 18 May 2024 13:31:24 +0200 Subject: [PATCH 29/46] Fixing the download strategy for ibm-fms (#1917) # What does this PR do? Fixes # (issue) ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests), Pull Request section? - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link to it if that's the case. - [ ] Did you make sure to update the documentation with your changes? Here are the [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation). - [ ] Did you write any new necessary tests? ## Who can review? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR. --- server/text_generation_server/cli.py | 96 +++++++++++----------- server/text_generation_server/utils/hub.py | 2 - 2 files changed, 48 insertions(+), 50 deletions(-) diff --git a/server/text_generation_server/cli.py b/server/text_generation_server/cli.py index 990c31be..74b87024 100644 --- a/server/text_generation_server/cli.py +++ b/server/text_generation_server/cli.py @@ -200,31 +200,27 @@ def download_weights( try: import json - medusa_head = hf_hub_download( - model_id, revision=revision, filename="medusa_lm_head.safetensors" - ) - medusa_config = hf_hub_download( + config = hf_hub_download( model_id, revision=revision, filename="config.json" ) - with open(medusa_config, "r") as f: + with open(config, "r") as f: config = json.load(f) - model_id = config["base_model_name_or_path"] - revision = "main" - try: - utils.weight_files(model_id, revision, extension) - logger.info( - f"Files for parent {model_id} are already present on the host. " - "Skipping download." - ) - return - # Local files not found - except ( - utils.LocalEntryNotFoundError, - FileNotFoundError, - utils.EntryNotFoundError, - ): - pass + base_model_id = config.get("base_model_name_or_path", None) + if base_model_id and base_model_id != model_id: + try: + logger.info(f"Downloading parent model {base_model_id}") + download_weights( + model_id=base_model_id, + revision="main", + extension=extension, + auto_convert=auto_convert, + logger_level=logger_level, + json_output=json_output, + trust_remote_code=trust_remote_code, + ) + except Exception: + pass except (utils.LocalEntryNotFoundError, utils.EntryNotFoundError): pass @@ -241,31 +237,6 @@ def download_weights( if not extension == ".safetensors" or not auto_convert: raise e - elif (Path(model_id) / "medusa_lm_head.safetensors").exists(): - # Try to load as a local Medusa model - try: - import json - - medusa_head = Path(model_id) / "medusa_lm_head.safetensors" - medusa_config = Path(model_id) / "config.json" - with open(medusa_config, "r") as f: - config = json.load(f) - - model_id = config["base_model_name_or_path"] - revision = "main" - try: - utils.weight_files(model_id, revision, extension) - logger.info( - f"Files for parent {model_id} are already present on the host. " - "Skipping download." - ) - return - # Local files not found - except (utils.LocalEntryNotFoundError, utils.EntryNotFoundError): - pass - except (utils.LocalEntryNotFoundError, utils.EntryNotFoundError): - pass - elif (Path(model_id) / "adapter_config.json").exists(): # Try to load as a local PEFT model try: @@ -276,14 +247,43 @@ def download_weights( return except (utils.LocalEntryNotFoundError, utils.EntryNotFoundError): pass + elif (Path(model_id) / "config.json").exists(): + # Try to load as a local Medusa model + try: + import json + + config = Path(model_id) / "config.json" + with open(config, "r") as f: + config = json.load(f) + + base_model_id = config.get("base_model_name_or_path", None) + if base_model_id: + try: + logger.info(f"Downloading parent model {base_model_id}") + download_weights( + model_id=base_model_id, + revision="main", + extension=extension, + auto_convert=auto_convert, + logger_level=logger_level, + json_output=json_output, + trust_remote_code=trust_remote_code, + ) + except Exception: + pass + except (utils.LocalEntryNotFoundError, utils.EntryNotFoundError): + pass # Try to see if there are local pytorch weights try: # Get weights for a local model, a hub cached model and inside the WEIGHTS_CACHE_OVERRIDE - local_pt_files = utils.weight_files(model_id, revision, ".bin") + try: + local_pt_files = utils.weight_files(model_id, revision, ".bin") + except Exception: + local_pt_files = utils.weight_files(model_id, revision, ".pt") # No local pytorch weights - except utils.LocalEntryNotFoundError: + except (utils.LocalEntryNotFoundError, utils.EntryNotFoundError): if extension == ".safetensors": logger.warning( f"No safetensors weights found for model {model_id} at revision {revision}. " diff --git a/server/text_generation_server/utils/hub.py b/server/text_generation_server/utils/hub.py index a81e659d..b56484f6 100644 --- a/server/text_generation_server/utils/hub.py +++ b/server/text_generation_server/utils/hub.py @@ -40,7 +40,6 @@ def _weight_hub_files_from_model_info( and "arguments" not in s.rfilename and "args" not in s.rfilename and "training" not in s.rfilename - and "medusa_lm_head" not in s.rfilename ] @@ -57,7 +56,6 @@ def _weight_files_from_dir(d: Path, extension: str) -> List[str]: and "args" not in f and "adapter" not in f and "training" not in f - and "medusa_lm_head" not in f ] return filenames From f1976851d937c04e0103bef32cf35f6960b2d0cf Mon Sep 17 00:00:00 2001 From: fxmarty <9808326+fxmarty@users.noreply.github.com> Date: Mon, 20 May 2024 02:44:48 +0200 Subject: [PATCH 30/46] ROCm: make CK FA2 default instead of Triton (#1924) As per title. Triton autotune overhead is prohibitive, as it needs to be done for each different prompt length. --- Dockerfile_amd | 5 +++-- docs/source/installation_amd.md | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/Dockerfile_amd b/Dockerfile_amd index 6f8f874b..92dd0ea8 100644 --- a/Dockerfile_amd +++ b/Dockerfile_amd @@ -117,8 +117,9 @@ RUN cd pytorch && python tools/amd_build/build_amd.py && python setup.py install # Set as recommended: https://github.com/ROCm/triton/wiki/A-script-to-set-program-execution-environment-in-ROCm ENV HIP_FORCE_DEV_KERNARG=1 -# On MI300, performances for flash with Triton FA is very competitive (actually better than CK) -ENV ROCM_USE_FLASH_ATTN_V2_TRITON=1 +# On MI250 and MI300, performances for flash with Triton FA are slightly better than CK. +# However, Triton requires a tunning for each prompt length, which is prohibitive. +ENV ROCM_USE_FLASH_ATTN_V2_TRITON=0 FROM base AS kernel-builder diff --git a/docs/source/installation_amd.md b/docs/source/installation_amd.md index 9c6aa409..636d301c 100644 --- a/docs/source/installation_amd.md +++ b/docs/source/installation_amd.md @@ -29,7 +29,7 @@ TunableOp is enabled by default, the warmup may take 1-2 minutes. In case you wo Two implementations of Flash Attention are available for ROCm, the first is [ROCm/flash-attention](https://github.com/ROCm/flash-attention) based on a [Composable Kernel](https://github.com/ROCm/composable_kernel) (CK) implementation, and the second is a [Triton implementation](https://github.com/huggingface/text-generation-inference/blob/main/server/text_generation_server/utils/flash_attn_triton.py). -By default, as its performances have experimentally been better, Triton implementation is used. It can be disabled (using CK implementation instead) by passing `--env ROCM_USE_FLASH_ATTN_V2_TRITON="0"` when launching TGI's docker container. +By default, the Composable Kernel implementation is used. However, the Triton implementation has slightly lower latency on MI250 and MI300, but requires a warmup which can be prohibitive as it needs to be done again for each new prompt length. If needed, FA Triton impelmentation can be enabled with `--env ROCM_USE_FLASH_ATTN_V2_TRITON="0"` when launching TGI's docker container. ## Unsupported features From 3adbc4cc04a62deeeb407e1a418d81eddb6bf091 Mon Sep 17 00:00:00 2001 From: Junlin Zhou Date: Wed, 22 May 2024 01:12:14 +0800 Subject: [PATCH 31/46] docs: Fix grafana dashboard url (#1925) # What does this PR do? Fixes an incorrect url in monitoring doc. ## Before submitting - [x] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests), Pull Request section? - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link to it if that's the case. - [ ] Did you make sure to update the documentation with your changes? Here are the [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation). - [ ] Did you write any new necessary tests? ## Who can review? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR. --- docs/source/basic_tutorials/monitoring.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/basic_tutorials/monitoring.md b/docs/source/basic_tutorials/monitoring.md index d6e50cfd..509b0aff 100644 --- a/docs/source/basic_tutorials/monitoring.md +++ b/docs/source/basic_tutorials/monitoring.md @@ -68,7 +68,7 @@ Once the Grafana server is launched, the Grafana interface is available at http: Once logged in, the Prometheus data source for Grafana needs to be configured, in the option `Add your first data source`. There, a Prometheus data source needs to be added with the Ngrok address we got earlier, that exposes Prometheus port (example: https://d661-4-223-164-145.ngrok-free.app). -Once Prometheus data source is configured, we can finally create our dashboard! From home, go to `Create your first dashboard` and then `Import dashboard`. There, we will use the recommended dashboard template [tgi_grafana.json](https://github.com/huggingface/text-generation-inference/blob/main/assets/grafana.json) for a dashboard ready to be used, but you may configure your own dashboard as you like. +Once Prometheus data source is configured, we can finally create our dashboard! From home, go to `Create your first dashboard` and then `Import dashboard`. There, we will use the recommended dashboard template [tgi_grafana.json](https://github.com/huggingface/text-generation-inference/blob/main/assets/tgi_grafana.json) for a dashboard ready to be used, but you may configure your own dashboard as you like. Community contributed dashboard templates are also available, for example [here](https://grafana.com/grafana/dashboards/19831-text-generation-inference-dashboard/) or [here](https://grafana.com/grafana/dashboards/20246-text-generation-inference/). From b9469a187840e3ed7ee530302fa66729fd9f49be Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Wed, 22 May 2024 16:22:57 +0200 Subject: [PATCH 32/46] Creating doc automatically for supported models. (#1929) # What does this PR do? Fixes # (issue) ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests), Pull Request section? - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link to it if that's the case. - [ ] Did you make sure to update the documentation with your changes? Here are the [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation). - [ ] Did you write any new necessary tests? ## Who can review? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR. --- docs/source/supported_models.md | 48 ++++++++++-------- update_doc.py | 88 ++++++++++++++++++++++++++++++--- 2 files changed, 107 insertions(+), 29 deletions(-) diff --git a/docs/source/supported_models.md b/docs/source/supported_models.md index d478085e..4b6cf731 100644 --- a/docs/source/supported_models.md +++ b/docs/source/supported_models.md @@ -1,30 +1,36 @@ + # Supported Models and Hardware Text Generation Inference enables serving optimized models on specific hardware for the highest performance. The following sections list which models are hardware are supported. ## Supported Models -The following models are optimized and can be served with TGI, which uses custom CUDA kernels for better inference. You can add the flag `--disable-custom-kernels` at the end of the `docker run` command if you wish to disable them. - -- [BLOOM](https://huggingface.co/bigscience/bloom) -- [FLAN-T5](https://huggingface.co/google/flan-t5-xxl) -- [Galactica](https://huggingface.co/facebook/galactica-120b) -- [GPT-2](https://huggingface.co/openai-community/gpt2) -- [GPT-Neox](https://huggingface.co/EleutherAI/gpt-neox-20b) -- [Llama](https://github.com/facebookresearch/llama) -- [OPT](https://huggingface.co/facebook/opt-66b) -- [SantaCoder](https://huggingface.co/bigcode/santacoder) -- [Starcoder](https://huggingface.co/bigcode/starcoder) -- [Falcon 7B](https://huggingface.co/tiiuae/falcon-7b) -- [Falcon 40B](https://huggingface.co/tiiuae/falcon-40b) -- [MPT](https://huggingface.co/mosaicml/mpt-30b) -- [Llama V2](https://huggingface.co/meta-llama) -- [Code Llama](https://huggingface.co/codellama) +- [Idefics 2](https://huggingface.co/HuggingFaceM4/idefics2-8b) (Multimodal) +- [Llava Next (1.6)](https://huggingface.co/llava-hf/llava-v1.6-vicuna-13b-hf) (Multimodal) +- [Llama](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) +- [Phi 3](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct) +- [Gemma](https://huggingface.co/google/gemma-7b) +- [Cohere](https://huggingface.co/CohereForAI/c4ai-command-r-plus) +- [Dbrx](https://huggingface.co/databricks/dbrx-instruct) +- [Mamba](https://huggingface.co/state-spaces/mamba-2.8b-slimpj) - [Mistral](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2) -- [Mixtral](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1) -- [Phi](https://huggingface.co/microsoft/phi-2) -- [Idefics](HuggingFaceM4/idefics-9b-instruct) (Multimodal) -- [Llava-next](llava-hf/llava-v1.6-mistral-7b-hf) (Multimodal) +- [Mixtral](https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1) +- [Gpt Bigcode](https://huggingface.co/bigcode/gpt_bigcode-santacoder) +- [Phi](https://huggingface.co/microsoft/phi-1_5) +- [Baichuan](https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat) +- [Falcon](https://huggingface.co/tiiuae/falcon-7b-instruct) +- [StarCoder 2](https://huggingface.co/bigcode/starcoder2-15b-instruct-v0.1) +- [Qwen 2](https://huggingface.co/bigcode/starcoder2-15b-instruct-v0.1) +- [Opt](https://huggingface.co/facebook/opt-6.7b) +- [T5](https://huggingface.co/google/flan-t5-xxl) +- [Galactica](https://huggingface.co/facebook/galactica-120b) +- [SantaCoder](https://huggingface.co/bigcode/santacoder) +- [Bloom](https://huggingface.co/bigscience/bloom-560m) +- [Mpt](https://huggingface.co/mosaicml/mpt-7b-instruct) +- [Gpt2](https://huggingface.co/openai-community/gpt2) +- [Gpt Neox](https://huggingface.co/EleutherAI/gpt-neox-20b) +- [Idefics](https://huggingface.co/HuggingFaceM4/idefics-9b) (Multimodal) + If the above list lacks the model you would like to serve, depending on the model's pipeline type, you can try to initialize and serve the model anyways to see how well it performs, but performance isn't guaranteed for non-optimized models: @@ -39,4 +45,4 @@ If you wish to serve a supported model that already exists on a local folder, ju ```bash text-generation-launcher --model-id -`````` +``` diff --git a/update_doc.py b/update_doc.py index 6127418c..5da81c72 100644 --- a/update_doc.py +++ b/update_doc.py @@ -1,13 +1,34 @@ import subprocess import argparse +import ast + +TEMPLATE = """ +# Supported Models and Hardware + +Text Generation Inference enables serving optimized models on specific hardware for the highest performance. The following sections list which models are hardware are supported. + +## Supported Models + +SUPPORTED_MODELS + +If the above list lacks the model you would like to serve, depending on the model's pipeline type, you can try to initialize and serve the model anyways to see how well it performs, but performance isn't guaranteed for non-optimized models: + +```python +# for causal LMs/text-generation models +AutoModelForCausalLM.from_pretrained(, device_map="auto")` +# or, for text-to-text generation models +AutoModelForSeq2SeqLM.from_pretrained(, device_map="auto") +``` + +If you wish to serve a supported model that already exists on a local folder, just point to the local folder. + +```bash +text-generation-launcher --model-id +``` +""" -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--check", action="store_true") - - args = parser.parse_args() - +def check_cli(check: bool): output = subprocess.check_output(["text-generation-launcher", "--help"]).decode( "utf-8" ) @@ -41,7 +62,7 @@ def main(): block = [] filename = "docs/source/basic_tutorials/launcher.md" - if args.check: + if check: with open(filename, "r") as f: doc = f.read() if doc != final_doc: @@ -53,12 +74,63 @@ def main(): ).stdout.decode("utf-8") print(diff) raise Exception( - "Doc is not up-to-date, run `python update_doc.py` in order to update it" + "Cli arguments Doc is not up-to-date, run `python update_doc.py` in order to update it" ) else: with open(filename, "w") as f: f.write(final_doc) +def check_supported_models(check: bool): + filename = "server/text_generation_server/models/__init__.py" + with open(filename, "r") as f: + tree = ast.parse(f.read()) + + enum_def = [ + x for x in tree.body if isinstance(x, ast.ClassDef) and x.name == "ModelType" + ][0] + _locals = {} + _globals = {} + exec(f"import enum\n{ast.unparse(enum_def)}", _globals, _locals) + ModelType = _locals["ModelType"] + list_string = "" + for data in ModelType: + list_string += f"- [{data.value['name']}]({data.value['url']})" + if data.value.get("multimodal", None): + list_string += " (Multimodal)" + list_string += "\n" + + final_doc = TEMPLATE.replace("SUPPORTED_MODELS", list_string) + + filename = "docs/source/supported_models.md" + if check: + with open(filename, "r") as f: + doc = f.read() + if doc != final_doc: + tmp = "supported.md" + with open(tmp, "w") as g: + g.write(final_doc) + diff = subprocess.run( + ["diff", tmp, filename], capture_output=True + ).stdout.decode("utf-8") + print(diff) + raise Exception( + "Supported models is not up-to-date, run `python update_doc.py` in order to update it" + ) + else: + with open(filename, "w") as f: + f.write(final_doc) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--check", action="store_true") + + args = parser.parse_args() + + check_cli(args.check) + check_supported_models(args.check) + + if __name__ == "__main__": main() From 57ba035a61fe2031f175ad043a70d0c6c770c024 Mon Sep 17 00:00:00 2001 From: drbh Date: Wed, 22 May 2024 14:46:29 -0400 Subject: [PATCH 33/46] fix: use path inside of speculator config (#1935) This PR access the path on the speculator similar to `MLPSpeculatorHead.load` and `MedusaHeadV1.load` these changes resolves this error locally when loading a `MedusaHeadV2` ``` TypeError: expected str, bytes or os.PathLike object, not dict ``` --- server/text_generation_server/layers/medusa.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/server/text_generation_server/layers/medusa.py b/server/text_generation_server/layers/medusa.py index 2e9a010f..7579ccdb 100644 --- a/server/text_generation_server/layers/medusa.py +++ b/server/text_generation_server/layers/medusa.py @@ -111,10 +111,10 @@ class MedusaHeadV2(nn.Module): from safetensors import safe_open import json - speculator = config.speculator + speculator_path = config.speculator["path"] - medusa_config = str(Path(speculator) / "config.json") - filename = str(Path(speculator) / "medusa_lm_head.safetensors") + medusa_config = str(Path(speculator_path) / "config.json") + filename = str(Path(speculator_path) / "medusa_lm_head.safetensors") with open(medusa_config, "r") as f: medusa_config = json.load(f) From a758d32c64fa5f3ea473dc2b5f9ed5e685cda3e3 Mon Sep 17 00:00:00 2001 From: drbh Date: Thu, 23 May 2024 05:34:18 -0400 Subject: [PATCH 34/46] feat: add train medusa head tutorial (#1934) This PR adds a tutorial to self distill and train medusa heads for a specific model --------- Co-authored-by: Nicolas Patry --- docs/source/_toctree.yml | 2 + docs/source/basic_tutorials/train_medusa.md | 208 ++++++++++++++++++++ docs/source/conceptual/speculation.md | 2 +- 3 files changed, 211 insertions(+), 1 deletion(-) create mode 100644 docs/source/basic_tutorials/train_medusa.md diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index 0fa02bc1..a7351a33 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -39,6 +39,8 @@ title: Visual Language Models - local: basic_tutorials/monitoring title: Monitoring TGI with Prometheus and Grafana + - local: basic_tutorials/train_medusa + title: Train Medusa title: Tutorials - sections: - local: conceptual/streaming diff --git a/docs/source/basic_tutorials/train_medusa.md b/docs/source/basic_tutorials/train_medusa.md new file mode 100644 index 00000000..76cb6bed --- /dev/null +++ b/docs/source/basic_tutorials/train_medusa.md @@ -0,0 +1,208 @@ +# Train Medusa + +This tutorial will show you how to train a Medusa model on a dataset of your choice. Please check out the [speculation documentation](../conceptual/speculation.md) for more information on how Medusa works and speculation in general. + +## What are the benefits of training a Medusa model? + +Training Medusa heads can greatly improve the speed of generation. Medusa adds extra "heads" to LLMs to predict multiple future tokens simultaneously. When augmenting a model with Medusa, the original model stays untouched, and only the new heads are fine-tuned during training. + +One of the most important things is to have a good dataset (with similar data to what will be used in production) because Medusa has a much higher hit-rate when the generation is in-domain. + +If you train Medusa on a dataset that is very different from the one you will use in production then the model will not be able to predict the future tokens accurately and consequently the speedup will be minimal or non-existent. + +## Self-distillation (Generating data for training) + +There are many methods for preparing data for training, but one of the easiest and most effective ways is to "self-distill" the data. This means that you can use the same model to generate the data that you will use to train the model. + +Essentially, you prompt the model with a similar input to what you will use in production and the model will generate the output. + +We'll use this output to help train the medusa heads to predict the `n+1`, `n+2`, `n+3`, etc tokens in the sequence. + +## Training + +The original implementation of Medusa is available at [https://github.com/FasterDecoding/Medusa](https://github.com/FasterDecoding/Medusa) and we'll follow a very similar process to train the model as described on the original repository. + +### Getting Started + +There are two methods for training the model: + +- `torchrun` that is a wrapper around `torch.distributed.launch` +- a forked version of `axlotl` that supports Medusa + +In this tutorial we'll use `torchrun` to train the model as it is the most straightforward way to train the model but similar steps can be followed to train the model using `axlotl` if you prefer. + +### Training with `torchrun` + +```bash +mkdir medusa-training +cd medusa-training + +pyenv install 3.10 +pyenv local 3.10 + +uv venv -p 3.10 +source .venv/bin/activate +``` + +Now lets clone the original `Medusa` repository and install the library. + +```bash +git clone https://github.com/FasterDecoding/Medusa.git +cd Medusa +pip install -e . +``` + +Next we'll need some data to train on, we can use the `ShareGPT_Vicuna_unfiltered` dataset that is available on the Hugging Face Hub. + +```bash +apt install git-lfs +git lfs install +git clone https://huggingface.co/datasets/Aeala/ShareGPT_Vicuna_unfiltered +``` + +Currently our directory structure looks like this: + +```bash +. +├── assets +├── CITATION.cff +├── create_data.py +├── data_generation +├── deepspeed.json +├── last_run_prepared +├── LICENSE +├── llm_judge +├── medusa +├── medusa_llm.egg-info +├── mistral.json +├── notebooks +├── pyproject.toml +├── README.md +├── ROADMAP.md +├── scripts +├── ShareGPT_Vicuna_unfiltered +│   ├── README.md +│   ├── ShareGPT_2023.05.04v0_Wasteland_Edition.json +│   └── ShareGPT_V4.3_unfiltered_cleaned_split.json +├── simple_gradio_interface.py +├── tiny-llama.json +└── vicuna_7b_qlora_stage1 +``` + +## Start Training + +Now the lets generate the data and start training the model. This process will take a while since we are generating data from the model. + +First make sure you have an instance of TGI running with the model you want to use for self-distillation. + +```bash +model=HuggingFaceH4/zephyr-7b-beta +volume=/home/ubuntu/.cache/huggingface/hub/ + +docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id $model +``` + +Now we can generate the data using the `create_data.py` script. + +```bash +python create_data.py \ + --input-filename ShareGPT_Vicuna_unfiltered/ShareGPT_V4.3_unfiltered_cleaned_split.json \ + --output-filename zephyr_self_distill.json +``` + +At this point our terminal should look like this: + +
+ +
+ +> Note: In the screen shot above we are only using a the first 500 examples from the dataset to speed up the process, you should have a much larger dataset for training. + +Now we can finally get to the fun part and start training the model! + +Using `torchrun` we can easily launch the `medusa` training script with the `zephyr_self_distill.json` configuration file. + +> NOTE: If you just self-distilled you may still have the model running, make sure to stop it before starting the training in order to allow all of the resources to be used for training. + +```bash +WANDB_MODE=offline torchrun --nproc_per_node=4 medusa/train/train_legacy.py \ + --model_name_or_path HuggingFaceH4/zephyr-7b-beta \ + --data_path zephyr_self_distill.json \ + --bf16 True \ + --output_dir zephyr_out \ + --num_train_epochs 5 \ + --per_device_train_batch_size 4 \ + --per_device_eval_batch_size 4 \ + --gradient_accumulation_steps 4 \ + --evaluation_strategy "no" \ + --save_strategy "no" \ + --learning_rate 1e-3 \ + --weight_decay 0.0 \ + --warmup_ratio 0.1 \ + --lr_scheduler_type "cosine" \ + --logging_steps 1 \ + --tf32 True \ + --model_max_length 2048 \ + --lazy_preprocess True \ + --medusa_num_heads 3 \ + --medusa_num_layers 1 \ + --deepspeed deepspeed.json +``` + +
+ +
+ +If successful, you should see the similar output to the one below: + +```bash +wandb: Run history: +wandb: train/epoch ▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇███ +wandb: train/global_step ▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇███ +wandb: train/learning_rate ▅███▇▇▆▅▅▄▃▂▂▁▁▁ +wandb: train/loss ██▆▄▄▃▃▂▂▃▁▁▂▁▁▁ +wandb: train/medusa0_loss ▆▆▇▆▆▅▄▅▃▃▃▃▂▂▂▂▂▃▂▂▂▁▁▁▂▁▁▁▁▁█▁▁▁▂▁▁▁▁▁ +wandb: train/medusa0_top1 ▁▁▁▁▁▁▁▁▃▂▃▃▄▄▄▃▄▃▄▄▅▅▆▅▆▆▇▅▇▇▄▇█▇▅▇█▆▇▇ +wandb: train/medusa1_loss ▇▇█▇▇▆▅▅▃▄▃▃▃▃▃▃▃▃▃▃▂▁▂▂▂▁▁▂▁▁▇▁▁▁▂▁▁▁▁▁ +wandb: train/medusa1_top1 ▁▁▁▁▁▁▁▁▃▂▃▃▃▄▄▃▃▂▃▃▅▅▆▄█▆▇▅▇▇▅█▇▇▅▇█▆▆▇ +wandb: train/medusa2_loss ▃▃▄▄▄▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁█▁▁▁▂▁▁▁▁▁ +wandb: train/medusa2_top1 ▁▁▁▂▁▁▁▁▂▂▃▃▃▄▄▃▃▂▃▃▅▆▅▄█▆▆▅▆▆▄█▇▇▄▇█▆▆▇ +wandb: train/total_flos ▁ +wandb: train/train_loss ▁ +wandb: train/train_runtime ▁ +wandb: train/train_samples_per_second ▁ +wandb: train/train_steps_per_second ▁ +wandb: +wandb: Run summary: +wandb: train/epoch 2.0 +wandb: train/global_step 16 +wandb: train/learning_rate 0.0 +wandb: train/loss 14.8906 +wandb: train/medusa0_loss 4.25 +wandb: train/medusa0_top1 0.28809 +wandb: train/medusa1_loss 4.8125 +wandb: train/medusa1_top1 0.22727 +wandb: train/medusa2_loss 5.5 +wandb: train/medusa2_top1 0.17293 +wandb: train/total_flos 0.0 +wandb: train/train_loss 23.98242 +wandb: train/train_runtime 396.9266 +wandb: train/train_samples_per_second 2.519 +wandb: train/train_steps_per_second 0.04 +``` + +Last but most importantly, don't forget to push this model to the Hugging Face Hub so you can use it in your projects. + +```bash +python -m medusa.hf_utils \ + --folder zephyr_out_medusa_mlp_zephyr-7b-beta_medusa_3_lr_0.001_layers_1 \ + --repo drbh/zephyr_medusa_demo +``` + +Woo, we've successfully trained a Medusa model and pushed it to the Hugging Face Hub! 🎉 diff --git a/docs/source/conceptual/speculation.md b/docs/source/conceptual/speculation.md index 79b1c82e..45618ae3 100644 --- a/docs/source/conceptual/speculation.md +++ b/docs/source/conceptual/speculation.md @@ -27,7 +27,7 @@ You can check a few existing fine-tunes for popular models: - [text-generation-inference/Mistral-7B-Instruct-v0.2-medusa](https://huggingface.co/text-generation-inference/Mistral-7B-Instruct-v0.2-medusa) -In order to create your own medusa heads for your own finetune, you should check own the original medusa repo. [https://github.com/FasterDecoding/Medusa](https://github.com/FasterDecoding/Medusa) +In order to create your own medusa heads for your own finetune, you should check own the original medusa repo. [../basic_tutorials/train_medusa.md](../basic_tutorials/train_medusa.md) In order to use medusa models in TGI, simply point to a medusa enabled model, and everything will load automatically. From 42693c4021b2c6165a263c2d8eb7cd831d57c7f7 Mon Sep 17 00:00:00 2001 From: "Wang, Yi" Date: Thu, 23 May 2024 20:11:08 +0800 Subject: [PATCH 35/46] reenable xpu for tgi (#1939) # What does this PR do? Fixes # (issue) ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests), Pull Request section? - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link to it if that's the case. - [ ] Did you make sure to update the documentation with your changes? Here are the [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation). - [ ] Did you write any new necessary tests? ## Who can review? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR. Signed-off-by: Wang, Yi A --- Dockerfile_intel | 1 + .../text_generation_server/layers/rotary.py | 2 + .../custom_modeling/idefics_modeling.py | 2 +- .../utils/flash_attn.py | 79 ++++++++++--------- .../utils/import_utils.py | 2 +- 5 files changed, 45 insertions(+), 41 deletions(-) diff --git a/Dockerfile_intel b/Dockerfile_intel index 5bc39d64..809992e1 100644 --- a/Dockerfile_intel +++ b/Dockerfile_intel @@ -43,6 +43,7 @@ USER root RUN wget http://nz2.archive.ubuntu.com/ubuntu/pool/main/o/openssl/libssl1.1_1.1.1f-1ubuntu2_amd64.deb && \ dpkg -i ./libssl1.1_1.1.1f-1ubuntu2_amd64.deb +RUN wget -qO - https://repositories.intel.com/gpu/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \ | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list diff --git a/server/text_generation_server/layers/rotary.py b/server/text_generation_server/layers/rotary.py index 198e5d8d..648d28ab 100644 --- a/server/text_generation_server/layers/rotary.py +++ b/server/text_generation_server/layers/rotary.py @@ -9,6 +9,8 @@ if SYSTEM == "cuda": import rotary_emb elif SYSTEM == "rocm": from vllm._C import ops +elif SYSTEM == "xpu": + import intel_extension_for_pytorch as ipex def _create_inv_freq(dim, base, device): diff --git a/server/text_generation_server/models/custom_modeling/idefics_modeling.py b/server/text_generation_server/models/custom_modeling/idefics_modeling.py index d0c84308..786ef559 100644 --- a/server/text_generation_server/models/custom_modeling/idefics_modeling.py +++ b/server/text_generation_server/models/custom_modeling/idefics_modeling.py @@ -62,7 +62,7 @@ if SYSTEM == "cuda": elif SYSTEM == "rocm": from vllm._C import ops else: - raise RuntimeError(f"Unsupported system {SYSTEM}") + dropout_layer_norm = None @dataclass diff --git a/server/text_generation_server/utils/flash_attn.py b/server/text_generation_server/utils/flash_attn.py index 9ac5655c..4f5cf10b 100644 --- a/server/text_generation_server/utils/flash_attn.py +++ b/server/text_generation_server/utils/flash_attn.py @@ -5,7 +5,9 @@ from loguru import logger import math from text_generation_server.utils.import_utils import SYSTEM -from text_generation_server.utils.flash_attn_triton import triton_attention + +if SYSTEM != "xpu": + from text_generation_server.utils.flash_attn_triton import triton_attention if os.getenv("USE_FLASH_ATTENTION", "").lower() == "false": raise ImportError("`USE_FLASH_ATTENTION` is false.") @@ -15,43 +17,6 @@ HAS_FLASH_ATTN_V2_ROCM = False ROCM_USE_FLASH_ATTN_V2_CK = False ROCM_USE_FLASH_ATTN_V2_TRITON = False -if SYSTEM == "xpu": - import intel_extension_for_pytorch as ipex - - def attention( - q, - k, - v, - out, - cu_seqlens, - max_s, - softmax_scale, - window_size_left=-1, - ): - if window_size_left <= 0 and window_size_left != -1: - raise ValueError("`window_size_left` must be > 0 or -1") - - if window_size_left != -1: - raise ValueError( - f"XPU version of Flash Attention does not support window attention (window_size_left != -1, got window_size_left={window_size_left})." - ) - return ipex.llm.functional.varlen_attention( - q, - k, - v, - out, - cu_seqlens, - cu_seqlens, - max_s, - max_s, - 0.0, - softmax_scale, - False, - True, - False, - None, - ) - if SYSTEM in {"cuda", "rocm"}: if not torch.cuda.is_available(): @@ -124,8 +89,44 @@ if SYSTEM in {"cuda", "rocm"}: logger.warning(f"Unable to use Flash Attention V2: {e}") HAS_FLASH_ATTN = True +if SYSTEM == "xpu": + import intel_extension_for_pytorch as ipex -if HAS_FLASH_ATTN_V2_CUDA: + def attention( + q, + k, + v, + out, + cu_seqlens, + max_s, + softmax_scale, + window_size_left=-1, + ): + if window_size_left <= 0 and window_size_left != -1: + raise ValueError("`window_size_left` must be > 0 or -1") + + if window_size_left != -1: + raise ValueError( + f"XPU version of Flash Attention does not support window attention (window_size_left != -1, got window_size_left={window_size_left})." + ) + return ipex.llm.functional.varlen_attention( + q, + k, + v, + out, + cu_seqlens, + cu_seqlens, + max_s, + max_s, + 0.0, + softmax_scale, + False, + True, + False, + None, + ) + +elif HAS_FLASH_ATTN_V2_CUDA: def attention( q, diff --git a/server/text_generation_server/utils/import_utils.py b/server/text_generation_server/utils/import_utils.py index f54987eb..40e57646 100644 --- a/server/text_generation_server/utils/import_utils.py +++ b/server/text_generation_server/utils/import_utils.py @@ -17,7 +17,7 @@ def get_cuda_free_memory(device, memory_fraction): return free_memory -def get_xpu_free_memory(device): +def get_xpu_free_memory(device, memory_fraction): total_gpu_memory = torch.xpu.get_device_properties(device).total_memory free_memory = int(total_gpu_memory * 0.5) return free_memory From 7cf21294d11b05588625f487caead8cee11161b6 Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Thu, 23 May 2024 14:39:38 +0200 Subject: [PATCH 36/46] Fixing some legacy behavior (big swapout of serverless on legacy stuff). (#1937) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # What does this PR do? Fixes # (issue) ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests), Pull Request section? - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link to it if that's the case. - [ ] Did you make sure to update the documentation with your changes? Here are the [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation). - [ ] Did you write any new necessary tests? ## Who can review? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR. --------- Co-authored-by: Daniël de Kok --- launcher/src/main.rs | 59 ++++++++++++++++--------- server/text_generation_server/server.py | 31 +++++++++++-- 2 files changed, 65 insertions(+), 25 deletions(-) diff --git a/launcher/src/main.rs b/launcher/src/main.rs index d6b45c1d..b80e0230 100644 --- a/launcher/src/main.rs +++ b/launcher/src/main.rs @@ -23,10 +23,28 @@ use tracing_subscriber::EnvFilter; mod env_runtime; +#[derive(Deserialize)] +struct RawConfig { + max_position_embeddings: Option, + n_positions: Option, + max_seq_len: Option, +} + #[derive(Deserialize)] struct Config { max_position_embeddings: Option, - max_seq_len: Option, +} + +impl From for Config { + fn from(other: RawConfig) -> Self { + let max_position_embeddings = other + .max_position_embeddings + .or(other.max_seq_len) + .or(other.n_positions); + Config { + max_position_embeddings, + } + } } #[derive(Clone, Copy, Debug, ValueEnum)] @@ -1324,33 +1342,30 @@ fn main() -> Result<(), LauncherError> { }; let content = std::fs::read_to_string(filename)?; - let config: Config = serde_json::from_str(&content)?; + let config: RawConfig = serde_json::from_str(&content)?; + let config: Config = config.into(); // Quantization usually means you're even more RAM constrained. let max_default = 4096; - let max_position_embeddings = match (config.max_position_embeddings, config.max_seq_len) { - (Some(max_position_embeddings), _) | (None, Some(max_position_embeddings)) => { - if max_position_embeddings > max_default { - let max = max_position_embeddings; - if args.max_input_tokens.is_none() - && args.max_total_tokens.is_none() - && args.max_batch_prefill_tokens.is_none() - { - tracing::info!("Model supports up to {max} but tgi will now set its default to {max_default} instead. This is to save VRAM by refusing large prompts in order to allow more users on the same hardware. You can increase that size using `--max-batch-prefill-tokens={} --max-total-tokens={max} --max-input-tokens={}`.", max + 50, max - 1); - } - max_default - } else { - max_position_embeddings + if let Some(max_position_embeddings) = config.max_position_embeddings { + if max_position_embeddings > max_default { + let max = max_position_embeddings; + if args.max_input_tokens.is_none() + && args.max_total_tokens.is_none() + && args.max_batch_prefill_tokens.is_none() + { + tracing::info!("Model supports up to {max} but tgi will now set its default to {max_default} instead. This is to save VRAM by refusing large prompts in order to allow more users on the same hardware. You can increase that size using `--max-batch-prefill-tokens={} --max-total-tokens={max} --max-input-tokens={}`.", max + 50, max - 1); } + Ok(max_default) + } else { + Ok(max_position_embeddings) } - _ => { - return Err(Box::new(LauncherError::ArgumentValidation( - "no max defined".to_string(), - ))); - } - }; - Ok(max_position_embeddings) + } else { + Err(Box::new(LauncherError::ArgumentValidation( + "no max defined".to_string(), + ))) + } }; let max_position_embeddings: usize = get_max_position_embeddings().unwrap_or(4096); diff --git a/server/text_generation_server/server.py b/server/text_generation_server/server.py index 97828ffb..5184731f 100644 --- a/server/text_generation_server/server.py +++ b/server/text_generation_server/server.py @@ -21,6 +21,18 @@ from text_generation_server.pb import generate_pb2_grpc, generate_pb2 from text_generation_server.tracing import UDSOpenTelemetryAioServerInterceptor from text_generation_server.models.globals import set_model_id +try: + from text_generation_server.models.pali_gemma import PaliGemmaBatch + from text_generation_server.models.vlm_causal_lm import ( + VlmCausalLMBatch, + ) + from text_generation_server.models.idefics_causal_lm import IdeficsCausalLMBatch + + VLM_BATCH_TYPES = {PaliGemmaBatch, VlmCausalLMBatch, IdeficsCausalLMBatch} +except (ImportError, NotImplementedError): + # These imports can fail on CPU/Non flash. + VLM_BATCH_TYPES = set() + class SignalHandler: KEEP_PROCESSING = True @@ -91,9 +103,22 @@ class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServicer): async def Prefill(self, request, context): start = time.time_ns() - batch = self.model.batch_type.from_pb( - request.batch, self.model.tokenizer, self.model.dtype, self.model.device - ) + if ( + self.model.batch_type in VLM_BATCH_TYPES + ): # Hack, i would rather use kwargs in the `from_pb` call + batch = self.model.batch_type.from_pb_processor( + request.batch, + self.model.tokenizer, + self.model.processor, + self.model.model.config, + self.model.dtype, + self.model.device, + ) + else: + batch = self.model.batch_type.from_pb( + request.batch, self.model.tokenizer, self.model.dtype, self.model.device + ) + generations, next_batch, timings = self.model.generate_token([batch]) self.cache.set(next_batch) From 4239e4d32726c2776b96c080c6317ccf34497946 Mon Sep 17 00:00:00 2001 From: Thomas Schillaci Date: Thu, 23 May 2024 15:37:09 +0200 Subject: [PATCH 37/46] Add completion route to client and add stop parameter where it's missing (#1869) # What does this PR do? - Add the stop parameter to the completion route - Add the completion method to the python client - Add the stop parameter to the python client's chat method ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [x] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests), Pull Request section? - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link to it if that's the case. - [ ] Did you make sure to update the documentation with your changes? Here are the [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation). - [ ] Did you write any new necessary tests? ## Who can review? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR. @Narsil --------- Co-authored-by: Thomas SCHILLACI Co-authored-by: Thomas Schillaci --- clients/python/text_generation/client.py | 186 +++++++++++++++++++++++ clients/python/text_generation/types.py | 106 ++++++++----- docs/openapi.json | 9 ++ router/src/lib.rs | 5 + router/src/server.rs | 25 ++- 5 files changed, 286 insertions(+), 45 deletions(-) diff --git a/clients/python/text_generation/client.py b/clients/python/text_generation/client.py index 98c018d5..12966747 100644 --- a/clients/python/text_generation/client.py +++ b/clients/python/text_generation/client.py @@ -13,6 +13,9 @@ from text_generation.types import ( Request, Parameters, Grammar, + CompletionRequest, + Completion, + CompletionComplete, ChatRequest, ChatCompletionChunk, ChatComplete, @@ -70,6 +73,94 @@ class Client: self.cookies = cookies self.timeout = timeout + def completion( + self, + prompt: str, + frequency_penalty: Optional[float] = None, + max_tokens: Optional[int] = None, + repetition_penalty: Optional[float] = None, + seed: Optional[int] = None, + stream: bool = False, + temperature: Optional[float] = None, + top_p: Optional[float] = None, + stop: Optional[List[str]] = None, + ): + """ + Given a prompt, generate a response synchronously + + Args: + prompt (`str`): + Prompt + frequency_penalty (`float`): + The parameter for frequency penalty. 0.0 means no penalty + Penalize new tokens based on their existing frequency in the text so far, + decreasing the model's likelihood to repeat the same line verbatim. + max_tokens (`int`): + Maximum number of generated tokens + repetition_penalty (`float`): + The parameter for frequency penalty. 0.0 means no penalty. See [this + paper](https://arxiv.org/pdf/1909.05858.pdf) for more details. + seed (`int`): + Random sampling seed + stream (`bool`): + Stream the response + temperature (`float`): + The value used to module the logits distribution. + top_p (`float`): + If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or + higher are kept for generation + stop (`List[str]`): + Stop generating tokens if a member of `stop` is generated + """ + request = CompletionRequest( + model="tgi", + prompt=prompt, + frequency_penalty=frequency_penalty, + max_tokens=max_tokens, + repetition_penalty=repetition_penalty, + seed=seed, + stream=stream, + temperature=temperature, + top_p=top_p, + stop=stop, + ) + if not stream: + resp = requests.post( + f"{self.base_url}/v1/completions", + json=request.dict(), + headers=self.headers, + cookies=self.cookies, + timeout=self.timeout, + ) + payload = resp.json() + if resp.status_code != 200: + raise parse_error(resp.status_code, payload) + return Completion(**payload) + else: + return self._completion_stream_response(request) + + def _completion_stream_response(self, request): + resp = requests.post( + f"{self.base_url}/v1/completions", + json=request.dict(), + headers=self.headers, + cookies=self.cookies, + timeout=self.timeout, + stream=True, + ) + # iterate and print stream + for byte_payload in resp.iter_lines(): + if byte_payload == b"\n": + continue + payload = byte_payload.decode("utf-8") + if payload.startswith("data:"): + json_payload = json.loads(payload.lstrip("data:").rstrip("\n")) + try: + response = CompletionComplete(**json_payload) + yield response + except ValidationError: + raise parse_error(resp.status, json_payload) + def chat( self, messages: List[Message], @@ -88,6 +179,7 @@ class Client: tools: Optional[List[Tool]] = None, tool_prompt: Optional[str] = None, tool_choice: Optional[str] = None, + stop: Optional[List[str]] = None, ): """ Given a list of messages, generate a response asynchronously @@ -130,6 +222,8 @@ class Client: A prompt to be appended before the tools tool_choice (`str`): The tool to use + stop (`List[str]`): + Stop generating tokens if a member of `stop` is generated """ request = ChatRequest( @@ -150,6 +244,7 @@ class Client: tools=tools, tool_prompt=tool_prompt, tool_choice=tool_choice, + stop=stop, ) if not stream: resp = requests.post( @@ -461,6 +556,93 @@ class AsyncClient: self.cookies = cookies self.timeout = ClientTimeout(timeout) + async def completion( + self, + prompt: str, + frequency_penalty: Optional[float] = None, + max_tokens: Optional[int] = None, + repetition_penalty: Optional[float] = None, + seed: Optional[int] = None, + stream: bool = False, + temperature: Optional[float] = None, + top_p: Optional[float] = None, + stop: Optional[List[str]] = None, + ) -> Union[Completion, AsyncIterator[CompletionComplete]]: + """ + Given a prompt, generate a response asynchronously + + Args: + prompt (`str`): + Prompt + frequency_penalty (`float`): + The parameter for frequency penalty. 0.0 means no penalty + Penalize new tokens based on their existing frequency in the text so far, + decreasing the model's likelihood to repeat the same line verbatim. + max_tokens (`int`): + Maximum number of generated tokens + repetition_penalty (`float`): + The parameter for frequency penalty. 0.0 means no penalty. See [this + paper](https://arxiv.org/pdf/1909.05858.pdf) for more details. + seed (`int`): + Random sampling seed + stream (`bool`): + Stream the response + temperature (`float`): + The value used to module the logits distribution. + top_p (`float`): + If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or + higher are kept for generation + stop (`List[str]`): + Stop generating tokens if a member of `stop` is generated + """ + request = CompletionRequest( + model="tgi", + prompt=prompt, + frequency_penalty=frequency_penalty, + max_tokens=max_tokens, + repetition_penalty=repetition_penalty, + seed=seed, + stream=stream, + temperature=temperature, + top_p=top_p, + stop=stop, + ) + if not stream: + return await self._completion_single_response(request) + else: + return self._completion_stream_response(request) + + async def _completion_single_response(self, request): + async with ClientSession( + headers=self.headers, cookies=self.cookies, timeout=self.timeout + ) as session: + async with session.post( + f"{self.base_url}/v1/completions", json=request.dict() + ) as resp: + payload = await resp.json() + if resp.status != 200: + raise parse_error(resp.status, payload) + return Completion(**payload) + + async def _completion_stream_response(self, request): + async with ClientSession( + headers=self.headers, cookies=self.cookies, timeout=self.timeout + ) as session: + async with session.post( + f"{self.base_url}/v1/completions", json=request.dict() + ) as resp: + async for byte_payload in resp.content: + if byte_payload == b"\n": + continue + payload = byte_payload.decode("utf-8") + if payload.startswith("data:"): + json_payload = json.loads(payload.lstrip("data:").rstrip("\n")) + try: + response = CompletionComplete(**json_payload) + yield response + except ValidationError: + raise parse_error(resp.status, json_payload) + async def chat( self, messages: List[Message], @@ -479,6 +661,7 @@ class AsyncClient: tools: Optional[List[Tool]] = None, tool_prompt: Optional[str] = None, tool_choice: Optional[str] = None, + stop: Optional[List[str]] = None, ) -> Union[ChatComplete, AsyncIterator[ChatCompletionChunk]]: """ Given a list of messages, generate a response asynchronously @@ -521,6 +704,8 @@ class AsyncClient: A prompt to be appended before the tools tool_choice (`str`): The tool to use + stop (`List[str]`): + Stop generating tokens if a member of `stop` is generated """ request = ChatRequest( @@ -541,6 +726,7 @@ class AsyncClient: tools=tools, tool_prompt=tool_prompt, tool_choice=tool_choice, + stop=stop, ) if not stream: return await self._chat_single_response(request) diff --git a/clients/python/text_generation/types.py b/clients/python/text_generation/types.py index 5e32bc6f..eb872ee6 100644 --- a/clients/python/text_generation/types.py +++ b/clients/python/text_generation/types.py @@ -46,30 +46,6 @@ class Tool(BaseModel): function: dict -class ChatCompletionComplete(BaseModel): - # Index of the chat completion - index: int - # Message associated with the chat completion - message: Message - # Log probabilities for the chat completion - logprobs: Optional[Any] - # Reason for completion - finish_reason: str - # Usage details of the chat completion - usage: Optional[Any] = None - - -class CompletionComplete(BaseModel): - # Index of the chat completion - index: int - # Message associated with the chat completion - text: str - # Log probabilities for the chat completion - logprobs: Optional[Any] - # Reason for completion - finish_reason: str - - class Function(BaseModel): name: Optional[str] arguments: str @@ -95,24 +71,41 @@ class Choice(BaseModel): finish_reason: Optional[str] = None -class ChatCompletionChunk(BaseModel): - id: str - object: str - created: int +class CompletionRequest(BaseModel): + # Model identifier model: str - system_fingerprint: str - choices: List[Choice] + # Prompt + prompt: str + # The parameter for repetition penalty. 1.0 means no penalty. + # See [this paper](https://arxiv.org/pdf/1909.05858.pdf) for more details. + repetition_penalty: Optional[float] = None + # The parameter for frequency penalty. 1.0 means no penalty + # Penalize new tokens based on their existing frequency in the text so far, + # decreasing the model's likelihood to repeat the same line verbatim. + frequency_penalty: Optional[float] = None + # Maximum number of tokens to generate + max_tokens: Optional[int] = None + # Flag to indicate streaming response + stream: bool = False + # Random sampling seed + seed: Optional[int] = None + # Sampling temperature + temperature: Optional[float] = None + # Top-p value for nucleus sampling + top_p: Optional[float] = None + # Stop generating tokens if a member of `stop` is generated + stop: Optional[List[str]] = None -class ChatComplete(BaseModel): - # Chat completion details - id: str - object: str - created: int - model: str - system_fingerprint: str - choices: List[ChatCompletionComplete] - usage: Any +class CompletionComplete(BaseModel): + # Index of the chat completion + index: int + # Message associated with the chat completion + text: str + # Log probabilities for the chat completion + logprobs: Optional[Any] + # Reason for completion + finish_reason: str class Completion(BaseModel): @@ -163,6 +156,41 @@ class ChatRequest(BaseModel): tool_prompt: Optional[str] = None # Choice of tool to be used tool_choice: Optional[str] = None + # Stop generating tokens if a member of `stop` is generated + stop: Optional[List[str]] = None + + +class ChatCompletionComplete(BaseModel): + # Index of the chat completion + index: int + # Message associated with the chat completion + message: Message + # Log probabilities for the chat completion + logprobs: Optional[Any] + # Reason for completion + finish_reason: str + # Usage details of the chat completion + usage: Optional[Any] = None + + +class ChatComplete(BaseModel): + # Chat completion details + id: str + object: str + created: int + model: str + system_fingerprint: str + choices: List[ChatCompletionComplete] + usage: Any + + +class ChatCompletionChunk(BaseModel): + id: str + object: str + created: int + model: str + system_fingerprint: str + choices: List[Choice] class Parameters(BaseModel): diff --git a/docs/openapi.json b/docs/openapi.json index 2a387c2f..79c3b80f 100644 --- a/docs/openapi.json +++ b/docs/openapi.json @@ -1121,6 +1121,15 @@ "description": "An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the\ntokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered.", "example": 0.95, "nullable": true + }, + "stop": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Up to 4 sequences where the API will stop generating further tokens.", + "example": "null", + "nullable": true } } }, diff --git a/router/src/lib.rs b/router/src/lib.rs index febbf277..ba1d9acc 100644 --- a/router/src/lib.rs +++ b/router/src/lib.rs @@ -402,6 +402,11 @@ pub struct CompletionRequest { #[serde(default)] #[schema(example = "1.0")] pub frequency_penalty: Option, + + /// Up to 4 sequences where the API will stop generating further tokens. + #[serde(default)] + #[schema(nullable = true, example = "null")] + pub stop: Option>, } #[derive(Clone, Deserialize, Serialize, ToSchema, Default)] diff --git a/router/src/server.rs b/router/src/server.rs index 52652b72..1edcc472 100644 --- a/router/src/server.rs +++ b/router/src/server.rs @@ -599,9 +599,22 @@ async fn completions( let span = tracing::Span::current(); metrics::increment_counter!("tgi_request_count"); - let stream = req.stream; - let max_new_tokens = req.max_tokens.or(Some(100)); - let seed = req.seed; + let CompletionRequest { + max_tokens, + seed, + stop, + stream, + temperature, + .. + } = req; + + let max_new_tokens = max_tokens.or(Some(100)); + let stop = stop.unwrap_or_default(); + // enable greedy only when temperature is 0 + let (do_sample, temperature) = match temperature { + Some(temperature) if temperature == 0.0 => (false, None), + other => (true, other), + }; // if suffix is present throw an error if req.suffix.is_some() { @@ -637,16 +650,16 @@ async fn completions( inputs: prompt.to_string(), parameters: GenerateParameters { best_of: None, - temperature: req.temperature, + temperature, repetition_penalty: req.repetition_penalty, frequency_penalty: req.frequency_penalty, top_k: None, top_p: req.top_p, typical_p: None, - do_sample: true, + do_sample, max_new_tokens, return_full_text: None, - stop: Vec::new(), + stop: stop.clone(), truncate: None, watermark: false, details: true, From 075092315e03b5309ebf7ed815398d9f3bb04249 Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Thu, 23 May 2024 15:40:40 +0200 Subject: [PATCH 38/46] Improving the logging system. (#1938) - Added a debug log for speculated ids (helps seeing in logs quality of a speculator). - Remove newlines from child process logs when re-emitting in non JSON mode. - Made standard level be closer to what's expected (only our binaries level). - Propagate that level correctly to the shard (was forced into INFO). # What does this PR do? Fixes # (issue) ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests), Pull Request section? - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link to it if that's the case. - [ ] Did you make sure to update the documentation with your changes? Here are the [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation). - [ ] Did you write any new necessary tests? ## Who can review? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR. --- launcher/src/main.rs | 40 ++++++++++++++----- router/src/main.rs | 19 +++++++-- .../models/flash_causal_lm.py | 5 +++ 3 files changed, 50 insertions(+), 14 deletions(-) diff --git a/launcher/src/main.rs b/launcher/src/main.rs index b80e0230..6dece8be 100644 --- a/launcher/src/main.rs +++ b/launcher/src/main.rs @@ -19,7 +19,7 @@ use std::thread::sleep; use std::time::{Duration, Instant}; use std::{fs, io}; use thiserror::Error; -use tracing_subscriber::EnvFilter; +use tracing_subscriber::{filter::LevelFilter, EnvFilter}; mod env_runtime; @@ -472,6 +472,7 @@ fn shard_manager( max_total_tokens: usize, max_batch_size: Option, otlp_endpoint: Option, + log_level: LevelFilter, status_sender: mpsc::Sender, shutdown: Arc, _shutdown_sender: mpsc::Sender<()>, @@ -494,7 +495,7 @@ fn shard_manager( "--uds-path".to_string(), uds_path, "--logger-level".to_string(), - "INFO".to_string(), + log_level.to_string().to_uppercase(), "--json-output".to_string(), ]; @@ -775,13 +776,13 @@ struct PythonLogMessage { impl PythonLogMessage { fn trace(&self) { match self.record.level.name { - PythonLogLevelEnum::Trace => tracing::trace!("{}", self.text), - PythonLogLevelEnum::Debug => tracing::debug!("{}", self.text), - PythonLogLevelEnum::Info => tracing::info!("{}", self.text), - PythonLogLevelEnum::Success => tracing::info!("{}", self.text), - PythonLogLevelEnum::Warning => tracing::warn!("{}", self.text), - PythonLogLevelEnum::Error => tracing::error!("{}", self.text), - PythonLogLevelEnum::Critical => tracing::error!("{}", self.text), + PythonLogLevelEnum::Trace => tracing::trace!("{}", self.text.trim_end()), + PythonLogLevelEnum::Debug => tracing::debug!("{}", self.text.trim_end()), + PythonLogLevelEnum::Info => tracing::info!("{}", self.text.trim_end()), + PythonLogLevelEnum::Success => tracing::info!("{}", self.text.trim_end()), + PythonLogLevelEnum::Warning => tracing::warn!("{}", self.text.trim_end()), + PythonLogLevelEnum::Error => tracing::error!("{}", self.text.trim_end()), + PythonLogLevelEnum::Critical => tracing::error!("{}", self.text.trim_end()), } } } @@ -1001,6 +1002,7 @@ fn spawn_shards( args: &Args, cuda_graphs: Vec, max_total_tokens: usize, + max_log_level: LevelFilter, shutdown: Arc, shutdown_receiver: &mpsc::Receiver<()>, shutdown_sender: mpsc::Sender<()>, @@ -1058,6 +1060,7 @@ fn spawn_shards( max_total_tokens, max_batch_size, otlp_endpoint, + max_log_level, status_sender, shutdown, shutdown_sender, @@ -1298,8 +1301,22 @@ fn main() -> Result<(), LauncherError> { let args: Args = Args::parse(); // Filter events with LOG_LEVEL - let env_filter = - EnvFilter::try_from_env("LOG_LEVEL").unwrap_or_else(|_| EnvFilter::new("info")); + let varname = "LOG_LEVEL"; + let env_filter = if let Ok(log_level) = std::env::var(varname) { + // Override to avoid simple logs to be spammed with tokio level informations + let log_level = match &log_level[..] { + "warn" => "text_generation_launcher=warn,text_generation_router=warn", + "info" => "text_generation_launcher=info,text_generation_router=info", + "debug" => "text_generation_launcher=debug,text_generation_router=debug", + log_level => log_level, + }; + EnvFilter::builder() + .with_default_directive(LevelFilter::INFO.into()) + .parse_lossy(log_level) + } else { + EnvFilter::new("info") + }; + let max_log_level = env_filter.max_level_hint().unwrap_or(LevelFilter::INFO); if args.json_output { tracing_subscriber::fmt() @@ -1521,6 +1538,7 @@ fn main() -> Result<(), LauncherError> { &args, cuda_graphs, max_total_tokens, + max_log_level, shutdown.clone(), &shutdown_receiver, shutdown_sender, diff --git a/router/src/main.rs b/router/src/main.rs index ae7666a0..c3b8d047 100644 --- a/router/src/main.rs +++ b/router/src/main.rs @@ -23,7 +23,7 @@ use tokenizers::Tokenizer; use tower_http::cors::AllowOrigin; use tracing_subscriber::layer::SubscriberExt; use tracing_subscriber::util::SubscriberInitExt; -use tracing_subscriber::{EnvFilter, Layer}; +use tracing_subscriber::{filter::LevelFilter, EnvFilter, Layer}; /// App Configuration #[derive(Parser, Debug)] @@ -482,8 +482,21 @@ fn init_logging(otlp_endpoint: Option, json_output: bool) { } // Filter events with LOG_LEVEL - let env_filter = - EnvFilter::try_from_env("LOG_LEVEL").unwrap_or_else(|_| EnvFilter::new("info")); + let varname = "LOG_LEVEL"; + let env_filter = if let Ok(log_level) = std::env::var(varname) { + // Override to avoid simple logs to be spammed with tokio level informations + let log_level = match &log_level[..] { + "warn" => "text_generation_launcher=warn,text_generation_router=warn", + "info" => "text_generation_launcher=info,text_generation_router=info", + "debug" => "text_generation_launcher=debug,text_generation_router=debug", + log_level => log_level, + }; + EnvFilter::builder() + .with_default_directive(LevelFilter::INFO.into()) + .parse_lossy(log_level) + } else { + EnvFilter::new("info") + }; tracing_subscriber::registry() .with(env_filter) diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py index 45ddd856..86d9b4c8 100644 --- a/server/text_generation_server/models/flash_causal_lm.py +++ b/server/text_generation_server/models/flash_causal_lm.py @@ -17,6 +17,7 @@ from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE from text_generation_server.utils.import_utils import SYSTEM from text_generation_server.models import Model from text_generation_server.utils.tokens import batch_top_tokens +from text_generation_server.utils.dist import RANK from text_generation_server.utils.speculate import get_speculate from text_generation_server.models.types import ( Batch, @@ -1187,6 +1188,10 @@ class FlashCausalLM(Model): next_token_texts = [] left = 0 + if n_accepted_ids > 1: + if RANK == 0: + logger.debug(f"Speculated ids {n_accepted_ids - 1}") + current_stopped = False for j in range(index, index + n_accepted_ids): # Generated token From 42b0847a80c5e16efb28a3523cbe065afdd3acf5 Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Fri, 24 May 2024 12:40:39 +0200 Subject: [PATCH 39/46] Fixing codellama loads by using purely `AutoTokenizer`. (#1947) - The need for the slow tokenizer default stems from back when llama 1 was introduced and all the flags where not supported in `tokenizers`. - Fixes #1891 # What does this PR do? Fixes # (issue) ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests), Pull Request section? - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link to it if that's the case. - [ ] Did you make sure to update the documentation with your changes? Here are the [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation). - [ ] Did you write any new necessary tests? ## Who can review? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR. --- .../models/flash_llama.py | 24 ++++++------------- 1 file changed, 7 insertions(+), 17 deletions(-) diff --git a/server/text_generation_server/models/flash_llama.py b/server/text_generation_server/models/flash_llama.py index 796fbd47..9a7dfaee 100644 --- a/server/text_generation_server/models/flash_llama.py +++ b/server/text_generation_server/models/flash_llama.py @@ -3,7 +3,6 @@ import torch.distributed from opentelemetry import trace from transformers import AutoConfig, AutoTokenizer, GenerationConfig -from transformers.models.llama import LlamaTokenizer from typing import Optional from text_generation_server.models import FlashCausalLM @@ -41,22 +40,13 @@ class FlashLlama(FlashCausalLM): else: raise NotImplementedError("FlashLlama is only available on GPU") - try: - tokenizer = LlamaTokenizer.from_pretrained( - model_id, - revision=revision, - padding_side="left", - truncation_side="left", - trust_remote_code=trust_remote_code, - ) - except Exception: - tokenizer = AutoTokenizer.from_pretrained( - model_id, - revision=revision, - padding_side="left", - truncation_side="left", - trust_remote_code=trust_remote_code, - ) + tokenizer = AutoTokenizer.from_pretrained( + model_id, + revision=revision, + padding_side="left", + truncation_side="left", + trust_remote_code=trust_remote_code, + ) try: generation_config = GenerationConfig.from_pretrained( model_id, revision=revision, trust_remote_code=trust_remote_code From 179336888ecc97658ccdbaeb2f6ba6d9559ad846 Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Fri, 24 May 2024 10:52:28 +0000 Subject: [PATCH 40/46] Modifing the version number. --- Cargo.lock | 12 ++++++------ Cargo.toml | 2 +- Dockerfile | 4 ++-- server/pyproject.toml | 2 +- server/requirements_cuda.txt | 18 +++++++++--------- server/requirements_rocm.txt | 18 +++++++++--------- 6 files changed, 28 insertions(+), 28 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 31afeda9..2e75fe8f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -310,9 +310,9 @@ dependencies = [ [[package]] name = "base64" -version = "0.22.1" +version = "0.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" +checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8" [[package]] name = "base64" @@ -3552,7 +3552,7 @@ dependencies = [ [[package]] name = "text-generation-benchmark" -version = "2.0.2" +version = "2.0.4" dependencies = [ "average", "clap", @@ -3573,7 +3573,7 @@ dependencies = [ [[package]] name = "text-generation-client" -version = "2.0.2" +version = "2.0.4" dependencies = [ "futures", "grpc-metadata", @@ -3590,7 +3590,7 @@ dependencies = [ [[package]] name = "text-generation-launcher" -version = "2.0.2" +version = "2.0.4" dependencies = [ "clap", "ctrlc", @@ -3609,7 +3609,7 @@ dependencies = [ [[package]] name = "text-generation-router" -version = "2.0.2" +version = "2.0.4" dependencies = [ "async-stream", "axum", diff --git a/Cargo.toml b/Cargo.toml index 34e55652..aafc8435 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,7 +9,7 @@ members = [ resolver = "2" [workspace.package] -version = "2.0.2" +version = "2.0.4" edition = "2021" authors = ["Olivier Dehaene"] homepage = "https://github.com/huggingface/text-generation-inference" diff --git a/Dockerfile b/Dockerfile index 73a274dc..af3fa4a5 100644 --- a/Dockerfile +++ b/Dockerfile @@ -74,5 +74,5 @@ COPY --from=builder /usr/src/target/release/text-generation-launcher /usr/local/ # Final image FROM base -ENTRYPOINT ["text-generation-launcher"] -CMD ["--json-output"] +#ENTRYPOINT ["text-generation-launcher"] +#CMD ["--json-output"] diff --git a/server/pyproject.toml b/server/pyproject.toml index c91e8849..5b6f720a 100644 --- a/server/pyproject.toml +++ b/server/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "text-generation-server" -version = "2.0.2" +version = "2.0.4" description = "Text Generation Inference Python gRPC Server" authors = ["Olivier Dehaene "] diff --git a/server/requirements_cuda.txt b/server/requirements_cuda.txt index 9035f6bc..88fcc4f3 100644 --- a/server/requirements_cuda.txt +++ b/server/requirements_cuda.txt @@ -6,14 +6,14 @@ colorama==0.4.6 ; python_version >= "3.9" and python_version < "3.13" and (sys_p deprecated==1.2.14 ; python_version >= "3.9" and python_version < "3.13" einops==0.6.1 ; python_version >= "3.9" and python_version < "3.13" filelock==3.14.0 ; python_version >= "3.9" and python_version < "3.13" -fsspec==2024.3.1 ; python_version >= "3.9" and python_version < "3.13" +fsspec==2024.5.0 ; python_version >= "3.9" and python_version < "3.13" googleapis-common-protos==1.63.0 ; python_version >= "3.9" and python_version < "3.13" grpc-interceptor==0.15.4 ; python_version >= "3.9" and python_version < "3.13" grpcio-reflection==1.62.2 ; python_version >= "3.9" and python_version < "3.13" grpcio-status==1.62.2 ; python_version >= "3.9" and python_version < "3.13" -grpcio==1.63.0 ; python_version >= "3.9" and python_version < "3.13" +grpcio==1.64.0 ; python_version >= "3.9" and python_version < "3.13" hf-transfer==0.1.6 ; python_version >= "3.9" and python_version < "3.13" -huggingface-hub==0.23.0 ; python_version >= "3.9" and python_version < "3.13" +huggingface-hub==0.23.1 ; python_version >= "3.9" and python_version < "3.13" idna==3.7 ; python_version >= "3.9" and python_version < "3.13" loguru==0.6.0 ; python_version >= "3.9" and python_version < "3.13" numpy==1.26.4 ; python_version >= "3.9" and python_version < "3.13" @@ -32,17 +32,17 @@ prometheus-client==0.20.0 ; python_version >= "3.9" and python_version < "3.13" protobuf==4.25.3 ; python_version >= "3.9" and python_version < "3.13" py-cpuinfo==9.0.0 ; python_version >= "3.9" and python_version < "3.13" pyyaml==6.0.1 ; python_version >= "3.9" and python_version < "3.13" -regex==2024.5.10 ; python_version >= "3.9" and python_version < "3.13" -requests==2.31.0 ; python_version >= "3.9" and python_version < "3.13" +regex==2024.5.15 ; python_version >= "3.9" and python_version < "3.13" +requests==2.32.2 ; python_version >= "3.9" and python_version < "3.13" safetensors==0.4.3 ; python_version >= "3.9" and python_version < "3.13" -scipy==1.13.0 ; python_version >= "3.9" and python_version < "3.13" +scipy==1.13.1 ; python_version >= "3.9" and python_version < "3.13" sentencepiece==0.1.99 ; python_version >= "3.9" and python_version < "3.13" -setuptools==69.5.1 ; python_version >= "3.9" and python_version < "3.13" +setuptools==70.0.0 ; python_version >= "3.9" and python_version < "3.13" tokenizers==0.19.1 ; python_version >= "3.9" and python_version < "3.13" tqdm==4.66.4 ; python_version >= "3.9" and python_version < "3.13" -transformers @ git+https://github.com/huggingface/transformers.git@b8aee2e918d7ba2d5e9e80162ae26b4806873307 ; python_version >= "3.9" and python_version < "3.13" +transformers==4.41.1 ; python_version >= "3.9" and python_version < "3.13" typer==0.6.1 ; python_version >= "3.9" and python_version < "3.13" -typing-extensions==4.11.0 ; python_version >= "3.9" and python_version < "3.13" +typing-extensions==4.12.0 ; python_version >= "3.9" and python_version < "3.13" urllib3==2.2.1 ; python_version >= "3.9" and python_version < "3.13" win32-setctime==1.1.0 ; python_version >= "3.9" and python_version < "3.13" and sys_platform == "win32" wrapt==1.16.0 ; python_version >= "3.9" and python_version < "3.13" diff --git a/server/requirements_rocm.txt b/server/requirements_rocm.txt index 9035f6bc..88fcc4f3 100644 --- a/server/requirements_rocm.txt +++ b/server/requirements_rocm.txt @@ -6,14 +6,14 @@ colorama==0.4.6 ; python_version >= "3.9" and python_version < "3.13" and (sys_p deprecated==1.2.14 ; python_version >= "3.9" and python_version < "3.13" einops==0.6.1 ; python_version >= "3.9" and python_version < "3.13" filelock==3.14.0 ; python_version >= "3.9" and python_version < "3.13" -fsspec==2024.3.1 ; python_version >= "3.9" and python_version < "3.13" +fsspec==2024.5.0 ; python_version >= "3.9" and python_version < "3.13" googleapis-common-protos==1.63.0 ; python_version >= "3.9" and python_version < "3.13" grpc-interceptor==0.15.4 ; python_version >= "3.9" and python_version < "3.13" grpcio-reflection==1.62.2 ; python_version >= "3.9" and python_version < "3.13" grpcio-status==1.62.2 ; python_version >= "3.9" and python_version < "3.13" -grpcio==1.63.0 ; python_version >= "3.9" and python_version < "3.13" +grpcio==1.64.0 ; python_version >= "3.9" and python_version < "3.13" hf-transfer==0.1.6 ; python_version >= "3.9" and python_version < "3.13" -huggingface-hub==0.23.0 ; python_version >= "3.9" and python_version < "3.13" +huggingface-hub==0.23.1 ; python_version >= "3.9" and python_version < "3.13" idna==3.7 ; python_version >= "3.9" and python_version < "3.13" loguru==0.6.0 ; python_version >= "3.9" and python_version < "3.13" numpy==1.26.4 ; python_version >= "3.9" and python_version < "3.13" @@ -32,17 +32,17 @@ prometheus-client==0.20.0 ; python_version >= "3.9" and python_version < "3.13" protobuf==4.25.3 ; python_version >= "3.9" and python_version < "3.13" py-cpuinfo==9.0.0 ; python_version >= "3.9" and python_version < "3.13" pyyaml==6.0.1 ; python_version >= "3.9" and python_version < "3.13" -regex==2024.5.10 ; python_version >= "3.9" and python_version < "3.13" -requests==2.31.0 ; python_version >= "3.9" and python_version < "3.13" +regex==2024.5.15 ; python_version >= "3.9" and python_version < "3.13" +requests==2.32.2 ; python_version >= "3.9" and python_version < "3.13" safetensors==0.4.3 ; python_version >= "3.9" and python_version < "3.13" -scipy==1.13.0 ; python_version >= "3.9" and python_version < "3.13" +scipy==1.13.1 ; python_version >= "3.9" and python_version < "3.13" sentencepiece==0.1.99 ; python_version >= "3.9" and python_version < "3.13" -setuptools==69.5.1 ; python_version >= "3.9" and python_version < "3.13" +setuptools==70.0.0 ; python_version >= "3.9" and python_version < "3.13" tokenizers==0.19.1 ; python_version >= "3.9" and python_version < "3.13" tqdm==4.66.4 ; python_version >= "3.9" and python_version < "3.13" -transformers @ git+https://github.com/huggingface/transformers.git@b8aee2e918d7ba2d5e9e80162ae26b4806873307 ; python_version >= "3.9" and python_version < "3.13" +transformers==4.41.1 ; python_version >= "3.9" and python_version < "3.13" typer==0.6.1 ; python_version >= "3.9" and python_version < "3.13" -typing-extensions==4.11.0 ; python_version >= "3.9" and python_version < "3.13" +typing-extensions==4.12.0 ; python_version >= "3.9" and python_version < "3.13" urllib3==2.2.1 ; python_version >= "3.9" and python_version < "3.13" win32-setctime==1.1.0 ; python_version >= "3.9" and python_version < "3.13" and sys_platform == "win32" wrapt==1.16.0 ; python_version >= "3.9" and python_version < "3.13" From b34edc2ee9edc0ccadf9d2d0907ec9af0ce19644 Mon Sep 17 00:00:00 2001 From: yuanwu Date: Wed, 17 Jul 2024 05:08:52 +0000 Subject: [PATCH 41/46] Upgrade to 2.0.4 Signed-off-by: yuanwu --- Dockerfile | 4 +- server/poetry.lock | 680 +++++++++++++++++++++++---------------------- 2 files changed, 349 insertions(+), 335 deletions(-) diff --git a/Dockerfile b/Dockerfile index af3fa4a5..73a274dc 100644 --- a/Dockerfile +++ b/Dockerfile @@ -74,5 +74,5 @@ COPY --from=builder /usr/src/target/release/text-generation-launcher /usr/local/ # Final image FROM base -#ENTRYPOINT ["text-generation-launcher"] -#CMD ["--json-output"] +ENTRYPOINT ["text-generation-launcher"] +CMD ["--json-output"] diff --git a/server/poetry.lock b/server/poetry.lock index 70e51d64..f59225a7 100644 --- a/server/poetry.lock +++ b/server/poetry.lock @@ -194,13 +194,13 @@ files = [ [[package]] name = "certifi" -version = "2024.6.2" +version = "2024.7.4" description = "Python package for providing Mozilla's CA Bundle." optional = false python-versions = ">=3.6" files = [ - {file = "certifi-2024.6.2-py3-none-any.whl", hash = "sha256:ddc6c8ce995e6987e7faf5e3f1b02b302836a0e5d98ece18392cb1a36c72ad56"}, - {file = "certifi-2024.6.2.tar.gz", hash = "sha256:3cd43f1c6fa7dedc5899d69d3ad0398fd018ad1a17fba83ddaf78aa46c747516"}, + {file = "certifi-2024.7.4-py3-none-any.whl", hash = "sha256:c198e21b1289c2ab85ee4e67bb4b4ef3ead0892059901a8d5b622f24a1101e90"}, + {file = "certifi-2024.7.4.tar.gz", hash = "sha256:5a1e7645bc0ec61a09e26c36f6106dd4cf40c6db3a1fb6352b0244e7fb057c7b"}, ] [[package]] @@ -474,13 +474,13 @@ files = [ [[package]] name = "exceptiongroup" -version = "1.2.1" +version = "1.2.2" description = "Backport of PEP 654 (exception groups)" optional = false python-versions = ">=3.7" files = [ - {file = "exceptiongroup-1.2.1-py3-none-any.whl", hash = "sha256:5258b9ed329c5bbdd31a309f53cbfb0b155341807f6ff7606a1e801a891b29ad"}, - {file = "exceptiongroup-1.2.1.tar.gz", hash = "sha256:a4785e48b045528f5bfe627b6ad554ff32def154f42372786903b7abcfe1aa16"}, + {file = "exceptiongroup-1.2.2-py3-none-any.whl", hash = "sha256:3111b9d131c238bec2f8f516e123e14ba243563fb135d3fe885990585aa7795b"}, + {file = "exceptiongroup-1.2.2.tar.gz", hash = "sha256:47c2edf7c6738fafb49fd34290706d1a1a2f4d1c6df275526b62cbb4aa5393cc"}, ] [package.extras] @@ -628,17 +628,17 @@ tqdm = ["tqdm"] [[package]] name = "googleapis-common-protos" -version = "1.63.1" +version = "1.63.2" description = "Common protobufs used in Google APIs" optional = false python-versions = ">=3.7" files = [ - {file = "googleapis-common-protos-1.63.1.tar.gz", hash = "sha256:c6442f7a0a6b2a80369457d79e6672bb7dcbaab88e0848302497e3ec80780a6a"}, - {file = "googleapis_common_protos-1.63.1-py2.py3-none-any.whl", hash = "sha256:0e1c2cdfcbc354b76e4a211a35ea35d6926a835cba1377073c4861db904a1877"}, + {file = "googleapis-common-protos-1.63.2.tar.gz", hash = "sha256:27c5abdffc4911f28101e635de1533fb4cfd2c37fbaa9174587c799fac90aa87"}, + {file = "googleapis_common_protos-1.63.2-py2.py3-none-any.whl", hash = "sha256:27a2499c7e8aff199665b22741997e485eccc8645aa9176c7c988e6fae507945"}, ] [package.dependencies] -protobuf = ">=3.19.5,<3.20.0 || >3.20.0,<3.20.1 || >3.20.1,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<6.0.0.dev0" +protobuf = ">=3.20.2,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<6.0.0.dev0" [package.extras] grpc = ["grpcio (>=1.44.0,<2.0.0.dev0)"] @@ -942,13 +942,13 @@ files = [ [[package]] name = "importlib-metadata" -version = "7.2.1" +version = "8.0.0" description = "Read metadata from Python packages" optional = false python-versions = ">=3.8" files = [ - {file = "importlib_metadata-7.2.1-py3-none-any.whl", hash = "sha256:ffef94b0b66046dd8ea2d619b701fe978d9264d38f3998bc4c27ec3b146a87c8"}, - {file = "importlib_metadata-7.2.1.tar.gz", hash = "sha256:509ecb2ab77071db5137c655e24ceb3eee66e7bbc6574165d0d114d9fc4bbe68"}, + {file = "importlib_metadata-8.0.0-py3-none-any.whl", hash = "sha256:15584cf2b1bf449d98ff8a6ff1abef57bf20f3ac6454f431736cd3e660921b2f"}, + {file = "importlib_metadata-8.0.0.tar.gz", hash = "sha256:188bd24e4c346d3f0a933f275c2fec67050326a856b9a359881d7c2a697e8812"}, ] [package.dependencies] @@ -1025,13 +1025,13 @@ files = [ [[package]] name = "jsonschema" -version = "4.22.0" +version = "4.23.0" description = "An implementation of JSON Schema validation for Python" optional = true python-versions = ">=3.8" files = [ - {file = "jsonschema-4.22.0-py3-none-any.whl", hash = "sha256:ff4cfd6b1367a40e7bc6411caec72effadd3db0bbe5017de188f2d6108335802"}, - {file = "jsonschema-4.22.0.tar.gz", hash = "sha256:5b22d434a45935119af990552c862e5d6d564e8f6601206b305a61fdf661a2b7"}, + {file = "jsonschema-4.23.0-py3-none-any.whl", hash = "sha256:fbadb6f8b144a8f8cf9f0b89ba94501d143e50411a1278633f56a7acf7fd5566"}, + {file = "jsonschema-4.23.0.tar.gz", hash = "sha256:d71497fef26351a33265337fa77ffeb82423f3ea21283cd9467bb03999266bc4"}, ] [package.dependencies] @@ -1042,7 +1042,7 @@ rpds-py = ">=0.7.1" [package.extras] format = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3987", "uri-template", "webcolors (>=1.11)"] -format-nongpl = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3986-validator (>0.1.0)", "uri-template", "webcolors (>=1.11)"] +format-nongpl = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3986-validator (>0.1.0)", "uri-template", "webcolors (>=24.6.0)"] [[package]] name = "jsonschema-specifications" @@ -1580,14 +1580,13 @@ files = [ [[package]] name = "nvidia-nvjitlink-cu12" -version = "12.5.40" +version = "12.5.82" description = "Nvidia JIT LTO Library" optional = false python-versions = ">=3" files = [ - {file = "nvidia_nvjitlink_cu12-12.5.40-py3-none-manylinux2014_aarch64.whl", hash = "sha256:004186d5ea6a57758fd6d57052a123c73a4815adf365eb8dd6a85c9eaa7535ff"}, - {file = "nvidia_nvjitlink_cu12-12.5.40-py3-none-manylinux2014_x86_64.whl", hash = "sha256:d9714f27c1d0f0895cd8915c07a87a1d0029a0aa36acaf9156952ec2a8a12189"}, - {file = "nvidia_nvjitlink_cu12-12.5.40-py3-none-win_amd64.whl", hash = "sha256:c3401dc8543b52d3a8158007a0c1ab4e9c768fcbd24153a48c86972102197ddd"}, + {file = "nvidia_nvjitlink_cu12-12.5.82-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f9b37bc5c8cf7509665cb6ada5aaa0ce65618f2332b7d3e78e9790511f111212"}, + {file = "nvidia_nvjitlink_cu12-12.5.82-py3-none-win_amd64.whl", hash = "sha256:e782564d705ff0bf61ac3e1bf730166da66dd2fe9012f111ede5fc49b64ae697"}, ] [[package]] @@ -1757,24 +1756,24 @@ files = [ [[package]] name = "optimum" -version = "1.20.0" +version = "1.21.2" description = "Optimum Library is an extension of the Hugging Face Transformers library, providing a framework to integrate third-party libraries from Hardware Partners and interface with their specific functionality." optional = false python-versions = ">=3.7.0" files = [ - {file = "optimum-1.20.0-py3-none-any.whl", hash = "sha256:0c0d0746043c95e22cf3586946d7408d353f10c0486f1c7d2d11084a5cfc0ede"}, - {file = "optimum-1.20.0.tar.gz", hash = "sha256:b64c7536fe738db9b56605105efe72006401ad2aa00cb499ae407f2e06f3043b"}, + {file = "optimum-1.21.2-py3-none-any.whl", hash = "sha256:8b3633b9312413ceac5156294a2a0cd221268baf5a2c593f4d54ec20bff296d8"}, + {file = "optimum-1.21.2.tar.gz", hash = "sha256:037e65d265237809fac69e9003215c60cf6de56e97c62ff7565abab4a94a64ce"}, ] [package.dependencies] coloredlogs = "*" datasets = "*" huggingface-hub = ">=0.8.0" -numpy = "*" +numpy = "<2.0" packaging = "*" sympy = "*" torch = ">=1.11" -transformers = {version = ">=4.26.0,<4.42.0", extras = ["sentencepiece"]} +transformers = {version = ">=4.26.0,<4.43.0", extras = ["sentencepiece"]} [package.extras] amd = ["optimum-amd"] @@ -1787,15 +1786,16 @@ exporters-gpu = ["onnx", "onnxruntime-gpu", "timm"] exporters-tf = ["h5py", "numpy (<1.24.0)", "onnx", "onnxruntime", "tensorflow (>=2.4,<=2.12.1)", "tf2onnx", "timm", "transformers[sentencepiece] (>=4.26.0,<4.38.0)"] furiosa = ["optimum-furiosa"] graphcore = ["optimum-graphcore"] -habana = ["optimum-habana", "transformers (>=4.38.0,<4.39.0)"] -intel = ["optimum-intel (>=1.16.0)"] -neural-compressor = ["optimum-intel[neural-compressor] (>=1.16.0)"] +habana = ["optimum-habana", "transformers (>=4.40.0,<4.41.0)"] +intel = ["optimum-intel (>=1.18.0)"] +ipex = ["optimum-intel[ipex] (>=1.18.0)"] +neural-compressor = ["optimum-intel[neural-compressor] (>=1.18.0)"] neuron = ["optimum-neuron[neuron] (>=0.0.20)", "transformers (>=4.36.2,<4.42.0)"] neuronx = ["optimum-neuron[neuronx] (>=0.0.20)", "transformers (>=4.36.2,<4.42.0)"] -nncf = ["optimum-intel[nncf] (>=1.16.0)"] +nncf = ["optimum-intel[nncf] (>=1.18.0)"] onnxruntime = ["datasets (>=1.2.1)", "evaluate", "onnx", "onnxruntime (>=1.11.0)", "protobuf (>=3.20.1)"] onnxruntime-gpu = ["accelerate", "datasets (>=1.2.1)", "evaluate", "onnx", "onnxruntime-gpu (>=1.11.0)", "protobuf (>=3.20.1)"] -openvino = ["optimum-intel[openvino] (>=1.16.0)"] +openvino = ["optimum-intel[openvino] (>=1.18.0)"] quality = ["black (>=23.1,<24.0)", "ruff (==0.1.5)"] tests = ["Pillow", "accelerate", "diffusers (>=0.17.0)", "einops", "invisible-watermark", "parameterized", "pytest (<=8.0.0)", "pytest-xdist", "requests", "rjieba", "sacremoses", "scikit-learn", "timm", "torchaudio", "torchvision"] @@ -1971,84 +1971,95 @@ test = ["black", "datasets", "diffusers (<0.21.0)", "hf-doc-builder", "parameter [[package]] name = "pillow" -version = "10.3.0" +version = "10.4.0" description = "Python Imaging Library (Fork)" optional = false python-versions = ">=3.8" files = [ - {file = "pillow-10.3.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:90b9e29824800e90c84e4022dd5cc16eb2d9605ee13f05d47641eb183cd73d45"}, - {file = "pillow-10.3.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a2c405445c79c3f5a124573a051062300936b0281fee57637e706453e452746c"}, - {file = "pillow-10.3.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:78618cdbccaa74d3f88d0ad6cb8ac3007f1a6fa5c6f19af64b55ca170bfa1edf"}, - {file = "pillow-10.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:261ddb7ca91fcf71757979534fb4c128448b5b4c55cb6152d280312062f69599"}, - {file = "pillow-10.3.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:ce49c67f4ea0609933d01c0731b34b8695a7a748d6c8d186f95e7d085d2fe475"}, - {file = "pillow-10.3.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:b14f16f94cbc61215115b9b1236f9c18403c15dd3c52cf629072afa9d54c1cbf"}, - {file = "pillow-10.3.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:d33891be6df59d93df4d846640f0e46f1a807339f09e79a8040bc887bdcd7ed3"}, - {file = "pillow-10.3.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:b50811d664d392f02f7761621303eba9d1b056fb1868c8cdf4231279645c25f5"}, - {file = "pillow-10.3.0-cp310-cp310-win32.whl", hash = "sha256:ca2870d5d10d8726a27396d3ca4cf7976cec0f3cb706debe88e3a5bd4610f7d2"}, - {file = "pillow-10.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:f0d0591a0aeaefdaf9a5e545e7485f89910c977087e7de2b6c388aec32011e9f"}, - {file = "pillow-10.3.0-cp310-cp310-win_arm64.whl", hash = "sha256:ccce24b7ad89adb5a1e34a6ba96ac2530046763912806ad4c247356a8f33a67b"}, - {file = "pillow-10.3.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:5f77cf66e96ae734717d341c145c5949c63180842a545c47a0ce7ae52ca83795"}, - {file = "pillow-10.3.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e4b878386c4bf293578b48fc570b84ecfe477d3b77ba39a6e87150af77f40c57"}, - {file = "pillow-10.3.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fdcbb4068117dfd9ce0138d068ac512843c52295ed996ae6dd1faf537b6dbc27"}, - {file = "pillow-10.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9797a6c8fe16f25749b371c02e2ade0efb51155e767a971c61734b1bf6293994"}, - {file = "pillow-10.3.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:9e91179a242bbc99be65e139e30690e081fe6cb91a8e77faf4c409653de39451"}, - {file = "pillow-10.3.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:1b87bd9d81d179bd8ab871603bd80d8645729939f90b71e62914e816a76fc6bd"}, - {file = "pillow-10.3.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:81d09caa7b27ef4e61cb7d8fbf1714f5aec1c6b6c5270ee53504981e6e9121ad"}, - {file = "pillow-10.3.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:048ad577748b9fa4a99a0548c64f2cb8d672d5bf2e643a739ac8faff1164238c"}, - {file = "pillow-10.3.0-cp311-cp311-win32.whl", hash = "sha256:7161ec49ef0800947dc5570f86568a7bb36fa97dd09e9827dc02b718c5643f09"}, - {file = "pillow-10.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:8eb0908e954d093b02a543dc963984d6e99ad2b5e36503d8a0aaf040505f747d"}, - {file = "pillow-10.3.0-cp311-cp311-win_arm64.whl", hash = "sha256:4e6f7d1c414191c1199f8996d3f2282b9ebea0945693fb67392c75a3a320941f"}, - {file = "pillow-10.3.0-cp312-cp312-macosx_10_10_x86_64.whl", hash = "sha256:e46f38133e5a060d46bd630faa4d9fa0202377495df1f068a8299fd78c84de84"}, - {file = "pillow-10.3.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:50b8eae8f7334ec826d6eeffaeeb00e36b5e24aa0b9df322c247539714c6df19"}, - {file = "pillow-10.3.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9d3bea1c75f8c53ee4d505c3e67d8c158ad4df0d83170605b50b64025917f338"}, - {file = "pillow-10.3.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:19aeb96d43902f0a783946a0a87dbdad5c84c936025b8419da0a0cd7724356b1"}, - {file = "pillow-10.3.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:74d28c17412d9caa1066f7a31df8403ec23d5268ba46cd0ad2c50fb82ae40462"}, - {file = "pillow-10.3.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:ff61bfd9253c3915e6d41c651d5f962da23eda633cf02262990094a18a55371a"}, - {file = "pillow-10.3.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:d886f5d353333b4771d21267c7ecc75b710f1a73d72d03ca06df49b09015a9ef"}, - {file = "pillow-10.3.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4b5ec25d8b17217d635f8935dbc1b9aa5907962fae29dff220f2659487891cd3"}, - {file = "pillow-10.3.0-cp312-cp312-win32.whl", hash = "sha256:51243f1ed5161b9945011a7360e997729776f6e5d7005ba0c6879267d4c5139d"}, - {file = "pillow-10.3.0-cp312-cp312-win_amd64.whl", hash = "sha256:412444afb8c4c7a6cc11a47dade32982439925537e483be7c0ae0cf96c4f6a0b"}, - {file = "pillow-10.3.0-cp312-cp312-win_arm64.whl", hash = "sha256:798232c92e7665fe82ac085f9d8e8ca98826f8e27859d9a96b41d519ecd2e49a"}, - {file = "pillow-10.3.0-cp38-cp38-macosx_10_10_x86_64.whl", hash = "sha256:4eaa22f0d22b1a7e93ff0a596d57fdede2e550aecffb5a1ef1106aaece48e96b"}, - {file = "pillow-10.3.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:cd5e14fbf22a87321b24c88669aad3a51ec052eb145315b3da3b7e3cc105b9a2"}, - {file = "pillow-10.3.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1530e8f3a4b965eb6a7785cf17a426c779333eb62c9a7d1bbcf3ffd5bf77a4aa"}, - {file = "pillow-10.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5d512aafa1d32efa014fa041d38868fda85028e3f930a96f85d49c7d8ddc0383"}, - {file = "pillow-10.3.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:339894035d0ede518b16073bdc2feef4c991ee991a29774b33e515f1d308e08d"}, - {file = "pillow-10.3.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:aa7e402ce11f0885305bfb6afb3434b3cd8f53b563ac065452d9d5654c7b86fd"}, - {file = "pillow-10.3.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:0ea2a783a2bdf2a561808fe4a7a12e9aa3799b701ba305de596bc48b8bdfce9d"}, - {file = "pillow-10.3.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:c78e1b00a87ce43bb37642c0812315b411e856a905d58d597750eb79802aaaa3"}, - {file = "pillow-10.3.0-cp38-cp38-win32.whl", hash = "sha256:72d622d262e463dfb7595202d229f5f3ab4b852289a1cd09650362db23b9eb0b"}, - {file = "pillow-10.3.0-cp38-cp38-win_amd64.whl", hash = "sha256:2034f6759a722da3a3dbd91a81148cf884e91d1b747992ca288ab88c1de15999"}, - {file = "pillow-10.3.0-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:2ed854e716a89b1afcedea551cd85f2eb2a807613752ab997b9974aaa0d56936"}, - {file = "pillow-10.3.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:dc1a390a82755a8c26c9964d457d4c9cbec5405896cba94cf51f36ea0d855002"}, - {file = "pillow-10.3.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4203efca580f0dd6f882ca211f923168548f7ba334c189e9eab1178ab840bf60"}, - {file = "pillow-10.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3102045a10945173d38336f6e71a8dc71bcaeed55c3123ad4af82c52807b9375"}, - {file = "pillow-10.3.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:6fb1b30043271ec92dc65f6d9f0b7a830c210b8a96423074b15c7bc999975f57"}, - {file = "pillow-10.3.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:1dfc94946bc60ea375cc39cff0b8da6c7e5f8fcdc1d946beb8da5c216156ddd8"}, - {file = "pillow-10.3.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:b09b86b27a064c9624d0a6c54da01c1beaf5b6cadfa609cf63789b1d08a797b9"}, - {file = "pillow-10.3.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:d3b2348a78bc939b4fed6552abfd2e7988e0f81443ef3911a4b8498ca084f6eb"}, - {file = "pillow-10.3.0-cp39-cp39-win32.whl", hash = "sha256:45ebc7b45406febf07fef35d856f0293a92e7417ae7933207e90bf9090b70572"}, - {file = "pillow-10.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:0ba26351b137ca4e0db0342d5d00d2e355eb29372c05afd544ebf47c0956ffeb"}, - {file = "pillow-10.3.0-cp39-cp39-win_arm64.whl", hash = "sha256:50fd3f6b26e3441ae07b7c979309638b72abc1a25da31a81a7fbd9495713ef4f"}, - {file = "pillow-10.3.0-pp310-pypy310_pp73-macosx_10_10_x86_64.whl", hash = "sha256:6b02471b72526ab8a18c39cb7967b72d194ec53c1fd0a70b050565a0f366d355"}, - {file = "pillow-10.3.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:8ab74c06ffdab957d7670c2a5a6e1a70181cd10b727cd788c4dd9005b6a8acd9"}, - {file = "pillow-10.3.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:048eeade4c33fdf7e08da40ef402e748df113fd0b4584e32c4af74fe78baaeb2"}, - {file = "pillow-10.3.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9e2ec1e921fd07c7cda7962bad283acc2f2a9ccc1b971ee4b216b75fad6f0463"}, - {file = "pillow-10.3.0-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:4c8e73e99da7db1b4cad7f8d682cf6abad7844da39834c288fbfa394a47bbced"}, - {file = "pillow-10.3.0-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:16563993329b79513f59142a6b02055e10514c1a8e86dca8b48a893e33cf91e3"}, - {file = "pillow-10.3.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:dd78700f5788ae180b5ee8902c6aea5a5726bac7c364b202b4b3e3ba2d293170"}, - {file = "pillow-10.3.0-pp39-pypy39_pp73-macosx_10_10_x86_64.whl", hash = "sha256:aff76a55a8aa8364d25400a210a65ff59d0168e0b4285ba6bf2bd83cf675ba32"}, - {file = "pillow-10.3.0-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:b7bc2176354defba3edc2b9a777744462da2f8e921fbaf61e52acb95bafa9828"}, - {file = "pillow-10.3.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:793b4e24db2e8742ca6423d3fde8396db336698c55cd34b660663ee9e45ed37f"}, - {file = "pillow-10.3.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d93480005693d247f8346bc8ee28c72a2191bdf1f6b5db469c096c0c867ac015"}, - {file = "pillow-10.3.0-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:c83341b89884e2b2e55886e8fbbf37c3fa5efd6c8907124aeb72f285ae5696e5"}, - {file = "pillow-10.3.0-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:1a1d1915db1a4fdb2754b9de292642a39a7fb28f1736699527bb649484fb966a"}, - {file = "pillow-10.3.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:a0eaa93d054751ee9964afa21c06247779b90440ca41d184aeb5d410f20ff591"}, - {file = "pillow-10.3.0.tar.gz", hash = "sha256:9d2455fbf44c914840c793e89aa82d0e1763a14253a000743719ae5946814b2d"}, + {file = "pillow-10.4.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:4d9667937cfa347525b319ae34375c37b9ee6b525440f3ef48542fcf66f2731e"}, + {file = "pillow-10.4.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:543f3dc61c18dafb755773efc89aae60d06b6596a63914107f75459cf984164d"}, + {file = "pillow-10.4.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7928ecbf1ece13956b95d9cbcfc77137652b02763ba384d9ab508099a2eca856"}, + {file = "pillow-10.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e4d49b85c4348ea0b31ea63bc75a9f3857869174e2bf17e7aba02945cd218e6f"}, + {file = "pillow-10.4.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:6c762a5b0997f5659a5ef2266abc1d8851ad7749ad9a6a5506eb23d314e4f46b"}, + {file = "pillow-10.4.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:a985e028fc183bf12a77a8bbf36318db4238a3ded7fa9df1b9a133f1cb79f8fc"}, + {file = "pillow-10.4.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:812f7342b0eee081eaec84d91423d1b4650bb9828eb53d8511bcef8ce5aecf1e"}, + {file = "pillow-10.4.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:ac1452d2fbe4978c2eec89fb5a23b8387aba707ac72810d9490118817d9c0b46"}, + {file = "pillow-10.4.0-cp310-cp310-win32.whl", hash = "sha256:bcd5e41a859bf2e84fdc42f4edb7d9aba0a13d29a2abadccafad99de3feff984"}, + {file = "pillow-10.4.0-cp310-cp310-win_amd64.whl", hash = "sha256:ecd85a8d3e79cd7158dec1c9e5808e821feea088e2f69a974db5edf84dc53141"}, + {file = "pillow-10.4.0-cp310-cp310-win_arm64.whl", hash = "sha256:ff337c552345e95702c5fde3158acb0625111017d0e5f24bf3acdb9cc16b90d1"}, + {file = "pillow-10.4.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:0a9ec697746f268507404647e531e92889890a087e03681a3606d9b920fbee3c"}, + {file = "pillow-10.4.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:dfe91cb65544a1321e631e696759491ae04a2ea11d36715eca01ce07284738be"}, + {file = "pillow-10.4.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5dc6761a6efc781e6a1544206f22c80c3af4c8cf461206d46a1e6006e4429ff3"}, + {file = "pillow-10.4.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5e84b6cc6a4a3d76c153a6b19270b3526a5a8ed6b09501d3af891daa2a9de7d6"}, + {file = "pillow-10.4.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:bbc527b519bd3aa9d7f429d152fea69f9ad37c95f0b02aebddff592688998abe"}, + {file = "pillow-10.4.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:76a911dfe51a36041f2e756b00f96ed84677cdeb75d25c767f296c1c1eda1319"}, + {file = "pillow-10.4.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:59291fb29317122398786c2d44427bbd1a6d7ff54017075b22be9d21aa59bd8d"}, + {file = "pillow-10.4.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:416d3a5d0e8cfe4f27f574362435bc9bae57f679a7158e0096ad2beb427b8696"}, + {file = "pillow-10.4.0-cp311-cp311-win32.whl", hash = "sha256:7086cc1d5eebb91ad24ded9f58bec6c688e9f0ed7eb3dbbf1e4800280a896496"}, + {file = "pillow-10.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:cbed61494057c0f83b83eb3a310f0bf774b09513307c434d4366ed64f4128a91"}, + {file = "pillow-10.4.0-cp311-cp311-win_arm64.whl", hash = "sha256:f5f0c3e969c8f12dd2bb7e0b15d5c468b51e5017e01e2e867335c81903046a22"}, + {file = "pillow-10.4.0-cp312-cp312-macosx_10_10_x86_64.whl", hash = "sha256:673655af3eadf4df6b5457033f086e90299fdd7a47983a13827acf7459c15d94"}, + {file = "pillow-10.4.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:866b6942a92f56300012f5fbac71f2d610312ee65e22f1aa2609e491284e5597"}, + {file = "pillow-10.4.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:29dbdc4207642ea6aad70fbde1a9338753d33fb23ed6956e706936706f52dd80"}, + {file = "pillow-10.4.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bf2342ac639c4cf38799a44950bbc2dfcb685f052b9e262f446482afaf4bffca"}, + {file = "pillow-10.4.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:f5b92f4d70791b4a67157321c4e8225d60b119c5cc9aee8ecf153aace4aad4ef"}, + {file = "pillow-10.4.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:86dcb5a1eb778d8b25659d5e4341269e8590ad6b4e8b44d9f4b07f8d136c414a"}, + {file = "pillow-10.4.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:780c072c2e11c9b2c7ca37f9a2ee8ba66f44367ac3e5c7832afcfe5104fd6d1b"}, + {file = "pillow-10.4.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:37fb69d905be665f68f28a8bba3c6d3223c8efe1edf14cc4cfa06c241f8c81d9"}, + {file = "pillow-10.4.0-cp312-cp312-win32.whl", hash = "sha256:7dfecdbad5c301d7b5bde160150b4db4c659cee2b69589705b6f8a0c509d9f42"}, + {file = "pillow-10.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:1d846aea995ad352d4bdcc847535bd56e0fd88d36829d2c90be880ef1ee4668a"}, + {file = "pillow-10.4.0-cp312-cp312-win_arm64.whl", hash = "sha256:e553cad5179a66ba15bb18b353a19020e73a7921296a7979c4a2b7f6a5cd57f9"}, + {file = "pillow-10.4.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:8bc1a764ed8c957a2e9cacf97c8b2b053b70307cf2996aafd70e91a082e70df3"}, + {file = "pillow-10.4.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:6209bb41dc692ddfee4942517c19ee81b86c864b626dbfca272ec0f7cff5d9fb"}, + {file = "pillow-10.4.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bee197b30783295d2eb680b311af15a20a8b24024a19c3a26431ff83eb8d1f70"}, + {file = "pillow-10.4.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1ef61f5dd14c300786318482456481463b9d6b91ebe5ef12f405afbba77ed0be"}, + {file = "pillow-10.4.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:297e388da6e248c98bc4a02e018966af0c5f92dfacf5a5ca22fa01cb3179bca0"}, + {file = "pillow-10.4.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:e4db64794ccdf6cb83a59d73405f63adbe2a1887012e308828596100a0b2f6cc"}, + {file = "pillow-10.4.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:bd2880a07482090a3bcb01f4265f1936a903d70bc740bfcb1fd4e8a2ffe5cf5a"}, + {file = "pillow-10.4.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4b35b21b819ac1dbd1233317adeecd63495f6babf21b7b2512d244ff6c6ce309"}, + {file = "pillow-10.4.0-cp313-cp313-win32.whl", hash = "sha256:551d3fd6e9dc15e4c1eb6fc4ba2b39c0c7933fa113b220057a34f4bb3268a060"}, + {file = "pillow-10.4.0-cp313-cp313-win_amd64.whl", hash = "sha256:030abdbe43ee02e0de642aee345efa443740aa4d828bfe8e2eb11922ea6a21ea"}, + {file = "pillow-10.4.0-cp313-cp313-win_arm64.whl", hash = "sha256:5b001114dd152cfd6b23befeb28d7aee43553e2402c9f159807bf55f33af8a8d"}, + {file = "pillow-10.4.0-cp38-cp38-macosx_10_10_x86_64.whl", hash = "sha256:8d4d5063501b6dd4024b8ac2f04962d661222d120381272deea52e3fc52d3736"}, + {file = "pillow-10.4.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:7c1ee6f42250df403c5f103cbd2768a28fe1a0ea1f0f03fe151c8741e1469c8b"}, + {file = "pillow-10.4.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b15e02e9bb4c21e39876698abf233c8c579127986f8207200bc8a8f6bb27acf2"}, + {file = "pillow-10.4.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7a8d4bade9952ea9a77d0c3e49cbd8b2890a399422258a77f357b9cc9be8d680"}, + {file = "pillow-10.4.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:43efea75eb06b95d1631cb784aa40156177bf9dd5b4b03ff38979e048258bc6b"}, + {file = "pillow-10.4.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:950be4d8ba92aca4b2bb0741285a46bfae3ca699ef913ec8416c1b78eadd64cd"}, + {file = "pillow-10.4.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:d7480af14364494365e89d6fddc510a13e5a2c3584cb19ef65415ca57252fb84"}, + {file = "pillow-10.4.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:73664fe514b34c8f02452ffb73b7a92c6774e39a647087f83d67f010eb9a0cf0"}, + {file = "pillow-10.4.0-cp38-cp38-win32.whl", hash = "sha256:e88d5e6ad0d026fba7bdab8c3f225a69f063f116462c49892b0149e21b6c0a0e"}, + {file = "pillow-10.4.0-cp38-cp38-win_amd64.whl", hash = "sha256:5161eef006d335e46895297f642341111945e2c1c899eb406882a6c61a4357ab"}, + {file = "pillow-10.4.0-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:0ae24a547e8b711ccaaf99c9ae3cd975470e1a30caa80a6aaee9a2f19c05701d"}, + {file = "pillow-10.4.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:298478fe4f77a4408895605f3482b6cc6222c018b2ce565c2b6b9c354ac3229b"}, + {file = "pillow-10.4.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:134ace6dc392116566980ee7436477d844520a26a4b1bd4053f6f47d096997fd"}, + {file = "pillow-10.4.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:930044bb7679ab003b14023138b50181899da3f25de50e9dbee23b61b4de2126"}, + {file = "pillow-10.4.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:c76e5786951e72ed3686e122d14c5d7012f16c8303a674d18cdcd6d89557fc5b"}, + {file = "pillow-10.4.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:b2724fdb354a868ddf9a880cb84d102da914e99119211ef7ecbdc613b8c96b3c"}, + {file = "pillow-10.4.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:dbc6ae66518ab3c5847659e9988c3b60dc94ffb48ef9168656e0019a93dbf8a1"}, + {file = "pillow-10.4.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:06b2f7898047ae93fad74467ec3d28fe84f7831370e3c258afa533f81ef7f3df"}, + {file = "pillow-10.4.0-cp39-cp39-win32.whl", hash = "sha256:7970285ab628a3779aecc35823296a7869f889b8329c16ad5a71e4901a3dc4ef"}, + {file = "pillow-10.4.0-cp39-cp39-win_amd64.whl", hash = "sha256:961a7293b2457b405967af9c77dcaa43cc1a8cd50d23c532e62d48ab6cdd56f5"}, + {file = "pillow-10.4.0-cp39-cp39-win_arm64.whl", hash = "sha256:32cda9e3d601a52baccb2856b8ea1fc213c90b340c542dcef77140dfa3278a9e"}, + {file = "pillow-10.4.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:5b4815f2e65b30f5fbae9dfffa8636d992d49705723fe86a3661806e069352d4"}, + {file = "pillow-10.4.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:8f0aef4ef59694b12cadee839e2ba6afeab89c0f39a3adc02ed51d109117b8da"}, + {file = "pillow-10.4.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9f4727572e2918acaa9077c919cbbeb73bd2b3ebcfe033b72f858fc9fbef0026"}, + {file = "pillow-10.4.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ff25afb18123cea58a591ea0244b92eb1e61a1fd497bf6d6384f09bc3262ec3e"}, + {file = "pillow-10.4.0-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:dc3e2db6ba09ffd7d02ae9141cfa0ae23393ee7687248d46a7507b75d610f4f5"}, + {file = "pillow-10.4.0-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:02a2be69f9c9b8c1e97cf2713e789d4e398c751ecfd9967c18d0ce304efbf885"}, + {file = "pillow-10.4.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:0755ffd4a0c6f267cccbae2e9903d95477ca2f77c4fcf3a3a09570001856c8a5"}, + {file = "pillow-10.4.0-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:a02364621fe369e06200d4a16558e056fe2805d3468350df3aef21e00d26214b"}, + {file = "pillow-10.4.0-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:1b5dea9831a90e9d0721ec417a80d4cbd7022093ac38a568db2dd78363b00908"}, + {file = "pillow-10.4.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9b885f89040bb8c4a1573566bbb2f44f5c505ef6e74cec7ab9068c900047f04b"}, + {file = "pillow-10.4.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:87dd88ded2e6d74d31e1e0a99a726a6765cda32d00ba72dc37f0651f306daaa8"}, + {file = "pillow-10.4.0-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:2db98790afc70118bd0255c2eeb465e9767ecf1f3c25f9a1abb8ffc8cfd1fe0a"}, + {file = "pillow-10.4.0-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:f7baece4ce06bade126fb84b8af1c33439a76d8a6fd818970215e0560ca28c27"}, + {file = "pillow-10.4.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:cfdd747216947628af7b259d274771d84db2268ca062dd5faf373639d00113a3"}, + {file = "pillow-10.4.0.tar.gz", hash = "sha256:166c1cd4d24309b30d61f79f4a9114b7b2313d7450912277855ff5dfd7cd4a06"}, ] [package.extras] -docs = ["furo", "olefile", "sphinx (>=2.4)", "sphinx-copybutton", "sphinx-inline-tabs", "sphinx-removed-in", "sphinxext-opengraph"] +docs = ["furo", "olefile", "sphinx (>=7.3)", "sphinx-copybutton", "sphinx-inline-tabs", "sphinxext-opengraph"] fpx = ["olefile"] mic = ["olefile"] tests = ["check-manifest", "coverage", "defusedxml", "markdown2", "olefile", "packaging", "pyroma", "pytest", "pytest-cov", "pytest-timeout"] @@ -2157,52 +2168,42 @@ files = [ [[package]] name = "pyarrow" -version = "16.1.0" +version = "17.0.0" description = "Python library for Apache Arrow" optional = false python-versions = ">=3.8" files = [ - {file = "pyarrow-16.1.0-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:17e23b9a65a70cc733d8b738baa6ad3722298fa0c81d88f63ff94bf25eaa77b9"}, - {file = "pyarrow-16.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:4740cc41e2ba5d641071d0ab5e9ef9b5e6e8c7611351a5cb7c1d175eaf43674a"}, - {file = "pyarrow-16.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:98100e0268d04e0eec47b73f20b39c45b4006f3c4233719c3848aa27a03c1aef"}, - {file = "pyarrow-16.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f68f409e7b283c085f2da014f9ef81e885d90dcd733bd648cfba3ef265961848"}, - {file = "pyarrow-16.1.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:a8914cd176f448e09746037b0c6b3a9d7688cef451ec5735094055116857580c"}, - {file = "pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:48be160782c0556156d91adbdd5a4a7e719f8d407cb46ae3bb4eaee09b3111bd"}, - {file = "pyarrow-16.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:9cf389d444b0f41d9fe1444b70650fea31e9d52cfcb5f818b7888b91b586efff"}, - {file = "pyarrow-16.1.0-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:d0ebea336b535b37eee9eee31761813086d33ed06de9ab6fc6aaa0bace7b250c"}, - {file = "pyarrow-16.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2e73cfc4a99e796727919c5541c65bb88b973377501e39b9842ea71401ca6c1c"}, - {file = "pyarrow-16.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bf9251264247ecfe93e5f5a0cd43b8ae834f1e61d1abca22da55b20c788417f6"}, - {file = "pyarrow-16.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ddf5aace92d520d3d2a20031d8b0ec27b4395cab9f74e07cc95edf42a5cc0147"}, - {file = "pyarrow-16.1.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:25233642583bf658f629eb230b9bb79d9af4d9f9229890b3c878699c82f7d11e"}, - {file = "pyarrow-16.1.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:a33a64576fddfbec0a44112eaf844c20853647ca833e9a647bfae0582b2ff94b"}, - {file = "pyarrow-16.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:185d121b50836379fe012753cf15c4ba9638bda9645183ab36246923875f8d1b"}, - {file = "pyarrow-16.1.0-cp312-cp312-macosx_10_15_x86_64.whl", hash = "sha256:2e51ca1d6ed7f2e9d5c3c83decf27b0d17bb207a7dea986e8dc3e24f80ff7d6f"}, - {file = "pyarrow-16.1.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:06ebccb6f8cb7357de85f60d5da50e83507954af617d7b05f48af1621d331c9a"}, - {file = "pyarrow-16.1.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b04707f1979815f5e49824ce52d1dceb46e2f12909a48a6a753fe7cafbc44a0c"}, - {file = "pyarrow-16.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0d32000693deff8dc5df444b032b5985a48592c0697cb6e3071a5d59888714e2"}, - {file = "pyarrow-16.1.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:8785bb10d5d6fd5e15d718ee1d1f914fe768bf8b4d1e5e9bf253de8a26cb1628"}, - {file = "pyarrow-16.1.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:e1369af39587b794873b8a307cc6623a3b1194e69399af0efd05bb202195a5a7"}, - {file = "pyarrow-16.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:febde33305f1498f6df85e8020bca496d0e9ebf2093bab9e0f65e2b4ae2b3444"}, - {file = "pyarrow-16.1.0-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:b5f5705ab977947a43ac83b52ade3b881eb6e95fcc02d76f501d549a210ba77f"}, - {file = "pyarrow-16.1.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:0d27bf89dfc2576f6206e9cd6cf7a107c9c06dc13d53bbc25b0bd4556f19cf5f"}, - {file = "pyarrow-16.1.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0d07de3ee730647a600037bc1d7b7994067ed64d0eba797ac74b2bc77384f4c2"}, - {file = "pyarrow-16.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fbef391b63f708e103df99fbaa3acf9f671d77a183a07546ba2f2c297b361e83"}, - {file = "pyarrow-16.1.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:19741c4dbbbc986d38856ee7ddfdd6a00fc3b0fc2d928795b95410d38bb97d15"}, - {file = "pyarrow-16.1.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:f2c5fb249caa17b94e2b9278b36a05ce03d3180e6da0c4c3b3ce5b2788f30eed"}, - {file = "pyarrow-16.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:e6b6d3cd35fbb93b70ade1336022cc1147b95ec6af7d36906ca7fe432eb09710"}, - {file = "pyarrow-16.1.0-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:18da9b76a36a954665ccca8aa6bd9f46c1145f79c0bb8f4f244f5f8e799bca55"}, - {file = "pyarrow-16.1.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:99f7549779b6e434467d2aa43ab2b7224dd9e41bdde486020bae198978c9e05e"}, - {file = "pyarrow-16.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f07fdffe4fd5b15f5ec15c8b64584868d063bc22b86b46c9695624ca3505b7b4"}, - {file = "pyarrow-16.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ddfe389a08ea374972bd4065d5f25d14e36b43ebc22fc75f7b951f24378bf0b5"}, - {file = "pyarrow-16.1.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:3b20bd67c94b3a2ea0a749d2a5712fc845a69cb5d52e78e6449bbd295611f3aa"}, - {file = "pyarrow-16.1.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:ba8ac20693c0bb0bf4b238751d4409e62852004a8cf031c73b0e0962b03e45e3"}, - {file = "pyarrow-16.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:31a1851751433d89a986616015841977e0a188662fcffd1a5677453f1df2de0a"}, - {file = "pyarrow-16.1.0.tar.gz", hash = "sha256:15fbb22ea96d11f0b5768504a3f961edab25eaf4197c341720c4a387f6c60315"}, + {file = "pyarrow-17.0.0-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:a5c8b238d47e48812ee577ee20c9a2779e6a5904f1708ae240f53ecbee7c9f07"}, + {file = "pyarrow-17.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:db023dc4c6cae1015de9e198d41250688383c3f9af8f565370ab2b4cb5f62655"}, + {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:da1e060b3876faa11cee287839f9cc7cdc00649f475714b8680a05fd9071d545"}, + {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:75c06d4624c0ad6674364bb46ef38c3132768139ddec1c56582dbac54f2663e2"}, + {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:fa3c246cc58cb5a4a5cb407a18f193354ea47dd0648194e6265bd24177982fe8"}, + {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:f7ae2de664e0b158d1607699a16a488de3d008ba99b3a7aa5de1cbc13574d047"}, + {file = "pyarrow-17.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:5984f416552eea15fd9cee03da53542bf4cddaef5afecefb9aa8d1010c335087"}, + {file = "pyarrow-17.0.0-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:1c8856e2ef09eb87ecf937104aacfa0708f22dfeb039c363ec99735190ffb977"}, + {file = "pyarrow-17.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2e19f569567efcbbd42084e87f948778eb371d308e137a0f97afe19bb860ccb3"}, + {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6b244dc8e08a23b3e352899a006a26ae7b4d0da7bb636872fa8f5884e70acf15"}, + {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b72e87fe3e1db343995562f7fff8aee354b55ee83d13afba65400c178ab2597"}, + {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:dc5c31c37409dfbc5d014047817cb4ccd8c1ea25d19576acf1a001fe07f5b420"}, + {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:e3343cb1e88bc2ea605986d4b94948716edc7a8d14afd4e2c097232f729758b4"}, + {file = "pyarrow-17.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:a27532c38f3de9eb3e90ecab63dfda948a8ca859a66e3a47f5f42d1e403c4d03"}, + {file = "pyarrow-17.0.0-cp312-cp312-macosx_10_15_x86_64.whl", hash = "sha256:9b8a823cea605221e61f34859dcc03207e52e409ccf6354634143e23af7c8d22"}, + {file = "pyarrow-17.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f1e70de6cb5790a50b01d2b686d54aaf73da01266850b05e3af2a1bc89e16053"}, + {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0071ce35788c6f9077ff9ecba4858108eebe2ea5a3f7cf2cf55ebc1dbc6ee24a"}, + {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:757074882f844411fcca735e39aae74248a1531367a7c80799b4266390ae51cc"}, + {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:9ba11c4f16976e89146781a83833df7f82077cdab7dc6232c897789343f7891a"}, + {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:b0c6ac301093b42d34410b187bba560b17c0330f64907bfa4f7f7f2444b0cf9b"}, + {file = "pyarrow-17.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:392bc9feabc647338e6c89267635e111d71edad5fcffba204425a7c8d13610d7"}, + {file = "pyarrow-17.0.0-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:af5ff82a04b2171415f1410cff7ebb79861afc5dae50be73ce06d6e870615204"}, + {file = "pyarrow-17.0.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:edca18eaca89cd6382dfbcff3dd2d87633433043650c07375d095cd3517561d8"}, ] [package.dependencies] numpy = ">=1.16.6" +[package.extras] +test = ["cffi", "hypothesis", "pandas", "pytest", "pytz"] + [[package]] name = "pyarrow-hotfix" version = "0.6" @@ -2216,109 +2217,119 @@ files = [ [[package]] name = "pydantic" -version = "2.7.4" +version = "2.8.2" description = "Data validation using Python type hints" optional = true python-versions = ">=3.8" files = [ - {file = "pydantic-2.7.4-py3-none-any.whl", hash = "sha256:ee8538d41ccb9c0a9ad3e0e5f07bf15ed8015b481ced539a1759d8cc89ae90d0"}, - {file = "pydantic-2.7.4.tar.gz", hash = "sha256:0c84efd9548d545f63ac0060c1e4d39bb9b14db8b3c0652338aecc07b5adec52"}, + {file = "pydantic-2.8.2-py3-none-any.whl", hash = "sha256:73ee9fddd406dc318b885c7a2eab8a6472b68b8fb5ba8150949fc3db939f23c8"}, + {file = "pydantic-2.8.2.tar.gz", hash = "sha256:6f62c13d067b0755ad1c21a34bdd06c0c12625a22b0fc09c6b149816604f7c2a"}, ] [package.dependencies] annotated-types = ">=0.4.0" -pydantic-core = "2.18.4" -typing-extensions = ">=4.6.1" +pydantic-core = "2.20.1" +typing-extensions = {version = ">=4.6.1", markers = "python_version < \"3.13\""} [package.extras] email = ["email-validator (>=2.0.0)"] [[package]] name = "pydantic-core" -version = "2.18.4" +version = "2.20.1" description = "Core functionality for Pydantic validation and serialization" optional = true python-versions = ">=3.8" files = [ - {file = "pydantic_core-2.18.4-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:f76d0ad001edd426b92233d45c746fd08f467d56100fd8f30e9ace4b005266e4"}, - {file = "pydantic_core-2.18.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:59ff3e89f4eaf14050c8022011862df275b552caef8082e37b542b066ce1ff26"}, - {file = "pydantic_core-2.18.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a55b5b16c839df1070bc113c1f7f94a0af4433fcfa1b41799ce7606e5c79ce0a"}, - {file = "pydantic_core-2.18.4-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:4d0dcc59664fcb8974b356fe0a18a672d6d7cf9f54746c05f43275fc48636851"}, - {file = "pydantic_core-2.18.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8951eee36c57cd128f779e641e21eb40bc5073eb28b2d23f33eb0ef14ffb3f5d"}, - {file = "pydantic_core-2.18.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4701b19f7e3a06ea655513f7938de6f108123bf7c86bbebb1196eb9bd35cf724"}, - {file = "pydantic_core-2.18.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e00a3f196329e08e43d99b79b286d60ce46bed10f2280d25a1718399457e06be"}, - {file = "pydantic_core-2.18.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:97736815b9cc893b2b7f663628e63f436018b75f44854c8027040e05230eeddb"}, - {file = "pydantic_core-2.18.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:6891a2ae0e8692679c07728819b6e2b822fb30ca7445f67bbf6509b25a96332c"}, - {file = "pydantic_core-2.18.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:bc4ff9805858bd54d1a20efff925ccd89c9d2e7cf4986144b30802bf78091c3e"}, - {file = "pydantic_core-2.18.4-cp310-none-win32.whl", hash = "sha256:1b4de2e51bbcb61fdebd0ab86ef28062704f62c82bbf4addc4e37fa4b00b7cbc"}, - {file = "pydantic_core-2.18.4-cp310-none-win_amd64.whl", hash = "sha256:6a750aec7bf431517a9fd78cb93c97b9b0c496090fee84a47a0d23668976b4b0"}, - {file = "pydantic_core-2.18.4-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:942ba11e7dfb66dc70f9ae66b33452f51ac7bb90676da39a7345e99ffb55402d"}, - {file = "pydantic_core-2.18.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b2ebef0e0b4454320274f5e83a41844c63438fdc874ea40a8b5b4ecb7693f1c4"}, - {file = "pydantic_core-2.18.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a642295cd0c8df1b86fc3dced1d067874c353a188dc8e0f744626d49e9aa51c4"}, - {file = "pydantic_core-2.18.4-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5f09baa656c904807e832cf9cce799c6460c450c4ad80803517032da0cd062e2"}, - {file = "pydantic_core-2.18.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:98906207f29bc2c459ff64fa007afd10a8c8ac080f7e4d5beff4c97086a3dabd"}, - {file = "pydantic_core-2.18.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:19894b95aacfa98e7cb093cd7881a0c76f55731efad31073db4521e2b6ff5b7d"}, - {file = "pydantic_core-2.18.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0fbbdc827fe5e42e4d196c746b890b3d72876bdbf160b0eafe9f0334525119c8"}, - {file = "pydantic_core-2.18.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f85d05aa0918283cf29a30b547b4df2fbb56b45b135f9e35b6807cb28bc47951"}, - {file = "pydantic_core-2.18.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e85637bc8fe81ddb73fda9e56bab24560bdddfa98aa64f87aaa4e4b6730c23d2"}, - {file = "pydantic_core-2.18.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:2f5966897e5461f818e136b8451d0551a2e77259eb0f73a837027b47dc95dab9"}, - {file = "pydantic_core-2.18.4-cp311-none-win32.whl", hash = "sha256:44c7486a4228413c317952e9d89598bcdfb06399735e49e0f8df643e1ccd0558"}, - {file = "pydantic_core-2.18.4-cp311-none-win_amd64.whl", hash = "sha256:8a7164fe2005d03c64fd3b85649891cd4953a8de53107940bf272500ba8a788b"}, - {file = "pydantic_core-2.18.4-cp311-none-win_arm64.whl", hash = "sha256:4e99bc050fe65c450344421017f98298a97cefc18c53bb2f7b3531eb39bc7805"}, - {file = "pydantic_core-2.18.4-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:6f5c4d41b2771c730ea1c34e458e781b18cc668d194958e0112455fff4e402b2"}, - {file = "pydantic_core-2.18.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2fdf2156aa3d017fddf8aea5adfba9f777db1d6022d392b682d2a8329e087cef"}, - {file = "pydantic_core-2.18.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4748321b5078216070b151d5271ef3e7cc905ab170bbfd27d5c83ee3ec436695"}, - {file = "pydantic_core-2.18.4-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:847a35c4d58721c5dc3dba599878ebbdfd96784f3fb8bb2c356e123bdcd73f34"}, - {file = "pydantic_core-2.18.4-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3c40d4eaad41f78e3bbda31b89edc46a3f3dc6e171bf0ecf097ff7a0ffff7cb1"}, - {file = "pydantic_core-2.18.4-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:21a5e440dbe315ab9825fcd459b8814bb92b27c974cbc23c3e8baa2b76890077"}, - {file = "pydantic_core-2.18.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:01dd777215e2aa86dfd664daed5957704b769e726626393438f9c87690ce78c3"}, - {file = "pydantic_core-2.18.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:4b06beb3b3f1479d32befd1f3079cc47b34fa2da62457cdf6c963393340b56e9"}, - {file = "pydantic_core-2.18.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:564d7922e4b13a16b98772441879fcdcbe82ff50daa622d681dd682175ea918c"}, - {file = "pydantic_core-2.18.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:0eb2a4f660fcd8e2b1c90ad566db2b98d7f3f4717c64fe0a83e0adb39766d5b8"}, - {file = "pydantic_core-2.18.4-cp312-none-win32.whl", hash = "sha256:8b8bab4c97248095ae0c4455b5a1cd1cdd96e4e4769306ab19dda135ea4cdb07"}, - {file = "pydantic_core-2.18.4-cp312-none-win_amd64.whl", hash = "sha256:14601cdb733d741b8958224030e2bfe21a4a881fb3dd6fbb21f071cabd48fa0a"}, - {file = "pydantic_core-2.18.4-cp312-none-win_arm64.whl", hash = "sha256:c1322d7dd74713dcc157a2b7898a564ab091ca6c58302d5c7b4c07296e3fd00f"}, - {file = "pydantic_core-2.18.4-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:823be1deb01793da05ecb0484d6c9e20baebb39bd42b5d72636ae9cf8350dbd2"}, - {file = "pydantic_core-2.18.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:ebef0dd9bf9b812bf75bda96743f2a6c5734a02092ae7f721c048d156d5fabae"}, - {file = "pydantic_core-2.18.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ae1d6df168efb88d7d522664693607b80b4080be6750c913eefb77e34c12c71a"}, - {file = "pydantic_core-2.18.4-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f9899c94762343f2cc2fc64c13e7cae4c3cc65cdfc87dd810a31654c9b7358cc"}, - {file = "pydantic_core-2.18.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:99457f184ad90235cfe8461c4d70ab7dd2680e28821c29eca00252ba90308c78"}, - {file = "pydantic_core-2.18.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:18f469a3d2a2fdafe99296a87e8a4c37748b5080a26b806a707f25a902c040a8"}, - {file = "pydantic_core-2.18.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b7cdf28938ac6b8b49ae5e92f2735056a7ba99c9b110a474473fd71185c1af5d"}, - {file = "pydantic_core-2.18.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:938cb21650855054dc54dfd9120a851c974f95450f00683399006aa6e8abb057"}, - {file = "pydantic_core-2.18.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:44cd83ab6a51da80fb5adbd9560e26018e2ac7826f9626bc06ca3dc074cd198b"}, - {file = "pydantic_core-2.18.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:972658f4a72d02b8abfa2581d92d59f59897d2e9f7e708fdabe922f9087773af"}, - {file = "pydantic_core-2.18.4-cp38-none-win32.whl", hash = "sha256:1d886dc848e60cb7666f771e406acae54ab279b9f1e4143babc9c2258213daa2"}, - {file = "pydantic_core-2.18.4-cp38-none-win_amd64.whl", hash = "sha256:bb4462bd43c2460774914b8525f79b00f8f407c945d50881568f294c1d9b4443"}, - {file = "pydantic_core-2.18.4-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:44a688331d4a4e2129140a8118479443bd6f1905231138971372fcde37e43528"}, - {file = "pydantic_core-2.18.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a2fdd81edd64342c85ac7cf2753ccae0b79bf2dfa063785503cb85a7d3593223"}, - {file = "pydantic_core-2.18.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:86110d7e1907ab36691f80b33eb2da87d780f4739ae773e5fc83fb272f88825f"}, - {file = "pydantic_core-2.18.4-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:46387e38bd641b3ee5ce247563b60c5ca098da9c56c75c157a05eaa0933ed154"}, - {file = "pydantic_core-2.18.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:123c3cec203e3f5ac7b000bd82235f1a3eced8665b63d18be751f115588fea30"}, - {file = "pydantic_core-2.18.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dc1803ac5c32ec324c5261c7209e8f8ce88e83254c4e1aebdc8b0a39f9ddb443"}, - {file = "pydantic_core-2.18.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:53db086f9f6ab2b4061958d9c276d1dbe3690e8dd727d6abf2321d6cce37fa94"}, - {file = "pydantic_core-2.18.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:abc267fa9837245cc28ea6929f19fa335f3dc330a35d2e45509b6566dc18be23"}, - {file = "pydantic_core-2.18.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:a0d829524aaefdebccb869eed855e2d04c21d2d7479b6cada7ace5448416597b"}, - {file = "pydantic_core-2.18.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:509daade3b8649f80d4e5ff21aa5673e4ebe58590b25fe42fac5f0f52c6f034a"}, - {file = "pydantic_core-2.18.4-cp39-none-win32.whl", hash = "sha256:ca26a1e73c48cfc54c4a76ff78df3727b9d9f4ccc8dbee4ae3f73306a591676d"}, - {file = "pydantic_core-2.18.4-cp39-none-win_amd64.whl", hash = "sha256:c67598100338d5d985db1b3d21f3619ef392e185e71b8d52bceacc4a7771ea7e"}, - {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:574d92eac874f7f4db0ca653514d823a0d22e2354359d0759e3f6a406db5d55d"}, - {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:1f4d26ceb5eb9eed4af91bebeae4b06c3fb28966ca3a8fb765208cf6b51102ab"}, - {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:77450e6d20016ec41f43ca4a6c63e9fdde03f0ae3fe90e7c27bdbeaece8b1ed4"}, - {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d323a01da91851a4f17bf592faf46149c9169d68430b3146dcba2bb5e5719abc"}, - {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:43d447dd2ae072a0065389092a231283f62d960030ecd27565672bd40746c507"}, - {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:578e24f761f3b425834f297b9935e1ce2e30f51400964ce4801002435a1b41ef"}, - {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:81b5efb2f126454586d0f40c4d834010979cb80785173d1586df845a632e4e6d"}, - {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:ab86ce7c8f9bea87b9d12c7f0af71102acbf5ecbc66c17796cff45dae54ef9a5"}, - {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:90afc12421df2b1b4dcc975f814e21bc1754640d502a2fbcc6d41e77af5ec312"}, - {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:51991a89639a912c17bef4b45c87bd83593aee0437d8102556af4885811d59f5"}, - {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:293afe532740370aba8c060882f7d26cfd00c94cae32fd2e212a3a6e3b7bc15e"}, - {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b48ece5bde2e768197a2d0f6e925f9d7e3e826f0ad2271120f8144a9db18d5c8"}, - {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:eae237477a873ab46e8dd748e515c72c0c804fb380fbe6c85533c7de51f23a8f"}, - {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:834b5230b5dfc0c1ec37b2fda433b271cbbc0e507560b5d1588e2cc1148cf1ce"}, - {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:e858ac0a25074ba4bce653f9b5d0a85b7456eaddadc0ce82d3878c22489fa4ee"}, - {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:2fd41f6eff4c20778d717af1cc50eca52f5afe7805ee530a4fbd0bae284f16e9"}, - {file = "pydantic_core-2.18.4.tar.gz", hash = "sha256:ec3beeada09ff865c344ff3bc2f427f5e6c26401cc6113d77e372c3fdac73864"}, + {file = "pydantic_core-2.20.1-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:3acae97ffd19bf091c72df4d726d552c473f3576409b2a7ca36b2f535ffff4a3"}, + {file = "pydantic_core-2.20.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:41f4c96227a67a013e7de5ff8f20fb496ce573893b7f4f2707d065907bffdbd6"}, + {file = "pydantic_core-2.20.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5f239eb799a2081495ea659d8d4a43a8f42cd1fe9ff2e7e436295c38a10c286a"}, + {file = "pydantic_core-2.20.1-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:53e431da3fc53360db73eedf6f7124d1076e1b4ee4276b36fb25514544ceb4a3"}, + {file = "pydantic_core-2.20.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f1f62b2413c3a0e846c3b838b2ecd6c7a19ec6793b2a522745b0869e37ab5bc1"}, + {file = "pydantic_core-2.20.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5d41e6daee2813ecceea8eda38062d69e280b39df793f5a942fa515b8ed67953"}, + {file = "pydantic_core-2.20.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3d482efec8b7dc6bfaedc0f166b2ce349df0011f5d2f1f25537ced4cfc34fd98"}, + {file = "pydantic_core-2.20.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:e93e1a4b4b33daed65d781a57a522ff153dcf748dee70b40c7258c5861e1768a"}, + {file = "pydantic_core-2.20.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:e7c4ea22b6739b162c9ecaaa41d718dfad48a244909fe7ef4b54c0b530effc5a"}, + {file = "pydantic_core-2.20.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:4f2790949cf385d985a31984907fecb3896999329103df4e4983a4a41e13e840"}, + {file = "pydantic_core-2.20.1-cp310-none-win32.whl", hash = "sha256:5e999ba8dd90e93d57410c5e67ebb67ffcaadcea0ad973240fdfd3a135506250"}, + {file = "pydantic_core-2.20.1-cp310-none-win_amd64.whl", hash = "sha256:512ecfbefef6dac7bc5eaaf46177b2de58cdf7acac8793fe033b24ece0b9566c"}, + {file = "pydantic_core-2.20.1-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:d2a8fa9d6d6f891f3deec72f5cc668e6f66b188ab14bb1ab52422fe8e644f312"}, + {file = "pydantic_core-2.20.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:175873691124f3d0da55aeea1d90660a6ea7a3cfea137c38afa0a5ffabe37b88"}, + {file = "pydantic_core-2.20.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:37eee5b638f0e0dcd18d21f59b679686bbd18917b87db0193ae36f9c23c355fc"}, + {file = "pydantic_core-2.20.1-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:25e9185e2d06c16ee438ed39bf62935ec436474a6ac4f9358524220f1b236e43"}, + {file = "pydantic_core-2.20.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:150906b40ff188a3260cbee25380e7494ee85048584998c1e66df0c7a11c17a6"}, + {file = "pydantic_core-2.20.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8ad4aeb3e9a97286573c03df758fc7627aecdd02f1da04516a86dc159bf70121"}, + {file = "pydantic_core-2.20.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d3f3ed29cd9f978c604708511a1f9c2fdcb6c38b9aae36a51905b8811ee5cbf1"}, + {file = "pydantic_core-2.20.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b0dae11d8f5ded51699c74d9548dcc5938e0804cc8298ec0aa0da95c21fff57b"}, + {file = "pydantic_core-2.20.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:faa6b09ee09433b87992fb5a2859efd1c264ddc37280d2dd5db502126d0e7f27"}, + {file = "pydantic_core-2.20.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:9dc1b507c12eb0481d071f3c1808f0529ad41dc415d0ca11f7ebfc666e66a18b"}, + {file = "pydantic_core-2.20.1-cp311-none-win32.whl", hash = "sha256:fa2fddcb7107e0d1808086ca306dcade7df60a13a6c347a7acf1ec139aa6789a"}, + {file = "pydantic_core-2.20.1-cp311-none-win_amd64.whl", hash = "sha256:40a783fb7ee353c50bd3853e626f15677ea527ae556429453685ae32280c19c2"}, + {file = "pydantic_core-2.20.1-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:595ba5be69b35777474fa07f80fc260ea71255656191adb22a8c53aba4479231"}, + {file = "pydantic_core-2.20.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a4f55095ad087474999ee28d3398bae183a66be4823f753cd7d67dd0153427c9"}, + {file = "pydantic_core-2.20.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f9aa05d09ecf4c75157197f27cdc9cfaeb7c5f15021c6373932bf3e124af029f"}, + {file = "pydantic_core-2.20.1-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e97fdf088d4b31ff4ba35db26d9cc472ac7ef4a2ff2badeabf8d727b3377fc52"}, + {file = "pydantic_core-2.20.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bc633a9fe1eb87e250b5c57d389cf28998e4292336926b0b6cdaee353f89a237"}, + {file = "pydantic_core-2.20.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d573faf8eb7e6b1cbbcb4f5b247c60ca8be39fe2c674495df0eb4318303137fe"}, + {file = "pydantic_core-2.20.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:26dc97754b57d2fd00ac2b24dfa341abffc380b823211994c4efac7f13b9e90e"}, + {file = "pydantic_core-2.20.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:33499e85e739a4b60c9dac710c20a08dc73cb3240c9a0e22325e671b27b70d24"}, + {file = "pydantic_core-2.20.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:bebb4d6715c814597f85297c332297c6ce81e29436125ca59d1159b07f423eb1"}, + {file = "pydantic_core-2.20.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:516d9227919612425c8ef1c9b869bbbee249bc91912c8aaffb66116c0b447ebd"}, + {file = "pydantic_core-2.20.1-cp312-none-win32.whl", hash = "sha256:469f29f9093c9d834432034d33f5fe45699e664f12a13bf38c04967ce233d688"}, + {file = "pydantic_core-2.20.1-cp312-none-win_amd64.whl", hash = "sha256:035ede2e16da7281041f0e626459bcae33ed998cca6a0a007a5ebb73414ac72d"}, + {file = "pydantic_core-2.20.1-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:0827505a5c87e8aa285dc31e9ec7f4a17c81a813d45f70b1d9164e03a813a686"}, + {file = "pydantic_core-2.20.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:19c0fa39fa154e7e0b7f82f88ef85faa2a4c23cc65aae2f5aea625e3c13c735a"}, + {file = "pydantic_core-2.20.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4aa223cd1e36b642092c326d694d8bf59b71ddddc94cdb752bbbb1c5c91d833b"}, + {file = "pydantic_core-2.20.1-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c336a6d235522a62fef872c6295a42ecb0c4e1d0f1a3e500fe949415761b8a19"}, + {file = "pydantic_core-2.20.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7eb6a0587eded33aeefea9f916899d42b1799b7b14b8f8ff2753c0ac1741edac"}, + {file = "pydantic_core-2.20.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:70c8daf4faca8da5a6d655f9af86faf6ec2e1768f4b8b9d0226c02f3d6209703"}, + {file = "pydantic_core-2.20.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e9fa4c9bf273ca41f940bceb86922a7667cd5bf90e95dbb157cbb8441008482c"}, + {file = "pydantic_core-2.20.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:11b71d67b4725e7e2a9f6e9c0ac1239bbc0c48cce3dc59f98635efc57d6dac83"}, + {file = "pydantic_core-2.20.1-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:270755f15174fb983890c49881e93f8f1b80f0b5e3a3cc1394a255706cabd203"}, + {file = "pydantic_core-2.20.1-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:c81131869240e3e568916ef4c307f8b99583efaa60a8112ef27a366eefba8ef0"}, + {file = "pydantic_core-2.20.1-cp313-none-win32.whl", hash = "sha256:b91ced227c41aa29c672814f50dbb05ec93536abf8f43cd14ec9521ea09afe4e"}, + {file = "pydantic_core-2.20.1-cp313-none-win_amd64.whl", hash = "sha256:65db0f2eefcaad1a3950f498aabb4875c8890438bc80b19362cf633b87a8ab20"}, + {file = "pydantic_core-2.20.1-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:4745f4ac52cc6686390c40eaa01d48b18997cb130833154801a442323cc78f91"}, + {file = "pydantic_core-2.20.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:a8ad4c766d3f33ba8fd692f9aa297c9058970530a32c728a2c4bfd2616d3358b"}, + {file = "pydantic_core-2.20.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:41e81317dd6a0127cabce83c0c9c3fbecceae981c8391e6f1dec88a77c8a569a"}, + {file = "pydantic_core-2.20.1-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:04024d270cf63f586ad41fff13fde4311c4fc13ea74676962c876d9577bcc78f"}, + {file = "pydantic_core-2.20.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:eaad4ff2de1c3823fddf82f41121bdf453d922e9a238642b1dedb33c4e4f98ad"}, + {file = "pydantic_core-2.20.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:26ab812fa0c845df815e506be30337e2df27e88399b985d0bb4e3ecfe72df31c"}, + {file = "pydantic_core-2.20.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3c5ebac750d9d5f2706654c638c041635c385596caf68f81342011ddfa1e5598"}, + {file = "pydantic_core-2.20.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2aafc5a503855ea5885559eae883978c9b6d8c8993d67766ee73d82e841300dd"}, + {file = "pydantic_core-2.20.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:4868f6bd7c9d98904b748a2653031fc9c2f85b6237009d475b1008bfaeb0a5aa"}, + {file = "pydantic_core-2.20.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:aa2f457b4af386254372dfa78a2eda2563680d982422641a85f271c859df1987"}, + {file = "pydantic_core-2.20.1-cp38-none-win32.whl", hash = "sha256:225b67a1f6d602de0ce7f6c1c3ae89a4aa25d3de9be857999e9124f15dab486a"}, + {file = "pydantic_core-2.20.1-cp38-none-win_amd64.whl", hash = "sha256:6b507132dcfc0dea440cce23ee2182c0ce7aba7054576efc65634f080dbe9434"}, + {file = "pydantic_core-2.20.1-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:b03f7941783b4c4a26051846dea594628b38f6940a2fdc0df00b221aed39314c"}, + {file = "pydantic_core-2.20.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:1eedfeb6089ed3fad42e81a67755846ad4dcc14d73698c120a82e4ccf0f1f9f6"}, + {file = "pydantic_core-2.20.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:635fee4e041ab9c479e31edda27fcf966ea9614fff1317e280d99eb3e5ab6fe2"}, + {file = "pydantic_core-2.20.1-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:77bf3ac639c1ff567ae3b47f8d4cc3dc20f9966a2a6dd2311dcc055d3d04fb8a"}, + {file = "pydantic_core-2.20.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7ed1b0132f24beeec5a78b67d9388656d03e6a7c837394f99257e2d55b461611"}, + {file = "pydantic_core-2.20.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c6514f963b023aeee506678a1cf821fe31159b925c4b76fe2afa94cc70b3222b"}, + {file = "pydantic_core-2.20.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:10d4204d8ca33146e761c79f83cc861df20e7ae9f6487ca290a97702daf56006"}, + {file = "pydantic_core-2.20.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2d036c7187b9422ae5b262badb87a20a49eb6c5238b2004e96d4da1231badef1"}, + {file = "pydantic_core-2.20.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:9ebfef07dbe1d93efb94b4700f2d278494e9162565a54f124c404a5656d7ff09"}, + {file = "pydantic_core-2.20.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:6b9d9bb600328a1ce523ab4f454859e9d439150abb0906c5a1983c146580ebab"}, + {file = "pydantic_core-2.20.1-cp39-none-win32.whl", hash = "sha256:784c1214cb6dd1e3b15dd8b91b9a53852aed16671cc3fbe4786f4f1db07089e2"}, + {file = "pydantic_core-2.20.1-cp39-none-win_amd64.whl", hash = "sha256:d2fe69c5434391727efa54b47a1e7986bb0186e72a41b203df8f5b0a19a4f669"}, + {file = "pydantic_core-2.20.1-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:a45f84b09ac9c3d35dfcf6a27fd0634d30d183205230a0ebe8373a0e8cfa0906"}, + {file = "pydantic_core-2.20.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:d02a72df14dfdbaf228424573a07af10637bd490f0901cee872c4f434a735b94"}, + {file = "pydantic_core-2.20.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d2b27e6af28f07e2f195552b37d7d66b150adbaa39a6d327766ffd695799780f"}, + {file = "pydantic_core-2.20.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:084659fac3c83fd674596612aeff6041a18402f1e1bc19ca39e417d554468482"}, + {file = "pydantic_core-2.20.1-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:242b8feb3c493ab78be289c034a1f659e8826e2233786e36f2893a950a719bb6"}, + {file = "pydantic_core-2.20.1-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:38cf1c40a921d05c5edc61a785c0ddb4bed67827069f535d794ce6bcded919fc"}, + {file = "pydantic_core-2.20.1-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:e0bbdd76ce9aa5d4209d65f2b27fc6e5ef1312ae6c5333c26db3f5ade53a1e99"}, + {file = "pydantic_core-2.20.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:254ec27fdb5b1ee60684f91683be95e5133c994cc54e86a0b0963afa25c8f8a6"}, + {file = "pydantic_core-2.20.1-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:407653af5617f0757261ae249d3fba09504d7a71ab36ac057c938572d1bc9331"}, + {file = "pydantic_core-2.20.1-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:c693e916709c2465b02ca0ad7b387c4f8423d1db7b4649c551f27a529181c5ad"}, + {file = "pydantic_core-2.20.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5b5ff4911aea936a47d9376fd3ab17e970cc543d1b68921886e7f64bd28308d1"}, + {file = "pydantic_core-2.20.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:177f55a886d74f1808763976ac4efd29b7ed15c69f4d838bbd74d9d09cf6fa86"}, + {file = "pydantic_core-2.20.1-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:964faa8a861d2664f0c7ab0c181af0bea66098b1919439815ca8803ef136fc4e"}, + {file = "pydantic_core-2.20.1-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:4dd484681c15e6b9a977c785a345d3e378d72678fd5f1f3c0509608da24f2ac0"}, + {file = "pydantic_core-2.20.1-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:f6d6cff3538391e8486a431569b77921adfcdef14eb18fbf19b7c0a5294d4e6a"}, + {file = "pydantic_core-2.20.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:a6d511cc297ff0883bc3708b465ff82d7560193169a8b93260f74ecb0a5e08a7"}, + {file = "pydantic_core-2.20.1.tar.gz", hash = "sha256:26ca695eeee5f9f1aeeb211ffc12f10bcb6f71e2989988fda61dabd65db878d4"}, ] [package.dependencies] @@ -2568,110 +2579,110 @@ use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] [[package]] name = "rpds-py" -version = "0.18.1" +version = "0.19.0" description = "Python bindings to Rust's persistent data structures (rpds)" optional = true python-versions = ">=3.8" files = [ - {file = "rpds_py-0.18.1-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:d31dea506d718693b6b2cffc0648a8929bdc51c70a311b2770f09611caa10d53"}, - {file = "rpds_py-0.18.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:732672fbc449bab754e0b15356c077cc31566df874964d4801ab14f71951ea80"}, - {file = "rpds_py-0.18.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4a98a1f0552b5f227a3d6422dbd61bc6f30db170939bd87ed14f3c339aa6c7c9"}, - {file = "rpds_py-0.18.1-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:7f1944ce16401aad1e3f7d312247b3d5de7981f634dc9dfe90da72b87d37887d"}, - {file = "rpds_py-0.18.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:38e14fb4e370885c4ecd734f093a2225ee52dc384b86fa55fe3f74638b2cfb09"}, - {file = "rpds_py-0.18.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:08d74b184f9ab6289b87b19fe6a6d1a97fbfea84b8a3e745e87a5de3029bf944"}, - {file = "rpds_py-0.18.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d70129cef4a8d979caa37e7fe957202e7eee8ea02c5e16455bc9808a59c6b2f0"}, - {file = "rpds_py-0.18.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:ce0bb20e3a11bd04461324a6a798af34d503f8d6f1aa3d2aa8901ceaf039176d"}, - {file = "rpds_py-0.18.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:81c5196a790032e0fc2464c0b4ab95f8610f96f1f2fa3d4deacce6a79852da60"}, - {file = "rpds_py-0.18.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:f3027be483868c99b4985fda802a57a67fdf30c5d9a50338d9db646d590198da"}, - {file = "rpds_py-0.18.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:d44607f98caa2961bab4fa3c4309724b185b464cdc3ba6f3d7340bac3ec97cc1"}, - {file = "rpds_py-0.18.1-cp310-none-win32.whl", hash = "sha256:c273e795e7a0f1fddd46e1e3cb8be15634c29ae8ff31c196debb620e1edb9333"}, - {file = "rpds_py-0.18.1-cp310-none-win_amd64.whl", hash = "sha256:8352f48d511de5f973e4f2f9412736d7dea76c69faa6d36bcf885b50c758ab9a"}, - {file = "rpds_py-0.18.1-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:6b5ff7e1d63a8281654b5e2896d7f08799378e594f09cf3674e832ecaf396ce8"}, - {file = "rpds_py-0.18.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8927638a4d4137a289e41d0fd631551e89fa346d6dbcfc31ad627557d03ceb6d"}, - {file = "rpds_py-0.18.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:154bf5c93d79558b44e5b50cc354aa0459e518e83677791e6adb0b039b7aa6a7"}, - {file = "rpds_py-0.18.1-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:07f2139741e5deb2c5154a7b9629bc5aa48c766b643c1a6750d16f865a82c5fc"}, - {file = "rpds_py-0.18.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8c7672e9fba7425f79019db9945b16e308ed8bc89348c23d955c8c0540da0a07"}, - {file = "rpds_py-0.18.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:489bdfe1abd0406eba6b3bb4fdc87c7fa40f1031de073d0cfb744634cc8fa261"}, - {file = "rpds_py-0.18.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3c20f05e8e3d4fc76875fc9cb8cf24b90a63f5a1b4c5b9273f0e8225e169b100"}, - {file = "rpds_py-0.18.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:967342e045564cef76dfcf1edb700b1e20838d83b1aa02ab313e6a497cf923b8"}, - {file = "rpds_py-0.18.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:2cc7c1a47f3a63282ab0f422d90ddac4aa3034e39fc66a559ab93041e6505da7"}, - {file = "rpds_py-0.18.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:f7afbfee1157e0f9376c00bb232e80a60e59ed716e3211a80cb8506550671e6e"}, - {file = "rpds_py-0.18.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:9e6934d70dc50f9f8ea47081ceafdec09245fd9f6032669c3b45705dea096b88"}, - {file = "rpds_py-0.18.1-cp311-none-win32.whl", hash = "sha256:c69882964516dc143083d3795cb508e806b09fc3800fd0d4cddc1df6c36e76bb"}, - {file = "rpds_py-0.18.1-cp311-none-win_amd64.whl", hash = "sha256:70a838f7754483bcdc830444952fd89645569e7452e3226de4a613a4c1793fb2"}, - {file = "rpds_py-0.18.1-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:3dd3cd86e1db5aadd334e011eba4e29d37a104b403e8ca24dcd6703c68ca55b3"}, - {file = "rpds_py-0.18.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:05f3d615099bd9b13ecf2fc9cf2d839ad3f20239c678f461c753e93755d629ee"}, - {file = "rpds_py-0.18.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:35b2b771b13eee8729a5049c976197ff58a27a3829c018a04341bcf1ae409b2b"}, - {file = "rpds_py-0.18.1-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ee17cd26b97d537af8f33635ef38be873073d516fd425e80559f4585a7b90c43"}, - {file = "rpds_py-0.18.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b646bf655b135ccf4522ed43d6902af37d3f5dbcf0da66c769a2b3938b9d8184"}, - {file = "rpds_py-0.18.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:19ba472b9606c36716062c023afa2484d1e4220548751bda14f725a7de17b4f6"}, - {file = "rpds_py-0.18.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6e30ac5e329098903262dc5bdd7e2086e0256aa762cc8b744f9e7bf2a427d3f8"}, - {file = "rpds_py-0.18.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d58ad6317d188c43750cb76e9deacf6051d0f884d87dc6518e0280438648a9ac"}, - {file = "rpds_py-0.18.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e1735502458621921cee039c47318cb90b51d532c2766593be6207eec53e5c4c"}, - {file = "rpds_py-0.18.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:f5bab211605d91db0e2995a17b5c6ee5edec1270e46223e513eaa20da20076ac"}, - {file = "rpds_py-0.18.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:2fc24a329a717f9e2448f8cd1f960f9dac4e45b6224d60734edeb67499bab03a"}, - {file = "rpds_py-0.18.1-cp312-none-win32.whl", hash = "sha256:1805d5901779662d599d0e2e4159d8a82c0b05faa86ef9222bf974572286b2b6"}, - {file = "rpds_py-0.18.1-cp312-none-win_amd64.whl", hash = "sha256:720edcb916df872d80f80a1cc5ea9058300b97721efda8651efcd938a9c70a72"}, - {file = "rpds_py-0.18.1-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:c827576e2fa017a081346dce87d532a5310241648eb3700af9a571a6e9fc7e74"}, - {file = "rpds_py-0.18.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:aa3679e751408d75a0b4d8d26d6647b6d9326f5e35c00a7ccd82b78ef64f65f8"}, - {file = "rpds_py-0.18.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0abeee75434e2ee2d142d650d1e54ac1f8b01e6e6abdde8ffd6eeac6e9c38e20"}, - {file = "rpds_py-0.18.1-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ed402d6153c5d519a0faf1bb69898e97fb31613b49da27a84a13935ea9164dfc"}, - {file = "rpds_py-0.18.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:338dee44b0cef8b70fd2ef54b4e09bb1b97fc6c3a58fea5db6cc083fd9fc2724"}, - {file = "rpds_py-0.18.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7750569d9526199c5b97e5a9f8d96a13300950d910cf04a861d96f4273d5b104"}, - {file = "rpds_py-0.18.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:607345bd5912aacc0c5a63d45a1f73fef29e697884f7e861094e443187c02be5"}, - {file = "rpds_py-0.18.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:207c82978115baa1fd8d706d720b4a4d2b0913df1c78c85ba73fe6c5804505f0"}, - {file = "rpds_py-0.18.1-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:6d1e42d2735d437e7e80bab4d78eb2e459af48c0a46e686ea35f690b93db792d"}, - {file = "rpds_py-0.18.1-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:5463c47c08630007dc0fe99fb480ea4f34a89712410592380425a9b4e1611d8e"}, - {file = "rpds_py-0.18.1-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:06d218939e1bf2ca50e6b0ec700ffe755e5216a8230ab3e87c059ebb4ea06afc"}, - {file = "rpds_py-0.18.1-cp38-none-win32.whl", hash = "sha256:312fe69b4fe1ffbe76520a7676b1e5ac06ddf7826d764cc10265c3b53f96dbe9"}, - {file = "rpds_py-0.18.1-cp38-none-win_amd64.whl", hash = "sha256:9437ca26784120a279f3137ee080b0e717012c42921eb07861b412340f85bae2"}, - {file = "rpds_py-0.18.1-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:19e515b78c3fc1039dd7da0a33c28c3154458f947f4dc198d3c72db2b6b5dc93"}, - {file = "rpds_py-0.18.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a7b28c5b066bca9a4eb4e2f2663012debe680f097979d880657f00e1c30875a0"}, - {file = "rpds_py-0.18.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:673fdbbf668dd958eff750e500495ef3f611e2ecc209464f661bc82e9838991e"}, - {file = "rpds_py-0.18.1-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d960de62227635d2e61068f42a6cb6aae91a7fe00fca0e3aeed17667c8a34611"}, - {file = "rpds_py-0.18.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:352a88dc7892f1da66b6027af06a2e7e5d53fe05924cc2cfc56495b586a10b72"}, - {file = "rpds_py-0.18.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4e0ee01ad8260184db21468a6e1c37afa0529acc12c3a697ee498d3c2c4dcaf3"}, - {file = "rpds_py-0.18.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e4c39ad2f512b4041343ea3c7894339e4ca7839ac38ca83d68a832fc8b3748ab"}, - {file = "rpds_py-0.18.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:aaa71ee43a703c321906813bb252f69524f02aa05bf4eec85f0c41d5d62d0f4c"}, - {file = "rpds_py-0.18.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:6cd8098517c64a85e790657e7b1e509b9fe07487fd358e19431cb120f7d96338"}, - {file = "rpds_py-0.18.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:4adec039b8e2928983f885c53b7cc4cda8965b62b6596501a0308d2703f8af1b"}, - {file = "rpds_py-0.18.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:32b7daaa3e9389db3695964ce8e566e3413b0c43e3394c05e4b243a4cd7bef26"}, - {file = "rpds_py-0.18.1-cp39-none-win32.whl", hash = "sha256:2625f03b105328729f9450c8badda34d5243231eef6535f80064d57035738360"}, - {file = "rpds_py-0.18.1-cp39-none-win_amd64.whl", hash = "sha256:bf18932d0003c8c4d51a39f244231986ab23ee057d235a12b2684ea26a353590"}, - {file = "rpds_py-0.18.1-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:cbfbea39ba64f5e53ae2915de36f130588bba71245b418060ec3330ebf85678e"}, - {file = "rpds_py-0.18.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:a3d456ff2a6a4d2adcdf3c1c960a36f4fd2fec6e3b4902a42a384d17cf4e7a65"}, - {file = "rpds_py-0.18.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7700936ef9d006b7ef605dc53aa364da2de5a3aa65516a1f3ce73bf82ecfc7ae"}, - {file = "rpds_py-0.18.1-pp310-pypy310_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:51584acc5916212e1bf45edd17f3a6b05fe0cbb40482d25e619f824dccb679de"}, - {file = "rpds_py-0.18.1-pp310-pypy310_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:942695a206a58d2575033ff1e42b12b2aece98d6003c6bc739fbf33d1773b12f"}, - {file = "rpds_py-0.18.1-pp310-pypy310_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b906b5f58892813e5ba5c6056d6a5ad08f358ba49f046d910ad992196ea61397"}, - {file = "rpds_py-0.18.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f6f8e3fecca256fefc91bb6765a693d96692459d7d4c644660a9fff32e517843"}, - {file = "rpds_py-0.18.1-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:7732770412bab81c5a9f6d20aeb60ae943a9b36dcd990d876a773526468e7163"}, - {file = "rpds_py-0.18.1-pp310-pypy310_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:bd1105b50ede37461c1d51b9698c4f4be6e13e69a908ab7751e3807985fc0346"}, - {file = "rpds_py-0.18.1-pp310-pypy310_pp73-musllinux_1_2_i686.whl", hash = "sha256:618916f5535784960f3ecf8111581f4ad31d347c3de66d02e728de460a46303c"}, - {file = "rpds_py-0.18.1-pp310-pypy310_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:17c6d2155e2423f7e79e3bb18151c686d40db42d8645e7977442170c360194d4"}, - {file = "rpds_py-0.18.1-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:6c4c4c3f878df21faf5fac86eda32671c27889e13570645a9eea0a1abdd50922"}, - {file = "rpds_py-0.18.1-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:fab6ce90574645a0d6c58890e9bcaac8d94dff54fb51c69e5522a7358b80ab64"}, - {file = "rpds_py-0.18.1-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:531796fb842b53f2695e94dc338929e9f9dbf473b64710c28af5a160b2a8927d"}, - {file = "rpds_py-0.18.1-pp38-pypy38_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:740884bc62a5e2bbb31e584f5d23b32320fd75d79f916f15a788d527a5e83644"}, - {file = "rpds_py-0.18.1-pp38-pypy38_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:998125738de0158f088aef3cb264a34251908dd2e5d9966774fdab7402edfab7"}, - {file = "rpds_py-0.18.1-pp38-pypy38_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e2be6e9dd4111d5b31ba3b74d17da54a8319d8168890fbaea4b9e5c3de630ae5"}, - {file = "rpds_py-0.18.1-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d0cee71bc618cd93716f3c1bf56653740d2d13ddbd47673efa8bf41435a60daa"}, - {file = "rpds_py-0.18.1-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2c3caec4ec5cd1d18e5dd6ae5194d24ed12785212a90b37f5f7f06b8bedd7139"}, - {file = "rpds_py-0.18.1-pp38-pypy38_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:27bba383e8c5231cd559affe169ca0b96ec78d39909ffd817f28b166d7ddd4d8"}, - {file = "rpds_py-0.18.1-pp38-pypy38_pp73-musllinux_1_2_i686.whl", hash = "sha256:a888e8bdb45916234b99da2d859566f1e8a1d2275a801bb8e4a9644e3c7e7909"}, - {file = "rpds_py-0.18.1-pp38-pypy38_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:6031b25fb1b06327b43d841f33842b383beba399884f8228a6bb3df3088485ff"}, - {file = "rpds_py-0.18.1-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:48c2faaa8adfacefcbfdb5f2e2e7bdad081e5ace8d182e5f4ade971f128e6bb3"}, - {file = "rpds_py-0.18.1-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:d85164315bd68c0806768dc6bb0429c6f95c354f87485ee3593c4f6b14def2bd"}, - {file = "rpds_py-0.18.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6afd80f6c79893cfc0574956f78a0add8c76e3696f2d6a15bca2c66c415cf2d4"}, - {file = "rpds_py-0.18.1-pp39-pypy39_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:fa242ac1ff583e4ec7771141606aafc92b361cd90a05c30d93e343a0c2d82a89"}, - {file = "rpds_py-0.18.1-pp39-pypy39_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d21be4770ff4e08698e1e8e0bce06edb6ea0626e7c8f560bc08222880aca6a6f"}, - {file = "rpds_py-0.18.1-pp39-pypy39_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5c45a639e93a0c5d4b788b2613bd637468edd62f8f95ebc6fcc303d58ab3f0a8"}, - {file = "rpds_py-0.18.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:910e71711d1055b2768181efa0a17537b2622afeb0424116619817007f8a2b10"}, - {file = "rpds_py-0.18.1-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b9bb1f182a97880f6078283b3505a707057c42bf55d8fca604f70dedfdc0772a"}, - {file = "rpds_py-0.18.1-pp39-pypy39_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:1d54f74f40b1f7aaa595a02ff42ef38ca654b1469bef7d52867da474243cc633"}, - {file = "rpds_py-0.18.1-pp39-pypy39_pp73-musllinux_1_2_i686.whl", hash = "sha256:8d2e182c9ee01135e11e9676e9a62dfad791a7a467738f06726872374a83db49"}, - {file = "rpds_py-0.18.1-pp39-pypy39_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:636a15acc588f70fda1661234761f9ed9ad79ebed3f2125d44be0862708b666e"}, - {file = "rpds_py-0.18.1.tar.gz", hash = "sha256:dc48b479d540770c811fbd1eb9ba2bb66951863e448efec2e2c102625328e92f"}, + {file = "rpds_py-0.19.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:fb37bd599f031f1a6fb9e58ec62864ccf3ad549cf14bac527dbfa97123edcca4"}, + {file = "rpds_py-0.19.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3384d278df99ec2c6acf701d067147320b864ef6727405d6470838476e44d9e8"}, + {file = "rpds_py-0.19.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e54548e0be3ac117595408fd4ca0ac9278fde89829b0b518be92863b17ff67a2"}, + {file = "rpds_py-0.19.0-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:8eb488ef928cdbc05a27245e52de73c0d7c72a34240ef4d9893fdf65a8c1a955"}, + {file = "rpds_py-0.19.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a5da93debdfe27b2bfc69eefb592e1831d957b9535e0943a0ee8b97996de21b5"}, + {file = "rpds_py-0.19.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:79e205c70afddd41f6ee79a8656aec738492a550247a7af697d5bd1aee14f766"}, + {file = "rpds_py-0.19.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:959179efb3e4a27610e8d54d667c02a9feaa86bbabaf63efa7faa4dfa780d4f1"}, + {file = "rpds_py-0.19.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a6e605bb9edcf010f54f8b6a590dd23a4b40a8cb141255eec2a03db249bc915b"}, + {file = "rpds_py-0.19.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:9133d75dc119a61d1a0ded38fb9ba40a00ef41697cc07adb6ae098c875195a3f"}, + {file = "rpds_py-0.19.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:dd36b712d35e757e28bf2f40a71e8f8a2d43c8b026d881aa0c617b450d6865c9"}, + {file = "rpds_py-0.19.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:354f3a91718489912f2e0fc331c24eaaf6a4565c080e00fbedb6015857c00582"}, + {file = "rpds_py-0.19.0-cp310-none-win32.whl", hash = "sha256:ebcbf356bf5c51afc3290e491d3722b26aaf5b6af3c1c7f6a1b757828a46e336"}, + {file = "rpds_py-0.19.0-cp310-none-win_amd64.whl", hash = "sha256:75a6076289b2df6c8ecb9d13ff79ae0cad1d5fb40af377a5021016d58cd691ec"}, + {file = "rpds_py-0.19.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:6d45080095e585f8c5097897313def60caa2046da202cdb17a01f147fb263b81"}, + {file = "rpds_py-0.19.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c5c9581019c96f865483d031691a5ff1cc455feb4d84fc6920a5ffc48a794d8a"}, + {file = "rpds_py-0.19.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1540d807364c84516417115c38f0119dfec5ea5c0dd9a25332dea60b1d26fc4d"}, + {file = "rpds_py-0.19.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:9e65489222b410f79711dc3d2d5003d2757e30874096b2008d50329ea4d0f88c"}, + {file = "rpds_py-0.19.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9da6f400eeb8c36f72ef6646ea530d6d175a4f77ff2ed8dfd6352842274c1d8b"}, + {file = "rpds_py-0.19.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:37f46bb11858717e0efa7893c0f7055c43b44c103e40e69442db5061cb26ed34"}, + {file = "rpds_py-0.19.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:071d4adc734de562bd11d43bd134330fb6249769b2f66b9310dab7460f4bf714"}, + {file = "rpds_py-0.19.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:9625367c8955e4319049113ea4f8fee0c6c1145192d57946c6ffcd8fe8bf48dd"}, + {file = "rpds_py-0.19.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:e19509145275d46bc4d1e16af0b57a12d227c8253655a46bbd5ec317e941279d"}, + {file = "rpds_py-0.19.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:4d438e4c020d8c39961deaf58f6913b1bf8832d9b6f62ec35bd93e97807e9cbc"}, + {file = "rpds_py-0.19.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:90bf55d9d139e5d127193170f38c584ed3c79e16638890d2e36f23aa1630b952"}, + {file = "rpds_py-0.19.0-cp311-none-win32.whl", hash = "sha256:8d6ad132b1bc13d05ffe5b85e7a01a3998bf3a6302ba594b28d61b8c2cf13aaf"}, + {file = "rpds_py-0.19.0-cp311-none-win_amd64.whl", hash = "sha256:7ec72df7354e6b7f6eb2a17fa6901350018c3a9ad78e48d7b2b54d0412539a67"}, + {file = "rpds_py-0.19.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:5095a7c838a8647c32aa37c3a460d2c48debff7fc26e1136aee60100a8cd8f68"}, + {file = "rpds_py-0.19.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6f2f78ef14077e08856e788fa482107aa602636c16c25bdf59c22ea525a785e9"}, + {file = "rpds_py-0.19.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b7cc6cb44f8636fbf4a934ca72f3e786ba3c9f9ba4f4d74611e7da80684e48d2"}, + {file = "rpds_py-0.19.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:cf902878b4af334a09de7a45badbff0389e7cf8dc2e4dcf5f07125d0b7c2656d"}, + {file = "rpds_py-0.19.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:688aa6b8aa724db1596514751ffb767766e02e5c4a87486ab36b8e1ebc1aedac"}, + {file = "rpds_py-0.19.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:57dbc9167d48e355e2569346b5aa4077f29bf86389c924df25c0a8b9124461fb"}, + {file = "rpds_py-0.19.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3b4cf5a9497874822341c2ebe0d5850fed392034caadc0bad134ab6822c0925b"}, + {file = "rpds_py-0.19.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:8a790d235b9d39c70a466200d506bb33a98e2ee374a9b4eec7a8ac64c2c261fa"}, + {file = "rpds_py-0.19.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:1d16089dfa58719c98a1c06f2daceba6d8e3fb9b5d7931af4a990a3c486241cb"}, + {file = "rpds_py-0.19.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:bc9128e74fe94650367fe23f37074f121b9f796cabbd2f928f13e9661837296d"}, + {file = "rpds_py-0.19.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:c8f77e661ffd96ff104bebf7d0f3255b02aa5d5b28326f5408d6284c4a8b3248"}, + {file = "rpds_py-0.19.0-cp312-none-win32.whl", hash = "sha256:5f83689a38e76969327e9b682be5521d87a0c9e5a2e187d2bc6be4765f0d4600"}, + {file = "rpds_py-0.19.0-cp312-none-win_amd64.whl", hash = "sha256:06925c50f86da0596b9c3c64c3837b2481337b83ef3519e5db2701df695453a4"}, + {file = "rpds_py-0.19.0-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:52e466bea6f8f3a44b1234570244b1cff45150f59a4acae3fcc5fd700c2993ca"}, + {file = "rpds_py-0.19.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:e21cc693045fda7f745c790cb687958161ce172ffe3c5719ca1764e752237d16"}, + {file = "rpds_py-0.19.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6b31f059878eb1f5da8b2fd82480cc18bed8dcd7fb8fe68370e2e6285fa86da6"}, + {file = "rpds_py-0.19.0-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:1dd46f309e953927dd018567d6a9e2fb84783963650171f6c5fe7e5c41fd5666"}, + {file = "rpds_py-0.19.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:34a01a4490e170376cd79258b7f755fa13b1a6c3667e872c8e35051ae857a92b"}, + {file = "rpds_py-0.19.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:bcf426a8c38eb57f7bf28932e68425ba86def6e756a5b8cb4731d8e62e4e0223"}, + {file = "rpds_py-0.19.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f68eea5df6347d3f1378ce992d86b2af16ad7ff4dcb4a19ccdc23dea901b87fb"}, + {file = "rpds_py-0.19.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:dab8d921b55a28287733263c0e4c7db11b3ee22aee158a4de09f13c93283c62d"}, + {file = "rpds_py-0.19.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:6fe87efd7f47266dfc42fe76dae89060038f1d9cb911f89ae7e5084148d1cc08"}, + {file = "rpds_py-0.19.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:535d4b52524a961d220875688159277f0e9eeeda0ac45e766092bfb54437543f"}, + {file = "rpds_py-0.19.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:8b1a94b8afc154fbe36978a511a1f155f9bd97664e4f1f7a374d72e180ceb0ae"}, + {file = "rpds_py-0.19.0-cp38-none-win32.whl", hash = "sha256:7c98298a15d6b90c8f6e3caa6457f4f022423caa5fa1a1ca7a5e9e512bdb77a4"}, + {file = "rpds_py-0.19.0-cp38-none-win_amd64.whl", hash = "sha256:b0da31853ab6e58a11db3205729133ce0df26e6804e93079dee095be3d681dc1"}, + {file = "rpds_py-0.19.0-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:5039e3cef7b3e7a060de468a4a60a60a1f31786da94c6cb054e7a3c75906111c"}, + {file = "rpds_py-0.19.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ab1932ca6cb8c7499a4d87cb21ccc0d3326f172cfb6a64021a889b591bb3045c"}, + {file = "rpds_py-0.19.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f2afd2164a1e85226fcb6a1da77a5c8896c18bfe08e82e8ceced5181c42d2179"}, + {file = "rpds_py-0.19.0-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:b1c30841f5040de47a0046c243fc1b44ddc87d1b12435a43b8edff7e7cb1e0d0"}, + {file = "rpds_py-0.19.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f757f359f30ec7dcebca662a6bd46d1098f8b9fb1fcd661a9e13f2e8ce343ba1"}, + {file = "rpds_py-0.19.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:15e65395a59d2e0e96caf8ee5389ffb4604e980479c32742936ddd7ade914b22"}, + {file = "rpds_py-0.19.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cb0f6eb3a320f24b94d177e62f4074ff438f2ad9d27e75a46221904ef21a7b05"}, + {file = "rpds_py-0.19.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b228e693a2559888790936e20f5f88b6e9f8162c681830eda303bad7517b4d5a"}, + {file = "rpds_py-0.19.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:2575efaa5d949c9f4e2cdbe7d805d02122c16065bfb8d95c129372d65a291a0b"}, + {file = "rpds_py-0.19.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:5c872814b77a4e84afa293a1bee08c14daed1068b2bb1cc312edbf020bbbca2b"}, + {file = "rpds_py-0.19.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:850720e1b383df199b8433a20e02b25b72f0fded28bc03c5bd79e2ce7ef050be"}, + {file = "rpds_py-0.19.0-cp39-none-win32.whl", hash = "sha256:ce84a7efa5af9f54c0aa7692c45861c1667080814286cacb9958c07fc50294fb"}, + {file = "rpds_py-0.19.0-cp39-none-win_amd64.whl", hash = "sha256:1c26da90b8d06227d7769f34915913911222d24ce08c0ab2d60b354e2d9c7aff"}, + {file = "rpds_py-0.19.0-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:75969cf900d7be665ccb1622a9aba225cf386bbc9c3bcfeeab9f62b5048f4a07"}, + {file = "rpds_py-0.19.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:8445f23f13339da640d1be8e44e5baf4af97e396882ebbf1692aecd67f67c479"}, + {file = "rpds_py-0.19.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5a7c1062ef8aea3eda149f08120f10795835fc1c8bc6ad948fb9652a113ca55"}, + {file = "rpds_py-0.19.0-pp310-pypy310_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:462b0c18fbb48fdbf980914a02ee38c423a25fcc4cf40f66bacc95a2d2d73bc8"}, + {file = "rpds_py-0.19.0-pp310-pypy310_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3208f9aea18991ac7f2b39721e947bbd752a1abbe79ad90d9b6a84a74d44409b"}, + {file = "rpds_py-0.19.0-pp310-pypy310_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c3444fe52b82f122d8a99bf66777aed6b858d392b12f4c317da19f8234db4533"}, + {file = "rpds_py-0.19.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:88cb4bac7185a9f0168d38c01d7a00addece9822a52870eee26b8d5b61409213"}, + {file = "rpds_py-0.19.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6b130bd4163c93798a6b9bb96be64a7c43e1cec81126ffa7ffaa106e1fc5cef5"}, + {file = "rpds_py-0.19.0-pp310-pypy310_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:a707b158b4410aefb6b054715545bbb21aaa5d5d0080217290131c49c2124a6e"}, + {file = "rpds_py-0.19.0-pp310-pypy310_pp73-musllinux_1_2_i686.whl", hash = "sha256:dc9ac4659456bde7c567107556ab065801622396b435a3ff213daef27b495388"}, + {file = "rpds_py-0.19.0-pp310-pypy310_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:81ea573aa46d3b6b3d890cd3c0ad82105985e6058a4baed03cf92518081eec8c"}, + {file = "rpds_py-0.19.0-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:3f148c3f47f7f29a79c38cc5d020edcb5ca780020fab94dbc21f9af95c463581"}, + {file = "rpds_py-0.19.0-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:b0906357f90784a66e89ae3eadc2654f36c580a7d65cf63e6a616e4aec3a81be"}, + {file = "rpds_py-0.19.0-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f629ecc2db6a4736b5ba95a8347b0089240d69ad14ac364f557d52ad68cf94b0"}, + {file = "rpds_py-0.19.0-pp38-pypy38_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c6feacd1d178c30e5bc37184526e56740342fd2aa6371a28367bad7908d454fc"}, + {file = "rpds_py-0.19.0-pp38-pypy38_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ae8b6068ee374fdfab63689be0963333aa83b0815ead5d8648389a8ded593378"}, + {file = "rpds_py-0.19.0-pp38-pypy38_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:78d57546bad81e0da13263e4c9ce30e96dcbe720dbff5ada08d2600a3502e526"}, + {file = "rpds_py-0.19.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a8b6683a37338818646af718c9ca2a07f89787551057fae57c4ec0446dc6224b"}, + {file = "rpds_py-0.19.0-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:e8481b946792415adc07410420d6fc65a352b45d347b78fec45d8f8f0d7496f0"}, + {file = "rpds_py-0.19.0-pp38-pypy38_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:bec35eb20792ea64c3c57891bc3ca0bedb2884fbac2c8249d9b731447ecde4fa"}, + {file = "rpds_py-0.19.0-pp38-pypy38_pp73-musllinux_1_2_i686.whl", hash = "sha256:aa5476c3e3a402c37779e95f7b4048db2cb5b0ed0b9d006983965e93f40fe05a"}, + {file = "rpds_py-0.19.0-pp38-pypy38_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:19d02c45f2507b489fd4df7b827940f1420480b3e2e471e952af4d44a1ea8e34"}, + {file = "rpds_py-0.19.0-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:a3e2fd14c5d49ee1da322672375963f19f32b3d5953f0615b175ff7b9d38daed"}, + {file = "rpds_py-0.19.0-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:93a91c2640645303e874eada51f4f33351b84b351a689d470f8108d0e0694210"}, + {file = "rpds_py-0.19.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e5b9fc03bf76a94065299d4a2ecd8dfbae4ae8e2e8098bbfa6ab6413ca267709"}, + {file = "rpds_py-0.19.0-pp39-pypy39_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5a4b07cdf3f84310c08c1de2c12ddadbb7a77568bcb16e95489f9c81074322ed"}, + {file = "rpds_py-0.19.0-pp39-pypy39_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ba0ed0dc6763d8bd6e5de5cf0d746d28e706a10b615ea382ac0ab17bb7388633"}, + {file = "rpds_py-0.19.0-pp39-pypy39_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:474bc83233abdcf2124ed3f66230a1c8435896046caa4b0b5ab6013c640803cc"}, + {file = "rpds_py-0.19.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:329c719d31362355a96b435f4653e3b4b061fcc9eba9f91dd40804ca637d914e"}, + {file = "rpds_py-0.19.0-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:ef9101f3f7b59043a34f1dccbb385ca760467590951952d6701df0da9893ca0c"}, + {file = "rpds_py-0.19.0-pp39-pypy39_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:0121803b0f424ee2109d6e1f27db45b166ebaa4b32ff47d6aa225642636cd834"}, + {file = "rpds_py-0.19.0-pp39-pypy39_pp73-musllinux_1_2_i686.whl", hash = "sha256:8344127403dea42f5970adccf6c5957a71a47f522171fafaf4c6ddb41b61703a"}, + {file = "rpds_py-0.19.0-pp39-pypy39_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:443cec402ddd650bb2b885113e1dcedb22b1175c6be223b14246a714b61cd521"}, + {file = "rpds_py-0.19.0.tar.gz", hash = "sha256:4fdc9afadbeb393b4bbbad75481e0ea78e4469f2e1d713a90811700830b553a9"}, ] [[package]] @@ -2894,18 +2905,18 @@ files = [ [[package]] name = "setuptools" -version = "70.1.0" +version = "70.3.0" description = "Easily download, build, install, upgrade, and uninstall Python packages" optional = false python-versions = ">=3.8" files = [ - {file = "setuptools-70.1.0-py3-none-any.whl", hash = "sha256:d9b8b771455a97c8a9f3ab3448ebe0b29b5e105f1228bba41028be116985a267"}, - {file = "setuptools-70.1.0.tar.gz", hash = "sha256:01a1e793faa5bd89abc851fa15d0a0db26f160890c7102cd8dce643e886b47f5"}, + {file = "setuptools-70.3.0-py3-none-any.whl", hash = "sha256:fe384da74336c398e0d956d1cae0669bc02eed936cdb1d49b57de1990dc11ffc"}, + {file = "setuptools-70.3.0.tar.gz", hash = "sha256:f171bab1dfbc86b132997f26a119f6056a57950d058587841a0082e8830f9dc5"}, ] [package.extras] -docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"] -testing = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "importlib-metadata", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "jaraco.test", "mypy (==1.10.0)", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.1)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy", "pytest-perf", "pytest-ruff (>=0.3.2)", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] +doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"] +test = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "importlib-metadata", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "jaraco.test", "mypy (==1.10.0)", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy", "pytest-perf", "pytest-ruff (>=0.3.2)", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] [[package]] name = "six" @@ -2920,17 +2931,20 @@ files = [ [[package]] name = "sympy" -version = "1.12.1" +version = "1.13.0" description = "Computer algebra system (CAS) in Python" optional = false python-versions = ">=3.8" files = [ - {file = "sympy-1.12.1-py3-none-any.whl", hash = "sha256:9b2cbc7f1a640289430e13d2a56f02f867a1da0190f2f99d8968c2f74da0e515"}, - {file = "sympy-1.12.1.tar.gz", hash = "sha256:2877b03f998cd8c08f07cd0de5b767119cd3ef40d09f41c30d722f6686b0fb88"}, + {file = "sympy-1.13.0-py3-none-any.whl", hash = "sha256:6b0b32a4673fb91bd3cac3b55406c8e01d53ae22780be467301cc452f6680c92"}, + {file = "sympy-1.13.0.tar.gz", hash = "sha256:3b6af8f4d008b9a1a6a4268b335b984b23835f26d1d60b0526ebc71d48a25f57"}, ] [package.dependencies] -mpmath = ">=1.1.0,<1.4.0" +mpmath = ">=1.1.0,<1.4" + +[package.extras] +dev = ["hypothesis (>=6.70.0)", "pytest (>=7.1.0)"] [[package]] name = "tbb" From 588a01455119f81d4d68b5f64053dbc071206624 Mon Sep 17 00:00:00 2001 From: yuanwu Date: Sun, 28 Jul 2024 09:05:49 +0000 Subject: [PATCH 42/46] Enable llava-next Signed-off-by: yuanwu --- router/client/src/client.rs | 267 +++-- router/client/src/sharded_client.rs | 2 + router/src/main.rs | 1 + .../text_generation_server/models/__init__.py | 18 + .../models/causal_lm.py | 1 + .../models/custom_modeling/llava_next.py | 417 ++++---- .../models/flash_causal_lm.py | 117 ++- .../models/vlm_causal_lm.py | 928 +++++++++++++++--- server/text_generation_server/server.py | 7 +- 9 files changed, 1278 insertions(+), 480 deletions(-) diff --git a/router/client/src/client.rs b/router/client/src/client.rs index 3cda2f4b..cff1fac8 100644 --- a/router/client/src/client.rs +++ b/router/client/src/client.rs @@ -110,6 +110,7 @@ impl Client { max_prefill_tokens: u32, max_total_tokens: u32, max_batch_size: Option, + model_id: &str ) -> Result> { let warmup_enabled: bool = env::var("WARMUP_ENABLED").ok().map_or(true, |value| value.to_lowercase() == "true"); if !warmup_enabled { @@ -152,25 +153,76 @@ impl Client { let mut batch_counter: u64 = 0; let mut request_counter: u64 = 0; - for shape in shapes.iter() { - let (batch_size, seq_length) = shape; - let mut batches: Vec = vec![ - self.create_warmup_batch( - *shape, - &mut batch_counter, - &mut request_counter, - max_input_length, - max_total_tokens, - seq_bucket_size, - false, - None, - ) - ]; - // if possible, create second batch in order to trigger concatenate operation - if *batch_size < max_decode_batch_size { - batches.push( + if model_id.contains("llava") { + let mut n_tokens = 0; + let mut requests = Vec::new(); + // Create requests + while n_tokens < max_prefill_tokens { + let truncate = cmp::min(max_input_length, max_prefill_tokens - n_tokens); + + let mut inputs = String::new(); + inputs.push_str("![]()"); + inputs.push_str(&"_test ".to_string().repeat(max_input_length as usize)); + + requests.push(Request { + id: 0, + // We truncate the input on the server side to be sure that it has the correct size + inputs, + truncate, + // Set sampling parameters to also take these ops into account in the max memory + parameters: Some(NextTokenChooserParameters { + temperature: 0.9, + top_k: 10, + top_p: 0.9, + typical_p: 0.9, + do_sample: false, + seed: 0, + repetition_penalty: 1.2, + frequency_penalty: 0.1, + watermark: true, + grammar: String::new(), + grammar_type: GrammarType::None as i32, + }), + stopping_parameters: Some(StoppingCriteriaParameters { + max_new_tokens: max_total_tokens - truncate, + stop_sequences: vec![], + ignore_eos_token: true, + }), + prefill_logprobs: true, + top_n_tokens: 20, + }); + n_tokens += max_input_length; + + // Check max_batch_size + if Some(requests.len()) == max_batch_size { + break; + } + } + + let mut batches = Vec::new(); + batches.push(Batch { + id: 0, + size: requests.len() as u32, + requests, + max_tokens: 0, + }); + + let request = tonic::Request::new(WarmupRequest { + batches, + max_input_length, + max_prefill_tokens, + max_total_tokens, + }) + .inject_context(); + let response = self.stub.warmup(request).await?.into_inner(); + Ok(response.max_supported_total_tokens) + } + else { + for shape in shapes.iter() { + let (batch_size, seq_length) = shape; + let mut batches: Vec = vec![ self.create_warmup_batch( - (1, *seq_length), + *shape, &mut batch_counter, &mut request_counter, max_input_length, @@ -179,56 +231,45 @@ impl Client { false, None, ) - ); + ]; + // if possible, create second batch in order to trigger concatenate operation + if *batch_size < max_decode_batch_size { + batches.push( + self.create_warmup_batch( + (1, *seq_length), + &mut batch_counter, + &mut request_counter, + max_input_length, + max_total_tokens, + seq_bucket_size, + false, + None, + ) + ); + } + + let request = tonic::Request::new(WarmupRequest { + batches, + max_input_length, + max_prefill_tokens, + max_total_tokens, + }).inject_context(); + let _response = self.stub.warmup(request).await?.into_inner(); } - let request = tonic::Request::new(WarmupRequest { - batches, - max_input_length, - max_prefill_tokens, - max_total_tokens, - }).inject_context(); - let _response = self.stub.warmup(request).await?.into_inner(); - } + // send batches to warmup all possible decode shapes + if decode_batch_sizes.len() > 1 { + let steps_per_bucket: u32 = if decode_bucket_size <= max_prefill_batch_size { + decode_bucket_size + } else { + decode_bucket_size.div_ceil(max_prefill_batch_size) + }; + let max_new_tokens: u32 = 2 * decode_batch_sizes.len() as u32 * steps_per_bucket; - // send batches to warmup all possible decode shapes - if decode_batch_sizes.len() > 1 { - let steps_per_bucket: u32 = if decode_bucket_size <= max_prefill_batch_size { - decode_bucket_size - } else { - decode_bucket_size.div_ceil(max_prefill_batch_size) - }; - let max_new_tokens: u32 = 2 * decode_batch_sizes.len() as u32 * steps_per_bucket; - - let mut requests_send: u32 = cmp::min(max_prefill_batch_size, decode_bucket_size); - let mut batches: Vec = vec![ - self.create_warmup_batch( - (requests_send, seq_bucket_size), - &mut batch_counter, - &mut request_counter, - max_input_length, - max_total_tokens, - seq_bucket_size, - false, - Some(max_new_tokens), - ) - ]; - - let get_current_decode_batch_size = |num: u32| -> u32 { - decode_batch_sizes.iter() - .filter(|&&x| x >= num) - .min() - .copied() - .unwrap() - }; - - let mut current_decode_batch_size: u32 = get_current_decode_batch_size(requests_send); - while current_decode_batch_size < max_decode_batch_size { - let distance_to_next_bucket = current_decode_batch_size + decode_bucket_size - requests_send; - let num_requests: u32 = cmp::min(distance_to_next_bucket, max_prefill_batch_size); - batches.push( + let mut requests_send: u32 = cmp::min(max_prefill_batch_size, decode_bucket_size); + let mut batches: Vec = vec![ self.create_warmup_batch( - (num_requests, seq_bucket_size), + (requests_send, seq_bucket_size), &mut batch_counter, &mut request_counter, max_input_length, @@ -237,48 +278,74 @@ impl Client { false, Some(max_new_tokens), ) - ); + ]; - requests_send += num_requests; - current_decode_batch_size = get_current_decode_batch_size(requests_send); + let get_current_decode_batch_size = |num: u32| -> u32 { + decode_batch_sizes.iter() + .filter(|&&x| x >= num) + .min() + .copied() + .unwrap() + }; + + let mut current_decode_batch_size: u32 = get_current_decode_batch_size(requests_send); + while current_decode_batch_size < max_decode_batch_size { + let distance_to_next_bucket = current_decode_batch_size + decode_bucket_size - requests_send; + let num_requests: u32 = cmp::min(distance_to_next_bucket, max_prefill_batch_size); + batches.push( + self.create_warmup_batch( + (num_requests, seq_bucket_size), + &mut batch_counter, + &mut request_counter, + max_input_length, + max_total_tokens, + seq_bucket_size, + false, + Some(max_new_tokens), + ) + ); + + requests_send += num_requests; + current_decode_batch_size = get_current_decode_batch_size(requests_send); + } + + let request = tonic::Request::new(WarmupRequest { + batches, + max_input_length, + max_prefill_tokens, + max_total_tokens, + }).inject_context(); + let _response = self.stub.warmup(request).await?.into_inner(); } - let request = tonic::Request::new(WarmupRequest { - batches, - max_input_length, - max_prefill_tokens, - max_total_tokens, - }).inject_context(); - let _response = self.stub.warmup(request).await?.into_inner(); - } - - // send batches with default params to warm up Greedy search - let mut greedy_shapes: Vec<(u32, u32)> = Vec::with_capacity(prefill_batch_sizes.len()); - for batch_size in &prefill_batch_sizes { - greedy_shapes.push((*batch_size, seq_bucket_size.clone())); - } - for greedy_shape in greedy_shapes.iter() { - let batches: Vec = vec![ - self.create_warmup_batch( - *greedy_shape, - &mut batch_counter, - &mut request_counter, + // send batches with default params to warm up Greedy search + let mut greedy_shapes: Vec<(u32, u32)> = Vec::with_capacity(prefill_batch_sizes.len()); + for batch_size in &prefill_batch_sizes { + greedy_shapes.push((*batch_size, seq_bucket_size.clone())); + } + for greedy_shape in greedy_shapes.iter() { + let batches: Vec = vec![ + self.create_warmup_batch( + *greedy_shape, + &mut batch_counter, + &mut request_counter, + max_input_length, + max_total_tokens, + seq_bucket_size, + true, + None, + ) + ]; + let request = tonic::Request::new(WarmupRequest { + batches, max_input_length, + max_prefill_tokens, max_total_tokens, - seq_bucket_size, - true, - None, - ) - ]; - let request = tonic::Request::new(WarmupRequest { - batches, - max_input_length, - max_prefill_tokens, - max_total_tokens, - }).inject_context(); - let _response = self.stub.warmup(request).await?.into_inner(); + }).inject_context(); + let _response = self.stub.warmup(request).await?.into_inner(); + } + Ok(None) // No support for maximum total tokens } - Ok(None) // No support for maximum total tokens } #[instrument(skip_all)] diff --git a/router/client/src/sharded_client.rs b/router/client/src/sharded_client.rs index e2c800dd..fdd84035 100644 --- a/router/client/src/sharded_client.rs +++ b/router/client/src/sharded_client.rs @@ -100,6 +100,7 @@ impl ShardedClient { max_prefill_tokens: u32, max_total_tokens: u32, max_batch_size: Option, + model_id: &str, ) -> Result> { let futures: Vec<_> = self .clients @@ -110,6 +111,7 @@ impl ShardedClient { max_prefill_tokens, max_total_tokens, max_batch_size, + model_id )) }) .collect(); diff --git a/router/src/main.rs b/router/src/main.rs index c3b8d047..4f9f0f73 100644 --- a/router/src/main.rs +++ b/router/src/main.rs @@ -349,6 +349,7 @@ async fn main() -> Result<(), RouterError> { max_batch_prefill_tokens, max_total_tokens as u32, max_batch_size, + &model_info.model_id ) .await .map_err(RouterError::Warmup)? diff --git a/server/text_generation_server/models/__init__.py b/server/text_generation_server/models/__init__.py index 3d3d3e1e..569b204f 100644 --- a/server/text_generation_server/models/__init__.py +++ b/server/text_generation_server/models/__init__.py @@ -16,6 +16,12 @@ from text_generation_server.models.model import Model from text_generation_server.models.causal_lm import CausalLM from text_generation_server.models.bloom import BLOOM from text_generation_server.models.starcoder import StarCoder +from text_generation_server.models.vlm_causal_lm import VlmCausalLM +from text_generation_server.models.custom_modeling.llava_next import ( + LlavaNextForConditionalGeneration, +) + + from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi @@ -159,6 +165,18 @@ def get_model( dtype=dtype, trust_remote_code=trust_remote_code, ) + logger.info(f"model_type = {model_type}") + if model_type == "llava_next": + logger.info(f"################model_type = {model_type}") + return VlmCausalLM( + model_class=LlavaNextForConditionalGeneration, + model_id=model_id, + revision=revision, + quantize=None, + speculator=speculator, + dtype=dtype, + trust_remote_code=trust_remote_code, + ) if model_type in modeling_auto.MODEL_FOR_CAUSAL_LM_MAPPING_NAMES: return CausalLM( diff --git a/server/text_generation_server/models/causal_lm.py b/server/text_generation_server/models/causal_lm.py index 8ec6aca8..4a48ad46 100644 --- a/server/text_generation_server/models/causal_lm.py +++ b/server/text_generation_server/models/causal_lm.py @@ -369,6 +369,7 @@ class CausalLMBatch(Batch): input_lengths = [b.input_length for b in batches] max_input_length = max(input_lengths) offsets = [max_input_length - b.input_length for b in batches] + cur_padding = [b.right_padding for b in batches] # For prefill there is a space allocated only for first token # Need to add padding to the max total tokens before first decode diff --git a/server/text_generation_server/models/custom_modeling/llava_next.py b/server/text_generation_server/models/custom_modeling/llava_next.py index de9673aa..4268cc9b 100644 --- a/server/text_generation_server/models/custom_modeling/llava_next.py +++ b/server/text_generation_server/models/custom_modeling/llava_next.py @@ -21,17 +21,12 @@ import torch.utils.checkpoint from torch import nn from transformers.activations import ACT2FN +from transformers.models.llava_next.modeling_llava_next import ( + unpad_image, +) +from optimum.habana.transformers.models import GaudiLlavaNextForConditionalGeneration from transformers.image_processing_utils import select_best_resolution - -from text_generation_server.models.custom_modeling.vlm import ( - load_text_model, - load_vision_model, -) -from text_generation_server.layers import ( - TensorParallelColumnLinear, - TensorParallelRowLinear, -) - +from loguru import logger def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size): """ @@ -56,100 +51,13 @@ def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size): return height // patch_size, width // patch_size -def unpad_image(tensor, original_size): - """ - Unpads a PyTorch tensor of a padded and resized image. - - Args: - tensor (`torch.Tensor`): - The image tensor, assumed to be of shape (num_channels, height, width). - original_size (`tuple`): - The original size of the image (height, width). - - Returns: - `torch.Tensor`: The unpadded image tensor. - """ - original_height, original_width = original_size - current_height, current_width = tensor.shape[1:] - - original_aspect_ratio = original_width / original_height - current_aspect_ratio = current_width / current_height - - if original_aspect_ratio > current_aspect_ratio: - scale_factor = current_width / original_width - new_height = int(original_height * scale_factor) - padding = (current_height - new_height) // 2 - unpadded_tensor = tensor[:, padding : current_height - padding, :] - else: - scale_factor = current_height / original_height - new_width = int(original_width * scale_factor) - padding = (current_width - new_width) // 2 - unpadded_tensor = tensor[:, :, padding : current_width - padding] - - return unpadded_tensor - - -# Copied from transformers.models.llava.modeling_llava.LlavaMultiModalProjector with Llava->LlavaNext -class LlavaNextMultiModalProjector(nn.Module): - def __init__(self, prefix, config, weights): - super().__init__() - - self.linear_1 = TensorParallelColumnLinear.load( - prefix=f"{prefix}.linear_1", config=config, weights=weights, bias=True - ) - self.act = ACT2FN[config.projector_hidden_act] - self.linear_2 = TensorParallelRowLinear.load( - prefix=f"{prefix}.linear_2", config=config, weights=weights, bias=True - ) - - def forward(self, image_features): - hidden_states = self.linear_1(image_features) - hidden_states = self.act(hidden_states) - hidden_states = self.linear_2(hidden_states) - return hidden_states - - -class LlavaNextForConditionalGeneration(nn.Module): - def __init__(self, prefix, config, weights): - super().__init__() - config.vision_config.quantize = config.quantize - vision_config = config.vision_config - # Instead of selecting in hidden_states[-2]. - # Instead compute only the n -2 + 1 layers and don't pool - if config.vision_feature_layer < 0: - vision_config.num_hidden_layers += config.vision_feature_layer + 1 - else: - vision_config.num_hidden_layers = config.vision_feature_layer + 1 - self.vision_tower = load_vision_model( - prefix="vision_tower" if not prefix else f"{prefix}.vision_tower", - config=config.vision_config, - weights=weights, - ) - - self.multi_modal_projector = LlavaNextMultiModalProjector( - prefix="multi_modal_projector", config=config, weights=weights - ) - - self.image_newline = weights.get_tensor("image_newline") - - self.vocab_size = config.text_config.vocab_size - self.config = config - config.text_config.quantize = config.quantize - config.text_config.speculator = config.speculator - self.language_model = load_text_model( - prefix="language_model" if not prefix else f"{prefix}.language_model", - config=config.text_config, - weights=weights, - ) - self.pad_token_id = ( - config.pad_token_id if config.pad_token_id is not None else -1 - ) - +class LlavaNextForConditionalGeneration(GaudiLlavaNextForConditionalGeneration): + def _merge_input_ids_with_image_features( self, - input_ids: torch.Tensor, inputs_embeds: torch.Tensor, image_features: torch.Tensor, + input_ids: torch.Tensor, ): """In place merges in vision_embeddings with inputs_embeds.""" mask = input_ids == self.config.image_token_index @@ -164,120 +72,215 @@ class LlavaNextForConditionalGeneration(nn.Module): def forward( self, - input_ids: torch.Tensor, - position_ids: torch.Tensor, - cu_seqlen_prefill: Optional[torch.Tensor], - kv_cache: List[Tuple[torch.Tensor, torch.Tensor]], - block_tables: torch.Tensor, - slots: torch.Tensor, - input_lengths: torch.Tensor, - max_s: int, - prefill_cache_indices: Optional[torch.Tensor], - lm_head_indices: Optional[torch.Tensor] = None, + input_ids: torch.LongTensor = None, pixel_values: torch.FloatTensor = None, - # Unused for this model - pixel_attention_mask=None, image_sizes: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + vision_feature_layer: Optional[int] = None, + vision_feature_select_strategy: Optional[str] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + token_idx: Optional[torch.Tensor] = None, ): - inputs_embeds = self.language_model.embed_tokens(input_ids) - if pixel_values is not None and len(pixel_values) > 0: - # num_special_image_tokens = (input_ids == self.config.image_token_index).sum() - # assert num_special_image_tokens == len(pixel_values), f"Received {num_special_image_tokens} for {len(pixel_values)} images, this is invalid" - # 1. Extract the input embeddings - # 2. Merge text and images - num_images, num_patches, channels, height, width = pixel_values.shape - pixel_values = pixel_values.view( - num_images * num_patches, channels, height, width + if token_idx is not None: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - image_features = self.vision_tower(pixel_values) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + if inputs_embeds is None: + inputs_embeds = self.get_input_embeddings()(input_ids) - # selected_image_feature = image_features.hidden_states[self.config.vision_feature_layer] - # Already done within the clip model - selected_image_feature = image_features.last_hidden_state + outputs = self.language_model( + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + token_idx=token_idx, + ) - if self.config.vision_feature_select_strategy == "default": - selected_image_feature = selected_image_feature[:, 1:] - elif self.config.vision_feature_select_strategy == "full": - selected_image_feature = selected_image_feature + logits = outputs[0] + + if not return_dict: + output = (logits,) + outputs[1:] + return output + + return outputs + + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + inputs_embeds=None, + pixel_values=None, + image_sizes=None, + attention_mask=None, + **kwargs, + ): + """ + Inherits from LlavaForConditionalGeneration: https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/models/llava_next/modeling_llava_next.py#L635 + The only differences are: + - add new args token_idx + - add the process of merging images into inputs_embeds + """ + token_idx = kwargs.get("token_idx", None) + if token_idx is None: + return super().prepare_inputs_for_generation( + input_ids=input_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + pixel_values=pixel_values, + image_sizes=image_sizes, + attention_mask=attention_mask, + **kwargs, + ) else: - raise RuntimeError( - f"Strategy `{self.config.vision_feature_select_strategy}` is not supported/valid." + + position_ids = kwargs.get("position_ids", None) + labels = kwargs.get("labels", None) + if past_key_values is None and pixel_values is not None and input_ids.shape[1] != 1: + vision_feature_select_strategy = kwargs.get("vision_feature_select_strategy", None) + vision_feature_layer = kwargs.get("vision_feature_layer", None) + vision_feature_select_strategy = ( + vision_feature_select_strategy + if vision_feature_select_strategy is not None + else self.config.vision_feature_select_strategy + ) + vision_feature_layer = ( + vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer + ) + + # 1. Extract the input embeddings + inputs_embeds = self.get_input_embeddings()(input_ids) + # 2. Merge text and images + batch_size, num_patches, num_channels, height, width = pixel_values.shape + reshaped_pixel_values = pixel_values.view(batch_size * num_patches, num_channels, height, width) + image_features = self.vision_tower( + reshaped_pixel_values, output_hidden_states=True + ) + + selected_image_feature = image_features.hidden_states[vision_feature_layer] + + if vision_feature_select_strategy == "default": + selected_image_feature = selected_image_feature[:, 1:] + elif vision_feature_select_strategy == "full": + selected_image_feature = selected_image_feature + + image_features = self.multi_modal_projector(selected_image_feature) + + # split up image_features for each of the individual images + # hence we get a list of image_features, each of shape (5, num_patches, hidden_size) + # if we assume each image has 5 image features (base image + 4 patches) + split_sizes = [image.shape[0] for image in pixel_values] + image_features = torch.split(image_features, split_sizes, dim=0) + + # NOTE we only support multimodal_patch_merge_type == "spatial_unpad" + height = width = self.config.vision_config.image_size // self.config.vision_config.patch_size + + new_image_features = [] + for image_idx, image_feature in enumerate(image_features): + if image_feature.shape[0] > 1: + base_image_feature = image_feature[0] + image_feature = image_feature[1:] + + if height * width != base_image_feature.shape[0]: + raise ValueError("The number of patches is not consistent with the image size.") + + num_patch_height, num_patch_width = get_anyres_image_grid_shape( + image_sizes[image_idx].tolist(), + self.config.image_grid_pinpoints, + self.config.vision_config.image_size, + ) + + image_feature = image_feature.view(num_patch_height, num_patch_width, height, width, -1) + image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous() + image_feature = image_feature.flatten(1, 2).flatten(2, 3) + image_feature = unpad_image(image_feature, image_sizes[image_idx]) + image_feature = torch.cat( + ( + image_feature, + self.image_newline[:, None, None].expand(*image_feature.shape[:-1], 1), + ), + dim=-1, + ) + image_feature = image_feature.flatten(1, 2).transpose(0, 1) + image_feature = torch.cat((base_image_feature, image_feature), dim=0) + else: + image_feature = image_feature[0] + image_feature = torch.cat((image_feature, self.image_newline[None]), dim=0) + new_image_features.append(image_feature) + image_features = torch.stack(new_image_features, dim=0) + inputs_embeds = self._merge_input_ids_with_image_features(inputs_embeds, image_features, input_ids) + self.image_offset = image_features.shape[1] - 1 # image_token has occupied 1 token position. + # In case input_ids.shape[1] == 1 & pixel_values==None & past_key_values != None, we are in the case of + # generation with cache + elif past_key_values is not None: + seq_len = input_ids.shape[1] + pad_len = seq_len - token_idx.item() + input_ids = torch.index_select(input_ids, 1, token_idx - 1) + # Retrieve the first layer to inspect the logits and mask out the hidden states + # that are set to 0 + first_layer_past_key_value = past_key_values[0][0][:, :, :, 0] + + # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941 + batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0) + + # Get the target length + past_length = first_layer_past_key_value.shape[-1] + extended_attention_mask = torch.ones( + (attention_mask.shape[0], past_length), + dtype=attention_mask.dtype, + device=attention_mask.device, + ) + # Filter out only the tokens that can be un-attended, this can happen + # if one uses Llava + Fused modules where the cache on the + # first iteration is already big enough, or if one passes custom cache + valid_indices = non_attended_tokens < extended_attention_mask.size(-1) + new_batch_index = batch_index[valid_indices] + new_non_attended_tokens = non_attended_tokens[valid_indices] + + # Zero-out the places where we don't need to attend + extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0 + + attention_mask = extended_attention_mask + attention_mask[:, -pad_len:] = 0 + + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past_key_values: + if token_idx is not None: + position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1 + else: + position_ids = position_ids[:, -input_ids.shape[1] :] + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and past_key_values is None: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids} + + model_inputs.update( + { + "position_ids": position_ids, + "past_key_values": past_key_values, + "use_cache": kwargs.get("use_cache"), + "attention_mask": attention_mask, + "token_idx": token_idx, + "labels": labels, + } ) - image_features = self.multi_modal_projector(selected_image_feature) - - # split up image_features for each of the individual images - # hence we get a list of image_features, each of shape (5, num_patches, hidden_size) - # if we assume each image has 5 image features (base image + 4 patches) - split_sizes = [num_patches] * num_images - image_features = torch.split(image_features, split_sizes, dim=0) - - # NOTE we only support multimodal_patch_merge_type == "spatial_unpad" - height = width = ( - self.config.vision_config.image_size - // self.config.vision_config.patch_size - ) - - new_image_features = [] - for image_idx, image_feature in enumerate(image_features): - if image_feature.shape[0] > 1: - base_image_feature = image_feature[0] - image_feature = image_feature[1:] - - if height * width != base_image_feature.shape[0]: - raise ValueError( - "The number of patches is not consistent with the image size." - ) - num_patch_height, num_patch_width = get_anyres_image_grid_shape( - image_sizes[image_idx], - self.config.image_grid_pinpoints, - self.config.vision_config.image_size, - ) - image_feature = image_feature.view( - num_patch_height, num_patch_width, height, width, -1 - ) - image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous() - image_feature = image_feature.flatten(1, 2).flatten(2, 3) - image_feature = unpad_image(image_feature, image_sizes[image_idx]) - image_feature = torch.cat( - ( - image_feature, - self.image_newline[:, None, None].expand( - *image_feature.shape[:-1], 1 - ), - ), - dim=-1, - ) - image_feature = image_feature.flatten(1, 2).transpose(0, 1) - image_feature = torch.cat( - (base_image_feature, image_feature), dim=0 - ) - else: - image_feature = image_feature[0] - image_feature = torch.cat( - (image_feature, self.image_newline[None]), dim=0 - ) - new_image_features.append(image_feature) - image_features = torch.stack(new_image_features, dim=0) - - inputs_embeds = self._merge_input_ids_with_image_features( - input_ids, inputs_embeds, image_features - ) - - hidden_states = self.language_model.model( - inputs_embeds=inputs_embeds, - position_ids=position_ids, - cu_seqlen_prefill=cu_seqlen_prefill, - kv_cache=kv_cache, - block_tables=block_tables, - slots=slots, - input_lengths=input_lengths, - max_s=max_s, - true_max_s=max_s, - prefill_cache_indices=None, - ) - if lm_head_indices is not None: - hidden_states = hidden_states[lm_head_indices] - logits, speculative_logits = self.language_model.lm_head(hidden_states) - return logits, speculative_logits + return model_inputs \ No newline at end of file diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py index 86d9b4c8..72ceca6b 100644 --- a/server/text_generation_server/models/flash_causal_lm.py +++ b/server/text_generation_server/models/flash_causal_lm.py @@ -10,7 +10,12 @@ import numpy as np from loguru import logger from dataclasses import dataclass from opentelemetry import trace -from transformers import PreTrainedTokenizerBase +from transformers import ( + PreTrainedTokenizerBase, + AutoConfig, + AutoTokenizer, + GenerationConfig, +) from typing import Optional, Tuple, List, Type, Dict from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE @@ -19,6 +24,11 @@ from text_generation_server.models import Model from text_generation_server.utils.tokens import batch_top_tokens from text_generation_server.utils.dist import RANK from text_generation_server.utils.speculate import get_speculate +from text_generation_server.utils import ( + initialize_torch_distributed, + weight_files, + Weights, +) from text_generation_server.models.types import ( Batch, Tokens, @@ -686,20 +696,97 @@ class FlashCausalLMBatch(Batch): class FlashCausalLM(Model): def __init__( self, - model: torch.nn.Module, - tokenizer: PreTrainedTokenizerBase, - num_layers: int, - num_kv_heads: int, - head_size: int, - dtype: torch.dtype, - device: torch.device, - rank: int = 0, - world_size: int = 1, - sliding_window: Optional[int] = None, + model_id: str, + model_class, + revision: Optional[str] = None, + quantize: Optional[str] = None, + speculator: Optional[str] = None, + dtype: Optional[torch.dtype] = None, + trust_remote_code: bool = False, + lora_adapter_ids: Optional[list] = [], + tokenizer_class: PreTrainedTokenizerBase = AutoTokenizer, + config_class: PreTrainedTokenizerBase = AutoConfig, + default_dtype=torch.bfloat16, + aliases=None, + # Used for Santacoder override of config + num_kv_heads: Optional[int] = None, + # Deepseek V2 uses different QK and V dims. + head_size: Optional[int] = None, + skip_special_tokens: bool = True, ): - self.num_layers = num_layers - self.num_kv_heads = num_kv_heads - self.head_size = head_size + + # Create model + world_size = int(os.getenv("WORLD_SIZE", "1")) + rank = int(os.getenv("RANK", "0")) + dtype = torch.bfloat16 if dtype is None else dtype + device = torch.device("hpu") + + tokenizer = tokenizer_class.from_pretrained( + model_id, + revision=revision, + padding_side="left", + truncation_side="left", + trust_remote_code=trust_remote_code, + ) + try: + generation_config = GenerationConfig.from_pretrained( + model_id, revision=revision, trust_remote_code=trust_remote_code + ) + if isinstance(generation_config.eos_token_id, (list, set)): + # TODO Huge hack + tokenizer._eos_token_ids = set(generation_config.eos_token_id) + except Exception: + pass + + config = config_class.from_pretrained( + model_id, revision=revision, trust_remote_code=trust_remote_code + ) + config.quantize = quantize + config.speculator = speculator + + + filenames = weight_files(model_id, revision=revision, extension=".safetensors") + weights = Weights(filenames, device, dtype) + + + + prefix = "" + model = model_class(prefix, config, weights) + + # VLM models define the config we care about in their text_config + text_config = getattr(config, "text_config", None) + if text_config is not None: + config = text_config + + + self.num_layers = config.num_hidden_layers + # Validation is done in the model itself + if num_kv_heads is None: + num_kv_heads = getattr(config, "num_key_value_heads", None) + # GPT-2 workaround + if num_kv_heads is None: + num_kv_heads = getattr(config, "n_head", None) + if num_kv_heads is None: + raise ValueError("Cannot get the number of key/value heads") + self.num_kv_heads = num_kv_heads ( + num_kv_heads // self.process_group.size() + if num_kv_heads > 1 + else num_kv_heads + ) + assert self.num_kv_heads > 0 + + if head_size is None: + # Some models use GQA and different sizes for o_proj + # and q_proj, that allows for that. + if hasattr(config, "head_dim"): + self.head_size = config.head_dim + else: + self.head_size = config.hidden_size // config.num_attention_heads + else: + self.head_size = head_size + + self.cuda_graphs = {} + self.kv_cache = [] self.cuda_graphs = {} @@ -711,7 +798,7 @@ class FlashCausalLM(Model): device=device, rank=rank, world_size=world_size, - sliding_window=sliding_window, + sliding_window=None, ) @property diff --git a/server/text_generation_server/models/vlm_causal_lm.py b/server/text_generation_server/models/vlm_causal_lm.py index f0db89b2..7412092a 100644 --- a/server/text_generation_server/models/vlm_causal_lm.py +++ b/server/text_generation_server/models/vlm_causal_lm.py @@ -1,29 +1,87 @@ import re import torch +import os +import time import math from PIL import Image from io import BytesIO import base64 - +import numpy from opentelemetry import trace +from loguru import logger from typing import Optional, Tuple, List, Type, Dict - +import tempfile +import copy +from text_generation_server.models import Model from transformers import PreTrainedTokenizerBase from transformers.image_processing_utils import select_best_resolution +from text_generation_server.utils.tokens import batch_top_tokens from text_generation_server.pb import generate_pb2 -from text_generation_server.models.flash_mistral import ( - BaseFlashMistral, - FlashMistralBatch, +from text_generation_server.models.causal_lm import ( + CausalLMBatch, + CausalLMRequest, + round_up, + remove_kv_cache_from_output ) -from text_generation_server.models.flash_causal_lm import FlashCausalLMBatch +from transformers.models.llava_next.modeling_llava_next import ( + get_anyres_image_grid_shape, +) + +from transformers import AutoProcessor +import text_generation_server.habana_quantization_env as hq_env +from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi from text_generation_server.models.cache_manager import ( get_cache_manager, ) +from text_generation_server.utils import ( + HeterogeneousNextTokenChooser, + StoppingCriteria, + make_tokenizer_optional, + is_tokenizer_transparent, + pad_next_token_chooser_parameters, +) +import habana_frameworks.torch as htorch +from optimum.habana.utils import HabanaProfile +from optimum.habana.transformers.generation import MODELS_OPTIMIZED_WITH_STATIC_SHAPES + +from transformers import ( + AutoTokenizer, + AutoModel, + PreTrainedTokenizerBase, + AutoConfig, +) +from optimum.habana.checkpoint_utils import ( + get_repo_root, + model_on_meta, + write_checkpoints_json, +) + +from text_generation_server.utils.speculate import get_speculate +from text_generation_server.models.types import ( + Batch, + Tokens, + Generation, + GeneratedText, +) +from text_generation_server.utils.debug import dbg_trace tracer = trace.get_tracer(__name__) IMAGES = re.compile(r"!\[[^\]]*\]\((.*?)\s*(\"(?:.*[^\"])\")?\s*\)") +BASE_IMAGE_TOKENS = int(os.environ.get('BASE_IMAGE_TOKENS', 2048)) +MAX_TOTAL_TOKENS = int(os.environ.get('MAX_TOTAL_TOKENS', 8192)) +BATCH_BUCKET_SIZE = int(os.environ.get('BATCH_BUCKET_SIZE', 1)) +PAD_SEQUENCE_TO_MULTIPLE_OF = int(os.environ.get('PAD_SEQUENCE_TO_MULTIPLE_OF', 128)) +PREFILL_BATCH_BUCKET_SIZE = int(os.environ.get('PREFILL_BATCH_BUCKET_SIZE', 1)) +CHUNK_SIZES = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048] +LAZY_MODE = int(os.environ.get('PT_HPU_LAZY_MODE', 1)) +PREFILL_GRAPH_NUM = int(os.environ.get('PREFILL_GRAPH_NUM', 16)) +os.environ['MAX_TOTAL_TOKENS'] = str(MAX_TOTAL_TOKENS) +os.environ['BATCH_BUCKET_SIZE'] = str(BATCH_BUCKET_SIZE) +os.environ['PAD_SEQUENCE_TO_MULTIPLE_OF'] = str(PAD_SEQUENCE_TO_MULTIPLE_OF) +os.environ['PREFILL_BATCH_BUCKET_SIZE'] = str(PREFILL_BATCH_BUCKET_SIZE) +os.environ['LAZY_MODE'] = str(LAZY_MODE) def split(string) -> List[Dict[str, str]]: parts = [] @@ -41,30 +99,6 @@ def split(string) -> List[Dict[str, str]]: return parts - -def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size): - """ - Calculate the shape of the image patch grid after the preprocessing for images of any resolution. - - Args: - image_size (`tuple`): - The size of the input image in the format (width, height). - grid_pinpoints (`List`): - A list containing possible resolutions. Each item in the list should be a tuple or list - of the form `(height, width)`. - patch_size (`int`): - The size of each image patch. - - Returns: - tuple: The shape of the image patch grid in the format (width, height). - """ - if not isinstance(grid_pinpoints, list): - raise ValueError("grid_pinpoints should be a list of tuples or lists") - - height, width = select_best_resolution(image_size, grid_pinpoints) - return height // patch_size, width // patch_size - - def image_text_replacement(image_input, config, image_id) -> str: if config.model_type == "idefics2": # TODO technically depends on image splitting which is not implemented. @@ -77,9 +111,7 @@ def image_text_replacement(image_input, config, image_id) -> str: elif config.model_type == "llava_next": height, width = image_input["image_sizes"][image_id] num_features = get_number_of_features(height, width, config) - from loguru import logger - logger.info(f"Found {num_features} in image of resolution {height}x{width}") return "" * num_features elif config.model_type == "paligemma": @@ -125,6 +157,7 @@ def get_number_of_features(height: int, width: int, config) -> int: image_grid_pinpoints, image_size, ) + unpadded_features, newline_features = get_unpadded_features( height, width, npatches, num_patch_height, num_patch_width ) @@ -140,27 +173,100 @@ def load_data_uri(image_uri: str) -> Image.Image: return image -class VlmCausalLMBatch(FlashMistralBatch): +class VlmCausalLMBatch(CausalLMBatch): pixel_values: Optional[List[torch.Tensor]] pixel_attention_mask: Optional[List[torch.Tensor]] image_sizes: Optional[List[Tuple[int, int]]] @classmethod - @tracer.start_as_current_span("concatenate") - def concatenate(cls, batches): - batch = super(VlmCausalLMBatch, cls).concatenate(batches) - batch.pixel_values = None - batch.pixel_attention_mask = None - batch.image_sizes = None - return batch + def from_tokenized( + cls, + pb: generate_pb2.Batch, + tokenizer: PreTrainedTokenizerBase, + batch_tokenized_inputs, + dtype: torch.dtype, + device: torch.device, + ) -> "VlmCausalLMBatch": + + dbg_trace('FROM_PB', f'num_reqs:{len(pb.requests)}') + requests = [CausalLMRequest.from_pb(idx, req, tokenizer) for idx, req in enumerate(pb.requests)] - @tracer.start_as_current_span("filter") - def filter(self, request_ids: List[int]): - batch = super().filter(request_ids) - batch.pixel_values = None - batch.pixel_attention_mask = None - batch.image_sizes = None - return batch + max_input_length = max(r.data.truncate for r in requests) + max_new_tokens = max(r.stopping_criteria.max_new_tokens for r in requests) + # TODO: Add support for sparse batches + top_n_tokens = [r.top_n_tokens for r in pb.requests] + top_n_tokens_tensor = torch.tensor(top_n_tokens, device=device, dtype=torch.int64) + + # TODO: by tokenizing all inputs at once we loose information on actual input lengths + # this means that we cannot shift inputs to the left after a long input sequence + # was filtered out + new_bs = round_up(len(requests), PREFILL_BATCH_BUCKET_SIZE) + parameters = [r.parameters for r in pb.requests] + # append the dummy parameters for dummy request + parameters = pad_next_token_chooser_parameters(parameters, new_bs) + + next_token_chooser = HeterogeneousNextTokenChooser.from_pb( + pb=parameters, + dtype=dtype, + device=device, + tokenizer=tokenizer, + quantization_enabled=hq_env.is_quantization_enabled, + ) + tokenized_inputs = batch_tokenized_inputs + input_len = tokenized_inputs["input_ids"].shape[1] + + bucket_size = max_input_length + left_padding = max_input_length - input_len + if input_len < max_input_length and PAD_SEQUENCE_TO_MULTIPLE_OF != 0: + assert PAD_SEQUENCE_TO_MULTIPLE_OF <= max_input_length, "PAD_SEQUENCE_TO_MULTIPLE_OF cannot be higher than max_input_length" + rounded_seq_len = round_up(input_len + 1, PAD_SEQUENCE_TO_MULTIPLE_OF) + if rounded_seq_len <= max_input_length: + bucket_size = rounded_seq_len - 1 + else: + bucket_size = max_input_length - 1 + left_padding = bucket_size - input_len + + input_ids = tokenized_inputs["input_ids"] + attention_mask = tokenized_inputs["attention_mask"] + # Allocate space for first token + if left_padding > 0: + input_ids = torch.nn.functional.pad( + input_ids, (left_padding, 1), value=tokenizer.pad_token_id + ) + attention_mask = torch.nn.functional.pad( + attention_mask, (left_padding, 1), value=0 + ) + all_input_ids = torch.nn.functional.pad( + input_ids, (0, max_new_tokens), value=tokenizer.pad_token_id + ).T.split(1, dim=1) + + # New input length after left padding + input_len = bucket_size + for r in requests: + r.input_length = input_len + r.prefix_offset = input_len - 5 + r.read_offset = input_len + r.all_input_ids = all_input_ids[r.idx] + input_ids = input_ids.to(device) + attention_mask = attention_mask.to(device) + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + + htorch.core.mark_step() + + return cls( + batch_id=pb.id, + requests=requests, + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=None, + merged_kv_cache=False, + next_token_chooser=next_token_chooser, + top_n_tokens=top_n_tokens, + top_n_tokens_tensor=top_n_tokens_tensor, + input_length=input_len, + ) @classmethod def batch_tokenized_inputs(cls, requests, tokenizer, processor, config): @@ -192,16 +298,26 @@ class VlmCausalLMBatch(FlashMistralBatch): image_inputs.append(image_input) else: raise RuntimeError(f"Invalid chunk type {chunk['type']}") - batch_inputs.append(full_text) max_truncation = max(max_truncation, r.truncate) + new_bs = round_up(len(requests), PREFILL_BATCH_BUCKET_SIZE) + missing_inputs = new_bs - len(requests) + dummy_images = [] + dummy_inputs = [] + if len(batch_inputs) > 0 and len(image_inputs) > 0: + dummy_inputs = [batch_inputs[0]] * missing_inputs + dummy_images = [image_inputs[0]] * missing_inputs + + image_inputs += dummy_images batch_tokenized_inputs = tokenizer( - batch_inputs, + batch_inputs + dummy_inputs, truncation=True, max_length=max_truncation, - add_special_tokens=not config.model_type == "paligemma", - )["input_ids"] + return_tensors="pt", + padding="longest", + return_token_type_ids=False, + ) if image_inputs: image_input = image_inputs[0] new_image_inputs = { @@ -255,126 +371,626 @@ class VlmCausalLMBatch(FlashMistralBatch): return batch -class VlmCausalLM(BaseFlashMistral): +class VlmCausalLM(Model): + def __init__( + self, + model_class, + model_id: str, + *, + processor_class=AutoProcessor, + processor_kwargs=None, + batch_class=VlmCausalLMBatch, + revision, + dtype, + trust_remote_code: bool, + **kwargs, + ): + adapt_transformers_to_gaudi() + if processor_kwargs is None: + processor_kwargs = {} + self.processor = processor_class.from_pretrained( + model_id, + revision=revision, + trust_remote_code=trust_remote_code, + **processor_kwargs, + ) + self.batch_class = batch_class + self.prev_bs = 0 + + # Create tokenizer + tokenizer = AutoTokenizer.from_pretrained( + model_id, + revision=revision, + padding_side="left", + truncation_side="left", + trust_remote_code=trust_remote_code, + ) + + # Create model + world_size = int(os.getenv("WORLD_SIZE", "1")) + rank = int(os.getenv("RANK", "0")) + dtype = torch.bfloat16 if dtype is None else dtype + device = torch.device("hpu") + + if hq_env.is_quantization_enabled: + htorch.core.hpu_set_env() + + if world_size > 1: + model = self.get_deepspeed_model( + model_class, model_id, dtype, revision + ) + model = self.prepare_model_for_quantization(model) + else: + get_repo_root(model_id) + + # Check support for rope scaling + model_kwargs = {} + config = AutoConfig.from_pretrained( + model_id + ) + if hasattr(config, "rope_scaling"): + model_kwargs["rope_scaling"] = self.get_rope_scaling() + + model = model_class.from_pretrained( + model_id, + revision=revision, + torch_dtype=dtype, + trust_remote_code=trust_remote_code, + **model_kwargs + ) + model = self.prepare_model_for_quantization(model) + model = model.eval().to(device) + + self.enable_hpu_graph = os.getenv("ENABLE_HPU_GRAPH", "true").lower() == "true" and LAZY_MODE == 1 + self.limit_hpu_graph = os.getenv("LIMIT_HPU_GRAPH", "false").lower() == "true" + model = remove_kv_cache_from_output(model) + if self.enable_hpu_graph: + from habana_frameworks.torch.hpu import wrap_in_hpu_graph + model = wrap_in_hpu_graph(model, disable_tensor_cache=True) + else: + if LAZY_MODE == 0: + # It is said that "keep_input_mutations" is safe for inference to be done + dbg_trace( + "TORCH COMPILE", f'Torch compiling of model') + model.model = torch.compile(model.model, backend="hpu_backend", options={"keep_input_mutations": True}) + + model = self.setup_quantization(model) + + if model.config.model_type not in MODELS_OPTIMIZED_WITH_STATIC_SHAPES: + raise ValueError(f"Model type {model.config.model_type} is not supported!") + + if tokenizer.pad_token_id is None: + if model.config.pad_token_id is not None: + tokenizer.pad_token_id = model.config.pad_token_id + elif model.config.eos_token_id is not None: + tokenizer.pad_token_id = model.config.eos_token_id + elif tokenizer.eos_token_id is not None: + tokenizer.pad_token_id = tokenizer.eos_token_id + else: + tokenizer.add_special_tokens({"pad_token": "[PAD]"}) + + kwargs = { + "use_cache": True, + "return_dict": True, + } + + if model.config.model_type in ["llama", "mistral"]: + kwargs["attn_softmax_bf16"] = True + kwargs["trim_logits"] = True + + if os.getenv("USE_FLASH_ATTENTION", "false").lower() == "true": + kwargs["use_flash_attention"] = True + if os.getenv("FLASH_ATTENTION_RECOMPUTE", "false").lower() == "true": + kwargs["flash_attention_recompute"] = True + + self.speculate = get_speculate() + super(VlmCausalLM, self).__init__( + model=model, + tokenizer=tokenizer, + requires_padding=True, + dtype=dtype, + device=device, + rank=rank, + kwargs=kwargs, + ) + + + @property def batch_type(self) -> Type[VlmCausalLMBatch]: - return VlmCausalLMBatch + return self.batch_class + + def max_past(self) -> Optional[int]: + return getattr(self.model.text_model, "max_past", None) + + def get_deepspeed_model( + self, + model_class, + model_id: str, + dtype: torch.dtype, + revision: Optional[str] = None + ) -> torch.nn.Module: + import deepspeed + from habana_frameworks.torch.distributed.hccl import initialize_distributed_hpu + + world_size, rank, local_rank = initialize_distributed_hpu() + model_kwargs = { + "revision": revision + } + + # Initialize process(es) for DeepSpeed + deepspeed.init_distributed(dist_backend="hccl") + logger.info( + "DeepSpeed is enabled. world_size {} rank {} local_rank {}".format(world_size, rank, local_rank) + ) + config = AutoConfig.from_pretrained(model_id, **model_kwargs) + load_to_meta = model_on_meta(config) + + # Check support for rope scaling + if hasattr(config, "rope_scaling"): + config.rope_scaling = self.get_rope_scaling() + model_kwargs["rope_scaling"] = self.get_rope_scaling() + + if load_to_meta: + # Construct model with fake meta tensors, later will be replaced on devices during ds-inference ckpt load + with deepspeed.OnDevice(dtype=dtype, device="meta"): + model = model_class.from_config(config, torch_dtype=dtype) + else: + get_repo_root(model_id, local_rank=os.getenv("LOCAL_RANK")) + # TODO: revisit placement on CPU when auto-injection is possible + with deepspeed.OnDevice(dtype=dtype, device="cpu"): + model = model_class.from_pretrained(model_id, torch_dtype=dtype, **model_kwargs) + model = model.eval() + + # Initialize the model + ds_inference_kwargs = {"dtype": dtype} + ds_inference_kwargs["tensor_parallel"] = {"tp_size": world_size} + ds_inference_kwargs["enable_cuda_graph"] = False + + if load_to_meta: + # model loaded to meta is managed differently + checkpoints_json = tempfile.NamedTemporaryFile(suffix=".json", mode="+w") + write_checkpoints_json(model_id, local_rank, checkpoints_json) + ds_inference_kwargs["checkpoint"] = checkpoints_json.name + model = deepspeed.init_inference(model, **ds_inference_kwargs) + + return model.module + + def get_rope_scaling(self) -> Optional[Dict]: + rope_scaling = os.getenv("ROPE_SCALING", None) + if rope_scaling is None: + return None + + rope_factor = float(os.getenv("ROPE_FACTOR", 1.0)) + return { + 'type': rope_scaling, 'factor': float(rope_factor) + } + + def setup_quantization(self, model): + if hq_env.is_quantization_enabled: + htorch.core.quantization._mark_params_as_const(model) + htorch.core.quantization._check_params_as_const(model) + htorch.core.hpu_initialize(model) + return model + + def prepare_model_for_quantization(self, model): + if hq_env.is_quantization_enabled: + if model.config.model_type == "llama": + self.patch_scoped_linear_all_reduce(model) + import habana_quantization_toolkit + habana_quantization_toolkit.prep_model(model) + return model + + def finish_quantization_measurements(self, model): + if hq_env.is_quantization_enabled: + import habana_quantization_toolkit + habana_quantization_toolkit.finish_measurements(self.model) + return model + + def patch_scoped_linear_all_reduce(self, model): + from deepspeed.module_inject.layers import LinearAllreduce + from optimum.habana.transformers.models.modeling_all_models import ScopedLinearAllReduce + for name, module in model.named_children(): + if type(module) is LinearAllreduce: + SL = ScopedLinearAllReduce(mod=module) + setattr(model, name, SL) + self.patch_scoped_linear_all_reduce(module) + + def decode(self, generated_ids: List[int]) -> str: + return self.tokenizer.decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False) + + def decode_token( + self, + all_input_ids: List[int], + prefix_offset: int = 0, + read_offset: int = 0, + ) -> Tuple[str, int, int]: + if is_tokenizer_transparent(self.tokenizer): + new_text = self.tokenizer.decode(all_input_ids[read_offset:], skip_special_tokens=False) + return new_text, read_offset, len(all_input_ids) + else: + return super().decode_token(all_input_ids, prefix_offset, read_offset) def forward( - self, batch: VlmCausalLMBatch - ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: + self, + input_ids, + attention_mask, + position_ids, + token_idx, + past_key_values: Optional[List[Tuple]] = None, + pixel_values: Optional[List[torch.Tensor]] = None, + image_sizes: Optional[List[Tuple[int, int]]] = None, + bypass_hpu_graph: Optional[bool] = None, + ) -> Tuple[torch.Tensor, List[Tuple[torch.Tensor, torch.Tensor]]]: # Model Forward - if batch.speculative_ids is not None: - input_ids = batch.input_ids - position_ids = batch.position_ids - cu_seqlen_prefill = batch.cu_seqlen_prefill - kv_cache = get_cache_manager().kv_cache - block_tables = batch.block_tables_tensor - slots = batch.slots[batch.slot_indices] - input_lengths = batch.input_lengths_tensor - max_s = batch.max_seqlen - lm_head_indices = batch.prefill_head_indices + kwargs = { + "input_ids": input_ids, + "attention_mask": attention_mask, + "past_key_values": past_key_values, + "token_idx": token_idx, + "pixel_values": pixel_values, + "image_sizes": image_sizes + } - speculative_ids = batch.speculative_ids + hpu_kwargs = {} + # Optimum Habana got "lazy_mode" key-val only supported for llama type of models + if self.model.config.model_type == "llama" : + hpu_kwargs["lazy_mode"] = LAZY_MODE == 1 - B, speculative_length = speculative_ids.shape - new_length = speculative_length + 1 - new_input_ids = torch.cat( - [input_ids.unsqueeze(-1), speculative_ids], dim=1 - ).reshape(-1) - arange = torch.arange(new_length, device=position_ids.device).unsqueeze(0) - arange_int = arange.to(dtype=torch.int32) - new_position_ids = ( - position_ids.unsqueeze(-1).expand(B, new_length) + arange - ).view(-1) - slots = (slots.unsqueeze(-1).expand(B, new_length) + arange_int).view(-1) - input_lengths = ( - input_lengths.unsqueeze(-1).expand(B, new_length) + arange_int - ).view(-1) + if self.has_position_ids: + kwargs["position_ids"] = position_ids - # Add Copy the block tables for all members - block_tables = ( - block_tables.unsqueeze(1) - .expand(B, new_length, -1) - .reshape(B * new_length, -1) - .contiguous() - ) - max_s = max_s + speculative_length + if bypass_hpu_graph != None: + hpu_kwargs["bypass_hpu_graphs"] = bypass_hpu_graph - input_ids = new_input_ids - position_ids = new_position_ids + kwargs.update(self.kwargs) + model_inputs = self.model.prepare_inputs_for_generation(**kwargs) + if past_key_values is not None: + return self.model.forward(**model_inputs, **hpu_kwargs) else: - input_ids = batch.input_ids - position_ids = batch.position_ids - cu_seqlen_prefill = batch.cu_seqlen_prefill - kv_cache = get_cache_manager().kv_cache - block_tables = batch.block_tables_tensor - slots = batch.slots[batch.slot_indices] - input_lengths = batch.input_lengths_tensor - max_s = batch.max_seqlen - lm_head_indices = batch.prefill_head_indices + outputs = self.model.forward(**model_inputs, **hpu_kwargs) + return outputs.logits, outputs.past_key_values - if cu_seqlen_prefill is None and self.max_past() is not None: - # In decode, not prefill, we're actually overwriting the KV-cache - # in a circular buffer mode. - # This makes sure the max_s for the decode pass is correct. - max_s = min(self.max_past(), max_s) + @tracer.start_as_current_span("generate_token") + def generate_token( + self, batches: List[VlmCausalLMBatch] + ) -> Tuple[List[Generation], Optional[CausalLMBatch], Tuple[int, int]]: + start = time.time_ns() + # Results + generations: List[Generation] = [] + prev_batches = [] + requests_to_generate = [] + # In order to pipeline any actions on CPU we perform the operation in 3 main stages: + # Stage 1. Collect next token ids of any previously started generations + for batch_id, batch in enumerate(batches): + if batch.logits is not None: + logits = batch.logits + past = batch.past + prefill = batch.past_key_values is None + if prefill: + # no right padding for prefill + token_idx_scalar = batch.attention_mask.shape[-1] - 1 + token_idx = torch.tensor(token_idx_scalar).to(self.device) + else: + token_idx_scalar = batch.attention_mask.shape[-1] - batch.right_padding + token_idx = torch.tensor(token_idx_scalar).to(self.device) - bs = input_ids.shape[0] - # Try to find an associated cuda graph - bs = input_ids.shape[0] - sorted_padded_bs = sorted([k for k in self.cuda_graphs.keys() if k >= bs]) - if sorted_padded_bs: - # Get associated cuda graph - cuda_graph = self.cuda_graphs[sorted_padded_bs[0]] + # Select next token + input_length = batch.input_length + if logits.shape[-2] > 1: + next_token_ids, next_token_logprobs, logprobs, _, _ = batch.next_token_chooser( + batch.input_ids, logits[:, input_length - 1: input_length, :].squeeze(-2), self.speculate + ) + else: + next_token_ids, next_token_logprobs, logprobs, _, _ = batch.next_token_chooser( + batch.input_ids, logits.squeeze(-2), self.speculate + ) + # Speculation is not active for causal + accepted_ids = torch.ones_like(batch.input_ids)[:, 0] + batch_top_token_ids, batch_top_token_logprobs = batch_top_tokens( + batch.top_n_tokens, + batch.top_n_tokens_tensor, + logprobs, + accepted_ids, + ) + + prev_batches.append({ + 'next_token_ids': next_token_ids, + 'next_token_logprobs': next_token_logprobs, + }) + + for req_idx, req in enumerate(batch.requests): + requests_to_generate.append({ + 'req': req, + 'prev_req_idx': req.idx, + 'batch_id': batch_id, + 'seed': batch.next_token_chooser.seeds[req_idx], + 'do_sample': batch.next_token_chooser.do_sample[req_idx], + 'top_n_tokens': batch.top_n_tokens[req_idx], + 'top_token_ids': batch_top_token_ids[req_idx], + 'top_token_logprobs': batch_top_token_logprobs[req_idx], + 'grammar_state': batch.next_token_chooser.fsm_grammar_states[req.idx], + }) + + htorch.core.mark_step() + + # Add new token into input_ids + batch.input_ids.index_copy_(1, token_idx, next_token_ids.unsqueeze(1)) + + # Update attention_mask as we added a new token to input_ids + batch.attention_mask.index_fill_(1, token_idx, 1) + + # Adjust lengths + batch.input_length += 1 + + # Update position_ids + if prefill: + batch.position_ids = torch.index_select(batch.position_ids, 1, token_idx - 1) + 1 + else: + batch.position_ids += 1 + # Update past key values + if prefill: + batch.past_key_values = past + + htorch.core.mark_step() + + # Stage 2. Prepare new batch for speculative scheduling + if len(batches) > 1: + batch = self.batch_type.concatenate(batches, self.tokenizer.pad_token_id) else: - cuda_graph = None - if cu_seqlen_prefill is not None or cuda_graph is None: - logits, speculative_logits = self.model.forward( - input_ids=input_ids, - position_ids=position_ids, - cu_seqlen_prefill=cu_seqlen_prefill, - kv_cache=kv_cache, - block_tables=block_tables, - slots=slots, - input_lengths=input_lengths, - max_s=max_s, - prefill_cache_indices=batch.prefill_cache_indices, - lm_head_indices=lm_head_indices, - pixel_values=batch.pixel_values, - pixel_attention_mask=batch.pixel_attention_mask, - image_sizes=batch.image_sizes, + batch = batches[0] + + prefill = batch.past_key_values is None + + # Check if we need to do any bookkeeping first + if not prefill: + batch = batch.__class__.recombine([batch], self.tokenizer.pad_token_id) + + scenario = 'PREFILL' if prefill else 'GENERATE' + if self.enable_hpu_graph and self.limit_hpu_graph and round_up(batch.batch_size, BATCH_BUCKET_SIZE) != self.prev_bs: + self.model.clear_cache() + self.prev_bs = round_up(batch.batch_size, BATCH_BUCKET_SIZE) + dbg_trace( + scenario, f'bs:{batch.batch_size} num_reqs:{len(batch.requests)} seq_len:{batch.seq_length} padding:{batch.right_padding}') + #assert batch.right_padding > 0, 'No more room for next token!' + + # Execute batch + if prefill: + # no right padding for prefill + token_idx = torch.tensor(batch.attention_mask.shape[-1] - 1).to(self.device) + batch.logits, batch.past = self.forward( + batch.input_ids, + batch.attention_mask, + batch.position_ids, + token_idx, + batch.past_key_values, + batch.pixel_values, + batch.image_sizes, + bypass_hpu_graph=prefill and self.limit_hpu_graph if self.enable_hpu_graph else None, + ) + elif all([req.stopping_criteria.max_new_tokens == 1 for req in batch.requests]): + # Don't schedule next forward if max_new_tokens for all requests equals 1 + # - we've already generated the first and only needed token in the prefill phase + pass + else: + token_idx = torch.tensor(batch.attention_mask.shape[-1] - batch.right_padding).to(self.device) + batch.logits = self.forward( + batch.input_ids, + batch.attention_mask, + batch.position_ids, + token_idx, + batch.past_key_values, + bypass_hpu_graph=prefill and self.limit_hpu_graph if self.enable_hpu_graph else None, ) - if batch.prefill_cache_indices is not None: - batch.prefill_cache_indices = None - if batch.pixel_values is not None: - batch.pixel_values = None - if batch.pixel_attention_mask is not None: - batch.pixel_attention_mask = None - if batch.image_sizes is not None: - batch.image_sizes = None - return logits, speculative_logits - # Copy inputs to the static inputs of the cuda graph - # Static inputs are potentially padded - cuda_graph["input_ids"][: input_ids.shape[0]] = input_ids - cuda_graph["position_ids"][: position_ids.shape[0]] = position_ids - cuda_graph["block_tables"][ - : block_tables.shape[0], : block_tables.shape[1] - ] = block_tables - cuda_graph["slots"].fill_(-1) - cuda_graph["slots"][: slots.shape[0]] = slots - cuda_graph["input_lengths"].zero_() - cuda_graph["input_lengths"][: input_lengths.shape[0]] = input_lengths + htorch.core.mark_step() - # Replay the graph - cuda_graph["graph"].replay() + start_decode = time.time_ns() - # Slice output to the correct shape - speculative_logits = ( - cuda_graph["speculative_logits"][:bs] - if cuda_graph["speculative_logits"] is not None - else None + # Stage 3. Finish and return previous generations + stopped = len(requests_to_generate) > 0 + for prev_batch in prev_batches: + prev_batch['next_token_logprobs'] = prev_batch['next_token_logprobs'].tolist() + prev_batch['next_token_ids_cpu'] = prev_batch['next_token_ids'].cpu() + htorch.core.mark_step() + + for req_data in requests_to_generate: + req = req_data['req'] + i = req_data['prev_req_idx'] + prev_batch_id = req_data['batch_id'] + assert len(prev_batches) > prev_batch_id + next_token_ids_cpu = prev_batches[prev_batch_id]['next_token_ids_cpu'] + next_token_logprobs = prev_batches[prev_batch_id]['next_token_logprobs'] + + request = req.data + input_length = req.input_length + prefix_offset = req.prefix_offset + read_offset = req.read_offset + do_sample = req_data['do_sample'] + seed = req_data['seed'] + stopping_criteria = req.stopping_criteria + all_input_ids = req.all_input_ids + next_token_id = next_token_ids_cpu[i] + next_token_logprob = next_token_logprobs[i] + top_n_tokens = req_data['top_n_tokens'] + top_token_ids = req_data['top_token_ids'] + top_token_logprobs = req_data['top_token_logprobs'] + grammar_state = req_data['grammar_state'] + + # Append next token to all tokens + all_input_ids[input_length] = next_token_id + new_input_length = input_length + 1 + + # Generated token + if is_tokenizer_transparent(self.tokenizer) and len(stopping_criteria.stop_sequence_criterias) == 0: + next_token_text = '' + else: + next_token_text, prefix_offset, read_offset = self.decode_token( + all_input_ids[0:new_input_length, 0], prefix_offset, read_offset + ) + + # Evaluate stopping criteria + stop, reason = stopping_criteria( + next_token_id, + next_token_text, + ) + + if not stop: + stopped = False + + # Shard generations + # All generations will be appended in the rust sharded client + if i % self.world_size == self.rank: + if stop: + # Decode generated tokens + if is_tokenizer_transparent(self.tokenizer): + output_text = None + else: + output_text = self.decode( + all_input_ids[new_input_length - stopping_criteria.current_tokens: new_input_length, 0] + ) + generated_text = GeneratedText( + output_text, + stopping_criteria.current_tokens, + reason, + seed if do_sample else None, + ) + else: + generated_text = None + + # Prefill + if stopping_criteria.current_tokens == 1 and request.prefill_logprobs: + # Remove generated token to only have prefill and add nan for first prompt token + prefill_logprobs = [float("nan")] + next_token_logprobs + prefill_token_ids = all_input_ids[0: new_input_length - 1] + prefill_texts = self.tokenizer.batch_decode( + prefill_token_ids, + clean_up_tokenization_spaces=False, + skip_special_tokens=False, + ) + prefill_tokens = Tokens( + prefill_token_ids, + prefill_logprobs, + prefill_texts, + is_special=[], + ) + else: + prefill_tokens = None + + if top_n_tokens > 0: + all_top_tokens = [] + for top_token_ids, top_token_logprobs in zip( + top_token_ids, top_token_logprobs + ): + toptoken_texts = self.tokenizer.batch_decode( + top_token_ids, + clean_up_tokenization_spaces=False, + skip_special_tokens=False, + ) + special_toptokens = [ + token_id in self.all_special_ids + for token_id in top_token_ids + ] + top_tokens = Tokens( + top_token_ids, + top_token_logprobs, + toptoken_texts, + special_toptokens, + ) + all_top_tokens.append(top_tokens) + top_tokens = all_top_tokens + else: + top_tokens = None + + generation = Generation( + request.id, + prefill_tokens, + Tokens( + [next_token_id], + [next_token_logprob], + [next_token_text], + [next_token_id in self.all_special_ids], + ), + generated_text, + top_tokens, + ) + + generations.append(generation) + + batch.next_token_chooser = ( + batch.next_token_chooser.advance_grammar_single_with_past_state( + req.idx, next_token_id, grammar_state + ) + ) + + req.all_input_ids = all_input_ids + req.input_length = new_input_length + req.prefix_offset = prefix_offset + req.read_offset = read_offset + + htorch.core.mark_step() + # self.step = self.step + 1 + # if self.hb_profiler is not None: + # if self.step > self.profiling_wait_steps + self.profiling_warmup_steps + self.profiling_steps: + # self.hb_profiler.stop() + # else: + # self.hb_profiler.step() + + forward_ns = start_decode - start + decode_ns = time.time_ns() - start_decode + return generations, batch if not stopped else None, (forward_ns, decode_ns) + + def batch_from_pb(self, batch): + return VlmCausalLMBatch.from_pb_processor( + batch, + self.tokenizer, + self.processor, + self.model.config, + self.dtype, + self.device ) - logits = cuda_graph["logits"][:bs] - return logits, speculative_logits + + def generate_warmup_batch(self, request, seq_len, batch_size): + batch = copy.deepcopy(request.batches[0]) + for req in batch.requests: + req.truncate = seq_len + + for i in range(len(batch.requests) - batch_size): + batch.requests.pop() + + return self.batch_from_pb(batch) + + def warmup(self, request) -> None: + batches = [self.batch_from_pb(batch) for batch in request.batches] + + try: + # prefill + _, prefill_batch, _ = self.generate_token([batches[0]]) + except torch.cuda.OutOfMemoryError as e: + raise RuntimeError( + f"Not enough memory to handle {len(batches[0].input_ids)} prefill tokens. " + f"You need to decrease `--max-batch-prefill-tokens`" + ) from e + + global BASE_IMAGE_TOKENS, PAD_SEQUENCE_TO_MULTIPLE_OF, PREFILL_BATCH_BUCKET_SIZE, PREFILL_GRAPH_NUM + max_input_length = batches[0].input_ids.shape[1] + max_batch_size = batches[0].input_ids.shape[0] + seq_num = (max_input_length - BASE_IMAGE_TOKENS) / PAD_SEQUENCE_TO_MULTIPLE_OF + batch_num = max_batch_size / PREFILL_BATCH_BUCKET_SIZE + while batch_num > PREFILL_GRAPH_NUM : + PREFILL_BATCH_BUCKET_SIZE = PREFILL_BATCH_BUCKET_SIZE * 2 + os.environ['PREFILL_BATCH_BUCKET_SIZE'] = str(PREFILL_BATCH_BUCKET_SIZE) + batch_num = max_batch_size / PREFILL_BATCH_BUCKET_SIZE + + while seq_num * batch_num >= PREFILL_GRAPH_NUM : + PAD_SEQUENCE_TO_MULTIPLE_OF = PAD_SEQUENCE_TO_MULTIPLE_OF * 2 + os.environ['PAD_SEQUENCE_TO_MULTIPLE_OF'] = str(PAD_SEQUENCE_TO_MULTIPLE_OF) + seq_num = (max_input_length - BASE_IMAGE_TOKENS) / PAD_SEQUENCE_TO_MULTIPLE_OF + + seq_lens_list = numpy.arange(BASE_IMAGE_TOKENS + PAD_SEQUENCE_TO_MULTIPLE_OF, max_input_length + 1, PAD_SEQUENCE_TO_MULTIPLE_OF).tolist() + batch_sizes_list = numpy.arange(PREFILL_BATCH_BUCKET_SIZE, max_batch_size + 1, PREFILL_BATCH_BUCKET_SIZE).tolist() + for seq_len in seq_lens_list : + for batch_size in batch_sizes_list : + batch = self.generate_warmup_batch(request, seq_len, batch_size) + _, prefill_batch, _ = self.generate_token([batch]) + _, decode_batch, _ = self.generate_token([prefill_batch]) diff --git a/server/text_generation_server/server.py b/server/text_generation_server/server.py index 5184731f..0b5e9e03 100644 --- a/server/text_generation_server/server.py +++ b/server/text_generation_server/server.py @@ -96,8 +96,11 @@ class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServicer): batch, self.model.tokenizer, self.model.dtype, self.model.device ) - batches = [batch_from_pb(batch) for batch in request.batches] - self.model.warmup(batches) + if self.model.batch_type in VLM_BATCH_TYPES : + self.model.warmup(request) + else: + batches = [batch_from_pb(batch) for batch in request.batches] + self.model.warmup(batches) return generate_pb2.WarmupResponse() From db0b6567e1d2121e0f19f566c22ecf7c6ce4dccd Mon Sep 17 00:00:00 2001 From: yuanwu Date: Mon, 29 Jul 2024 22:02:42 +0000 Subject: [PATCH 43/46] Remove log Signed-off-by: yuanwu --- server/text_generation_server/models/__init__.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/server/text_generation_server/models/__init__.py b/server/text_generation_server/models/__init__.py index 569b204f..30bd8f6c 100644 --- a/server/text_generation_server/models/__init__.py +++ b/server/text_generation_server/models/__init__.py @@ -165,9 +165,8 @@ def get_model( dtype=dtype, trust_remote_code=trust_remote_code, ) - logger.info(f"model_type = {model_type}") + if model_type == "llava_next": - logger.info(f"################model_type = {model_type}") return VlmCausalLM( model_class=LlavaNextForConditionalGeneration, model_id=model_id, From 3f0f0e08250303a0ae854942fabe93847d15dee8 Mon Sep 17 00:00:00 2001 From: yuanwu Date: Tue, 30 Jul 2024 03:53:46 +0000 Subject: [PATCH 44/46] Add the habana profiler Signed-off-by: yuanwu --- .../models/vlm_causal_lm.py | 31 +++++++++++++++---- 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/server/text_generation_server/models/vlm_causal_lm.py b/server/text_generation_server/models/vlm_causal_lm.py index 7412092a..51c4b340 100644 --- a/server/text_generation_server/models/vlm_causal_lm.py +++ b/server/text_generation_server/models/vlm_causal_lm.py @@ -494,6 +494,25 @@ class VlmCausalLM(Model): kwargs=kwargs, ) + # Create profiler + ranks_to_profile = [int(val) for val in os.getenv("PROF_RANKS", "0").split(',')] + record_shapes = os.getenv("PROF_RECORD_SHAPES", "false").lower() == "true" + output_dir = os.getenv("PROF_PATH", "/tmp/hpu_profile") + self.profiling_warmup_steps = int(os.getenv("PROF_WARMUPSTEP", "0")) if rank in ranks_to_profile else 0 + self.profiling_steps = int(os.getenv("PROF_STEP", "0")) if rank in ranks_to_profile else 0 + self.profiling_wait_steps = int(os.getenv("PROF_WAITSTEP", "0")) + if self.profiling_steps > 0: + self.hb_profiler = HabanaProfile( + wait=self.profiling_wait_steps, + warmup=self.profiling_warmup_steps, + active=self.profiling_steps, + output_dir=output_dir, + record_shapes=record_shapes + ) + self.hb_profiler.start() + else: + self.hb_profiler = None + self.step = 0 @property @@ -929,12 +948,12 @@ class VlmCausalLM(Model): req.read_offset = read_offset htorch.core.mark_step() - # self.step = self.step + 1 - # if self.hb_profiler is not None: - # if self.step > self.profiling_wait_steps + self.profiling_warmup_steps + self.profiling_steps: - # self.hb_profiler.stop() - # else: - # self.hb_profiler.step() + self.step = self.step + 1 + if self.hb_profiler is not None: + if self.step > self.profiling_wait_steps + self.profiling_warmup_steps + self.profiling_steps: + self.hb_profiler.stop() + else: + self.hb_profiler.step() forward_ns = start_decode - start decode_ns = time.time_ns() - start_decode From 05c13c89defeb5c67114f1ad567643f71b29d4d1 Mon Sep 17 00:00:00 2001 From: yuanwu Date: Tue, 30 Jul 2024 10:05:38 +0000 Subject: [PATCH 45/46] Remove useless modification Signed-off-by: yuanwu --- .../models/flash_causal_lm.py | 117 +++--------------- 1 file changed, 15 insertions(+), 102 deletions(-) diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py index 72ceca6b..86d9b4c8 100644 --- a/server/text_generation_server/models/flash_causal_lm.py +++ b/server/text_generation_server/models/flash_causal_lm.py @@ -10,12 +10,7 @@ import numpy as np from loguru import logger from dataclasses import dataclass from opentelemetry import trace -from transformers import ( - PreTrainedTokenizerBase, - AutoConfig, - AutoTokenizer, - GenerationConfig, -) +from transformers import PreTrainedTokenizerBase from typing import Optional, Tuple, List, Type, Dict from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE @@ -24,11 +19,6 @@ from text_generation_server.models import Model from text_generation_server.utils.tokens import batch_top_tokens from text_generation_server.utils.dist import RANK from text_generation_server.utils.speculate import get_speculate -from text_generation_server.utils import ( - initialize_torch_distributed, - weight_files, - Weights, -) from text_generation_server.models.types import ( Batch, Tokens, @@ -696,97 +686,20 @@ class FlashCausalLMBatch(Batch): class FlashCausalLM(Model): def __init__( self, - model_id: str, - model_class, - revision: Optional[str] = None, - quantize: Optional[str] = None, - speculator: Optional[str] = None, - dtype: Optional[torch.dtype] = None, - trust_remote_code: bool = False, - lora_adapter_ids: Optional[list] = [], - tokenizer_class: PreTrainedTokenizerBase = AutoTokenizer, - config_class: PreTrainedTokenizerBase = AutoConfig, - default_dtype=torch.bfloat16, - aliases=None, - # Used for Santacoder override of config - num_kv_heads: Optional[int] = None, - # Deepseek V2 uses different QK and V dims. - head_size: Optional[int] = None, - skip_special_tokens: bool = True, + model: torch.nn.Module, + tokenizer: PreTrainedTokenizerBase, + num_layers: int, + num_kv_heads: int, + head_size: int, + dtype: torch.dtype, + device: torch.device, + rank: int = 0, + world_size: int = 1, + sliding_window: Optional[int] = None, ): - - # Create model - world_size = int(os.getenv("WORLD_SIZE", "1")) - rank = int(os.getenv("RANK", "0")) - dtype = torch.bfloat16 if dtype is None else dtype - device = torch.device("hpu") - - tokenizer = tokenizer_class.from_pretrained( - model_id, - revision=revision, - padding_side="left", - truncation_side="left", - trust_remote_code=trust_remote_code, - ) - try: - generation_config = GenerationConfig.from_pretrained( - model_id, revision=revision, trust_remote_code=trust_remote_code - ) - if isinstance(generation_config.eos_token_id, (list, set)): - # TODO Huge hack - tokenizer._eos_token_ids = set(generation_config.eos_token_id) - except Exception: - pass - - config = config_class.from_pretrained( - model_id, revision=revision, trust_remote_code=trust_remote_code - ) - config.quantize = quantize - config.speculator = speculator - - - filenames = weight_files(model_id, revision=revision, extension=".safetensors") - weights = Weights(filenames, device, dtype) - - - - prefix = "" - model = model_class(prefix, config, weights) - - # VLM models define the config we care about in their text_config - text_config = getattr(config, "text_config", None) - if text_config is not None: - config = text_config - - - self.num_layers = config.num_hidden_layers - # Validation is done in the model itself - if num_kv_heads is None: - num_kv_heads = getattr(config, "num_key_value_heads", None) - # GPT-2 workaround - if num_kv_heads is None: - num_kv_heads = getattr(config, "n_head", None) - if num_kv_heads is None: - raise ValueError("Cannot get the number of key/value heads") - self.num_kv_heads = num_kv_heads ( - num_kv_heads // self.process_group.size() - if num_kv_heads > 1 - else num_kv_heads - ) - assert self.num_kv_heads > 0 - - if head_size is None: - # Some models use GQA and different sizes for o_proj - # and q_proj, that allows for that. - if hasattr(config, "head_dim"): - self.head_size = config.head_dim - else: - self.head_size = config.hidden_size // config.num_attention_heads - else: - self.head_size = head_size - - self.cuda_graphs = {} - self.kv_cache = [] + self.num_layers = num_layers + self.num_kv_heads = num_kv_heads + self.head_size = head_size self.cuda_graphs = {} @@ -798,7 +711,7 @@ class FlashCausalLM(Model): device=device, rank=rank, world_size=world_size, - sliding_window=None, + sliding_window=sliding_window, ) @property From d34ffc4fe9f468436d6f81b6334454e24af31ba5 Mon Sep 17 00:00:00 2001 From: yuanwu Date: Fri, 2 Aug 2024 04:36:59 +0000 Subject: [PATCH 46/46] Refile the hpu warmup Signed-off-by: yuanwu --- .../models/custom_modeling/llava_next.py | 1 - .../models/vlm_causal_lm.py | 329 ++++++++++++++---- server/text_generation_server/server.py | 7 +- 3 files changed, 269 insertions(+), 68 deletions(-) diff --git a/server/text_generation_server/models/custom_modeling/llava_next.py b/server/text_generation_server/models/custom_modeling/llava_next.py index 4268cc9b..319a6d28 100644 --- a/server/text_generation_server/models/custom_modeling/llava_next.py +++ b/server/text_generation_server/models/custom_modeling/llava_next.py @@ -26,7 +26,6 @@ from transformers.models.llava_next.modeling_llava_next import ( ) from optimum.habana.transformers.models import GaudiLlavaNextForConditionalGeneration from transformers.image_processing_utils import select_best_resolution -from loguru import logger def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size): """ diff --git a/server/text_generation_server/models/vlm_causal_lm.py b/server/text_generation_server/models/vlm_causal_lm.py index 51c4b340..5c6c90c6 100644 --- a/server/text_generation_server/models/vlm_causal_lm.py +++ b/server/text_generation_server/models/vlm_causal_lm.py @@ -10,6 +10,7 @@ import numpy from opentelemetry import trace from loguru import logger from typing import Optional, Tuple, List, Type, Dict +import itertools import tempfile import copy from text_generation_server.models import Model @@ -20,9 +21,10 @@ from text_generation_server.pb import generate_pb2 from text_generation_server.models.causal_lm import ( CausalLMBatch, CausalLMRequest, - round_up, - remove_kv_cache_from_output + remove_kv_cache_from_output, + biggest_single_chunk, ) + from transformers.models.llava_next.modeling_llava_next import ( get_anyres_image_grid_shape, ) @@ -43,6 +45,7 @@ from text_generation_server.utils import ( import habana_frameworks.torch as htorch from optimum.habana.utils import HabanaProfile from optimum.habana.transformers.generation import MODELS_OPTIMIZED_WITH_STATIC_SHAPES +from optimum.habana.utils import get_hpu_memory_stats from transformers import ( AutoTokenizer, @@ -70,18 +73,20 @@ tracer = trace.get_tracer(__name__) IMAGES = re.compile(r"!\[[^\]]*\]\((.*?)\s*(\"(?:.*[^\"])\")?\s*\)") BASE_IMAGE_TOKENS = int(os.environ.get('BASE_IMAGE_TOKENS', 2048)) MAX_TOTAL_TOKENS = int(os.environ.get('MAX_TOTAL_TOKENS', 8192)) -BATCH_BUCKET_SIZE = int(os.environ.get('BATCH_BUCKET_SIZE', 1)) -PAD_SEQUENCE_TO_MULTIPLE_OF = int(os.environ.get('PAD_SEQUENCE_TO_MULTIPLE_OF', 128)) -PREFILL_BATCH_BUCKET_SIZE = int(os.environ.get('PREFILL_BATCH_BUCKET_SIZE', 1)) +MAX_BATCH_TOTAL_TOKENS = int(os.environ.get('MAX_BATCH_TOTAL_TOKENS', 131072)) +PAD_SEQUENCE_TO_MULTIPLE_OF = int(os.environ.get('PAD_SEQUENCE_TO_MULTIPLE_OF', 256)) CHUNK_SIZES = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048] LAZY_MODE = int(os.environ.get('PT_HPU_LAZY_MODE', 1)) -PREFILL_GRAPH_NUM = int(os.environ.get('PREFILL_GRAPH_NUM', 16)) -os.environ['MAX_TOTAL_TOKENS'] = str(MAX_TOTAL_TOKENS) -os.environ['BATCH_BUCKET_SIZE'] = str(BATCH_BUCKET_SIZE) -os.environ['PAD_SEQUENCE_TO_MULTIPLE_OF'] = str(PAD_SEQUENCE_TO_MULTIPLE_OF) -os.environ['PREFILL_BATCH_BUCKET_SIZE'] = str(PREFILL_BATCH_BUCKET_SIZE) -os.environ['LAZY_MODE'] = str(LAZY_MODE) +PREFILL_WARMUP_BATCH_SIZE_LIST = [] +PREFILL_WARMUP_SEQLEN_LIST = [] +DECODE_WARMUP_BATCH_SIZE_LIST = [] +def round_up(warmup_list:list, num) : + i = 0 + for i in warmup_list: + if num <= i : + break + return i def split(string) -> List[Dict[str, str]]: parts = [] @@ -186,6 +191,7 @@ class VlmCausalLMBatch(CausalLMBatch): batch_tokenized_inputs, dtype: torch.dtype, device: torch.device, + is_warmup: bool = False, ) -> "VlmCausalLMBatch": dbg_trace('FROM_PB', f'num_reqs:{len(pb.requests)}') @@ -200,7 +206,7 @@ class VlmCausalLMBatch(CausalLMBatch): # TODO: by tokenizing all inputs at once we loose information on actual input lengths # this means that we cannot shift inputs to the left after a long input sequence # was filtered out - new_bs = round_up(len(requests), PREFILL_BATCH_BUCKET_SIZE) + new_bs = round_up(PREFILL_WARMUP_BATCH_SIZE_LIST, len(requests)) parameters = [r.parameters for r in pb.requests] # append the dummy parameters for dummy request parameters = pad_next_token_chooser_parameters(parameters, new_bs) @@ -217,14 +223,14 @@ class VlmCausalLMBatch(CausalLMBatch): bucket_size = max_input_length left_padding = max_input_length - input_len - if input_len < max_input_length and PAD_SEQUENCE_TO_MULTIPLE_OF != 0: - assert PAD_SEQUENCE_TO_MULTIPLE_OF <= max_input_length, "PAD_SEQUENCE_TO_MULTIPLE_OF cannot be higher than max_input_length" - rounded_seq_len = round_up(input_len + 1, PAD_SEQUENCE_TO_MULTIPLE_OF) - if rounded_seq_len <= max_input_length: - bucket_size = rounded_seq_len - 1 - else: - bucket_size = max_input_length - 1 - left_padding = bucket_size - input_len + if is_warmup is False: + if input_len < max_input_length : + rounded_seq_len = round_up(PREFILL_WARMUP_SEQLEN_LIST, input_len + 1) + if rounded_seq_len <= max_input_length: + bucket_size = rounded_seq_len - 1 + else: + bucket_size = max_input_length - 1 + left_padding = bucket_size - input_len input_ids = tokenized_inputs["input_ids"] attention_mask = tokenized_inputs["attention_mask"] @@ -269,7 +275,7 @@ class VlmCausalLMBatch(CausalLMBatch): ) @classmethod - def batch_tokenized_inputs(cls, requests, tokenizer, processor, config): + def batch_tokenized_inputs(cls, requests, tokenizer, processor, config, is_warmup): batch_inputs = [] image_inputs = [] max_truncation = 0 @@ -301,17 +307,19 @@ class VlmCausalLMBatch(CausalLMBatch): batch_inputs.append(full_text) max_truncation = max(max_truncation, r.truncate) - new_bs = round_up(len(requests), PREFILL_BATCH_BUCKET_SIZE) - missing_inputs = new_bs - len(requests) - dummy_images = [] - dummy_inputs = [] - if len(batch_inputs) > 0 and len(image_inputs) > 0: - dummy_inputs = [batch_inputs[0]] * missing_inputs - dummy_images = [image_inputs[0]] * missing_inputs + if is_warmup is False: + new_bs = round_up(PREFILL_WARMUP_BATCH_SIZE_LIST, len(requests)) + missing_inputs = new_bs - len(requests) + dummy_images = [] + dummy_inputs = [] + if len(batch_inputs) > 0 and len(image_inputs) > 0: + dummy_inputs = [batch_inputs[0]] * missing_inputs + dummy_images = [image_inputs[0]] * missing_inputs + image_inputs += dummy_images + batch_inputs += dummy_inputs - image_inputs += dummy_images batch_tokenized_inputs = tokenizer( - batch_inputs + dummy_inputs, + batch_inputs, truncation=True, max_length=max_truncation, return_tensors="pt", @@ -347,9 +355,10 @@ class VlmCausalLMBatch(CausalLMBatch): config, dtype: torch.dtype, device: torch.device, + is_warmup: bool = False, ) -> "VlmCausalLMBatch": batch_tokenized_inputs, image_inputs = cls.batch_tokenized_inputs( - pb.requests, tokenizer, processor, config + pb.requests, tokenizer, processor, config, is_warmup ) batch = cls.from_tokenized(pb, tokenizer, batch_tokenized_inputs, dtype, device) if image_inputs is not None: @@ -370,6 +379,114 @@ class VlmCausalLMBatch(CausalLMBatch): batch.image_sizes = None return batch + @classmethod + @tracer.start_as_current_span("concatenate") + def concatenate(cls, batches: List["CausalLMBatch"], pad_token_id: int = 0, is_warmup:bool = False) -> "CausalLMBatch": + return cls.recombine(batches, pad_token_id, is_warmup) + + + + @classmethod + def recombine(cls, batches: List["VlmCausalLMBatch"], pad_token_id: int, is_warmup: bool =False) -> "VlmCausalLMBatch": + if not all(b.past_key_values is not None for b in batches): + raise ValueError("KV cache not allocated! Cannot recombine before prefill!") + + total_requests = sum(len(b) for b in batches) + new_bs = total_requests + if is_warmup is False : + new_bs = round_up(DECODE_WARMUP_BATCH_SIZE_LIST, total_requests) + batch_id = batches[0].batch_id + device = batches[0].input_ids.device + + input_lengths = [b.input_length for b in batches] + max_input_length = max(input_lengths) + offsets = [max_input_length - b.input_length for b in batches] + + cur_padding = [b.right_padding for b in batches] + # For prefill there is a space allocated only for first token + # Need to add padding to the max total tokens before first decode + + moves_needed = [total_requests - len(b) if b.batch_size == new_bs else total_requests for b in batches] + dst_batch_idx = min(enumerate(moves_needed), key=lambda idx_val: idx_val[1])[0] + reshape = (batches[dst_batch_idx].batch_size < new_bs) + + # TODO: Add support for changing max seq len, i.e. due to output length bucketing + # FIXME: max_seq_len for non optimized code + if len(batches) > 1: + scenario = 'CONCAT' + elif reshape: + scenario = 'RESHAPE' + elif cur_padding[dst_batch_idx] <= 0: + scenario = 'SHIFT' + offsets = [biggest_single_chunk(b.max_input_length - max_input_length) for b in batches] + max_input_length = max_input_length + offsets[dst_batch_idx] + else: + # Nothing to do + return batches[0] + + dbg_trace( + scenario, f'bs:{[b.batch_size for b in batches]}->{new_bs}' + f' reqs:{[len(b) for b in batches]}' + f' offsets:{offsets}' + f' input_lengths:{input_lengths}' + f' cur_padding:{cur_padding}' + f' dst_batch:{dst_batch_idx}') + + grouped_requests = [[req for req in batch.requests] for batch in batches] + flat_requests = list(itertools.chain(*grouped_requests)) + + for i in range(len(batches)): + target_bs = new_bs if i == dst_batch_idx else batches[i].batch_size + batches[i].merge_kv_cache_if_needed(target_bs, offsets[i]) + batches[i].realign(target_bs, offsets[i], pad_token_id) + batches[i].split_kv_cache_if_needed(i == dst_batch_idx) + batches[dst_batch_idx].expand_bs(new_bs) + batches[dst_batch_idx].move_data([batches[i] for i in range(len(batches)) if i != dst_batch_idx]) + + top_n_tokens = [r.data.top_n_tokens for r in flat_requests] + top_n_tokens_tensor = torch.tensor(top_n_tokens, device=device, dtype=torch.int64) + + parameters = [r.data.parameters for r in flat_requests] + # append the dummy parameters for dummy requests + batch_size = batches[dst_batch_idx].batch_size + parameters = pad_next_token_chooser_parameters(parameters, batch_size) + + # update past grammar states + fsm_grammar_states = [0] * batch_size + for batch in batches: + for i, req in enumerate(batch.requests): + fsm_grammar_states[req.idx] = batch.next_token_chooser.fsm_grammar_states[i] + + next_token_chooser = HeterogeneousNextTokenChooser.from_pb( + parameters, + batches[dst_batch_idx].next_token_chooser.dtype, + batches[dst_batch_idx].next_token_chooser.device, + batches[dst_batch_idx].next_token_chooser.tokenizer, + fsm_grammar_states, + quantization_enabled=hq_env.is_quantization_enabled, + ) + + input_ids = batches[dst_batch_idx].input_ids + attention_mask = batches[dst_batch_idx].attention_mask + position_ids = batches[dst_batch_idx].position_ids + past_key_values = batches[dst_batch_idx].past_key_values + input_length = max_input_length + + htorch.core.mark_step() + + return cls( + batch_id=batch_id, + requests=flat_requests, + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + merged_kv_cache=False, + next_token_chooser=next_token_chooser, + top_n_tokens=top_n_tokens, + top_n_tokens_tensor=top_n_tokens_tensor, + input_length=input_length, + ) class VlmCausalLM(Model): def __init__( @@ -672,7 +789,7 @@ class VlmCausalLM(Model): @tracer.start_as_current_span("generate_token") def generate_token( - self, batches: List[VlmCausalLMBatch] + self, batches: List[VlmCausalLMBatch], is_warmup: bool = False ) -> Tuple[List[Generation], Optional[CausalLMBatch], Tuple[int, int]]: start = time.time_ns() # Results @@ -755,7 +872,7 @@ class VlmCausalLM(Model): # Stage 2. Prepare new batch for speculative scheduling if len(batches) > 1: - batch = self.batch_type.concatenate(batches, self.tokenizer.pad_token_id) + batch = self.batch_type.concatenate(batches, self.tokenizer.pad_token_id, is_warmup) else: batch = batches[0] @@ -763,12 +880,12 @@ class VlmCausalLM(Model): # Check if we need to do any bookkeeping first if not prefill: - batch = batch.__class__.recombine([batch], self.tokenizer.pad_token_id) + batch = batch.__class__.recombine([batch], self.tokenizer.pad_token_id, is_warmup) scenario = 'PREFILL' if prefill else 'GENERATE' - if self.enable_hpu_graph and self.limit_hpu_graph and round_up(batch.batch_size, BATCH_BUCKET_SIZE) != self.prev_bs: + if self.enable_hpu_graph and self.limit_hpu_graph and round_up(DECODE_WARMUP_BATCH_SIZE_LIST, batch.batch_size) != self.prev_bs: self.model.clear_cache() - self.prev_bs = round_up(batch.batch_size, BATCH_BUCKET_SIZE) + self.prev_bs = round_up(DECODE_WARMUP_BATCH_SIZE_LIST, batch.batch_size) dbg_trace( scenario, f'bs:{batch.batch_size} num_reqs:{len(batch.requests)} seq_len:{batch.seq_length} padding:{batch.right_padding}') #assert batch.right_padding > 0, 'No more room for next token!' @@ -959,17 +1076,18 @@ class VlmCausalLM(Model): decode_ns = time.time_ns() - start_decode return generations, batch if not stopped else None, (forward_ns, decode_ns) - def batch_from_pb(self, batch): + def batch_from_pb(self, batch, is_warmup): return VlmCausalLMBatch.from_pb_processor( batch, self.tokenizer, self.processor, self.model.config, self.dtype, - self.device + self.device, + is_warmup ) - def generate_warmup_batch(self, request, seq_len, batch_size): + def generate_warmup_batch(self, request, seq_len, batch_size, is_warmup): batch = copy.deepcopy(request.batches[0]) for req in batch.requests: req.truncate = seq_len @@ -977,39 +1095,122 @@ class VlmCausalLM(Model): for i in range(len(batch.requests) - batch_size): batch.requests.pop() - return self.batch_from_pb(batch) + return self.batch_from_pb(batch, is_warmup) def warmup(self, request) -> None: - batches = [self.batch_from_pb(batch) for batch in request.batches] + is_warmup = True + batches = [self.batch_from_pb(batch, is_warmup) for batch in request.batches] try: - # prefill - _, prefill_batch, _ = self.generate_token([batches[0]]) - except torch.cuda.OutOfMemoryError as e: + # max prefill batch size warmup + _, prefill_batch, _ = self.generate_token([batches[0]], is_warmup) + except: raise RuntimeError( f"Not enough memory to handle {len(batches[0].input_ids)} prefill tokens. " f"You need to decrease `--max-batch-prefill-tokens`" - ) from e + ) - global BASE_IMAGE_TOKENS, PAD_SEQUENCE_TO_MULTIPLE_OF, PREFILL_BATCH_BUCKET_SIZE, PREFILL_GRAPH_NUM + self.model.clear_inputs() + global BASE_IMAGE_TOKENS, MAX_TOTAL_TOKENS, MAX_BATCH_TOTAL_TOKENS, PREFILL_WARMUP_BATCH_SIZE_LIST, PREFILL_WARMUP_SEQLEN_LIST, DECODE_WARMUP_BATCH_SIZE_LIST max_input_length = batches[0].input_ids.shape[1] - max_batch_size = batches[0].input_ids.shape[0] - seq_num = (max_input_length - BASE_IMAGE_TOKENS) / PAD_SEQUENCE_TO_MULTIPLE_OF - batch_num = max_batch_size / PREFILL_BATCH_BUCKET_SIZE - while batch_num > PREFILL_GRAPH_NUM : - PREFILL_BATCH_BUCKET_SIZE = PREFILL_BATCH_BUCKET_SIZE * 2 - os.environ['PREFILL_BATCH_BUCKET_SIZE'] = str(PREFILL_BATCH_BUCKET_SIZE) - batch_num = max_batch_size / PREFILL_BATCH_BUCKET_SIZE + max_prefill_batch_size = batches[0].input_ids.shape[0] + PREFILL_WARMUP_BATCH_SIZE_LIST = [] + batch_size = 1 + while batch_size <= max_prefill_batch_size: + PREFILL_WARMUP_BATCH_SIZE_LIST.append(batch_size) + batch_size = batch_size * 2 + if PREFILL_WARMUP_BATCH_SIZE_LIST[-1] < max_prefill_batch_size : + PREFILL_WARMUP_BATCH_SIZE_LIST.append(max_prefill_batch_size) - while seq_num * batch_num >= PREFILL_GRAPH_NUM : - PAD_SEQUENCE_TO_MULTIPLE_OF = PAD_SEQUENCE_TO_MULTIPLE_OF * 2 - os.environ['PAD_SEQUENCE_TO_MULTIPLE_OF'] = str(PAD_SEQUENCE_TO_MULTIPLE_OF) - seq_num = (max_input_length - BASE_IMAGE_TOKENS) / PAD_SEQUENCE_TO_MULTIPLE_OF + seq_len = BASE_IMAGE_TOKENS + PREFILL_WARMUP_SEQLEN_LIST = [] + i = 0 + while seq_len <= max_input_length: + PREFILL_WARMUP_SEQLEN_LIST.append(seq_len) + seq_len += PAD_SEQUENCE_TO_MULTIPLE_OF*(2**i) + i += 1 + if PREFILL_WARMUP_SEQLEN_LIST[-1] < max_input_length: + PREFILL_WARMUP_SEQLEN_LIST.append(max_input_length) - seq_lens_list = numpy.arange(BASE_IMAGE_TOKENS + PAD_SEQUENCE_TO_MULTIPLE_OF, max_input_length + 1, PAD_SEQUENCE_TO_MULTIPLE_OF).tolist() - batch_sizes_list = numpy.arange(PREFILL_BATCH_BUCKET_SIZE, max_batch_size + 1, PREFILL_BATCH_BUCKET_SIZE).tolist() - for seq_len in seq_lens_list : - for batch_size in batch_sizes_list : - batch = self.generate_warmup_batch(request, seq_len, batch_size) - _, prefill_batch, _ = self.generate_token([batch]) - _, decode_batch, _ = self.generate_token([prefill_batch]) + #Prefill and decode warmup + DECODE_WARMUP_BATCH_SIZE_LIST = [] + prefill_batch = None + decode_batch = None + try: + for batch_size in PREFILL_WARMUP_BATCH_SIZE_LIST : + for seq_len in PREFILL_WARMUP_SEQLEN_LIST : + batch = self.generate_warmup_batch(request, seq_len, batch_size, is_warmup) + _, prefill_batch, _ = self.generate_token([batch], is_warmup) + _, decode_batch, _ = self.generate_token([prefill_batch], is_warmup) + + DECODE_WARMUP_BATCH_SIZE_LIST.append(batch_size) + + except: + raise RuntimeError( + f"Not enough memory to handle following prefill and decode warmup." + f"Prefill batch size list:{PREFILL_WARMUP_BATCH_SIZE_LIST}" + f"Prefill sequence length list:{PREFILL_WARMUP_SEQLEN_LIST}" + f"Decode batch size list:{DECODE_WARMUP_BATCH_SIZE_LIST}" + f"You need to decrease `--max-batch-prefill-tokens`" + ) + + mem_stats = get_hpu_memory_stats(self.device) + logger.info( + f"\nFollowing prefill and decode warmup successfully.\n" + f"Prefill batch size list:{PREFILL_WARMUP_BATCH_SIZE_LIST}\n" + f"Prefill sequence length list:{PREFILL_WARMUP_SEQLEN_LIST}\n" + f"Decode batch size list:{DECODE_WARMUP_BATCH_SIZE_LIST}\n" + f"Memory stats: {mem_stats} " + ) + + self.model.clear_inputs() + max_decode_batch_size = math.floor(MAX_BATCH_TOTAL_TOKENS / MAX_TOTAL_TOKENS) + batch_size = max_prefill_batch_size * 2 + # Decode warmup with bigger batch_size + try: + if DECODE_WARMUP_BATCH_SIZE_LIST[-1] < max_decode_batch_size and batch_size <= max_decode_batch_size: + batches = [] + for i in range(int(batch_size/max_prefill_batch_size)) : + batch = self.generate_warmup_batch(request, PREFILL_WARMUP_SEQLEN_LIST[0], DECODE_WARMUP_BATCH_SIZE_LIST[-1], is_warmup) + _, prefill_batch, _ = self.generate_token([batch], is_warmup) + batches.append(prefill_batch) + while batch_size <= max_decode_batch_size: + _, decode_batch, _ = self.generate_token(batches, is_warmup) + DECODE_WARMUP_BATCH_SIZE_LIST.append(batch_size) + batch_size = batch_size * 2 + batches.clear() + + for i in range(int(batch_size/max_prefill_batch_size)) : + batch = self.generate_warmup_batch(request, PREFILL_WARMUP_SEQLEN_LIST[0], DECODE_WARMUP_BATCH_SIZE_LIST[-1], is_warmup) + _, prefill_batch, _ = self.generate_token([batch], is_warmup) + batches.append(prefill_batch) + + batches.clear() + if DECODE_WARMUP_BATCH_SIZE_LIST[-1] < max_decode_batch_size: + max_decode_batch_size = math.floor( max_decode_batch_size / 2) * 2 + batch_size = max_decode_batch_size + for i in range(int(max_decode_batch_size / 2)) : + batch = self.generate_warmup_batch(request, PREFILL_WARMUP_SEQLEN_LIST[0], 2, is_warmup) + _, prefill_batch, _ = self.generate_token([batch], is_warmup) + batches.append(prefill_batch) + _, decode_batch, _ = self.generate_token(batches, is_warmup) + DECODE_WARMUP_BATCH_SIZE_LIST.append(max_decode_batch_size) + max_batch_total_tokens = max_decode_batch_size * MAX_TOTAL_TOKENS + MAX_BATCH_TOTAL_TOKENS = max_batch_total_tokens + except : + raise RuntimeError( + f"Not enough memory to handle batch_size({batch_size}) decode warmup." + f"Decode batch size list:{DECODE_WARMUP_BATCH_SIZE_LIST}" + f"max_decode_batch_size is {max_decode_batch_size}" + f"You need to decrease env `MAX_BATCH_TOTAL_TOKENS` or '--max_batch_total_tokens'" + ) + + mem_stats = get_hpu_memory_stats(self.device) + logger.info( + f"\nFollowing decode warmup successfully.\n" + f"Decode batch size list:{DECODE_WARMUP_BATCH_SIZE_LIST}\n" + f"Memory stats: {mem_stats}" + ) + + self.model.clear_inputs() + return MAX_BATCH_TOTAL_TOKENS \ No newline at end of file diff --git a/server/text_generation_server/server.py b/server/text_generation_server/server.py index 0b5e9e03..4cb7fb24 100644 --- a/server/text_generation_server/server.py +++ b/server/text_generation_server/server.py @@ -97,12 +97,13 @@ class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServicer): ) if self.model.batch_type in VLM_BATCH_TYPES : - self.model.warmup(request) + max_supported_total_tokens = self.model.warmup(request) + return generate_pb2.WarmupResponse(max_supported_total_tokens=max_supported_total_tokens) else: batches = [batch_from_pb(batch) for batch in request.batches] self.model.warmup(batches) + return generate_pb2.WarmupResponse() - return generate_pb2.WarmupResponse() async def Prefill(self, request, context): start = time.time_ns() @@ -171,7 +172,7 @@ def serve( uds_path: Path, ): # Remove default handler - logger.remove() + #logger.remove() logger.add( sys.stdout, format="{message}",