From 7d8e5fb2845fb818f9244a70c4bb3aeaf190b65d Mon Sep 17 00:00:00 2001 From: Omar Sanseviero Date: Thu, 31 Aug 2023 20:00:12 +0200 Subject: [PATCH 01/11] Update version in docs (#957) --- README.md | 5 +++-- docs/openapi.json | 2 +- docs/source/quicktour.md | 4 ++-- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 739e656b..fd2afd0d 100644 --- a/README.md +++ b/README.md @@ -67,6 +67,7 @@ to power Hugging Chat, the Inference API and Inference Endpoint. - [Falcon 40B](https://huggingface.co/tiiuae/falcon-40b) - [MPT](https://huggingface.co/mosaicml/mpt-30b) - [Llama V2](https://huggingface.co/meta-llama) +- [Code Llama](https://huggingface.co/codellama) Other architectures are supported on a best effort basis using: @@ -86,7 +87,7 @@ The easiest way of getting started is using the official Docker container: model=tiiuae/falcon-7b-instruct volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run -docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:1.0.2 --model-id $model +docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:1.0.3 --model-id $model ``` **Note:** To use GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). We also recommend using NVIDIA drivers with CUDA version 11.8 or higher. For running the Docker container on a machine with no GPUs or CUDA support, it is enough to remove the `--gpus all` flag and add `--disable-custom-kernels`, please note CPU is not the intended platform for this project, so performance might be subpar. @@ -153,7 +154,7 @@ model=meta-llama/Llama-2-7b-chat-hf volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run token= -docker run --gpus all --shm-size 1g -e HUGGING_FACE_HUB_TOKEN=$token -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:1.0.2 --model-id $model +docker run --gpus all --shm-size 1g -e HUGGING_FACE_HUB_TOKEN=$token -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:1.0.3 --model-id $model ``` ### A note on Shared Memory (shm) diff --git a/docs/openapi.json b/docs/openapi.json index 5974c58d..23c4f198 100644 --- a/docs/openapi.json +++ b/docs/openapi.json @@ -10,7 +10,7 @@ "name": "Apache 2.0", "url": "https://www.apache.org/licenses/LICENSE-2.0" }, - "version": "1.0.2" + "version": "1.0.3" }, "paths": { "/": { diff --git a/docs/source/quicktour.md b/docs/source/quicktour.md index c085943c..b91e77cb 100644 --- a/docs/source/quicktour.md +++ b/docs/source/quicktour.md @@ -8,7 +8,7 @@ Let's say you want to deploy [Falcon-7B Instruct](https://huggingface.co/tiiuae/ model=tiiuae/falcon-7b-instruct volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run -docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:1.0.2 --model-id $model +docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:1.0.3 --model-id $model ``` @@ -85,7 +85,7 @@ curl 127.0.0.1:8080/generate \ To see all possible deploy flags and options, you can use the `--help` flag. It's possible to configure the number of shards, quantization, generation parameters, and more. ```bash -docker run ghcr.io/huggingface/text-generation-inference:1.0.2 --help +docker run ghcr.io/huggingface/text-generation-inference:1.0.3 --help ``` From 8a5f5649429f8df0ae5b86e485879c3d21d255f6 Mon Sep 17 00:00:00 2001 From: Vincent Brouwers Date: Thu, 31 Aug 2023 21:15:14 +0200 Subject: [PATCH 02/11] Fix Falcon weight mapping for H2O.ai checkpoints (#953) # What does this PR do? During the safetensor conversion, duplicate weights are removed. However, which of the duplicates gets removed, differs per checkpoint. In some, like `h2oai/h2ogpt-oig-oasst1-falcon-40b`, the weight `transformer.word_embeddings.weightSafetensor` gets removed. In others, `lm_head.weight` gets removed. Long story long, we need to support both. Originally, f018143 mapped `lm_head` to `word_embeddings`. Then ac736fd switched this around. This commit merges them and allows for both. ## Before submitting - [x] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [x] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests), Pull Request section? - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link to it if that's the case. - [ ] Did you make sure to update the documentation with your changes? Here are the [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation). - [ ] Did you write any new necessary tests? ## Who can review? @Narsil, you wrote both commits I referenced in this PR. I think you'll understand this change :) --- server/text_generation_server/models/flash_rw.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/server/text_generation_server/models/flash_rw.py b/server/text_generation_server/models/flash_rw.py index 2fc7c53d..195b3883 100644 --- a/server/text_generation_server/models/flash_rw.py +++ b/server/text_generation_server/models/flash_rw.py @@ -54,7 +54,10 @@ class FlashRWSharded(FlashCausalLM): device, dtype, process_group=self.process_group, - aliases={"lm_head.weight": ["transformer.word_embeddings.weight"]}, + aliases={ + "lm_head.weight": ["transformer.word_embeddings.weight"], + "transformer.word_embeddings.weight": ["lm_head.weight"], + }, ) config.quantize = quantize From 4f5d93ecd0c5ba5de0ead54d8934f163550d27d2 Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Fri, 1 Sep 2023 00:22:03 +0200 Subject: [PATCH 03/11] Fixing top_k tokens when k ends up < 0 (#966) # What does this PR do? Fixes # (issue) ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests), Pull Request section? - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link to it if that's the case. - [ ] Did you make sure to update the documentation with your changes? Here are the [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation). - [ ] Did you write any new necessary tests? ## Who can review? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR. --- server/text_generation_server/utils/tokens.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/server/text_generation_server/utils/tokens.py b/server/text_generation_server/utils/tokens.py index 69177d56..7b003f1d 100644 --- a/server/text_generation_server/utils/tokens.py +++ b/server/text_generation_server/utils/tokens.py @@ -363,9 +363,10 @@ def batch_top_tokens( # Find the new "fuzzy" top n values top_n_indices = (logprobs >= nth_highest).nonzero() _, top_n_ishes = torch.unique_consecutive(top_n_indices[:, 0], return_counts=True) - + + k = 1 if top_n_ishes.numel() == 0 else top_n_ishes.max() # Take a new topk for these new max n values - top_k = torch.topk(logprobs, k=top_n_ishes.max(), dim=1, sorted=True) + top_k = torch.topk(logprobs, k=k, dim=1, sorted=True) top_n_ishes = top_n_ishes.tolist() top_indices = top_k.indices.tolist() From 2bc287bfcd54bfb75833ab73d170afaf552053ba Mon Sep 17 00:00:00 2001 From: Victor SANH Date: Fri, 1 Sep 2023 12:44:34 -0400 Subject: [PATCH 04/11] small fix on idefics (#954) transposing the fixes from https://github.com/huggingface/transformers/pull/25787 --- .../models/custom_modeling/idefics_config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/server/text_generation_server/models/custom_modeling/idefics_config.py b/server/text_generation_server/models/custom_modeling/idefics_config.py index 34925087..0bdb2e3d 100644 --- a/server/text_generation_server/models/custom_modeling/idefics_config.py +++ b/server/text_generation_server/models/custom_modeling/idefics_config.py @@ -51,7 +51,7 @@ class IdeficsVisionConfig(PretrainedConfig): Number of attention heads for each attention layer in the Transformer encoder. image_num_channels (`int`, *optional*, defaults to `3`): Number of image channels. - hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`): + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported. layer_norm_eps (`float`, *optional*, defaults to 1e-5): @@ -80,7 +80,7 @@ class IdeficsVisionConfig(PretrainedConfig): num_hidden_layers=32, num_attention_heads=16, num_channels=3, - hidden_act="quick_gelu", + hidden_act="gelu", layer_norm_eps=1e-5, attention_dropout=0.0, initializer_range=0.02, From 033230ae667101d2d8d8bcd4952442fa348ef951 Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Mon, 4 Sep 2023 15:00:19 +0200 Subject: [PATCH 05/11] Backport https://github.com/vllm-project/vllm/pull/936 (#977) # What does this PR do? Fixes # (issue) ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests), Pull Request section? - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link to it if that's the case. - [ ] Did you make sure to update the documentation with your changes? Here are the [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation). - [ ] Did you write any new necessary tests? ## Who can review? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR. --- server/Makefile-vllm | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/server/Makefile-vllm b/server/Makefile-vllm index af750733..96bfc108 100644 --- a/server/Makefile-vllm +++ b/server/Makefile-vllm @@ -1,4 +1,4 @@ -vllm_commit := d284b831c17f42a8ea63369a06138325f73c4cf9 +vllm_commit := e86af624d059969b0fb07b075b1d338bf10c3365 vllm: # Clone vllm @@ -10,4 +10,4 @@ build-vllm: vllm install-vllm: build-vllm pip uninstall vllm -y || true - cd vllm && python setup.py install \ No newline at end of file + cd vllm && python setup.py install From c8bbbd812900ea52bab9b950f846ed2d975d9e78 Mon Sep 17 00:00:00 2001 From: Jelle Zijlstra Date: Wed, 6 Sep 2023 05:12:08 -0700 Subject: [PATCH 06/11] chore(client): Support Pydantic 2 (#900) This should allow users to use either Pydantic 2 or Pydantic 1. I couldn't run all tests locally because I reran them too often and got rate limited, but I believe this is sufficient. --- clients/python/pyproject.toml | 2 +- clients/python/text_generation/types.py | 30 ++++++++++++------------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/clients/python/pyproject.toml b/clients/python/pyproject.toml index a52bdd81..915ac7aa 100644 --- a/clients/python/pyproject.toml +++ b/clients/python/pyproject.toml @@ -12,7 +12,7 @@ repository = "https://github.com/huggingface/text-generation-inference" [tool.poetry.dependencies] python = "^3.7" -pydantic = "^1.10" +pydantic = "> 1.10, < 3" aiohttp = "^3.8" huggingface-hub = ">= 0.12, < 1.0" diff --git a/clients/python/text_generation/types.py b/clients/python/text_generation/types.py index 38f75253..20083b19 100644 --- a/clients/python/text_generation/types.py +++ b/clients/python/text_generation/types.py @@ -18,21 +18,21 @@ class Parameters(BaseModel): # Stop generating tokens if a member of `stop_sequences` is generated stop: List[str] = [] # Random sampling seed - seed: Optional[int] + seed: Optional[int] = None # The value used to module the logits distribution. - temperature: Optional[float] + temperature: Optional[float] = None # The number of highest probability vocabulary tokens to keep for top-k-filtering. - top_k: Optional[int] + top_k: Optional[int] = None # If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or # higher are kept for generation. - top_p: Optional[float] + top_p: Optional[float] = None # truncate inputs tokens to the given size - truncate: Optional[int] + truncate: Optional[int] = None # Typical Decoding mass # See [Typical Decoding for Natural Language Generation](https://arxiv.org/abs/2202.00666) for more information - typical_p: Optional[float] + typical_p: Optional[float] = None # Generate best_of sequences and return the one if the highest token logprobs - best_of: Optional[int] + best_of: Optional[int] = None # Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226) watermark: bool = False # Get generation details @@ -114,7 +114,7 @@ class Request(BaseModel): # Prompt inputs: str # Generation parameters - parameters: Optional[Parameters] + parameters: Optional[Parameters] = None # Whether to stream output tokens stream: bool = False @@ -145,7 +145,7 @@ class InputToken(BaseModel): text: str # Logprob # Optional since the logprob of the first token cannot be computed - logprob: Optional[float] + logprob: Optional[float] = None # Generated tokens @@ -180,7 +180,7 @@ class BestOfSequence(BaseModel): # Number of generated tokens generated_tokens: int # Sampling seed if sampling was activated - seed: Optional[int] + seed: Optional[int] = None # Decoder input tokens, empty if decoder_input_details is False prefill: List[InputToken] # Generated tokens @@ -196,7 +196,7 @@ class Details(BaseModel): # Number of generated tokens generated_tokens: int # Sampling seed if sampling was activated - seed: Optional[int] + seed: Optional[int] = None # Decoder input tokens, empty if decoder_input_details is False prefill: List[InputToken] # Generated tokens @@ -204,7 +204,7 @@ class Details(BaseModel): # Most likely tokens top_tokens: Optional[List[List[Token]]] # Additional sequences when using the `best_of` parameter - best_of_sequences: Optional[List[BestOfSequence]] + best_of_sequences: Optional[List[BestOfSequence]] = None # `generate` return value @@ -222,7 +222,7 @@ class StreamDetails(BaseModel): # Number of generated tokens generated_tokens: int # Sampling seed if sampling was activated - seed: Optional[int] + seed: Optional[int] = None # `generate_stream` return value @@ -233,10 +233,10 @@ class StreamResponse(BaseModel): top_tokens: Optional[List[Token]] # Complete generated text # Only available when the generation is finished - generated_text: Optional[str] + generated_text: Optional[str] = None # Generation details # Only available when the generation is finished - details: Optional[StreamDetails] + details: Optional[StreamDetails] = None # Inference API currently deployed model From 3ed4c0f33fee281fbdc276e208574e22821818d9 Mon Sep 17 00:00:00 2001 From: Julien Bouquillon Date: Wed, 6 Sep 2023 14:57:59 +0200 Subject: [PATCH 07/11] docs: typo in streaming.js (#971) Looks like an error --- docs/source/conceptual/streaming.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/conceptual/streaming.md b/docs/source/conceptual/streaming.md index c20d76e0..054fde30 100644 --- a/docs/source/conceptual/streaming.md +++ b/docs/source/conceptual/streaming.md @@ -121,9 +121,9 @@ If you're using the free Inference API, you can use `HfInference`. If you're usi We can create a `HfInferenceEndpoint` providing our endpoint URL and credential. ```js -import { HfInference } from '@huggingface/inference' +import { HfInferenceEndpoint } from '@huggingface/inference' -const hf = new HfInference('https://YOUR_ENDPOINT.endpoints.huggingface.cloud', 'hf_YOUR_TOKEN') +const hf = new HfInferenceEndpoint('https://YOUR_ENDPOINT.endpoints.huggingface.cloud', 'hf_YOUR_TOKEN') // prompt const prompt = 'What can you do in Nuremberg, Germany? Give me 3 Tips' From 211e7b7e3503c7e388c2cadb10132d53b1deb8b0 Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Wed, 6 Sep 2023 15:01:00 +0200 Subject: [PATCH 08/11] Disabling exllama on old compute. (#986) # What does this PR do? Disabling exllama on old compute. Exllama + T4 don't play nice together, this will disable it right away to avoid issues at runtime. Fixes # (issue) ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests), Pull Request section? - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link to it if that's the case. - [ ] Did you make sure to update the documentation with your changes? Here are the [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation). - [ ] Did you write any new necessary tests? ## Who can review? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR. --- server/text_generation_server/utils/layers.py | 17 ++++++++++++----- server/text_generation_server/utils/weights.py | 4 ++-- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/server/text_generation_server/utils/layers.py b/server/text_generation_server/utils/layers.py index 745c1d2e..6be54048 100644 --- a/server/text_generation_server/utils/layers.py +++ b/server/text_generation_server/utils/layers.py @@ -18,13 +18,20 @@ from accelerate import init_empty_weights from text_generation_server.utils.gptq.quant_linear import QuantLinear -HAS_EXLLAMA = True +try: + major, _minor = torch.cuda.get_device_capability() +except Exception: + major = 1 +HAS_EXLLAMA = False +CAN_EXLLAMA = major >= 8 if os.getenv("DISABLE_EXLLAMA") == "True": HAS_EXLLAMA = False -try: - from text_generation_server.utils.gptq.exllama import Ex4bitLinear -except ImportError: - HAS_EXLLAMA = False +elif CAN_EXLLAMA: + try: + from text_generation_server.utils.gptq.exllama import Ex4bitLinear + HAS_EXLLAMA = True + except ImportError: + pass from typing import Optional diff --git a/server/text_generation_server/utils/weights.py b/server/text_generation_server/utils/weights.py index ef662ce1..261456bd 100644 --- a/server/text_generation_server/utils/weights.py +++ b/server/text_generation_server/utils/weights.py @@ -170,10 +170,10 @@ class Weights: "Cannot load `gptq` weight, make sure the model is already quantized, or quantize it with `text-generation-server quantize ORIGINAL_MODEL_ID NEW_MODEL_ID`" ) - from text_generation_server.utils.layers import HAS_EXLLAMA + from text_generation_server.utils.layers import HAS_EXLLAMA, CAN_EXLLAMA if use_exllama: - if not HAS_EXLLAMA: + if not HAS_EXLLAMA and CAN_EXLLAMA: logger.warning( "Exllama GPTQ cuda kernels (which are faster) could have been used, but are not currently installed, try using BUILD_EXTENSIONS=True" ) From 059bb5cf832221e3480eace71485c3b85dc8876d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E4=BD=B3=E6=AC=A3?= Date: Wed, 6 Sep 2023 21:20:32 +0800 Subject: [PATCH 09/11] chore: sync text-generation version from 0.3.0 to 0.6.0 with pyproject.toml (#950) # What does this PR do? sync the version for text-generation. --- clients/python/text_generation/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clients/python/text_generation/__init__.py b/clients/python/text_generation/__init__.py index 46109833..5ab10fdb 100644 --- a/clients/python/text_generation/__init__.py +++ b/clients/python/text_generation/__init__.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "0.3.0" +__version__ = "0.6.0" from text_generation.client import Client, AsyncClient from text_generation.inference_api import InferenceAPIClient, InferenceAPIAsyncClient From f260eb72f911bc30ae5d26020b6e14a774e5a168 Mon Sep 17 00:00:00 2001 From: Merve Noyan Date: Wed, 6 Sep 2023 16:36:49 +0300 Subject: [PATCH 10/11] docs: Flash Attention Conceptual Guide (#892) PR for conceptual guide on flash attention. I will add more info unless I'm told otherwise. --------- Co-authored-by: Nicolas Patry Co-authored-by: Omar Sanseviero --- docs/source/_toctree.yml | 2 ++ docs/source/conceptual/flash_attention.md | 12 ++++++++++++ 2 files changed, 14 insertions(+) create mode 100644 docs/source/conceptual/flash_attention.md diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index 5ba470bd..6a8baaf6 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -21,4 +21,6 @@ - sections: - local: conceptual/streaming title: Streaming + - local: conceptual/flash_attention + title: Flash Attention title: Conceptual Guides diff --git a/docs/source/conceptual/flash_attention.md b/docs/source/conceptual/flash_attention.md new file mode 100644 index 00000000..1f3a6293 --- /dev/null +++ b/docs/source/conceptual/flash_attention.md @@ -0,0 +1,12 @@ +# Flash Attention + +Scaling the transformer architecture is heavily bottlenecked by the self-attention mechanism, which has quadratic time and memory complexity. Recent developments in accelerator hardware mainly focus on enhancing compute capacities and not memory and transferring data between hardware. This results in attention operation having a memory bottleneck. **Flash Attention** is an attention algorithm used to reduce this problem and scale transformer-based models more efficiently, enabling faster training and inference. + +Standard attention mechanism uses High Bandwidth Memory (HBM) to store, read and write keys, queries and values. HBM is large in memory, but slow in processing, meanwhile SRAM is smaller in memory, but faster in operations. In the standard attention implementation, the cost of loading and writing keys, queries, and values from HBM is high. It loads keys, queries, and values from HBM to GPU on-chip SRAM, performs a single step of the attention mechanism, writes it back to HBM, and repeats this for every single attention step. Instead, Flash Attention loads keys, queries, and values once, fuses the operations of the attention mechanism, and writes them back. + +![Flash Attention](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/flash-attn.png) + +It is implemented for supported models. You can check out the complete list of models that support Flash Attention [here](https://github.com/huggingface/text-generation-inference/tree/main/server/text_generation_server/models), for models with flash prefix. + +You can learn more about Flash Attention by reading the paper in this [link](https://arxiv.org/abs/2205.14135). + From a9fdfb24643c64a17cdb90560988f131d1dc0863 Mon Sep 17 00:00:00 2001 From: Omar Sanseviero Date: Wed, 6 Sep 2023 18:42:42 +0200 Subject: [PATCH 11/11] docs: Remove redundant content from stream guide (#884) Co-authored-by: OlivierDehaene --- docs/source/conceptual/streaming.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/docs/source/conceptual/streaming.md b/docs/source/conceptual/streaming.md index 054fde30..b7e75c5f 100644 --- a/docs/source/conceptual/streaming.md +++ b/docs/source/conceptual/streaming.md @@ -143,6 +143,4 @@ SSEs are different than: * Polling: where the client keeps calling the server to get data. This means that the server might return empty responses and cause overhead. * Webhooks: where there is a bi-directional connection. The server can send information to the client, but the client can also send data to the server after the first request. Webhooks are more complex to operate as they don’t only use HTTP. -One of the limitations of Server-Sent Events is that they limit how many concurrent requests can handle by the server. Instead of timing out when there are too many SSE connections, TGI returns a HTTP Error with an `overloaded` error type (`huggingface_hub` returns `OverloadedError`). This allows the client to manage the overloaded server (e.g. it could display a busy error to the user or it could retry with a new request). To configure the maximum number of concurrent requests, you can specify `--max_concurrent_requests`, allowing to handle backpressure. - -One of the limitations of Server-Sent Events is that they limit how many concurrent requests can handle by the server. Instead of timing out when there are too many SSE connections, TGI returns an HTTP Error with an `overloaded` error type (`huggingface_hub` returns `OverloadedError`). This allows the client to manage the overloaded server (e.g., it could display a busy error to the user or retry with a new request). To configure the maximum number of concurrent requests, you can specify `--max_concurrent_requests`, allowing clients to handle backpressure. +If there are too many requests at the same time, TGI returns an HTTP Error with an `overloaded` error type (`huggingface_hub` returns `OverloadedError`). This allows the client to manage the overloaded server (e.g., it could display a busy error to the user or retry with a new request). To configure the maximum number of concurrent requests, you can specify `--max_concurrent_requests`, allowing clients to handle backpressure.