mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-09 19:34:53 +00:00
fix typos
This commit is contained in:
parent
7253be349a
commit
0cd6ff7a3d
@ -36,7 +36,7 @@ class Exl2WeightsLoader(WeightsLoader):
|
||||
|
||||
def get_weights(self, weights: "Weights", prefix: str):
|
||||
"""
|
||||
Get weights at the given prefix and apply without tensor paralllism.
|
||||
Get weights at the given prefix and apply without tensor parallelism.
|
||||
"""
|
||||
try:
|
||||
q_weight = weights.get_tensor(f"{prefix}.q_weight")
|
||||
|
@ -598,7 +598,7 @@ def get_loaders(
|
||||
|
||||
def find_layers(module, layers=(nn.Conv2d, nn.Linear), name=""):
|
||||
# Skip last lm_head linear
|
||||
# Need isintance Falcon is inheriting Linear.
|
||||
# Need isinstance Falcon is inheriting Linear.
|
||||
if isinstance(module, layers) and "lm_head" not in name:
|
||||
return {name: module}
|
||||
res = {}
|
||||
|
@ -221,7 +221,7 @@ class SparseMoELayer(nn.Module):
|
||||
|
||||
log_once(
|
||||
logger.info,
|
||||
"Using MoE layer wih fused gemm",
|
||||
"Using MoE layer with fused gemm",
|
||||
)
|
||||
|
||||
self.moe = cls(
|
||||
|
@ -282,7 +282,7 @@ class IdeficsProcessor(ProcessorMixin):
|
||||
|
||||
"""
|
||||
|
||||
# if the value isn't overriden by the user, check if the tokenizer was trained with this token and then use it
|
||||
# if the value isn't overridden by the user, check if the tokenizer was trained with this token and then use it
|
||||
if add_end_of_utterance_token is None:
|
||||
add_end_of_utterance_token = (
|
||||
self.tokenizer_was_trained_with_end_of_utterance_token
|
||||
|
@ -1518,7 +1518,7 @@ class FlashCausalLM(Model):
|
||||
)
|
||||
self.bucketing_ctx.num_hpu_blocks = num_blocks
|
||||
if os.getenv("VLLM_SKIP_WARMUP", "false").lower() == "true":
|
||||
logger.info("skip warmup hpu graph, not recommmended")
|
||||
logger.info("skip warmup hpu graph, not recommended")
|
||||
del _batch, batch
|
||||
return int(num_blocks * BLOCK_SIZE), max_input_tokens, max_total_tokens
|
||||
|
||||
|
@ -22,7 +22,7 @@ class WeightsLoader(ABC):
|
||||
@abstractmethod
|
||||
def get_weights(self, weights: "Weights", prefix: str):
|
||||
"""
|
||||
Get weights at the given prefix and apply without tensor paralllism.
|
||||
Get weights at the given prefix and apply without tensor parallelism.
|
||||
"""
|
||||
...
|
||||
|
||||
@ -50,7 +50,7 @@ class WeightsLoader(ABC):
|
||||
def get_weights_col(self, weights: "Weights", prefix: str):
|
||||
"""
|
||||
Get weights at the given prefix and apply column-splitting for tensor
|
||||
paralllism.
|
||||
parallelism.
|
||||
"""
|
||||
return weights.get_multi_weights_col([prefix], 0)
|
||||
|
||||
|
2
backends/neuron/tests/fixtures/model.py
vendored
2
backends/neuron/tests/fixtures/model.py
vendored
@ -118,7 +118,7 @@ def neuron_model_config(request):
|
||||
|
||||
For each exposed model, the local directory is maintained for the duration of the
|
||||
test session and cleaned up afterwards.
|
||||
The hub model artifacts are never cleaned up and persist accross sessions.
|
||||
The hub model artifacts are never cleaned up and persist across sessions.
|
||||
They must be cleaned up manually when the optimum-neuron version changes.
|
||||
|
||||
"""
|
||||
|
@ -93,7 +93,7 @@ To run FP8 Inference:
|
||||
1. Measure statistics using [Optimum Habana measurement script](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation#running-with-fp8)
|
||||
2. Run the model in TGI with QUANT_CONFIG setting - e.g. `-e QUANT_CONFIG=./quantization_config/maxabs_quant.json`.
|
||||
|
||||
The following commmand example for FP8 inference is based on the assumption that measurement is done via the first step above.
|
||||
The following command example for FP8 inference is based on the assumption that measurement is done via the first step above.
|
||||
|
||||
Example for Llama3.1-70B on 8 cards with FP8 precision:
|
||||
|
||||
@ -155,7 +155,7 @@ curl -N 127.0.0.1:8080/generate \
|
||||
-H 'Content-Type: application/json'
|
||||
```
|
||||
|
||||
> Note: In Llava-v1.6-Mistral-7B, an image usually accounts for 2000 input tokens. For example, an image of size 512x512 is represented by 2800 tokens. Thus, `max-input-tokens` must be larger than the number of tokens associated with the image. Otherwise the image may be truncated. We set `BASE_IMAGE_TOKENS=2048` as the default image token value. This is the minimum value of `max-input-tokens`. You can override the environment variable `BASE_IMAGE_TOKENS` to change this value. The warmup will generate graphs with input length from `BASE_IMAGE_TOKENS` to `max-input-tokens`. For Llava-v1.6-Mistral-7B, the value of `max-batch-prefill-tokens` is 16384, which is calcualted as follows: `prefill_batch_size` = `max-batch-prefill-tokens` / `max-input-tokens`.
|
||||
> Note: In Llava-v1.6-Mistral-7B, an image usually accounts for 2000 input tokens. For example, an image of size 512x512 is represented by 2800 tokens. Thus, `max-input-tokens` must be larger than the number of tokens associated with the image. Otherwise the image may be truncated. We set `BASE_IMAGE_TOKENS=2048` as the default image token value. This is the minimum value of `max-input-tokens`. You can override the environment variable `BASE_IMAGE_TOKENS` to change this value. The warmup will generate graphs with input length from `BASE_IMAGE_TOKENS` to `max-input-tokens`. For Llava-v1.6-Mistral-7B, the value of `max-batch-prefill-tokens` is 16384, which is calculated as follows: `prefill_batch_size` = `max-batch-prefill-tokens` / `max-input-tokens`.
|
||||
|
||||
### How to Benchmark Performance
|
||||
|
||||
|
@ -5,13 +5,13 @@
|
||||
Performance leap: TGI processes 3x more tokens, 13x faster than vLLM on long prompts. Zero config !
|
||||
|
||||
### 3x more tokens.
|
||||
By reducing our memory footprint, we’re able to ingest many more tokens and more dynamically than before. A single L4 (24GB) can handle 30k tokens on llama 3.1-8B, while vLLM gets barely 10k. A lot of work went into reducing the footprint of the runtime and its effect are best seen on smaller constrained environments.
|
||||
By reducing our memory footprint, we’re able to ingest many more tokens and more dynamically than before. A single L4 (24GB) can handle 30k tokens on llama 3.1-8B, while vLLM gets barely 10k. A lot of work went into reducing the footprint of the runtime and its effects are best seen on smaller constrained environments.
|
||||
|
||||
### 13x faster
|
||||
On long prompts (200k+ tokens) conversation replies take 27.5s in vLLM, while it takes only 2s in TGI. How so ? We keep the initial conversation around, so when a new reply comes in, we can answer almost instantly. The overhead of the lookup is ~5us. Thanks @Daniël de Kok for the beast data structure.
|
||||
|
||||
### Zero config
|
||||
That’s it. Remove all the flags your are using and you’re likely to get the best performance. By evaluating the hardware and model, TGI carefully selects automatic values to give best performance. In production, we don’t have any flags anymore in our deployments. We kept all existing flags around, they may come in handy in niche scenarios.
|
||||
That’s it. Remove all the flags you are using and you’re likely to get the best performance. By evaluating the hardware and model, TGI carefully selects automatic values to give best performance. In production, we don’t have any flags anymore in our deployments. We kept all existing flags around, they may come in handy in niche scenarios.
|
||||
|
||||
|
||||
|
||||
@ -37,7 +37,7 @@ For more details on benchmarking in general we recommend the documentation of k6
|
||||
We selected a handful of scenarios to simplify the picture, they seem to accurately reflect a larger trend.
|
||||
|
||||
1. **Small scenario**: This scenario consists of the first 200 requests from the orca datasets being prompted to the model. The 200 requests total 8k tokens together and are representative of conversation starters. Prefix caching has very limited impact in that scenario and we feel it's a relatively balanced benchmark for simple use cases.
|
||||
2. **Long scenario**: This scenario consists of 20 requests totalling 200k prompt tokens which are essentially asking for summaries of large chunks for text. In practical scenarios this is really useful when you are feeding large chunks of code, large chunks of business data or documents repeatedly and ask simple questions about them (summarization, classification, or where to find some data). This scenario is the one closest to what a lot of professional use cases seem to be doing by including a lot of information in the prompt itself. Those very long conversations are the ones that benefit the most for our recent changes since we are enable ever larger prompts and ever faster caching.
|
||||
2. **Long scenario**: This scenario consists of 20 requests totalling 200k prompt tokens which are essentially asking for summaries of large chunks for text. In practical scenarios this is really useful when you are feeding large chunks of code, large chunks of business data or documents repeatedly and ask simple questions about them (summarization, classification, or where to find some data). This scenario is the one closest to what a lot of professional use cases seem to be doing by including a lot of information in the prompt itself. Those very long conversations are the ones that benefit the most for our recent changes since we are enabling even larger prompts and even faster caching.
|
||||
|
||||
### Hardware
|
||||
|
||||
@ -119,7 +119,7 @@ Our performance gains can be attributed to several key factors:
|
||||
While we've made significant progress, there are still opportunities for improvement:
|
||||
|
||||
1. **Special models**: All LLMs come with the aforementioned improvements. Some specific set of features might not (some quantizations, speculation or VLMs for instance are harder to optimize for with the same level of detail).
|
||||
2. **KV-Cache Long-Term Retention**: Addressing KV-cache long-term retention is a challenge. There are several solutions envisionned like shared KV-cache (like redis or memcached) solutions or innovative storage approaches. It is an area of ongoing research of ours.
|
||||
2. **KV-Cache Long-Term Retention**: Addressing KV-cache long-term retention is a challenge. There are several solutions envisioned like shared KV-cache (like redis or memcached) solutions or innovative storage approaches. It is an area of ongoing research of ours.
|
||||
3. **Multimodal models**: We are also investigating quite a lot other kind of models, like audio-to-audio, image/video generation, and other hybrids, where we see a lot of potential of applying the same principles we've applied in TGI to maximize performance.
|
||||
|
||||
By sharing our benchmarking methodology, results, and technical insights, we aim to contribute to the ongoing development of more efficient and effective LLMs.
|
||||
|
@ -2,7 +2,7 @@
|
||||
|
||||
## What is LoRA?
|
||||
|
||||
LoRA is a technique that allows for efficent fine-tuning a model while only updating a small portion of the model's weights. This is useful when you have a large model that has been pre-trained on a large dataset, but you want to fine-tune it on a smaller dataset or for a specific task.
|
||||
LoRA is a technique that allows for efficient fine-tuning a model while only updating a small portion of the model's weights. This is useful when you have a large model that has been pre-trained on a large dataset, but you want to fine-tune it on a smaller dataset or for a specific task.
|
||||
|
||||
LoRA works by adding a small number of additional weights to the model, which are used to adapt the model to the new dataset or task. These additional weights are learned during the fine-tuning process, while the rest of the model's weights are kept fixed.
|
||||
|
||||
@ -18,13 +18,13 @@ Technically, LoRA can be used to fine-tune a large language model on a small dat
|
||||
|
||||
## Optimizing Inference with LoRA
|
||||
|
||||
LoRA's can be used during inference by mutliplying the adapter weights with the model weights at each specified layer. This process can be computationally expensive, but due to awesome work by [punica-ai](https://github.com/punica-ai/punica) and the [lorax](https://github.com/predibase/lorax) team, optimized kernels/and frameworks have been developed to make this process more efficient. TGI leverages these optimizations in order to provide fast and efficient inference with mulitple LoRA models.
|
||||
LoRA's can be used during inference by multiplying the adapter weights with the model weights at each specified layer. This process can be computationally expensive, but due to awesome work by [punica-ai](https://github.com/punica-ai/punica) and the [lorax](https://github.com/predibase/lorax) team, optimized kernels/and frameworks have been developed to make this process more efficient. TGI leverages these optimizations in order to provide fast and efficient inference with multiple LoRA models.
|
||||
|
||||
## Serving multiple LoRA adapters with TGI
|
||||
|
||||
Once a LoRA model has been trained, it can be used to generate text or perform other tasks just like a regular language model. However, because the model has been fine-tuned on a specific dataset, it may perform better on that dataset than a model that has not been fine-tuned.
|
||||
|
||||
In practice its often useful to have multiple LoRA models, each fine-tuned on a different dataset or for a different task. This allows you to use the model that is best suited for a particular task or dataset.
|
||||
In practice it's often useful to have multiple LoRA models, each fine-tuned on a different dataset or for a different task. This allows you to use the model that is best suited for a particular task or dataset.
|
||||
|
||||
Text Generation Inference (TGI) now supports loading multiple LoRA models at startup that can be used in generation requests. This feature is available starting from version `~2.0.6` and is compatible with LoRA models trained using the `peft` library.
|
||||
|
||||
|
@ -138,7 +138,7 @@ Options:
|
||||
## MAX_TOP_N_TOKENS
|
||||
```shell
|
||||
--max-top-n-tokens <MAX_TOP_N_TOKENS>
|
||||
This is the maximum allowed value for clients to set `top_n_tokens`. `top_n_tokens` is used to return information about the the `n` most likely tokens at each generation step, instead of just the sampled token. This information can be used for downstream tasks like for classification or ranking
|
||||
This is the maximum allowed value for clients to set `top_n_tokens`. `top_n_tokens` is used to return information about the `n` most likely tokens at each generation step, instead of just the sampled token. This information can be used for downstream tasks like for classification or ranking
|
||||
|
||||
[env: MAX_TOP_N_TOKENS=]
|
||||
[default: 5]
|
||||
|
@ -238,7 +238,7 @@ def neuron_model_config(request):
|
||||
|
||||
For each exposed model, the local directory is maintained for the duration of the
|
||||
test session and cleaned up afterwards.
|
||||
The hub model artifacts are never cleaned up and persist accross sessions.
|
||||
The hub model artifacts are never cleaned up and persist across sessions.
|
||||
They must be cleaned up manually when the optimum-neuron version changes.
|
||||
|
||||
"""
|
||||
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@ -663,7 +663,7 @@ struct Args {
|
||||
max_stop_sequences: usize,
|
||||
|
||||
/// This is the maximum allowed value for clients to set `top_n_tokens`.
|
||||
/// `top_n_tokens` is used to return information about the the `n` most likely
|
||||
/// `top_n_tokens` is used to return information about the `n` most likely
|
||||
/// tokens at each generation step, instead of just the sampled token. This
|
||||
/// information can be used for downstream tasks like for classification or
|
||||
/// ranking.
|
||||
|
@ -152,7 +152,7 @@ std::tuple<at::Tensor, std::optional<std::vector<at::Tensor>>, at::Tensor> forwa
|
||||
auto query_scaled = query_view * inv_norm_factor;
|
||||
auto attention_scores = at::bmm(query_scaled, key_view);
|
||||
|
||||
// Computing `optionally_cast_fp16_to_fp32 + masked_fill + softmax + cast_to_intial_dtype`
|
||||
// Computing `optionally_cast_fp16_to_fp32 + masked_fill + softmax + cast_to_initial_dtype`
|
||||
at::Tensor attention_probs;
|
||||
if (true) {
|
||||
// TODO @thomasw21: it's easier to think of attention_scores as 2D tensors
|
||||
@ -182,7 +182,7 @@ std::tuple<at::Tensor, std::optional<std::vector<at::Tensor>>, at::Tensor> forwa
|
||||
*/
|
||||
|
||||
/*
|
||||
* We should split [batch_size_times_num_heads_block, q_length] in seperate blocks and [batch_size_times_num_heads_block_size, kv_length] a single block
|
||||
* We should split [batch_size_times_num_heads_block, q_length] in separate blocks and [batch_size_times_num_heads_block_size, kv_length] a single block
|
||||
* with multiple threads as we need to `sync_threads` to run exponential sum.
|
||||
* We maximise the usage of threads within a single block
|
||||
*/
|
||||
|
@ -150,7 +150,7 @@ std::tuple<at::Tensor, std::optional<std::vector<at::Tensor>>, at::Tensor> forwa
|
||||
|
||||
auto attention_scores = alibi.baddbmm(query_layer, key_layer, beta, inv_norm_factor);
|
||||
|
||||
// Computing `optionally_cast_fp16_to_fp32 + masked_fill + softmax + cast_to_intial_dtype`
|
||||
// Computing `optionally_cast_fp16_to_fp32 + masked_fill + softmax + cast_to_initial_dtype`
|
||||
at::Tensor attention_probs;
|
||||
if (true) {
|
||||
const auto kv_length = key_layer.size(2);
|
||||
@ -182,7 +182,7 @@ std::tuple<at::Tensor, std::optional<std::vector<at::Tensor>>, at::Tensor> forwa
|
||||
*/
|
||||
|
||||
/*
|
||||
* We should split [batch_size_times_num_heads_block, q_length] in seperate blocks and [batch_size_times_num_heads_block_size, kv_length] a single block
|
||||
* We should split [batch_size_times_num_heads_block, q_length] in separate blocks and [batch_size_times_num_heads_block_size, kv_length] a single block
|
||||
* with multiple threads as we need to `sync_threads` to run exponential sum.
|
||||
* We maximise the usage of threads within a single block
|
||||
*/
|
||||
|
@ -34,7 +34,7 @@ class WNA16Int24Loader(WeightsLoader):
|
||||
|
||||
def get_weights(self, weights: Weights, prefix: str):
|
||||
"""
|
||||
Get weights at the given prefix and apply without tensor paralllism.
|
||||
Get weights at the given prefix and apply without tensor parallelism.
|
||||
"""
|
||||
weight_packed = weights.get_tensor(f"{prefix}.weight_packed")
|
||||
meta = weights.get_tensor(f"{prefix}.meta")
|
||||
|
@ -36,7 +36,7 @@ class Exl2WeightsLoader(WeightsLoader):
|
||||
|
||||
def get_weights(self, weights: "Weights", prefix: str):
|
||||
"""
|
||||
Get weights at the given prefix and apply without tensor paralllism.
|
||||
Get weights at the given prefix and apply without tensor parallelism.
|
||||
"""
|
||||
try:
|
||||
q_weight = weights.get_tensor(f"{prefix}.q_weight")
|
||||
|
@ -598,7 +598,7 @@ def get_loaders(
|
||||
|
||||
def find_layers(module, layers=(nn.Conv2d, nn.Linear), name=""):
|
||||
# Skip last lm_head linear
|
||||
# Need isintance Falcon is inheriting Linear.
|
||||
# Need isinstance Falcon is inheriting Linear.
|
||||
if isinstance(module, layers) and "lm_head" not in name:
|
||||
return {name: module}
|
||||
res = {}
|
||||
|
@ -26,7 +26,7 @@ class MarlinWeightsLoader(WeightsLoader):
|
||||
|
||||
def get_weights(self, weights: "Weights", prefix: str):
|
||||
"""
|
||||
Get weights at the given prefix and apply without tensor paralllism.
|
||||
Get weights at the given prefix and apply without tensor parallelism.
|
||||
"""
|
||||
is_marlin_24 = getattr(self, "gptq_checkpoint_format", None) == "marlin_24"
|
||||
if is_marlin_24:
|
||||
|
@ -244,7 +244,7 @@ class SparseMoELayer(nn.Module):
|
||||
|
||||
log_once(
|
||||
logger.info,
|
||||
"Using MoE layer wih fused gemm",
|
||||
"Using MoE layer with fused gemm",
|
||||
)
|
||||
|
||||
self.moe = cls(
|
||||
|
@ -295,7 +295,7 @@ class Gemma3Config(PretrainedConfig):
|
||||
else:
|
||||
vision_config = SiglipVisionConfig()
|
||||
logger.info(
|
||||
"vision_config is None or incompatible with Gemma3VisionConfig intialization. Gemma3 will be limited "
|
||||
"vision_config is None or incompatible with Gemma3VisionConfig initialization. Gemma3 will be limited "
|
||||
"to text tasks."
|
||||
)
|
||||
|
||||
|
@ -282,7 +282,7 @@ class IdeficsProcessor(ProcessorMixin):
|
||||
|
||||
"""
|
||||
|
||||
# if the value isn't overriden by the user, check if the tokenizer was trained with this token and then use it
|
||||
# if the value isn't overridden by the user, check if the tokenizer was trained with this token and then use it
|
||||
if add_end_of_utterance_token is None:
|
||||
add_end_of_utterance_token = (
|
||||
self.tokenizer_was_trained_with_end_of_utterance_token
|
||||
|
@ -24,7 +24,7 @@ class WeightsLoader(ABC):
|
||||
@abstractmethod
|
||||
def get_weights(self, weights: "Weights", prefix: str):
|
||||
"""
|
||||
Get weights at the given prefix and apply without tensor paralllism.
|
||||
Get weights at the given prefix and apply without tensor parallelism.
|
||||
"""
|
||||
...
|
||||
|
||||
@ -52,7 +52,7 @@ class WeightsLoader(ABC):
|
||||
def get_weights_col(self, weights: "Weights", prefix: str):
|
||||
"""
|
||||
Get weights at the given prefix and apply column-splitting for tensor
|
||||
paralllism.
|
||||
parallelism.
|
||||
"""
|
||||
return weights.get_multi_weights_col([prefix], 0)
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user