2022-10-18 13:19:03 +00:00
|
|
|
/// Payload validation logic
|
2024-06-04 13:56:56 +00:00
|
|
|
use crate::config::Config;
|
2023-07-13 12:22:37 +00:00
|
|
|
use crate::validation::ValidationError::{BestOfSampling, BestOfSeed, EmptyInput};
|
2024-06-27 13:54:35 +00:00
|
|
|
use crate::{
|
|
|
|
GenerateParameters, GenerateRequest, GrammarType, HubPreprocessorConfig, Idefics2Preprocessor,
|
2024-10-28 04:00:24 +00:00
|
|
|
TokenizerTrait,
|
2024-06-27 13:54:35 +00:00
|
|
|
};
|
2024-10-28 04:00:24 +00:00
|
|
|
use crate::{PyTokenizer, Tokenizer};
|
2024-06-04 13:56:56 +00:00
|
|
|
use base64::{engine::general_purpose::STANDARD, Engine};
|
2024-07-31 08:33:10 +00:00
|
|
|
use image::{ImageFormat, ImageReader};
|
2024-02-21 10:05:32 +00:00
|
|
|
use jsonschema::{Draft, JSONSchema};
|
2023-04-09 18:22:27 +00:00
|
|
|
use rand::{thread_rng, Rng};
|
2024-02-21 10:05:32 +00:00
|
|
|
use serde_json::Value;
|
Adding Llava-Next (Llava 1.6) with full support. (#1709)
# What does this PR do?
- Changed all models to extract `embed_tokens` in order to enable llava
to separately call the embeddings and the core model layers.
- Added VlmCausalLM to inherit from FlashMistral in order to be
maximally supported. The only added logics sits on top and parses images
into pixel values, preallocates input_ids space for the image
embeddings, and passes them for the model.
- Added Clip for the vision tower.
- Didn't add flash for the vision tower since there's no padding anyway.
- Added heuristic (potentially incomplete) to calculate number of
features *before* calculating the clip patches (allows for easier logic
reuse of the LLM under the hood).
Still needs to be done:
- [x] Implement the image parsing in the controller side, to avoid
downloading n times per TP shard and also refusing requests too large
early and avoid issues where the truncation actually truncates the
image.
- [ ] Make sure it works with quantization properly.
- [x] Make sure it works with TP>1
<!--
Congratulations! You've made it this far! You're not quite done yet
though.
Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.
Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.
Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->
<!-- Remove if not applicable -->
Fixes # (issue)
## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?
## Who can review?
Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.
<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @
@OlivierDehaene OR @Narsil
-->
2024-04-09 19:32:00 +00:00
|
|
|
use std::io::Cursor;
|
2024-06-27 13:54:35 +00:00
|
|
|
use std::iter;
|
2024-08-12 12:59:17 +00:00
|
|
|
use std::sync::Arc;
|
2022-10-17 12:59:00 +00:00
|
|
|
use thiserror::Error;
|
2023-10-23 13:51:12 +00:00
|
|
|
use tokio::sync::mpsc;
|
2023-04-09 18:22:27 +00:00
|
|
|
use tokio::sync::oneshot;
|
2023-02-13 12:02:45 +00:00
|
|
|
use tracing::{instrument, Span};
|
2024-03-22 16:14:54 +00:00
|
|
|
use {once_cell::sync::Lazy, regex::Regex};
|
2022-10-11 14:50:54 +00:00
|
|
|
|
2022-10-18 13:19:03 +00:00
|
|
|
/// Validation
|
2022-10-11 14:50:54 +00:00
|
|
|
#[derive(Debug, Clone)]
|
2022-10-17 16:27:33 +00:00
|
|
|
pub struct Validation {
|
2023-04-09 18:22:27 +00:00
|
|
|
/// Validation parameters
|
2023-03-09 14:30:54 +00:00
|
|
|
max_best_of: usize,
|
2023-04-09 18:22:27 +00:00
|
|
|
max_stop_sequences: usize,
|
2023-08-28 09:43:47 +00:00
|
|
|
max_top_n_tokens: u32,
|
2023-04-09 18:22:27 +00:00
|
|
|
max_input_length: usize,
|
|
|
|
max_total_tokens: usize,
|
2024-02-15 09:28:10 +00:00
|
|
|
disable_grammar_support: bool,
|
2023-04-09 18:22:27 +00:00
|
|
|
/// Channel to communicate with the background tokenization task
|
2024-10-28 04:00:24 +00:00
|
|
|
sender: mpsc::UnboundedSender<TokenizerRequest>,
|
2022-10-11 14:50:54 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
impl Validation {
|
2024-02-16 10:58:58 +00:00
|
|
|
#[allow(clippy::too_many_arguments)]
|
2023-02-15 20:56:59 +00:00
|
|
|
pub(crate) fn new(
|
|
|
|
workers: usize,
|
2024-10-28 04:00:24 +00:00
|
|
|
tokenizer: Tokenizer,
|
Adding Llava-Next (Llava 1.6) with full support. (#1709)
# What does this PR do?
- Changed all models to extract `embed_tokens` in order to enable llava
to separately call the embeddings and the core model layers.
- Added VlmCausalLM to inherit from FlashMistral in order to be
maximally supported. The only added logics sits on top and parses images
into pixel values, preallocates input_ids space for the image
embeddings, and passes them for the model.
- Added Clip for the vision tower.
- Didn't add flash for the vision tower since there's no padding anyway.
- Added heuristic (potentially incomplete) to calculate number of
features *before* calculating the clip patches (allows for easier logic
reuse of the LLM under the hood).
Still needs to be done:
- [x] Implement the image parsing in the controller side, to avoid
downloading n times per TP shard and also refusing requests too large
early and avoid issues where the truncation actually truncates the
image.
- [ ] Make sure it works with quantization properly.
- [x] Make sure it works with TP>1
<!--
Congratulations! You've made it this far! You're not quite done yet
though.
Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.
Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.
Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->
<!-- Remove if not applicable -->
Fixes # (issue)
## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?
## Who can review?
Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.
<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @
@OlivierDehaene OR @Narsil
-->
2024-04-09 19:32:00 +00:00
|
|
|
config: Option<Config>,
|
2024-06-27 13:54:35 +00:00
|
|
|
preprocessor_config: Option<HubPreprocessorConfig>,
|
2023-03-09 14:30:54 +00:00
|
|
|
max_best_of: usize,
|
2023-02-15 20:56:59 +00:00
|
|
|
max_stop_sequences: usize,
|
2023-08-28 09:43:47 +00:00
|
|
|
max_top_n_tokens: u32,
|
2023-02-15 20:56:59 +00:00
|
|
|
max_input_length: usize,
|
|
|
|
max_total_tokens: usize,
|
2024-02-15 09:28:10 +00:00
|
|
|
disable_grammar_support: bool,
|
2023-02-15 20:56:59 +00:00
|
|
|
) -> Self {
|
2024-10-28 04:00:24 +00:00
|
|
|
let workers = if let Tokenizer::Python { .. } = &tokenizer {
|
|
|
|
1
|
|
|
|
} else {
|
|
|
|
workers
|
|
|
|
};
|
2023-04-09 18:22:27 +00:00
|
|
|
// If we have a fast tokenizer
|
2024-10-28 04:00:24 +00:00
|
|
|
let sender = {
|
2023-10-23 13:51:12 +00:00
|
|
|
// Create round robin channel
|
|
|
|
let (validation_sender, validation_round_robin_receiver) = mpsc::unbounded_channel();
|
|
|
|
let mut senders = Vec::with_capacity(workers);
|
2023-04-09 18:22:27 +00:00
|
|
|
|
|
|
|
// Create workers
|
|
|
|
for _ in 0..workers {
|
|
|
|
let tokenizer_clone = tokenizer.clone();
|
Adding Llava-Next (Llava 1.6) with full support. (#1709)
# What does this PR do?
- Changed all models to extract `embed_tokens` in order to enable llava
to separately call the embeddings and the core model layers.
- Added VlmCausalLM to inherit from FlashMistral in order to be
maximally supported. The only added logics sits on top and parses images
into pixel values, preallocates input_ids space for the image
embeddings, and passes them for the model.
- Added Clip for the vision tower.
- Didn't add flash for the vision tower since there's no padding anyway.
- Added heuristic (potentially incomplete) to calculate number of
features *before* calculating the clip patches (allows for easier logic
reuse of the LLM under the hood).
Still needs to be done:
- [x] Implement the image parsing in the controller side, to avoid
downloading n times per TP shard and also refusing requests too large
early and avoid issues where the truncation actually truncates the
image.
- [ ] Make sure it works with quantization properly.
- [x] Make sure it works with TP>1
<!--
Congratulations! You've made it this far! You're not quite done yet
though.
Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.
Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.
Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->
<!-- Remove if not applicable -->
Fixes # (issue)
## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?
## Who can review?
Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.
<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @
@OlivierDehaene OR @Narsil
-->
2024-04-09 19:32:00 +00:00
|
|
|
let config_clone = config.clone();
|
2024-06-27 13:54:35 +00:00
|
|
|
let preprocessor_config_clone = preprocessor_config.clone();
|
2023-10-23 13:51:12 +00:00
|
|
|
let (tokenizer_sender, tokenizer_receiver) = mpsc::unbounded_channel();
|
|
|
|
senders.push(tokenizer_sender);
|
2023-04-09 18:22:27 +00:00
|
|
|
|
|
|
|
// Spawn worker
|
|
|
|
tokio::task::spawn_blocking(move || {
|
2024-06-27 13:54:35 +00:00
|
|
|
tokenizer_worker(
|
|
|
|
tokenizer_clone,
|
|
|
|
config_clone,
|
|
|
|
preprocessor_config_clone,
|
|
|
|
tokenizer_receiver,
|
|
|
|
)
|
2023-04-09 18:22:27 +00:00
|
|
|
});
|
|
|
|
}
|
2023-10-23 13:51:12 +00:00
|
|
|
|
|
|
|
// Create tokenization round robin task
|
|
|
|
tokio::spawn(round_robin_task(validation_round_robin_receiver, senders));
|
|
|
|
|
2024-10-28 04:00:24 +00:00
|
|
|
validation_sender
|
2023-04-09 18:22:27 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
Self {
|
|
|
|
max_best_of,
|
|
|
|
sender,
|
2023-02-15 20:56:59 +00:00
|
|
|
max_stop_sequences,
|
2023-08-28 09:43:47 +00:00
|
|
|
max_top_n_tokens,
|
2022-10-18 13:19:03 +00:00
|
|
|
max_input_length,
|
2023-02-15 20:56:59 +00:00
|
|
|
max_total_tokens,
|
2024-02-15 09:28:10 +00:00
|
|
|
disable_grammar_support,
|
2023-04-09 18:22:27 +00:00
|
|
|
}
|
|
|
|
}
|
2022-10-11 14:50:54 +00:00
|
|
|
|
2023-11-20 09:33:44 +00:00
|
|
|
#[instrument(skip(self, inputs))]
|
Add a new `/tokenize` route to get the tokenized input (#1471)
# What does this PR do?
Ideally this is done client side, but this is a recurring request,
therefore we implemented it.
- Runs only if rust tokenizer is present (not encumbering the main
inference pipeline is important).
- Returns simple results, ID, text (gotten with offsets from the
original string) and offsets (so users can do things like highlighting
text).
<!--
Congratulations! You've made it this far! You're not quite done yet
though.
Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.
Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.
Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->
<!-- Remove if not applicable -->
Fixes # (issue)
## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?
## Who can review?
Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.
<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @
@OlivierDehaene OR @Narsil
-->
2024-01-25 13:19:03 +00:00
|
|
|
pub async fn tokenize(
|
2023-04-09 18:22:27 +00:00
|
|
|
&self,
|
|
|
|
inputs: String,
|
2024-08-29 14:29:01 +00:00
|
|
|
add_special_tokens: bool,
|
2023-04-09 18:22:27 +00:00
|
|
|
truncate: Option<usize>,
|
2024-10-28 04:00:24 +00:00
|
|
|
) -> Result<(tokenizers::Encoding, Vec<Chunk>), ValidationError> {
|
2023-04-09 18:22:27 +00:00
|
|
|
// If we have a fast tokenizer
|
2024-10-28 04:00:24 +00:00
|
|
|
// Create response channel
|
|
|
|
let (response_sender, response_receiver) = oneshot::channel();
|
|
|
|
// Send request to the background validation task
|
|
|
|
// Unwrap is safe here
|
|
|
|
let _ = &self
|
|
|
|
.sender
|
|
|
|
.send((
|
|
|
|
(inputs, add_special_tokens, truncate),
|
|
|
|
response_sender,
|
|
|
|
Span::current(),
|
|
|
|
))
|
|
|
|
.unwrap();
|
|
|
|
|
|
|
|
// Await on response channel
|
|
|
|
// Unwrap is safe here
|
|
|
|
let encoding = response_receiver.await.unwrap()?;
|
|
|
|
Ok(encoding)
|
Add a new `/tokenize` route to get the tokenized input (#1471)
# What does this PR do?
Ideally this is done client side, but this is a recurring request,
therefore we implemented it.
- Runs only if rust tokenizer is present (not encumbering the main
inference pipeline is important).
- Returns simple results, ID, text (gotten with offsets from the
original string) and offsets (so users can do things like highlighting
text).
<!--
Congratulations! You've made it this far! You're not quite done yet
though.
Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.
Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.
Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->
<!-- Remove if not applicable -->
Fixes # (issue)
## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?
## Who can review?
Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.
<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @
@OlivierDehaene OR @Narsil
-->
2024-01-25 13:19:03 +00:00
|
|
|
}
|
|
|
|
|
2024-08-12 12:59:17 +00:00
|
|
|
#[allow(clippy::type_complexity)]
|
Add a new `/tokenize` route to get the tokenized input (#1471)
# What does this PR do?
Ideally this is done client side, but this is a recurring request,
therefore we implemented it.
- Runs only if rust tokenizer is present (not encumbering the main
inference pipeline is important).
- Returns simple results, ID, text (gotten with offsets from the
original string) and offsets (so users can do things like highlighting
text).
<!--
Congratulations! You've made it this far! You're not quite done yet
though.
Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.
Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.
Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->
<!-- Remove if not applicable -->
Fixes # (issue)
## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?
## Who can review?
Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.
<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @
@OlivierDehaene OR @Narsil
-->
2024-01-25 13:19:03 +00:00
|
|
|
#[instrument(skip(self, inputs))]
|
|
|
|
async fn validate_input(
|
|
|
|
&self,
|
|
|
|
inputs: String,
|
2024-08-29 14:29:01 +00:00
|
|
|
add_special_tokens: bool,
|
Add a new `/tokenize` route to get the tokenized input (#1471)
# What does this PR do?
Ideally this is done client side, but this is a recurring request,
therefore we implemented it.
- Runs only if rust tokenizer is present (not encumbering the main
inference pipeline is important).
- Returns simple results, ID, text (gotten with offsets from the
original string) and offsets (so users can do things like highlighting
text).
<!--
Congratulations! You've made it this far! You're not quite done yet
though.
Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.
Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.
Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->
<!-- Remove if not applicable -->
Fixes # (issue)
## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?
## Who can review?
Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.
<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @
@OlivierDehaene OR @Narsil
-->
2024-01-25 13:19:03 +00:00
|
|
|
truncate: Option<usize>,
|
|
|
|
max_new_tokens: Option<u32>,
|
2024-08-12 12:59:17 +00:00
|
|
|
) -> Result<(Vec<Chunk>, Option<Vec<u32>>, usize, u32), ValidationError> {
|
Add a new `/tokenize` route to get the tokenized input (#1471)
# What does this PR do?
Ideally this is done client side, but this is a recurring request,
therefore we implemented it.
- Runs only if rust tokenizer is present (not encumbering the main
inference pipeline is important).
- Returns simple results, ID, text (gotten with offsets from the
original string) and offsets (so users can do things like highlighting
text).
<!--
Congratulations! You've made it this far! You're not quite done yet
though.
Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.
Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.
Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->
<!-- Remove if not applicable -->
Fixes # (issue)
## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?
## Who can review?
Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.
<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @
@OlivierDehaene OR @Narsil
-->
2024-01-25 13:19:03 +00:00
|
|
|
// If we have a fast tokenizer
|
2024-10-28 04:00:24 +00:00
|
|
|
let (encoding, inputs) = self
|
2024-08-29 14:29:01 +00:00
|
|
|
.tokenize(inputs.clone(), add_special_tokens, truncate)
|
2024-10-28 04:00:24 +00:00
|
|
|
.await?;
|
|
|
|
// Create response channel
|
|
|
|
let input_length = if let Some(truncate) = truncate {
|
|
|
|
std::cmp::min(encoding.len(), truncate)
|
|
|
|
} else {
|
|
|
|
encoding.len()
|
|
|
|
};
|
2023-04-09 18:22:27 +00:00
|
|
|
|
2024-10-28 04:00:24 +00:00
|
|
|
// Get total tokens
|
|
|
|
let max_new_tokens: u32 = if let Some(max_new_tokens) = max_new_tokens {
|
|
|
|
max_new_tokens
|
|
|
|
} else {
|
|
|
|
self.max_total_tokens.saturating_sub(input_length) as u32
|
|
|
|
};
|
|
|
|
let total_tokens = input_length + max_new_tokens as usize;
|
2024-08-12 12:59:17 +00:00
|
|
|
|
2024-10-28 04:00:24 +00:00
|
|
|
// Validate MaxTotalTokens
|
|
|
|
if total_tokens > self.max_total_tokens {
|
|
|
|
return Err(ValidationError::MaxTotalTokens(
|
|
|
|
self.max_total_tokens,
|
|
|
|
input_length,
|
|
|
|
max_new_tokens,
|
|
|
|
));
|
2023-04-09 18:22:27 +00:00
|
|
|
}
|
|
|
|
|
2024-10-28 04:00:24 +00:00
|
|
|
// Validate InputLength
|
|
|
|
if input_length > self.max_input_length {
|
|
|
|
return Err(ValidationError::InputLength(
|
|
|
|
self.max_input_length,
|
2024-08-12 12:59:17 +00:00
|
|
|
input_length,
|
2024-10-28 04:00:24 +00:00
|
|
|
));
|
2022-10-11 14:50:54 +00:00
|
|
|
}
|
2024-10-28 04:00:24 +00:00
|
|
|
|
|
|
|
let ids = encoding.get_ids();
|
|
|
|
let input_ids = ids[ids.len().saturating_sub(input_length)..].to_owned();
|
|
|
|
|
|
|
|
metrics::histogram!("tgi_request_input_length").record(input_length as f64);
|
|
|
|
Ok((inputs, Some(input_ids), input_length, max_new_tokens))
|
2022-10-11 14:50:54 +00:00
|
|
|
}
|
|
|
|
|
2022-10-18 13:19:03 +00:00
|
|
|
/// Validate a payload and get the number of tokens in the input
|
2023-02-13 12:02:45 +00:00
|
|
|
#[instrument(skip_all)]
|
2022-10-11 14:50:54 +00:00
|
|
|
pub(crate) async fn validate(
|
|
|
|
&self,
|
|
|
|
request: GenerateRequest,
|
2023-01-31 16:04:00 +00:00
|
|
|
) -> Result<ValidGenerateRequest, ValidationError> {
|
2023-04-09 18:22:27 +00:00
|
|
|
let GenerateParameters {
|
|
|
|
best_of,
|
|
|
|
temperature,
|
|
|
|
repetition_penalty,
|
2024-02-08 17:41:25 +00:00
|
|
|
frequency_penalty,
|
2023-04-09 18:22:27 +00:00
|
|
|
top_k,
|
|
|
|
top_p,
|
|
|
|
typical_p,
|
|
|
|
do_sample,
|
|
|
|
max_new_tokens,
|
|
|
|
stop: stop_sequences,
|
|
|
|
truncate,
|
|
|
|
seed,
|
|
|
|
watermark,
|
2023-06-02 15:12:30 +00:00
|
|
|
decoder_input_details,
|
2023-08-28 09:43:47 +00:00
|
|
|
top_n_tokens,
|
2024-02-15 09:28:10 +00:00
|
|
|
grammar,
|
2024-06-25 18:46:27 +00:00
|
|
|
adapter_id,
|
2023-04-09 18:22:27 +00:00
|
|
|
..
|
|
|
|
} = request.parameters;
|
|
|
|
|
|
|
|
// sampling must be true when best_of > 1
|
|
|
|
let best_of = best_of.unwrap_or(1);
|
|
|
|
let sampling = do_sample
|
|
|
|
|| temperature.is_some()
|
|
|
|
|| top_k.is_some()
|
|
|
|
|| top_p.is_some()
|
|
|
|
|| typical_p.is_some();
|
|
|
|
|
|
|
|
if best_of > 1 && !sampling {
|
|
|
|
return Err(BestOfSampling);
|
|
|
|
}
|
|
|
|
|
|
|
|
let temperature = temperature.unwrap_or(1.0);
|
|
|
|
if temperature <= 0.0 {
|
|
|
|
return Err(ValidationError::Temperature);
|
|
|
|
}
|
|
|
|
|
|
|
|
let repetition_penalty = repetition_penalty.unwrap_or(1.0);
|
|
|
|
if repetition_penalty <= 0.0 {
|
|
|
|
return Err(ValidationError::RepetitionPenalty);
|
|
|
|
}
|
|
|
|
|
2024-02-08 17:41:25 +00:00
|
|
|
let frequency_penalty = frequency_penalty.unwrap_or(0.0);
|
|
|
|
if !(-2.0..=2.0).contains(&frequency_penalty) {
|
|
|
|
return Err(ValidationError::FrequencyPenalty);
|
|
|
|
}
|
|
|
|
|
2023-04-09 18:22:27 +00:00
|
|
|
// Different because the proto default value is not a valid value
|
|
|
|
// for the user
|
|
|
|
let top_p = top_p
|
|
|
|
.map(|value| {
|
|
|
|
if value <= 0.0 || value >= 1.0 {
|
|
|
|
return Err(ValidationError::TopP);
|
|
|
|
}
|
|
|
|
Ok(value)
|
|
|
|
})
|
|
|
|
.unwrap_or(Ok(1.0))?;
|
|
|
|
|
|
|
|
let typical_p = typical_p
|
|
|
|
.map(|value| {
|
|
|
|
if value <= 0.0 || value >= 1.0 {
|
|
|
|
return Err(ValidationError::TypicalP);
|
|
|
|
}
|
|
|
|
Ok(value)
|
|
|
|
})
|
|
|
|
.unwrap_or(Ok(1.0))?;
|
|
|
|
|
|
|
|
let top_k: u32 = top_k
|
|
|
|
.map(|value| {
|
|
|
|
if value <= 0 {
|
|
|
|
return Err(ValidationError::TopK);
|
|
|
|
}
|
|
|
|
Ok(value as u32)
|
|
|
|
})
|
|
|
|
.unwrap_or(Ok(0))?;
|
|
|
|
|
2023-10-04 15:38:42 +00:00
|
|
|
if max_new_tokens == Some(0) {
|
2023-04-09 18:22:27 +00:00
|
|
|
return Err(ValidationError::NegativeMaxNewTokens);
|
|
|
|
}
|
|
|
|
|
|
|
|
if stop_sequences.len() > self.max_stop_sequences {
|
|
|
|
return Err(ValidationError::StopSequence(
|
|
|
|
self.max_stop_sequences,
|
|
|
|
stop_sequences.len(),
|
|
|
|
));
|
|
|
|
}
|
|
|
|
|
|
|
|
// If seed is None, assign a random one
|
|
|
|
let seed = match seed {
|
|
|
|
None => thread_rng().gen(),
|
|
|
|
Some(seed) => {
|
|
|
|
if best_of > 1 {
|
|
|
|
return Err(BestOfSeed);
|
|
|
|
}
|
|
|
|
seed
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2023-08-28 09:43:47 +00:00
|
|
|
let top_n_tokens = top_n_tokens
|
|
|
|
.map(|value| {
|
|
|
|
if value > self.max_top_n_tokens {
|
|
|
|
return Err(ValidationError::TopNTokens(self.max_top_n_tokens, value));
|
|
|
|
}
|
|
|
|
Ok(value)
|
|
|
|
})
|
|
|
|
.unwrap_or(Ok(0))?;
|
|
|
|
|
2023-04-09 18:22:27 +00:00
|
|
|
// Check if inputs is empty
|
|
|
|
if request.inputs.is_empty() {
|
|
|
|
return Err(EmptyInput);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Check if truncate is strictly positive and less than max_input_length
|
|
|
|
let truncate = truncate
|
|
|
|
.map(|value| {
|
|
|
|
if value == 0 || value > self.max_input_length {
|
|
|
|
return Err(ValidationError::Truncate(self.max_input_length, value));
|
|
|
|
}
|
|
|
|
Ok(Some(value))
|
|
|
|
})
|
|
|
|
.unwrap_or(Ok(None))?;
|
|
|
|
|
|
|
|
// Validate inputs
|
2024-08-12 12:59:17 +00:00
|
|
|
let (inputs, input_ids, input_length, max_new_tokens) = self
|
2024-08-29 14:29:01 +00:00
|
|
|
.validate_input(
|
|
|
|
request.inputs,
|
|
|
|
request.add_special_tokens,
|
|
|
|
truncate,
|
|
|
|
max_new_tokens,
|
|
|
|
)
|
2023-04-09 18:22:27 +00:00
|
|
|
.await?;
|
|
|
|
|
2024-02-15 09:28:10 +00:00
|
|
|
// TODO: we should build the FSM here and pass the compiled FSM instead of the grammar
|
|
|
|
// NOTE: this is currently difficult because we need the tokenizer in Python to build
|
|
|
|
// the FSM and we'd have to load a copy of the tokenizer into our Pyo3 instance which
|
|
|
|
// may be slow and memory intensive. Best case is to have a Rust implementation of the FSM
|
|
|
|
// compiler and use that to build the FSM here.
|
|
|
|
|
|
|
|
// Validate grammar and unpack the grammar and type for the proto message
|
2024-06-04 13:56:56 +00:00
|
|
|
let grammar = match grammar {
|
2024-02-15 09:28:10 +00:00
|
|
|
Some(grammar) => {
|
|
|
|
// Ensure that grammar is not set if it's not supported
|
|
|
|
if self.disable_grammar_support {
|
|
|
|
return Err(ValidationError::Grammar);
|
|
|
|
}
|
2024-06-04 13:56:56 +00:00
|
|
|
let valid_grammar = match grammar {
|
2024-02-21 10:05:32 +00:00
|
|
|
GrammarType::Json(json) => {
|
|
|
|
let json = match json {
|
|
|
|
// if value is a string, we need to parse it again to make sure its
|
|
|
|
// a valid json
|
|
|
|
Value::String(s) => serde_json::from_str(&s)
|
|
|
|
.map_err(|e| ValidationError::InvalidGrammar(e.to_string())),
|
|
|
|
Value::Object(_) => Ok(json),
|
|
|
|
_ => Err(ValidationError::Grammar),
|
|
|
|
}?;
|
|
|
|
|
|
|
|
// Check if the json is a valid JSONSchema
|
|
|
|
JSONSchema::options()
|
|
|
|
.with_draft(Draft::Draft202012)
|
|
|
|
.compile(&json)
|
|
|
|
.map_err(|e| ValidationError::InvalidGrammar(e.to_string()))?;
|
|
|
|
|
2024-07-29 14:07:25 +00:00
|
|
|
// The schema can be valid but lack properties.
|
|
|
|
// We need properties for the grammar to be successfully parsed in Python.
|
|
|
|
// Therefore, we must check and throw an error if properties are missing.
|
|
|
|
json.get("properties")
|
|
|
|
.ok_or(ValidationError::InvalidGrammar(
|
|
|
|
"Grammar must have a 'properties' field".to_string(),
|
|
|
|
))?;
|
|
|
|
|
2024-06-04 13:56:56 +00:00
|
|
|
// Serialize json to string
|
|
|
|
ValidGrammar::Json(
|
2024-02-21 10:05:32 +00:00
|
|
|
serde_json::to_string(&json)
|
|
|
|
.map_err(|e| ValidationError::InvalidGrammar(e.to_string()))?,
|
|
|
|
)
|
|
|
|
}
|
2024-06-04 13:56:56 +00:00
|
|
|
GrammarType::Regex(regex) => ValidGrammar::Regex(regex),
|
|
|
|
};
|
|
|
|
Some(valid_grammar)
|
2024-02-15 09:28:10 +00:00
|
|
|
}
|
2024-06-04 13:56:56 +00:00
|
|
|
None => None,
|
2024-02-15 09:28:10 +00:00
|
|
|
};
|
|
|
|
|
2024-06-04 13:56:56 +00:00
|
|
|
let parameters = ValidParameters {
|
2023-04-09 18:22:27 +00:00
|
|
|
temperature,
|
|
|
|
repetition_penalty,
|
2024-02-08 17:41:25 +00:00
|
|
|
frequency_penalty,
|
2023-04-09 18:22:27 +00:00
|
|
|
top_k,
|
|
|
|
top_p,
|
|
|
|
typical_p,
|
|
|
|
do_sample,
|
|
|
|
seed,
|
|
|
|
watermark,
|
2024-02-15 09:28:10 +00:00
|
|
|
grammar,
|
2023-04-09 18:22:27 +00:00
|
|
|
};
|
2024-06-04 13:56:56 +00:00
|
|
|
let stopping_parameters = ValidStoppingParameters {
|
2023-04-09 18:22:27 +00:00
|
|
|
max_new_tokens,
|
|
|
|
stop_sequences,
|
|
|
|
ignore_eos_token: false,
|
|
|
|
};
|
|
|
|
|
2024-07-08 14:03:59 +00:00
|
|
|
metrics::histogram!("tgi_request_max_new_tokens").record(max_new_tokens as f64);
|
2023-04-09 18:22:27 +00:00
|
|
|
|
|
|
|
Ok(ValidGenerateRequest {
|
|
|
|
inputs,
|
2024-08-12 12:59:17 +00:00
|
|
|
input_ids: input_ids.map(Arc::new),
|
2024-08-29 14:29:01 +00:00
|
|
|
add_special_tokens: request.add_special_tokens,
|
2023-06-02 15:12:30 +00:00
|
|
|
decoder_input_details,
|
2023-04-24 15:59:00 +00:00
|
|
|
input_length: input_length as u32,
|
2023-04-09 18:22:27 +00:00
|
|
|
truncate: truncate.unwrap_or(self.max_input_length) as u32,
|
|
|
|
parameters,
|
|
|
|
stopping_parameters,
|
2023-09-27 08:40:18 +00:00
|
|
|
top_n_tokens,
|
2024-06-25 18:46:27 +00:00
|
|
|
adapter_id,
|
2023-04-09 18:22:27 +00:00
|
|
|
})
|
2022-10-11 14:50:54 +00:00
|
|
|
}
|
2023-03-09 14:30:54 +00:00
|
|
|
|
|
|
|
/// Validate the best_of parameter
|
|
|
|
#[instrument(skip_all)]
|
|
|
|
pub(crate) fn validate_best_of(&self, best_of: usize) -> Result<usize, ValidationError> {
|
|
|
|
if self.max_best_of == 1 && best_of != 1 {
|
|
|
|
return Err(ValidationError::BestOfDisabled);
|
|
|
|
}
|
|
|
|
|
|
|
|
if best_of > self.max_best_of {
|
|
|
|
return Err(ValidationError::BestOf(self.max_best_of, best_of));
|
|
|
|
}
|
|
|
|
|
|
|
|
Ok(best_of)
|
|
|
|
}
|
2022-10-11 14:50:54 +00:00
|
|
|
}
|
|
|
|
|
2023-10-23 13:51:12 +00:00
|
|
|
/// Round robin tokenization task
|
|
|
|
async fn round_robin_task(
|
|
|
|
mut receiver: mpsc::UnboundedReceiver<TokenizerRequest>,
|
|
|
|
senders: Vec<mpsc::UnboundedSender<TokenizerRequest>>,
|
|
|
|
) {
|
|
|
|
loop {
|
|
|
|
for sender in &senders {
|
|
|
|
match receiver.recv().await {
|
|
|
|
None => return,
|
|
|
|
Some(request) => sender.send(request).unwrap(),
|
|
|
|
};
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-04-09 18:22:27 +00:00
|
|
|
/// Start tokenization workers
|
Adding Llava-Next (Llava 1.6) with full support. (#1709)
# What does this PR do?
- Changed all models to extract `embed_tokens` in order to enable llava
to separately call the embeddings and the core model layers.
- Added VlmCausalLM to inherit from FlashMistral in order to be
maximally supported. The only added logics sits on top and parses images
into pixel values, preallocates input_ids space for the image
embeddings, and passes them for the model.
- Added Clip for the vision tower.
- Didn't add flash for the vision tower since there's no padding anyway.
- Added heuristic (potentially incomplete) to calculate number of
features *before* calculating the clip patches (allows for easier logic
reuse of the LLM under the hood).
Still needs to be done:
- [x] Implement the image parsing in the controller side, to avoid
downloading n times per TP shard and also refusing requests too large
early and avoid issues where the truncation actually truncates the
image.
- [ ] Make sure it works with quantization properly.
- [x] Make sure it works with TP>1
<!--
Congratulations! You've made it this far! You're not quite done yet
though.
Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.
Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.
Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->
<!-- Remove if not applicable -->
Fixes # (issue)
## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?
## Who can review?
Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.
<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @
@OlivierDehaene OR @Narsil
-->
2024-04-09 19:32:00 +00:00
|
|
|
fn tokenizer_worker(
|
|
|
|
tokenizer: Tokenizer,
|
|
|
|
config: Option<Config>,
|
2024-06-27 13:54:35 +00:00
|
|
|
preprocessor_config: Option<HubPreprocessorConfig>,
|
Adding Llava-Next (Llava 1.6) with full support. (#1709)
# What does this PR do?
- Changed all models to extract `embed_tokens` in order to enable llava
to separately call the embeddings and the core model layers.
- Added VlmCausalLM to inherit from FlashMistral in order to be
maximally supported. The only added logics sits on top and parses images
into pixel values, preallocates input_ids space for the image
embeddings, and passes them for the model.
- Added Clip for the vision tower.
- Didn't add flash for the vision tower since there's no padding anyway.
- Added heuristic (potentially incomplete) to calculate number of
features *before* calculating the clip patches (allows for easier logic
reuse of the LLM under the hood).
Still needs to be done:
- [x] Implement the image parsing in the controller side, to avoid
downloading n times per TP shard and also refusing requests too large
early and avoid issues where the truncation actually truncates the
image.
- [ ] Make sure it works with quantization properly.
- [x] Make sure it works with TP>1
<!--
Congratulations! You've made it this far! You're not quite done yet
though.
Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.
Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.
Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->
<!-- Remove if not applicable -->
Fixes # (issue)
## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?
## Who can review?
Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.
<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @
@OlivierDehaene OR @Narsil
-->
2024-04-09 19:32:00 +00:00
|
|
|
mut receiver: mpsc::UnboundedReceiver<TokenizerRequest>,
|
|
|
|
) {
|
2024-10-28 04:00:24 +00:00
|
|
|
match tokenizer {
|
|
|
|
Tokenizer::Python {
|
|
|
|
tokenizer_name,
|
|
|
|
revision,
|
|
|
|
} => {
|
|
|
|
pyo3::Python::with_gil(|py| -> pyo3::PyResult<()> {
|
|
|
|
let tokenizer = PyTokenizer::from_py(py, tokenizer_name, revision)?;
|
|
|
|
// Loop over requests
|
|
|
|
while let Some(((inputs, add_special_tokens, truncate), response_tx, parent_span)) =
|
|
|
|
receiver.blocking_recv()
|
|
|
|
{
|
|
|
|
parent_span.in_scope(|| {
|
|
|
|
response_tx
|
|
|
|
.send(prepare_input(
|
|
|
|
inputs,
|
|
|
|
truncate,
|
|
|
|
add_special_tokens,
|
|
|
|
&tokenizer,
|
|
|
|
config.as_ref(),
|
|
|
|
preprocessor_config.as_ref(),
|
|
|
|
))
|
|
|
|
.unwrap_or(())
|
|
|
|
})
|
|
|
|
}
|
|
|
|
Ok(())
|
|
|
|
})
|
|
|
|
.expect("Failure in python tokenizer worker");
|
|
|
|
}
|
|
|
|
Tokenizer::Rust(tokenizer) => {
|
|
|
|
while let Some(((inputs, add_special_tokens, truncate), response_tx, parent_span)) =
|
|
|
|
receiver.blocking_recv()
|
|
|
|
{
|
|
|
|
parent_span.in_scope(|| {
|
|
|
|
response_tx
|
|
|
|
.send(prepare_input(
|
|
|
|
inputs,
|
|
|
|
truncate,
|
|
|
|
add_special_tokens,
|
|
|
|
&tokenizer,
|
|
|
|
config.as_ref(),
|
|
|
|
preprocessor_config.as_ref(),
|
|
|
|
))
|
|
|
|
.unwrap_or(())
|
|
|
|
})
|
|
|
|
}
|
|
|
|
}
|
2023-01-03 09:41:22 +00:00
|
|
|
}
|
|
|
|
}
|
2022-10-11 14:50:54 +00:00
|
|
|
|
Adding Llava-Next (Llava 1.6) with full support. (#1709)
# What does this PR do?
- Changed all models to extract `embed_tokens` in order to enable llava
to separately call the embeddings and the core model layers.
- Added VlmCausalLM to inherit from FlashMistral in order to be
maximally supported. The only added logics sits on top and parses images
into pixel values, preallocates input_ids space for the image
embeddings, and passes them for the model.
- Added Clip for the vision tower.
- Didn't add flash for the vision tower since there's no padding anyway.
- Added heuristic (potentially incomplete) to calculate number of
features *before* calculating the clip patches (allows for easier logic
reuse of the LLM under the hood).
Still needs to be done:
- [x] Implement the image parsing in the controller side, to avoid
downloading n times per TP shard and also refusing requests too large
early and avoid issues where the truncation actually truncates the
image.
- [ ] Make sure it works with quantization properly.
- [x] Make sure it works with TP>1
<!--
Congratulations! You've made it this far! You're not quite done yet
though.
Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.
Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.
Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->
<!-- Remove if not applicable -->
Fixes # (issue)
## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?
## Who can review?
Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.
<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @
@OlivierDehaene OR @Narsil
-->
2024-04-09 19:32:00 +00:00
|
|
|
fn format_from_mimetype(mimetype: &str) -> Option<ImageFormat> {
|
|
|
|
match mimetype {
|
|
|
|
"image/png" => Some(ImageFormat::Png),
|
|
|
|
"image/jpeg" => Some(ImageFormat::Jpeg),
|
|
|
|
"image/jpg" => Some(ImageFormat::Jpeg),
|
|
|
|
"image/gif" => Some(ImageFormat::Gif),
|
|
|
|
"image/webp" => Some(ImageFormat::WebP),
|
|
|
|
"image/tiff" => Some(ImageFormat::Tiff),
|
|
|
|
// "image/pnm"=>Some(ImageFormat::Pnm),
|
|
|
|
// "image/tga"=>Some(ImageFormat::Tga),
|
|
|
|
// "image/dds"=>Some(ImageFormat::Dds),
|
|
|
|
// "image/bmp"=>Some(ImageFormat::Bmp),
|
|
|
|
// "image/ico"=>Some(ImageFormat::Ico),
|
|
|
|
// "image/x-exr"=>Some(ImageFormat::OpenExr),
|
|
|
|
_ => None,
|
|
|
|
}
|
|
|
|
}
|
2024-06-04 13:56:56 +00:00
|
|
|
|
Adding Llava-Next (Llava 1.6) with full support. (#1709)
# What does this PR do?
- Changed all models to extract `embed_tokens` in order to enable llava
to separately call the embeddings and the core model layers.
- Added VlmCausalLM to inherit from FlashMistral in order to be
maximally supported. The only added logics sits on top and parses images
into pixel values, preallocates input_ids space for the image
embeddings, and passes them for the model.
- Added Clip for the vision tower.
- Didn't add flash for the vision tower since there's no padding anyway.
- Added heuristic (potentially incomplete) to calculate number of
features *before* calculating the clip patches (allows for easier logic
reuse of the LLM under the hood).
Still needs to be done:
- [x] Implement the image parsing in the controller side, to avoid
downloading n times per TP shard and also refusing requests too large
early and avoid issues where the truncation actually truncates the
image.
- [ ] Make sure it works with quantization properly.
- [x] Make sure it works with TP>1
<!--
Congratulations! You've made it this far! You're not quite done yet
though.
Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.
Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.
Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->
<!-- Remove if not applicable -->
Fixes # (issue)
## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?
## Who can review?
Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.
<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @
@OlivierDehaene OR @Narsil
-->
2024-04-09 19:32:00 +00:00
|
|
|
fn format_to_mimetype(format: ImageFormat) -> String {
|
|
|
|
match format {
|
|
|
|
ImageFormat::Png => "image/png",
|
|
|
|
ImageFormat::Jpeg => "image/jpeg",
|
|
|
|
ImageFormat::Gif => "image/gif",
|
|
|
|
ImageFormat::WebP => "image/webp",
|
|
|
|
ImageFormat::Tiff => "image/tiff",
|
|
|
|
_ => "application/octet-stream",
|
|
|
|
}
|
|
|
|
.to_string()
|
|
|
|
}
|
|
|
|
|
2024-06-03 07:27:22 +00:00
|
|
|
fn fetch_image(input: &str) -> Result<(Vec<u8>, String, usize, usize), ValidationError> {
|
Adding Llava-Next (Llava 1.6) with full support. (#1709)
# What does this PR do?
- Changed all models to extract `embed_tokens` in order to enable llava
to separately call the embeddings and the core model layers.
- Added VlmCausalLM to inherit from FlashMistral in order to be
maximally supported. The only added logics sits on top and parses images
into pixel values, preallocates input_ids space for the image
embeddings, and passes them for the model.
- Added Clip for the vision tower.
- Didn't add flash for the vision tower since there's no padding anyway.
- Added heuristic (potentially incomplete) to calculate number of
features *before* calculating the clip patches (allows for easier logic
reuse of the LLM under the hood).
Still needs to be done:
- [x] Implement the image parsing in the controller side, to avoid
downloading n times per TP shard and also refusing requests too large
early and avoid issues where the truncation actually truncates the
image.
- [ ] Make sure it works with quantization properly.
- [x] Make sure it works with TP>1
<!--
Congratulations! You've made it this far! You're not quite done yet
though.
Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.
Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.
Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->
<!-- Remove if not applicable -->
Fixes # (issue)
## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?
## Who can review?
Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.
<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @
@OlivierDehaene OR @Narsil
-->
2024-04-09 19:32:00 +00:00
|
|
|
if input.starts_with(" || input.starts_with(" {
|
|
|
|
let url = &input["..input.len() - 1];
|
|
|
|
let data = reqwest::blocking::get(url)?.bytes()?;
|
|
|
|
|
|
|
|
let format = image::guess_format(&data)?;
|
|
|
|
// TODO Remove this clone
|
|
|
|
let img = ImageReader::with_format(Cursor::new(data.clone()), format).decode()?;
|
|
|
|
let height: usize = img.height().try_into()?;
|
|
|
|
let width: usize = img.width().try_into()?;
|
|
|
|
let mimetype = format_to_mimetype(format);
|
2024-06-03 07:27:22 +00:00
|
|
|
Ok((data.to_vec(), mimetype, height, width))
|
Adding Llava-Next (Llava 1.6) with full support. (#1709)
# What does this PR do?
- Changed all models to extract `embed_tokens` in order to enable llava
to separately call the embeddings and the core model layers.
- Added VlmCausalLM to inherit from FlashMistral in order to be
maximally supported. The only added logics sits on top and parses images
into pixel values, preallocates input_ids space for the image
embeddings, and passes them for the model.
- Added Clip for the vision tower.
- Didn't add flash for the vision tower since there's no padding anyway.
- Added heuristic (potentially incomplete) to calculate number of
features *before* calculating the clip patches (allows for easier logic
reuse of the LLM under the hood).
Still needs to be done:
- [x] Implement the image parsing in the controller side, to avoid
downloading n times per TP shard and also refusing requests too large
early and avoid issues where the truncation actually truncates the
image.
- [ ] Make sure it works with quantization properly.
- [x] Make sure it works with TP>1
<!--
Congratulations! You've made it this far! You're not quite done yet
though.
Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.
Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.
Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->
<!-- Remove if not applicable -->
Fixes # (issue)
## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?
## Who can review?
Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.
<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @
@OlivierDehaene OR @Narsil
-->
2024-04-09 19:32:00 +00:00
|
|
|
} else if input.starts_with(" {
|
|
|
|
// Remove 
|
|
|
|
let content = &input["..input.len() - 1];
|
|
|
|
let tokens: Vec<_> = content.split(';').collect();
|
|
|
|
if tokens.len() != 2 {
|
|
|
|
return Err(ValidationError::InvalidImageContent(content.to_string()));
|
|
|
|
}
|
|
|
|
let mimetype = tokens[0];
|
|
|
|
let content = tokens[1];
|
|
|
|
|
|
|
|
if !content.starts_with("base64,") {
|
|
|
|
return Err(ValidationError::InvalidImageContent(content.to_string()));
|
|
|
|
}
|
|
|
|
|
|
|
|
let data = STANDARD.decode(content["base64,".len()..].as_bytes())?;
|
|
|
|
let img = if let Some(format) = format_from_mimetype(mimetype) {
|
2024-06-03 07:27:22 +00:00
|
|
|
ImageReader::with_format(Cursor::new(&data), format).decode()?
|
Adding Llava-Next (Llava 1.6) with full support. (#1709)
# What does this PR do?
- Changed all models to extract `embed_tokens` in order to enable llava
to separately call the embeddings and the core model layers.
- Added VlmCausalLM to inherit from FlashMistral in order to be
maximally supported. The only added logics sits on top and parses images
into pixel values, preallocates input_ids space for the image
embeddings, and passes them for the model.
- Added Clip for the vision tower.
- Didn't add flash for the vision tower since there's no padding anyway.
- Added heuristic (potentially incomplete) to calculate number of
features *before* calculating the clip patches (allows for easier logic
reuse of the LLM under the hood).
Still needs to be done:
- [x] Implement the image parsing in the controller side, to avoid
downloading n times per TP shard and also refusing requests too large
early and avoid issues where the truncation actually truncates the
image.
- [ ] Make sure it works with quantization properly.
- [x] Make sure it works with TP>1
<!--
Congratulations! You've made it this far! You're not quite done yet
though.
Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.
Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.
Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->
<!-- Remove if not applicable -->
Fixes # (issue)
## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?
## Who can review?
Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.
<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @
@OlivierDehaene OR @Narsil
-->
2024-04-09 19:32:00 +00:00
|
|
|
} else {
|
2024-06-03 07:27:22 +00:00
|
|
|
ImageReader::new(Cursor::new(&data))
|
Adding Llava-Next (Llava 1.6) with full support. (#1709)
# What does this PR do?
- Changed all models to extract `embed_tokens` in order to enable llava
to separately call the embeddings and the core model layers.
- Added VlmCausalLM to inherit from FlashMistral in order to be
maximally supported. The only added logics sits on top and parses images
into pixel values, preallocates input_ids space for the image
embeddings, and passes them for the model.
- Added Clip for the vision tower.
- Didn't add flash for the vision tower since there's no padding anyway.
- Added heuristic (potentially incomplete) to calculate number of
features *before* calculating the clip patches (allows for easier logic
reuse of the LLM under the hood).
Still needs to be done:
- [x] Implement the image parsing in the controller side, to avoid
downloading n times per TP shard and also refusing requests too large
early and avoid issues where the truncation actually truncates the
image.
- [ ] Make sure it works with quantization properly.
- [x] Make sure it works with TP>1
<!--
Congratulations! You've made it this far! You're not quite done yet
though.
Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.
Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.
Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->
<!-- Remove if not applicable -->
Fixes # (issue)
## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?
## Who can review?
Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.
<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @
@OlivierDehaene OR @Narsil
-->
2024-04-09 19:32:00 +00:00
|
|
|
.with_guessed_format()
|
|
|
|
.map_err(|_io_error| ValidationError::InvalidImageContent(content.to_string()))?
|
|
|
|
.decode()?
|
|
|
|
};
|
|
|
|
|
|
|
|
let height: usize = img.height().try_into()?;
|
|
|
|
let width: usize = img.width().try_into()?;
|
2024-06-03 07:27:22 +00:00
|
|
|
Ok((data, mimetype.to_string(), height, width))
|
Adding Llava-Next (Llava 1.6) with full support. (#1709)
# What does this PR do?
- Changed all models to extract `embed_tokens` in order to enable llava
to separately call the embeddings and the core model layers.
- Added VlmCausalLM to inherit from FlashMistral in order to be
maximally supported. The only added logics sits on top and parses images
into pixel values, preallocates input_ids space for the image
embeddings, and passes them for the model.
- Added Clip for the vision tower.
- Didn't add flash for the vision tower since there's no padding anyway.
- Added heuristic (potentially incomplete) to calculate number of
features *before* calculating the clip patches (allows for easier logic
reuse of the LLM under the hood).
Still needs to be done:
- [x] Implement the image parsing in the controller side, to avoid
downloading n times per TP shard and also refusing requests too large
early and avoid issues where the truncation actually truncates the
image.
- [ ] Make sure it works with quantization properly.
- [x] Make sure it works with TP>1
<!--
Congratulations! You've made it this far! You're not quite done yet
though.
Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.
Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.
Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->
<!-- Remove if not applicable -->
Fixes # (issue)
## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?
## Who can review?
Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.
<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @
@OlivierDehaene OR @Narsil
-->
2024-04-09 19:32:00 +00:00
|
|
|
} else {
|
|
|
|
Err(ValidationError::InvalidImageContent(input.to_string()))
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-06-27 13:54:35 +00:00
|
|
|
fn image_tokens(
|
|
|
|
config: &Config,
|
|
|
|
preprocessor_config: Option<&HubPreprocessorConfig>,
|
|
|
|
height: usize,
|
|
|
|
width: usize,
|
|
|
|
) -> String {
|
|
|
|
use Config::*;
|
|
|
|
use HubPreprocessorConfig::*;
|
|
|
|
match config {
|
|
|
|
Idefics => "<image>".to_string(),
|
2024-10-02 09:22:13 +00:00
|
|
|
Mllama => "<|image|>".to_string(),
|
2024-06-27 13:54:35 +00:00
|
|
|
Idefics2(config) => {
|
|
|
|
const FAKE: &str = "<fake_token_around_image>";
|
|
|
|
const IMAGE: &str = "<image>";
|
|
|
|
|
|
|
|
let slots = config.get_number_of_features(height, width);
|
|
|
|
|
|
|
|
let mut image_string = String::with_capacity(2 * FAKE.len() + slots * IMAGE.len());
|
|
|
|
image_string.push_str(FAKE);
|
|
|
|
image_string.extend(iter::repeat(IMAGE).take(slots));
|
|
|
|
image_string.push_str(FAKE);
|
|
|
|
|
|
|
|
if matches!(
|
|
|
|
preprocessor_config,
|
|
|
|
Some(Idefics2Processor(Idefics2Preprocessor {
|
|
|
|
do_image_splitting: true,
|
|
|
|
..
|
|
|
|
}))
|
|
|
|
) {
|
|
|
|
image_string = image_string.repeat(5);
|
|
|
|
};
|
|
|
|
|
|
|
|
image_string
|
|
|
|
}
|
|
|
|
Paligemma(config) => "<image>".repeat(config.get_number_of_features(height, width)),
|
|
|
|
LlavaNext(config) => "<image>".repeat(config.get_number_of_features(height, width)),
|
2024-10-30 16:40:51 +00:00
|
|
|
Qwen2Vl(config) => format!(
|
|
|
|
"<|vision_start|>{:?}<|vision_end|>",
|
|
|
|
"<|image_pad|>".repeat(config.get_number_of_features(height, width))
|
|
|
|
),
|
2024-06-27 13:54:35 +00:00
|
|
|
_ => unimplemented!("Images tokens are not supported for this model configuration"),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
fn image_tokens_fixup(config: &Config, text: String) -> String {
|
|
|
|
match config {
|
|
|
|
Config::Idefics2(_) => {
|
|
|
|
const FAKE: &str = "<fake_token_around_image>";
|
|
|
|
text.replace(&format!("{FAKE}{FAKE}"), FAKE)
|
|
|
|
}
|
|
|
|
_ => text,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-04-09 18:22:27 +00:00
|
|
|
/// Get input length and optionally truncate it
|
2024-10-28 04:00:24 +00:00
|
|
|
fn prepare_input<T: TokenizerTrait>(
|
2024-06-03 07:27:22 +00:00
|
|
|
inputs: String,
|
Adding Llava-Next (Llava 1.6) with full support. (#1709)
# What does this PR do?
- Changed all models to extract `embed_tokens` in order to enable llava
to separately call the embeddings and the core model layers.
- Added VlmCausalLM to inherit from FlashMistral in order to be
maximally supported. The only added logics sits on top and parses images
into pixel values, preallocates input_ids space for the image
embeddings, and passes them for the model.
- Added Clip for the vision tower.
- Didn't add flash for the vision tower since there's no padding anyway.
- Added heuristic (potentially incomplete) to calculate number of
features *before* calculating the clip patches (allows for easier logic
reuse of the LLM under the hood).
Still needs to be done:
- [x] Implement the image parsing in the controller side, to avoid
downloading n times per TP shard and also refusing requests too large
early and avoid issues where the truncation actually truncates the
image.
- [ ] Make sure it works with quantization properly.
- [x] Make sure it works with TP>1
<!--
Congratulations! You've made it this far! You're not quite done yet
though.
Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.
Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.
Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->
<!-- Remove if not applicable -->
Fixes # (issue)
## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?
## Who can review?
Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.
<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @
@OlivierDehaene OR @Narsil
-->
2024-04-09 19:32:00 +00:00
|
|
|
_truncate: Option<usize>,
|
2024-08-29 14:29:01 +00:00
|
|
|
add_special_tokens: bool,
|
2024-10-28 04:00:24 +00:00
|
|
|
tokenizer: &T,
|
2024-06-27 13:54:35 +00:00
|
|
|
config: Option<&Config>,
|
|
|
|
preprocessor_config: Option<&HubPreprocessorConfig>,
|
2024-07-31 08:33:10 +00:00
|
|
|
) -> Result<(tokenizers::Encoding, Vec<Chunk>), ValidationError> {
|
2024-06-27 13:54:35 +00:00
|
|
|
use Config::*;
|
Adding Llava-Next (Llava 1.6) with full support. (#1709)
# What does this PR do?
- Changed all models to extract `embed_tokens` in order to enable llava
to separately call the embeddings and the core model layers.
- Added VlmCausalLM to inherit from FlashMistral in order to be
maximally supported. The only added logics sits on top and parses images
into pixel values, preallocates input_ids space for the image
embeddings, and passes them for the model.
- Added Clip for the vision tower.
- Didn't add flash for the vision tower since there's no padding anyway.
- Added heuristic (potentially incomplete) to calculate number of
features *before* calculating the clip patches (allows for easier logic
reuse of the LLM under the hood).
Still needs to be done:
- [x] Implement the image parsing in the controller side, to avoid
downloading n times per TP shard and also refusing requests too large
early and avoid issues where the truncation actually truncates the
image.
- [ ] Make sure it works with quantization properly.
- [x] Make sure it works with TP>1
<!--
Congratulations! You've made it this far! You're not quite done yet
though.
Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.
Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.
Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->
<!-- Remove if not applicable -->
Fixes # (issue)
## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?
## Who can review?
Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.
<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @
@OlivierDehaene OR @Narsil
-->
2024-04-09 19:32:00 +00:00
|
|
|
static RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"!\[\]\([^\)]*\)").unwrap());
|
2024-06-03 07:27:22 +00:00
|
|
|
let (tokenizer_query, input_chunks) = match config {
|
2024-10-30 16:40:51 +00:00
|
|
|
Some(
|
|
|
|
config @ (Idefics | Mllama | Idefics2(_) | Paligemma(_) | LlavaNext(_) | Qwen2Vl(_)),
|
|
|
|
) => {
|
2024-06-03 07:27:22 +00:00
|
|
|
let mut input_chunks = Vec::new();
|
Adding Llava-Next (Llava 1.6) with full support. (#1709)
# What does this PR do?
- Changed all models to extract `embed_tokens` in order to enable llava
to separately call the embeddings and the core model layers.
- Added VlmCausalLM to inherit from FlashMistral in order to be
maximally supported. The only added logics sits on top and parses images
into pixel values, preallocates input_ids space for the image
embeddings, and passes them for the model.
- Added Clip for the vision tower.
- Didn't add flash for the vision tower since there's no padding anyway.
- Added heuristic (potentially incomplete) to calculate number of
features *before* calculating the clip patches (allows for easier logic
reuse of the LLM under the hood).
Still needs to be done:
- [x] Implement the image parsing in the controller side, to avoid
downloading n times per TP shard and also refusing requests too large
early and avoid issues where the truncation actually truncates the
image.
- [ ] Make sure it works with quantization properly.
- [x] Make sure it works with TP>1
<!--
Congratulations! You've made it this far! You're not quite done yet
though.
Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.
Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.
Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->
<!-- Remove if not applicable -->
Fixes # (issue)
## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?
## Who can review?
Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.
<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @
@OlivierDehaene OR @Narsil
-->
2024-04-09 19:32:00 +00:00
|
|
|
let mut tokenizer_query = String::with_capacity(inputs.len());
|
|
|
|
let mut start = 0;
|
|
|
|
for chunk in RE.find_iter(&inputs) {
|
|
|
|
let chunk_start = chunk.start();
|
Pali gemma modeling (#1895)
This PR adds paligemma modeling code
Blog post: https://huggingface.co/blog/paligemma
Transformers PR: https://github.com/huggingface/transformers/pull/30814
install the latest changes and run with
```bash
# get the weights
# text-generation-server download-weights gv-hf/PaliGemma-base-224px-hf
# run TGI
text-generation-launcher --model-id gv-hf/PaliGemma-base-224px-hf
```
basic example sending various requests
```python
from huggingface_hub import InferenceClient
client = InferenceClient("http://127.0.0.1:3000")
images = [
"https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/cow_beach_1.png",
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png",
]
prompts = [
"What animal is in this image?",
"Name three colors in this image.",
"What are 10 colors in this image?",
"Where is the cow standing?",
"answer en Where is the cow standing?",
"Is there a bird in the image?",
"Is ther a cow in the image?",
"Is there a rabbit in the image?",
"how many birds are in the image?",
"how many rabbits are in the image?",
]
for img in images:
print(f"\nImage: {img.split('/')[-1]}")
for prompt in prompts:
inputs = f"{prompt}\n"
json_data = {
"inputs": inputs,
"parameters": {
"max_new_tokens": 30,
"do_sample": False,
},
}
generated_output = client.text_generation(prompt, max_new_tokens=30, stream=False)
print([f"{prompt}\n{generated_output}"])
```
---------
Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
2024-05-16 04:58:47 +00:00
|
|
|
let chunk_end = chunk.end();
|
|
|
|
if chunk_start != start {
|
2024-07-31 08:33:10 +00:00
|
|
|
input_chunks.push(Chunk::Text(inputs[start..chunk_start].to_string()));
|
Pali gemma modeling (#1895)
This PR adds paligemma modeling code
Blog post: https://huggingface.co/blog/paligemma
Transformers PR: https://github.com/huggingface/transformers/pull/30814
install the latest changes and run with
```bash
# get the weights
# text-generation-server download-weights gv-hf/PaliGemma-base-224px-hf
# run TGI
text-generation-launcher --model-id gv-hf/PaliGemma-base-224px-hf
```
basic example sending various requests
```python
from huggingface_hub import InferenceClient
client = InferenceClient("http://127.0.0.1:3000")
images = [
"https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/cow_beach_1.png",
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png",
]
prompts = [
"What animal is in this image?",
"Name three colors in this image.",
"What are 10 colors in this image?",
"Where is the cow standing?",
"answer en Where is the cow standing?",
"Is there a bird in the image?",
"Is ther a cow in the image?",
"Is there a rabbit in the image?",
"how many birds are in the image?",
"how many rabbits are in the image?",
]
for img in images:
print(f"\nImage: {img.split('/')[-1]}")
for prompt in prompts:
inputs = f"{prompt}\n"
json_data = {
"inputs": inputs,
"parameters": {
"max_new_tokens": 30,
"do_sample": False,
},
}
generated_output = client.text_generation(prompt, max_new_tokens=30, stream=False)
print([f"{prompt}\n{generated_output}"])
```
---------
Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
2024-05-16 04:58:47 +00:00
|
|
|
tokenizer_query.push_str(&inputs[start..chunk_start]);
|
|
|
|
}
|
2024-06-03 07:27:22 +00:00
|
|
|
let (data, mimetype, height, width) = fetch_image(&inputs[chunk_start..chunk_end])?;
|
2024-07-31 08:33:10 +00:00
|
|
|
input_chunks.push(Chunk::Image(Image { data, mimetype }));
|
2024-06-27 13:54:35 +00:00
|
|
|
tokenizer_query.push_str(&image_tokens(config, preprocessor_config, height, width));
|
Pali gemma modeling (#1895)
This PR adds paligemma modeling code
Blog post: https://huggingface.co/blog/paligemma
Transformers PR: https://github.com/huggingface/transformers/pull/30814
install the latest changes and run with
```bash
# get the weights
# text-generation-server download-weights gv-hf/PaliGemma-base-224px-hf
# run TGI
text-generation-launcher --model-id gv-hf/PaliGemma-base-224px-hf
```
basic example sending various requests
```python
from huggingface_hub import InferenceClient
client = InferenceClient("http://127.0.0.1:3000")
images = [
"https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/cow_beach_1.png",
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png",
]
prompts = [
"What animal is in this image?",
"Name three colors in this image.",
"What are 10 colors in this image?",
"Where is the cow standing?",
"answer en Where is the cow standing?",
"Is there a bird in the image?",
"Is ther a cow in the image?",
"Is there a rabbit in the image?",
"how many birds are in the image?",
"how many rabbits are in the image?",
]
for img in images:
print(f"\nImage: {img.split('/')[-1]}")
for prompt in prompts:
inputs = f"{prompt}\n"
json_data = {
"inputs": inputs,
"parameters": {
"max_new_tokens": 30,
"do_sample": False,
},
}
generated_output = client.text_generation(prompt, max_new_tokens=30, stream=False)
print([f"{prompt}\n{generated_output}"])
```
---------
Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
2024-05-16 04:58:47 +00:00
|
|
|
start = chunk_end;
|
|
|
|
}
|
2024-06-03 07:27:22 +00:00
|
|
|
if start != inputs.len() {
|
2024-07-31 08:33:10 +00:00
|
|
|
input_chunks.push(Chunk::Text(inputs[start..].to_string()));
|
Pali gemma modeling (#1895)
This PR adds paligemma modeling code
Blog post: https://huggingface.co/blog/paligemma
Transformers PR: https://github.com/huggingface/transformers/pull/30814
install the latest changes and run with
```bash
# get the weights
# text-generation-server download-weights gv-hf/PaliGemma-base-224px-hf
# run TGI
text-generation-launcher --model-id gv-hf/PaliGemma-base-224px-hf
```
basic example sending various requests
```python
from huggingface_hub import InferenceClient
client = InferenceClient("http://127.0.0.1:3000")
images = [
"https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/cow_beach_1.png",
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png",
]
prompts = [
"What animal is in this image?",
"Name three colors in this image.",
"What are 10 colors in this image?",
"Where is the cow standing?",
"answer en Where is the cow standing?",
"Is there a bird in the image?",
"Is ther a cow in the image?",
"Is there a rabbit in the image?",
"how many birds are in the image?",
"how many rabbits are in the image?",
]
for img in images:
print(f"\nImage: {img.split('/')[-1]}")
for prompt in prompts:
inputs = f"{prompt}\n"
json_data = {
"inputs": inputs,
"parameters": {
"max_new_tokens": 30,
"do_sample": False,
},
}
generated_output = client.text_generation(prompt, max_new_tokens=30, stream=False)
print([f"{prompt}\n{generated_output}"])
```
---------
Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
2024-05-16 04:58:47 +00:00
|
|
|
tokenizer_query.push_str(&inputs[start..]);
|
|
|
|
}
|
2024-04-23 21:04:44 +00:00
|
|
|
|
2024-06-27 13:54:35 +00:00
|
|
|
tokenizer_query = image_tokens_fixup(config, tokenizer_query);
|
|
|
|
|
2024-06-03 07:27:22 +00:00
|
|
|
(tokenizer_query, input_chunks)
|
2024-04-23 21:04:44 +00:00
|
|
|
}
|
2024-07-31 08:33:10 +00:00
|
|
|
_ => (inputs.clone(), vec![Chunk::Text(inputs)]),
|
2024-03-22 16:14:54 +00:00
|
|
|
};
|
Adding Llava-Next (Llava 1.6) with full support. (#1709)
# What does this PR do?
- Changed all models to extract `embed_tokens` in order to enable llava
to separately call the embeddings and the core model layers.
- Added VlmCausalLM to inherit from FlashMistral in order to be
maximally supported. The only added logics sits on top and parses images
into pixel values, preallocates input_ids space for the image
embeddings, and passes them for the model.
- Added Clip for the vision tower.
- Didn't add flash for the vision tower since there's no padding anyway.
- Added heuristic (potentially incomplete) to calculate number of
features *before* calculating the clip patches (allows for easier logic
reuse of the LLM under the hood).
Still needs to be done:
- [x] Implement the image parsing in the controller side, to avoid
downloading n times per TP shard and also refusing requests too large
early and avoid issues where the truncation actually truncates the
image.
- [ ] Make sure it works with quantization properly.
- [x] Make sure it works with TP>1
<!--
Congratulations! You've made it this far! You're not quite done yet
though.
Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.
Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.
Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->
<!-- Remove if not applicable -->
Fixes # (issue)
## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?
## Who can review?
Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.
<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @
@OlivierDehaene OR @Narsil
-->
2024-04-09 19:32:00 +00:00
|
|
|
|
2023-03-09 12:10:30 +00:00
|
|
|
// Get the number of tokens in the input
|
Adding Llava-Next (Llava 1.6) with full support. (#1709)
# What does this PR do?
- Changed all models to extract `embed_tokens` in order to enable llava
to separately call the embeddings and the core model layers.
- Added VlmCausalLM to inherit from FlashMistral in order to be
maximally supported. The only added logics sits on top and parses images
into pixel values, preallocates input_ids space for the image
embeddings, and passes them for the model.
- Added Clip for the vision tower.
- Didn't add flash for the vision tower since there's no padding anyway.
- Added heuristic (potentially incomplete) to calculate number of
features *before* calculating the clip patches (allows for easier logic
reuse of the LLM under the hood).
Still needs to be done:
- [x] Implement the image parsing in the controller side, to avoid
downloading n times per TP shard and also refusing requests too large
early and avoid issues where the truncation actually truncates the
image.
- [ ] Make sure it works with quantization properly.
- [x] Make sure it works with TP>1
<!--
Congratulations! You've made it this far! You're not quite done yet
though.
Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.
Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.
Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->
<!-- Remove if not applicable -->
Fixes # (issue)
## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?
## Who can review?
Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.
<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @
@OlivierDehaene OR @Narsil
-->
2024-04-09 19:32:00 +00:00
|
|
|
let encoding = tokenizer
|
2024-10-28 04:00:24 +00:00
|
|
|
.encode_trait(tokenizer_query, add_special_tokens)
|
2023-03-09 12:10:30 +00:00
|
|
|
.map_err(|err| ValidationError::Tokenizer(err.to_string()))?;
|
|
|
|
|
2024-06-03 07:27:22 +00:00
|
|
|
Ok((encoding, input_chunks))
|
2022-10-11 14:50:54 +00:00
|
|
|
}
|
2022-10-18 13:19:03 +00:00
|
|
|
|
2023-04-09 18:22:27 +00:00
|
|
|
type TokenizerRequest = (
|
2024-08-29 14:29:01 +00:00
|
|
|
(String, bool, Option<usize>),
|
2024-07-31 08:33:10 +00:00
|
|
|
oneshot::Sender<Result<(tokenizers::Encoding, Vec<Chunk>), ValidationError>>,
|
2023-02-13 12:02:45 +00:00
|
|
|
Span,
|
2022-10-18 13:19:03 +00:00
|
|
|
);
|
|
|
|
|
2024-07-31 08:33:10 +00:00
|
|
|
#[derive(Debug, Clone, Eq, PartialEq)]
|
|
|
|
pub struct Image {
|
|
|
|
pub data: Vec<u8>,
|
|
|
|
pub mimetype: String,
|
|
|
|
}
|
|
|
|
|
|
|
|
#[derive(Debug, Clone, Eq, PartialEq)]
|
|
|
|
pub enum Chunk {
|
|
|
|
Text(String),
|
|
|
|
Image(Image),
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Convert input chunks to a stringly-typed input for backwards
|
|
|
|
/// compat for backends that haven't implemented chunked inputs.
|
|
|
|
pub trait ChunksToString {
|
|
|
|
/// Convert chunks to string.
|
|
|
|
fn chunks_to_string(&self) -> String;
|
|
|
|
}
|
|
|
|
|
|
|
|
impl ChunksToString for Vec<Chunk> {
|
|
|
|
fn chunks_to_string(&self) -> String {
|
|
|
|
let mut output = String::new();
|
|
|
|
self.iter().for_each(|c| match &c {
|
|
|
|
Chunk::Text(text) => output.push_str(text),
|
|
|
|
Chunk::Image(Image { data, mimetype }) => {
|
|
|
|
let encoded = STANDARD.encode(data);
|
|
|
|
output.push_str(&format!("", mimetype, encoded))
|
|
|
|
}
|
|
|
|
});
|
|
|
|
output
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-06-04 13:56:56 +00:00
|
|
|
#[derive(Debug, Clone)]
|
2024-07-31 08:33:10 +00:00
|
|
|
pub enum ValidGrammar {
|
2024-06-04 13:56:56 +00:00
|
|
|
Json(String),
|
|
|
|
Regex(String),
|
|
|
|
}
|
|
|
|
|
|
|
|
#[derive(Debug, Clone)]
|
2024-07-31 08:33:10 +00:00
|
|
|
pub struct ValidParameters {
|
2024-06-04 13:56:56 +00:00
|
|
|
/// / exponential scaling output probability distribution
|
|
|
|
pub temperature: f32,
|
|
|
|
/// / restricting to the k highest probability elements
|
|
|
|
pub top_k: u32,
|
|
|
|
/// / restricting to top tokens summing to prob_cut_off <= prob_cut_off
|
|
|
|
pub top_p: f32,
|
|
|
|
/// / restricting to top tokens summing to prob_cut_off <= prob_cut_off
|
|
|
|
pub typical_p: f32,
|
|
|
|
/// / apply sampling on the logits
|
|
|
|
pub do_sample: bool,
|
|
|
|
/// / random seed for sampling
|
|
|
|
pub seed: u64,
|
|
|
|
/// / repetition penalty
|
|
|
|
pub repetition_penalty: f32,
|
|
|
|
/// / frequency penalty
|
|
|
|
pub frequency_penalty: f32,
|
|
|
|
/// / token watermarking using "A Watermark for Large Language Models"
|
|
|
|
pub watermark: bool,
|
|
|
|
/// / grammar (applied if not empty)
|
|
|
|
pub grammar: Option<ValidGrammar>,
|
|
|
|
}
|
|
|
|
|
|
|
|
#[derive(Debug, Clone)]
|
2024-07-31 08:33:10 +00:00
|
|
|
pub struct ValidStoppingParameters {
|
2024-06-04 13:56:56 +00:00
|
|
|
/// / Maximum number of generated tokens
|
|
|
|
pub max_new_tokens: u32,
|
|
|
|
/// / Optional stopping sequences
|
|
|
|
pub stop_sequences: Vec<String>,
|
|
|
|
/// / Ignore end of sequence token
|
|
|
|
/// / used for benchmarking
|
|
|
|
pub ignore_eos_token: bool,
|
|
|
|
}
|
|
|
|
|
feat: supports openai chat completions API (#1427)
This PR adds support to make TGI a drop in replacement for OpenAI
clients by exposing the same HTTP interface.
Notes
- TGI inits a single model at startup so the `model` field is unused in
HTTP requests.
- `max_tokens` and `stream` should work as expected but other params may
be (unimplemented or not supported)
General approach
- fetch the `tokenizer_config` at startup from the hub
- pass `tokenizer_config` into `Infer` so we have it at request time
- use the `chat_template` on the config to format chat request
- parse jinja template and render chat string
- pass inputs into existing generate function
- wrap generation output in expected structure before returning
# How to test
### Streaming curl
```bash
curl localhost:3000/v1/chat/completions \
-X POST \
-d '{
"model": "tgi",
"messages": [
{
"role": "system",
"content": "You are a helpful assistant."
},
{
"role": "user",
"content": "What is deep learning?"
}
],
"stream": true,
"max_tokens": 20
}' \
-H 'Content-Type: application/json'
```
It is also possible to use the `openai` python library and change the
base url
### 🌊 STREAMING REQUEST
```python
from openai import OpenAI
# init the client but point it to TGI
client = OpenAI(
base_url="http://localhost:3000/v1",
api_key="not needed for a local LLM"
)
chat_completion = client.chat.completions.create(
model="tgi",
messages=[
{"role": "system", "content": "You are a helpful assistant." },
{"role": "user", "content": "What is deep learning?"}
],
stream=True
)
# iterate and print stream
for message in chat_completion:
print(message)
# ChatCompletionChunk(id='', choices=[Choice(delta=ChoiceDelta(content=' that', function_call=None, role='assistant', tool_calls=None), finish_reason=None, index=2, logprobs=None)], created=1704486761, model='', object='text_completion', system_fingerprint='')
```
### 🚗 SYNCHRONOUS REQUEST
```python
from openai import OpenAI
# init the client but point it to TGI
client = OpenAI(
base_url="http://localhost:3000/v1",
api_key="not needed for a local LLM"
)
chat_completion = client.chat.completions.create(
model="tgi",
messages=[
{"role": "system", "content": "You are a helpful assistant." },
{"role": "user", "content": "What is deep learning?"}
],
stream=False
)
print(chat_completion)
# ChatCompletion(id='', choices=[Choice(finish_reason=None, index=0, logprobs=None, message=ChatCompletionMessage(content='\nDeep learning is a new field of research that has been gaining traction in the last ...', role='assistant', function_call=None, tool_calls=None))], created=1704486762, model='', object='text_completion', system_fingerprint='', usage=CompletionUsage(completion_tokens=100, prompt_tokens=76, total_tokens=176))
```
## How to run dev
```bash
cd text-generation-inference/server
MASTER_ADDR=127.0.0.1 MASTER_PORT=5555 text-generation-server serve --trust-remote-code gpt2
```
***note many of the existing `chat_templates` use non standard `jinja`
(ie. adding a `raise` to the template) which will throw an error when
parsing; hence using `upstage/SOLAR-10.7B-Instruct-v1.0` since it has a
valid template
```bash
cd text-generation-inference/router
cargo run -- --tokenizer-name upstage/SOLAR-10.7B-Instruct-v1.0
```
trigger
```bash
curl localhost:3000/v1/chat/completions \
-X POST \
-d '{ "model": "gpt-3.5-turbo", "messages": [ { "role": "system", "content": "You are a helpful assistant." }, { "role": "user", "content": "What is the IP address of the Google DNS servers?" } ], "stream": true, "max_tokens": 20, "logprobs": true }' \
-H 'Content-Type: application/json'
```
^ supports `stream: true` and `stream: false` requests
2024-01-16 10:07:41 +00:00
|
|
|
#[derive(Debug, Clone)]
|
2024-07-31 08:33:10 +00:00
|
|
|
pub struct ValidGenerateRequest {
|
|
|
|
pub inputs: Vec<Chunk>,
|
2024-08-12 12:59:17 +00:00
|
|
|
pub input_ids: Option<Arc<Vec<u32>>>,
|
2023-04-24 15:59:00 +00:00
|
|
|
pub input_length: u32,
|
2023-04-09 18:22:27 +00:00
|
|
|
pub truncate: u32,
|
2024-08-29 14:29:01 +00:00
|
|
|
pub add_special_tokens: bool,
|
2023-06-02 15:12:30 +00:00
|
|
|
pub decoder_input_details: bool,
|
2024-06-04 13:56:56 +00:00
|
|
|
pub parameters: ValidParameters,
|
|
|
|
pub stopping_parameters: ValidStoppingParameters,
|
2023-08-28 09:43:47 +00:00
|
|
|
pub top_n_tokens: u32,
|
2024-06-25 18:46:27 +00:00
|
|
|
pub adapter_id: Option<String>,
|
2023-01-31 16:04:00 +00:00
|
|
|
}
|
|
|
|
|
2022-10-18 13:19:03 +00:00
|
|
|
#[derive(Error, Debug)]
|
|
|
|
pub enum ValidationError {
|
2023-03-09 14:30:54 +00:00
|
|
|
#[error("`best_of` must be > 0 and <= {0}. Given: {1}")]
|
|
|
|
BestOf(usize, usize),
|
|
|
|
#[error("`best_of` != 1 is not allowed for this endpoint")]
|
|
|
|
BestOfDisabled,
|
|
|
|
#[error("you must use sampling when `best_of` is > 1")]
|
|
|
|
BestOfSampling,
|
|
|
|
#[error("`seed` must not be set when `best_of` > 1")]
|
|
|
|
BestOfSeed,
|
|
|
|
#[error("`best_of` != 1 is not supported when streaming tokens")]
|
|
|
|
BestOfStream,
|
2023-08-28 09:43:47 +00:00
|
|
|
#[error("`top_n_tokens` must be >= 0 and <= {0}. Given: {1}")]
|
|
|
|
TopNTokens(u32, u32),
|
|
|
|
#[error("`top_n_tokens` != 0 is not allowed for this endpoint")]
|
|
|
|
TopNTokensDisabled,
|
2023-06-02 15:12:30 +00:00
|
|
|
#[error("`decoder_input_details` == true is not supported when streaming tokens")]
|
|
|
|
PrefillDetailsStream,
|
2023-03-07 17:52:22 +00:00
|
|
|
#[error("`temperature` must be strictly positive")]
|
2022-10-18 13:19:03 +00:00
|
|
|
Temperature,
|
2023-03-07 17:52:22 +00:00
|
|
|
#[error("`repetition_penalty` must be strictly positive")]
|
2023-02-01 14:58:42 +00:00
|
|
|
RepetitionPenalty,
|
2024-02-08 17:41:25 +00:00
|
|
|
#[error("`frequency_penalty` must be >= -2.0 and <= 2.0")]
|
|
|
|
FrequencyPenalty,
|
2023-03-09 10:33:57 +00:00
|
|
|
#[error("`top_p` must be > 0.0 and < 1.0")]
|
2022-10-18 13:19:03 +00:00
|
|
|
TopP,
|
2023-03-07 17:52:22 +00:00
|
|
|
#[error("`top_k` must be strictly positive")]
|
2022-10-18 13:19:03 +00:00
|
|
|
TopK,
|
2023-03-09 12:10:30 +00:00
|
|
|
#[error("`truncate` must be strictly positive and less than {0}. Given: {1}")]
|
|
|
|
Truncate(usize, usize),
|
2023-03-09 10:33:57 +00:00
|
|
|
#[error("`typical_p` must be > 0.0 and < 1.0")]
|
|
|
|
TypicalP,
|
2023-10-11 08:46:40 +00:00
|
|
|
#[error("one of `max_new_tokens` or `truncate` must be set if a fast tokenizer is not in use")]
|
|
|
|
UnsetMaxNewTokens,
|
2023-03-07 17:52:22 +00:00
|
|
|
#[error("`max_new_tokens` must be strictly positive")]
|
2023-04-09 18:22:27 +00:00
|
|
|
NegativeMaxNewTokens,
|
|
|
|
#[error("`max_new_tokens` must be <= {0}. Given: {1}")]
|
|
|
|
MaxNewTokens(usize, u32),
|
2023-03-07 17:52:22 +00:00
|
|
|
#[error("`inputs` tokens + `max_new_tokens` must be <= {0}. Given: {1} `inputs` tokens and {2} `max_new_tokens`")]
|
2023-02-15 20:56:59 +00:00
|
|
|
MaxTotalTokens(usize, usize, u32),
|
2023-03-07 17:52:22 +00:00
|
|
|
#[error("`inputs` must have less than {0} tokens. Given: {1}")]
|
2022-10-21 08:59:15 +00:00
|
|
|
InputLength(usize, usize),
|
2023-03-07 17:52:22 +00:00
|
|
|
#[error("`inputs` cannot be empty")]
|
2023-02-15 20:56:59 +00:00
|
|
|
EmptyInput,
|
2023-03-07 17:52:22 +00:00
|
|
|
#[error("`stop` supports up to {0} stop sequences. Given: {1}")]
|
2023-01-03 09:41:22 +00:00
|
|
|
StopSequence(usize, usize),
|
2022-11-14 16:15:19 +00:00
|
|
|
#[error("tokenizer error {0}")]
|
|
|
|
Tokenizer(String),
|
2024-02-15 09:28:10 +00:00
|
|
|
#[error("grammar is not supported")]
|
|
|
|
Grammar,
|
2024-02-21 10:05:32 +00:00
|
|
|
#[error("grammar is not valid: {0}")]
|
|
|
|
InvalidGrammar(String),
|
Adding Llava-Next (Llava 1.6) with full support. (#1709)
# What does this PR do?
- Changed all models to extract `embed_tokens` in order to enable llava
to separately call the embeddings and the core model layers.
- Added VlmCausalLM to inherit from FlashMistral in order to be
maximally supported. The only added logics sits on top and parses images
into pixel values, preallocates input_ids space for the image
embeddings, and passes them for the model.
- Added Clip for the vision tower.
- Didn't add flash for the vision tower since there's no padding anyway.
- Added heuristic (potentially incomplete) to calculate number of
features *before* calculating the clip patches (allows for easier logic
reuse of the LLM under the hood).
Still needs to be done:
- [x] Implement the image parsing in the controller side, to avoid
downloading n times per TP shard and also refusing requests too large
early and avoid issues where the truncation actually truncates the
image.
- [ ] Make sure it works with quantization properly.
- [x] Make sure it works with TP>1
<!--
Congratulations! You've made it this far! You're not quite done yet
though.
Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.
Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.
Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->
<!-- Remove if not applicable -->
Fixes # (issue)
## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?
## Who can review?
Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.
<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @
@OlivierDehaene OR @Narsil
-->
2024-04-09 19:32:00 +00:00
|
|
|
#[error("base64 encoding is invalid: {0}")]
|
|
|
|
InvalidBase64(#[from] base64::DecodeError),
|
|
|
|
#[error("invalid image: {0}")]
|
|
|
|
InvalidImage(#[from] image::ImageError),
|
|
|
|
#[error("invalid integer: {0}")]
|
|
|
|
InvalidInt(#[from] core::num::TryFromIntError),
|
|
|
|
#[error("invalid image content: {0}")]
|
|
|
|
InvalidImageContent(String),
|
|
|
|
#[error("Could not fetch image: {0}")]
|
|
|
|
FailedFetchImage(#[from] reqwest::Error),
|
2024-07-31 08:33:10 +00:00
|
|
|
#[error("{0} modality is not supported")]
|
|
|
|
UnsupportedModality(&'static str),
|
2022-10-18 13:19:03 +00:00
|
|
|
}
|
2023-04-25 12:13:14 +00:00
|
|
|
|
|
|
|
#[cfg(test)]
|
2023-04-26 18:23:54 +00:00
|
|
|
mod tests {
|
2023-04-25 12:13:14 +00:00
|
|
|
use super::*;
|
2024-06-27 13:54:35 +00:00
|
|
|
use crate::config::{Idefics2, PaliTextConfig, Paligemma};
|
2023-04-26 14:14:40 +00:00
|
|
|
use crate::default_parameters;
|
|
|
|
use crate::tests::get_tokenizer;
|
2023-04-25 12:13:14 +00:00
|
|
|
|
|
|
|
#[tokio::test]
|
2023-04-26 18:23:54 +00:00
|
|
|
async fn test_validation_max_new_tokens() {
|
2024-10-28 04:00:24 +00:00
|
|
|
let tokenizer = get_tokenizer();
|
2023-04-25 12:13:14 +00:00
|
|
|
let max_best_of = 2;
|
|
|
|
let max_stop_sequence = 3;
|
2023-08-28 09:43:47 +00:00
|
|
|
let max_top_n_tokens = 4;
|
|
|
|
let max_input_length = 5;
|
|
|
|
let max_total_tokens = 6;
|
2023-04-25 12:13:14 +00:00
|
|
|
let workers = 1;
|
2024-02-15 09:28:10 +00:00
|
|
|
let disable_grammar_support = true;
|
Adding Llava-Next (Llava 1.6) with full support. (#1709)
# What does this PR do?
- Changed all models to extract `embed_tokens` in order to enable llava
to separately call the embeddings and the core model layers.
- Added VlmCausalLM to inherit from FlashMistral in order to be
maximally supported. The only added logics sits on top and parses images
into pixel values, preallocates input_ids space for the image
embeddings, and passes them for the model.
- Added Clip for the vision tower.
- Didn't add flash for the vision tower since there's no padding anyway.
- Added heuristic (potentially incomplete) to calculate number of
features *before* calculating the clip patches (allows for easier logic
reuse of the LLM under the hood).
Still needs to be done:
- [x] Implement the image parsing in the controller side, to avoid
downloading n times per TP shard and also refusing requests too large
early and avoid issues where the truncation actually truncates the
image.
- [ ] Make sure it works with quantization properly.
- [x] Make sure it works with TP>1
<!--
Congratulations! You've made it this far! You're not quite done yet
though.
Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.
Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.
Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->
<!-- Remove if not applicable -->
Fixes # (issue)
## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?
## Who can review?
Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.
<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @
@OlivierDehaene OR @Narsil
-->
2024-04-09 19:32:00 +00:00
|
|
|
let config = None;
|
2023-04-26 18:23:54 +00:00
|
|
|
let validation = Validation::new(
|
|
|
|
workers,
|
|
|
|
tokenizer,
|
Adding Llava-Next (Llava 1.6) with full support. (#1709)
# What does this PR do?
- Changed all models to extract `embed_tokens` in order to enable llava
to separately call the embeddings and the core model layers.
- Added VlmCausalLM to inherit from FlashMistral in order to be
maximally supported. The only added logics sits on top and parses images
into pixel values, preallocates input_ids space for the image
embeddings, and passes them for the model.
- Added Clip for the vision tower.
- Didn't add flash for the vision tower since there's no padding anyway.
- Added heuristic (potentially incomplete) to calculate number of
features *before* calculating the clip patches (allows for easier logic
reuse of the LLM under the hood).
Still needs to be done:
- [x] Implement the image parsing in the controller side, to avoid
downloading n times per TP shard and also refusing requests too large
early and avoid issues where the truncation actually truncates the
image.
- [ ] Make sure it works with quantization properly.
- [x] Make sure it works with TP>1
<!--
Congratulations! You've made it this far! You're not quite done yet
though.
Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.
Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.
Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->
<!-- Remove if not applicable -->
Fixes # (issue)
## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?
## Who can review?
Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.
<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @
@OlivierDehaene OR @Narsil
-->
2024-04-09 19:32:00 +00:00
|
|
|
config,
|
2024-06-27 13:54:35 +00:00
|
|
|
None,
|
2023-04-26 18:23:54 +00:00
|
|
|
max_best_of,
|
|
|
|
max_stop_sequence,
|
2023-08-28 09:43:47 +00:00
|
|
|
max_top_n_tokens,
|
2023-04-26 18:23:54 +00:00
|
|
|
max_input_length,
|
|
|
|
max_total_tokens,
|
2024-02-15 09:28:10 +00:00
|
|
|
disable_grammar_support,
|
2023-04-26 18:23:54 +00:00
|
|
|
);
|
2023-04-25 12:13:14 +00:00
|
|
|
|
|
|
|
let max_new_tokens = 10;
|
2023-04-26 18:23:54 +00:00
|
|
|
match validation
|
2024-08-29 14:29:01 +00:00
|
|
|
.validate_input("Hello".to_string(), true, None, Some(max_new_tokens))
|
2023-04-26 18:23:54 +00:00
|
|
|
.await
|
|
|
|
{
|
2024-10-28 04:00:24 +00:00
|
|
|
Err(ValidationError::MaxTotalTokens(6, 1, 10)) => (),
|
|
|
|
// Ok((_s, _, 0, 10)) => (),
|
2024-04-12 12:20:31 +00:00
|
|
|
r => panic!("Unexpected not max new tokens: {r:?}"),
|
2023-04-25 12:13:14 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#[tokio::test]
|
2023-04-26 18:23:54 +00:00
|
|
|
async fn test_validation_input_length() {
|
2024-10-28 04:00:24 +00:00
|
|
|
let tokenizer = get_tokenizer();
|
2023-04-25 12:13:14 +00:00
|
|
|
let max_best_of = 2;
|
|
|
|
let max_stop_sequence = 3;
|
2023-08-28 09:43:47 +00:00
|
|
|
let max_top_n_tokens = 4;
|
|
|
|
let max_input_length = 5;
|
|
|
|
let max_total_tokens = 6;
|
2024-02-15 09:28:10 +00:00
|
|
|
let disable_grammar_support = true;
|
2023-04-25 12:13:14 +00:00
|
|
|
let workers = 1;
|
Adding Llava-Next (Llava 1.6) with full support. (#1709)
# What does this PR do?
- Changed all models to extract `embed_tokens` in order to enable llava
to separately call the embeddings and the core model layers.
- Added VlmCausalLM to inherit from FlashMistral in order to be
maximally supported. The only added logics sits on top and parses images
into pixel values, preallocates input_ids space for the image
embeddings, and passes them for the model.
- Added Clip for the vision tower.
- Didn't add flash for the vision tower since there's no padding anyway.
- Added heuristic (potentially incomplete) to calculate number of
features *before* calculating the clip patches (allows for easier logic
reuse of the LLM under the hood).
Still needs to be done:
- [x] Implement the image parsing in the controller side, to avoid
downloading n times per TP shard and also refusing requests too large
early and avoid issues where the truncation actually truncates the
image.
- [ ] Make sure it works with quantization properly.
- [x] Make sure it works with TP>1
<!--
Congratulations! You've made it this far! You're not quite done yet
though.
Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.
Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.
Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->
<!-- Remove if not applicable -->
Fixes # (issue)
## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?
## Who can review?
Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.
<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @
@OlivierDehaene OR @Narsil
-->
2024-04-09 19:32:00 +00:00
|
|
|
let config = None;
|
2023-04-26 18:23:54 +00:00
|
|
|
let validation = Validation::new(
|
|
|
|
workers,
|
|
|
|
tokenizer,
|
Adding Llava-Next (Llava 1.6) with full support. (#1709)
# What does this PR do?
- Changed all models to extract `embed_tokens` in order to enable llava
to separately call the embeddings and the core model layers.
- Added VlmCausalLM to inherit from FlashMistral in order to be
maximally supported. The only added logics sits on top and parses images
into pixel values, preallocates input_ids space for the image
embeddings, and passes them for the model.
- Added Clip for the vision tower.
- Didn't add flash for the vision tower since there's no padding anyway.
- Added heuristic (potentially incomplete) to calculate number of
features *before* calculating the clip patches (allows for easier logic
reuse of the LLM under the hood).
Still needs to be done:
- [x] Implement the image parsing in the controller side, to avoid
downloading n times per TP shard and also refusing requests too large
early and avoid issues where the truncation actually truncates the
image.
- [ ] Make sure it works with quantization properly.
- [x] Make sure it works with TP>1
<!--
Congratulations! You've made it this far! You're not quite done yet
though.
Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.
Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.
Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->
<!-- Remove if not applicable -->
Fixes # (issue)
## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?
## Who can review?
Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.
<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @
@OlivierDehaene OR @Narsil
-->
2024-04-09 19:32:00 +00:00
|
|
|
config,
|
2024-06-27 13:54:35 +00:00
|
|
|
None,
|
2023-04-26 18:23:54 +00:00
|
|
|
max_best_of,
|
|
|
|
max_stop_sequence,
|
2023-08-28 09:43:47 +00:00
|
|
|
max_top_n_tokens,
|
2023-04-26 18:23:54 +00:00
|
|
|
max_input_length,
|
|
|
|
max_total_tokens,
|
2024-02-15 09:28:10 +00:00
|
|
|
disable_grammar_support,
|
2023-04-26 18:23:54 +00:00
|
|
|
);
|
2023-04-25 12:13:14 +00:00
|
|
|
|
|
|
|
let max_new_tokens = 10;
|
2023-04-26 18:23:54 +00:00
|
|
|
match validation
|
2024-08-29 14:29:01 +00:00
|
|
|
.validate_input("Hello".to_string(), true, None, Some(max_new_tokens))
|
2023-04-26 18:23:54 +00:00
|
|
|
.await
|
|
|
|
{
|
2023-08-28 09:43:47 +00:00
|
|
|
Err(ValidationError::MaxTotalTokens(6, 1, 10)) => (),
|
2023-04-26 18:23:54 +00:00
|
|
|
_ => panic!("Unexpected not max new tokens"),
|
2023-04-25 12:13:14 +00:00
|
|
|
}
|
|
|
|
}
|
2023-04-26 14:14:40 +00:00
|
|
|
|
|
|
|
#[tokio::test]
|
2023-04-26 18:23:54 +00:00
|
|
|
async fn test_validation_best_of_sampling() {
|
2024-10-28 04:00:24 +00:00
|
|
|
let tokenizer = get_tokenizer();
|
2023-04-26 14:14:40 +00:00
|
|
|
let max_best_of = 2;
|
|
|
|
let max_stop_sequence = 3;
|
2023-08-28 09:43:47 +00:00
|
|
|
let max_top_n_tokens = 4;
|
|
|
|
let max_input_length = 5;
|
|
|
|
let max_total_tokens = 6;
|
2023-04-26 14:14:40 +00:00
|
|
|
let workers = 1;
|
2024-02-15 09:28:10 +00:00
|
|
|
let disable_grammar_support = true;
|
Adding Llava-Next (Llava 1.6) with full support. (#1709)
# What does this PR do?
- Changed all models to extract `embed_tokens` in order to enable llava
to separately call the embeddings and the core model layers.
- Added VlmCausalLM to inherit from FlashMistral in order to be
maximally supported. The only added logics sits on top and parses images
into pixel values, preallocates input_ids space for the image
embeddings, and passes them for the model.
- Added Clip for the vision tower.
- Didn't add flash for the vision tower since there's no padding anyway.
- Added heuristic (potentially incomplete) to calculate number of
features *before* calculating the clip patches (allows for easier logic
reuse of the LLM under the hood).
Still needs to be done:
- [x] Implement the image parsing in the controller side, to avoid
downloading n times per TP shard and also refusing requests too large
early and avoid issues where the truncation actually truncates the
image.
- [ ] Make sure it works with quantization properly.
- [x] Make sure it works with TP>1
<!--
Congratulations! You've made it this far! You're not quite done yet
though.
Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.
Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.
Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->
<!-- Remove if not applicable -->
Fixes # (issue)
## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?
## Who can review?
Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.
<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @
@OlivierDehaene OR @Narsil
-->
2024-04-09 19:32:00 +00:00
|
|
|
let config = None;
|
2023-04-26 18:23:54 +00:00
|
|
|
let validation = Validation::new(
|
|
|
|
workers,
|
|
|
|
tokenizer,
|
Adding Llava-Next (Llava 1.6) with full support. (#1709)
# What does this PR do?
- Changed all models to extract `embed_tokens` in order to enable llava
to separately call the embeddings and the core model layers.
- Added VlmCausalLM to inherit from FlashMistral in order to be
maximally supported. The only added logics sits on top and parses images
into pixel values, preallocates input_ids space for the image
embeddings, and passes them for the model.
- Added Clip for the vision tower.
- Didn't add flash for the vision tower since there's no padding anyway.
- Added heuristic (potentially incomplete) to calculate number of
features *before* calculating the clip patches (allows for easier logic
reuse of the LLM under the hood).
Still needs to be done:
- [x] Implement the image parsing in the controller side, to avoid
downloading n times per TP shard and also refusing requests too large
early and avoid issues where the truncation actually truncates the
image.
- [ ] Make sure it works with quantization properly.
- [x] Make sure it works with TP>1
<!--
Congratulations! You've made it this far! You're not quite done yet
though.
Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.
Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.
Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->
<!-- Remove if not applicable -->
Fixes # (issue)
## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?
## Who can review?
Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.
<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @
@OlivierDehaene OR @Narsil
-->
2024-04-09 19:32:00 +00:00
|
|
|
config,
|
2024-06-27 13:54:35 +00:00
|
|
|
None,
|
2023-04-26 18:23:54 +00:00
|
|
|
max_best_of,
|
|
|
|
max_stop_sequence,
|
2023-08-28 09:43:47 +00:00
|
|
|
max_top_n_tokens,
|
2023-04-26 18:23:54 +00:00
|
|
|
max_input_length,
|
|
|
|
max_total_tokens,
|
2024-02-15 09:28:10 +00:00
|
|
|
disable_grammar_support,
|
2023-04-26 18:23:54 +00:00
|
|
|
);
|
|
|
|
match validation
|
|
|
|
.validate(GenerateRequest {
|
|
|
|
inputs: "Hello".to_string(),
|
2024-08-29 14:29:01 +00:00
|
|
|
add_special_tokens: true,
|
2023-04-26 18:23:54 +00:00
|
|
|
parameters: GenerateParameters {
|
|
|
|
best_of: Some(2),
|
|
|
|
do_sample: false,
|
|
|
|
..default_parameters()
|
|
|
|
},
|
|
|
|
})
|
|
|
|
.await
|
|
|
|
{
|
2023-04-26 14:14:40 +00:00
|
|
|
Err(ValidationError::BestOfSampling) => (),
|
2023-04-26 18:23:54 +00:00
|
|
|
_ => panic!("Unexpected not best of sampling"),
|
2023-04-26 14:14:40 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#[tokio::test]
|
2023-04-26 18:23:54 +00:00
|
|
|
async fn test_validation_top_p() {
|
2024-10-28 04:00:24 +00:00
|
|
|
let tokenizer = get_tokenizer();
|
2023-04-26 14:14:40 +00:00
|
|
|
let max_best_of = 2;
|
|
|
|
let max_stop_sequence = 3;
|
2023-08-28 09:43:47 +00:00
|
|
|
let max_top_n_tokens = 4;
|
|
|
|
let max_input_length = 5;
|
2023-12-14 14:59:38 +00:00
|
|
|
let max_total_tokens = 106;
|
2023-04-26 14:14:40 +00:00
|
|
|
let workers = 1;
|
2024-02-15 09:28:10 +00:00
|
|
|
let disable_grammar_support = true;
|
Adding Llava-Next (Llava 1.6) with full support. (#1709)
# What does this PR do?
- Changed all models to extract `embed_tokens` in order to enable llava
to separately call the embeddings and the core model layers.
- Added VlmCausalLM to inherit from FlashMistral in order to be
maximally supported. The only added logics sits on top and parses images
into pixel values, preallocates input_ids space for the image
embeddings, and passes them for the model.
- Added Clip for the vision tower.
- Didn't add flash for the vision tower since there's no padding anyway.
- Added heuristic (potentially incomplete) to calculate number of
features *before* calculating the clip patches (allows for easier logic
reuse of the LLM under the hood).
Still needs to be done:
- [x] Implement the image parsing in the controller side, to avoid
downloading n times per TP shard and also refusing requests too large
early and avoid issues where the truncation actually truncates the
image.
- [ ] Make sure it works with quantization properly.
- [x] Make sure it works with TP>1
<!--
Congratulations! You've made it this far! You're not quite done yet
though.
Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.
Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.
Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->
<!-- Remove if not applicable -->
Fixes # (issue)
## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?
## Who can review?
Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.
<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @
@OlivierDehaene OR @Narsil
-->
2024-04-09 19:32:00 +00:00
|
|
|
let config = None;
|
2023-04-26 18:23:54 +00:00
|
|
|
let validation = Validation::new(
|
|
|
|
workers,
|
|
|
|
tokenizer,
|
Adding Llava-Next (Llava 1.6) with full support. (#1709)
# What does this PR do?
- Changed all models to extract `embed_tokens` in order to enable llava
to separately call the embeddings and the core model layers.
- Added VlmCausalLM to inherit from FlashMistral in order to be
maximally supported. The only added logics sits on top and parses images
into pixel values, preallocates input_ids space for the image
embeddings, and passes them for the model.
- Added Clip for the vision tower.
- Didn't add flash for the vision tower since there's no padding anyway.
- Added heuristic (potentially incomplete) to calculate number of
features *before* calculating the clip patches (allows for easier logic
reuse of the LLM under the hood).
Still needs to be done:
- [x] Implement the image parsing in the controller side, to avoid
downloading n times per TP shard and also refusing requests too large
early and avoid issues where the truncation actually truncates the
image.
- [ ] Make sure it works with quantization properly.
- [x] Make sure it works with TP>1
<!--
Congratulations! You've made it this far! You're not quite done yet
though.
Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.
Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.
Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->
<!-- Remove if not applicable -->
Fixes # (issue)
## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?
## Who can review?
Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.
<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @
@OlivierDehaene OR @Narsil
-->
2024-04-09 19:32:00 +00:00
|
|
|
config,
|
2024-06-27 13:54:35 +00:00
|
|
|
None,
|
2023-04-26 18:23:54 +00:00
|
|
|
max_best_of,
|
|
|
|
max_stop_sequence,
|
2023-08-28 09:43:47 +00:00
|
|
|
max_top_n_tokens,
|
2023-04-26 18:23:54 +00:00
|
|
|
max_input_length,
|
|
|
|
max_total_tokens,
|
2024-02-15 09:28:10 +00:00
|
|
|
disable_grammar_support,
|
2023-04-26 18:23:54 +00:00
|
|
|
);
|
|
|
|
match validation
|
|
|
|
.validate(GenerateRequest {
|
|
|
|
inputs: "Hello".to_string(),
|
2024-08-29 14:29:01 +00:00
|
|
|
add_special_tokens: true,
|
2023-04-26 18:23:54 +00:00
|
|
|
parameters: GenerateParameters {
|
|
|
|
top_p: Some(1.0),
|
2023-12-18 09:20:08 +00:00
|
|
|
max_new_tokens: Some(5),
|
2023-04-26 18:23:54 +00:00
|
|
|
..default_parameters()
|
|
|
|
},
|
|
|
|
})
|
|
|
|
.await
|
|
|
|
{
|
2023-04-26 14:14:40 +00:00
|
|
|
Err(ValidationError::TopP) => (),
|
2023-04-26 18:23:54 +00:00
|
|
|
_ => panic!("Unexpected top_p"),
|
2023-04-26 14:14:40 +00:00
|
|
|
}
|
|
|
|
|
2023-04-26 18:23:54 +00:00
|
|
|
match validation
|
|
|
|
.validate(GenerateRequest {
|
|
|
|
inputs: "Hello".to_string(),
|
2024-08-29 14:29:01 +00:00
|
|
|
add_special_tokens: true,
|
2023-04-26 18:23:54 +00:00
|
|
|
parameters: GenerateParameters {
|
|
|
|
top_p: Some(0.99),
|
2023-12-18 09:20:08 +00:00
|
|
|
max_new_tokens: Some(5),
|
2023-04-26 18:23:54 +00:00
|
|
|
..default_parameters()
|
|
|
|
},
|
|
|
|
})
|
|
|
|
.await
|
|
|
|
{
|
2023-04-26 14:14:40 +00:00
|
|
|
Ok(_) => (),
|
2023-04-26 18:23:54 +00:00
|
|
|
_ => panic!("Unexpected top_p error"),
|
2023-04-26 14:14:40 +00:00
|
|
|
}
|
|
|
|
|
2023-04-26 18:23:54 +00:00
|
|
|
let valid_request = validation
|
|
|
|
.validate(GenerateRequest {
|
|
|
|
inputs: "Hello".to_string(),
|
2024-08-29 14:29:01 +00:00
|
|
|
add_special_tokens: true,
|
2023-04-26 18:23:54 +00:00
|
|
|
parameters: GenerateParameters {
|
|
|
|
top_p: None,
|
2023-12-18 09:20:08 +00:00
|
|
|
max_new_tokens: Some(5),
|
2023-04-26 18:23:54 +00:00
|
|
|
..default_parameters()
|
|
|
|
},
|
|
|
|
})
|
|
|
|
.await
|
|
|
|
.unwrap();
|
2023-04-26 14:14:40 +00:00
|
|
|
// top_p == 1.0 is invalid for users to ask for but it's the default resolved value.
|
|
|
|
assert_eq!(valid_request.parameters.top_p, 1.0);
|
|
|
|
}
|
2023-08-28 09:43:47 +00:00
|
|
|
|
|
|
|
#[tokio::test]
|
|
|
|
async fn test_validation_top_n_tokens() {
|
2024-10-28 04:00:24 +00:00
|
|
|
let tokenizer = get_tokenizer();
|
2023-08-28 09:43:47 +00:00
|
|
|
let max_best_of = 2;
|
|
|
|
let max_stop_sequences = 3;
|
|
|
|
let max_top_n_tokens = 4;
|
|
|
|
let max_input_length = 5;
|
2023-12-14 14:59:38 +00:00
|
|
|
let max_total_tokens = 106;
|
2023-08-28 09:43:47 +00:00
|
|
|
let workers = 1;
|
2024-02-15 09:28:10 +00:00
|
|
|
let disable_grammar_support = true;
|
Adding Llava-Next (Llava 1.6) with full support. (#1709)
# What does this PR do?
- Changed all models to extract `embed_tokens` in order to enable llava
to separately call the embeddings and the core model layers.
- Added VlmCausalLM to inherit from FlashMistral in order to be
maximally supported. The only added logics sits on top and parses images
into pixel values, preallocates input_ids space for the image
embeddings, and passes them for the model.
- Added Clip for the vision tower.
- Didn't add flash for the vision tower since there's no padding anyway.
- Added heuristic (potentially incomplete) to calculate number of
features *before* calculating the clip patches (allows for easier logic
reuse of the LLM under the hood).
Still needs to be done:
- [x] Implement the image parsing in the controller side, to avoid
downloading n times per TP shard and also refusing requests too large
early and avoid issues where the truncation actually truncates the
image.
- [ ] Make sure it works with quantization properly.
- [x] Make sure it works with TP>1
<!--
Congratulations! You've made it this far! You're not quite done yet
though.
Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.
Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.
Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->
<!-- Remove if not applicable -->
Fixes # (issue)
## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?
## Who can review?
Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.
<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @
@OlivierDehaene OR @Narsil
-->
2024-04-09 19:32:00 +00:00
|
|
|
let config = None;
|
2023-08-28 09:43:47 +00:00
|
|
|
let validation = Validation::new(
|
|
|
|
workers,
|
|
|
|
tokenizer,
|
Adding Llava-Next (Llava 1.6) with full support. (#1709)
# What does this PR do?
- Changed all models to extract `embed_tokens` in order to enable llava
to separately call the embeddings and the core model layers.
- Added VlmCausalLM to inherit from FlashMistral in order to be
maximally supported. The only added logics sits on top and parses images
into pixel values, preallocates input_ids space for the image
embeddings, and passes them for the model.
- Added Clip for the vision tower.
- Didn't add flash for the vision tower since there's no padding anyway.
- Added heuristic (potentially incomplete) to calculate number of
features *before* calculating the clip patches (allows for easier logic
reuse of the LLM under the hood).
Still needs to be done:
- [x] Implement the image parsing in the controller side, to avoid
downloading n times per TP shard and also refusing requests too large
early and avoid issues where the truncation actually truncates the
image.
- [ ] Make sure it works with quantization properly.
- [x] Make sure it works with TP>1
<!--
Congratulations! You've made it this far! You're not quite done yet
though.
Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.
Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.
Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->
<!-- Remove if not applicable -->
Fixes # (issue)
## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?
## Who can review?
Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.
<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @
@OlivierDehaene OR @Narsil
-->
2024-04-09 19:32:00 +00:00
|
|
|
config,
|
2024-06-27 13:54:35 +00:00
|
|
|
None,
|
2023-08-28 09:43:47 +00:00
|
|
|
max_best_of,
|
|
|
|
max_stop_sequences,
|
|
|
|
max_top_n_tokens,
|
|
|
|
max_input_length,
|
|
|
|
max_total_tokens,
|
2024-02-15 09:28:10 +00:00
|
|
|
disable_grammar_support,
|
2023-08-28 09:43:47 +00:00
|
|
|
);
|
|
|
|
match validation
|
|
|
|
.validate(GenerateRequest {
|
|
|
|
inputs: "Hello".to_string(),
|
2024-08-29 14:29:01 +00:00
|
|
|
add_special_tokens: true,
|
2023-08-28 09:43:47 +00:00
|
|
|
parameters: GenerateParameters {
|
|
|
|
top_n_tokens: Some(5),
|
2023-12-18 09:20:08 +00:00
|
|
|
max_new_tokens: Some(5),
|
2023-08-28 09:43:47 +00:00
|
|
|
..default_parameters()
|
|
|
|
},
|
|
|
|
})
|
|
|
|
.await
|
|
|
|
{
|
|
|
|
Err(ValidationError::TopNTokens(4, 5)) => (),
|
|
|
|
_ => panic!("Unexpected top_n_tokens"),
|
|
|
|
}
|
|
|
|
|
|
|
|
validation
|
|
|
|
.validate(GenerateRequest {
|
|
|
|
inputs: "Hello".to_string(),
|
2024-08-29 14:29:01 +00:00
|
|
|
add_special_tokens: true,
|
2023-08-28 09:43:47 +00:00
|
|
|
parameters: GenerateParameters {
|
|
|
|
top_n_tokens: Some(4),
|
2023-12-18 09:20:08 +00:00
|
|
|
max_new_tokens: Some(5),
|
2023-08-28 09:43:47 +00:00
|
|
|
..default_parameters()
|
|
|
|
},
|
|
|
|
})
|
|
|
|
.await
|
|
|
|
.unwrap();
|
|
|
|
|
|
|
|
validation
|
|
|
|
.validate(GenerateRequest {
|
|
|
|
inputs: "Hello".to_string(),
|
2024-08-29 14:29:01 +00:00
|
|
|
add_special_tokens: true,
|
2023-08-28 09:43:47 +00:00
|
|
|
parameters: GenerateParameters {
|
|
|
|
top_n_tokens: Some(0),
|
2023-12-18 09:20:08 +00:00
|
|
|
max_new_tokens: Some(5),
|
2023-08-28 09:43:47 +00:00
|
|
|
..default_parameters()
|
|
|
|
},
|
|
|
|
})
|
|
|
|
.await
|
|
|
|
.unwrap();
|
|
|
|
|
|
|
|
let valid_request = validation
|
|
|
|
.validate(GenerateRequest {
|
|
|
|
inputs: "Hello".to_string(),
|
2024-08-29 14:29:01 +00:00
|
|
|
add_special_tokens: true,
|
2023-08-28 09:43:47 +00:00
|
|
|
parameters: GenerateParameters {
|
|
|
|
top_n_tokens: None,
|
2023-12-18 09:20:08 +00:00
|
|
|
max_new_tokens: Some(5),
|
2023-08-28 09:43:47 +00:00
|
|
|
..default_parameters()
|
|
|
|
},
|
|
|
|
})
|
|
|
|
.await
|
|
|
|
.unwrap();
|
|
|
|
|
|
|
|
assert_eq!(valid_request.top_n_tokens, 0);
|
|
|
|
}
|
2024-06-03 07:27:22 +00:00
|
|
|
|
|
|
|
static PIXEL_GIF: &str = "R0lGODdhAQABAIEAAP///wAAAAAAAAAAACwAAAAAAQABAAAIBAABBAQAOw==";
|
|
|
|
|
|
|
|
#[tokio::test]
|
|
|
|
async fn test_prepare_input_chunks() {
|
|
|
|
let pixel_data = STANDARD.decode(PIXEL_GIF).unwrap();
|
|
|
|
|
2024-10-28 04:00:24 +00:00
|
|
|
let tokenizer = get_tokenizer();
|
2024-06-03 07:27:22 +00:00
|
|
|
|
|
|
|
let max_best_of = 2;
|
|
|
|
let max_stop_sequence = 3;
|
|
|
|
let max_top_n_tokens = 4;
|
|
|
|
let max_input_length = 5;
|
|
|
|
let max_total_tokens = 6;
|
|
|
|
let disable_grammar_support = true;
|
|
|
|
let workers = 1;
|
|
|
|
let config = Config::Paligemma(Paligemma {
|
|
|
|
text_config: PaliTextConfig {
|
|
|
|
num_image_tokens: 1,
|
|
|
|
},
|
|
|
|
});
|
|
|
|
let validation = Validation::new(
|
|
|
|
workers,
|
|
|
|
tokenizer,
|
|
|
|
Some(config),
|
2024-06-27 13:54:35 +00:00
|
|
|
None,
|
2024-06-03 07:27:22 +00:00
|
|
|
max_best_of,
|
|
|
|
max_stop_sequence,
|
|
|
|
max_top_n_tokens,
|
|
|
|
max_input_length,
|
|
|
|
max_total_tokens,
|
|
|
|
disable_grammar_support,
|
|
|
|
);
|
|
|
|
|
|
|
|
let chunks = match validation
|
|
|
|
.tokenize(
|
|
|
|
format!("test", PIXEL_GIF),
|
2024-08-29 14:29:01 +00:00
|
|
|
true,
|
2024-06-03 07:27:22 +00:00
|
|
|
None,
|
|
|
|
)
|
|
|
|
.await
|
|
|
|
{
|
2024-10-28 04:00:24 +00:00
|
|
|
Ok((_encoding, chunks)) => chunks,
|
2024-06-03 07:27:22 +00:00
|
|
|
_ => panic!("Unexpected tokenization failure"),
|
|
|
|
};
|
|
|
|
|
|
|
|
assert!(
|
|
|
|
chunks
|
|
|
|
== vec![
|
|
|
|
Chunk::Text("test".to_string()).into(),
|
|
|
|
Chunk::Image(Image {
|
|
|
|
data: pixel_data.clone(),
|
|
|
|
mimetype: "image/gif".to_string()
|
|
|
|
})
|
|
|
|
.into()
|
|
|
|
],
|
|
|
|
"Failed to process images",
|
|
|
|
);
|
|
|
|
}
|
2024-06-27 13:54:35 +00:00
|
|
|
|
|
|
|
#[tokio::test]
|
|
|
|
async fn test_idefics2_correct_n_fake_tokens() {
|
|
|
|
let pixel_data = STANDARD.decode(PIXEL_GIF).unwrap();
|
|
|
|
|
2024-10-28 04:00:24 +00:00
|
|
|
let tokenizer = get_tokenizer();
|
2024-06-27 13:54:35 +00:00
|
|
|
|
|
|
|
let max_best_of = 2;
|
|
|
|
let max_stop_sequence = 3;
|
|
|
|
let max_top_n_tokens = 4;
|
|
|
|
let max_input_length = 5;
|
|
|
|
let max_total_tokens = 6;
|
|
|
|
let disable_grammar_support = true;
|
|
|
|
let workers = 1;
|
|
|
|
let config = Config::Idefics2(Idefics2 {});
|
|
|
|
let validation = Validation::new(
|
|
|
|
workers,
|
|
|
|
tokenizer,
|
|
|
|
Some(config),
|
|
|
|
Some(HubPreprocessorConfig::Idefics2Processor(
|
|
|
|
Idefics2Preprocessor {
|
|
|
|
do_image_splitting: true,
|
|
|
|
},
|
|
|
|
)),
|
|
|
|
max_best_of,
|
|
|
|
max_stop_sequence,
|
|
|
|
max_top_n_tokens,
|
|
|
|
max_input_length,
|
|
|
|
max_total_tokens,
|
|
|
|
disable_grammar_support,
|
|
|
|
);
|
|
|
|
|
|
|
|
let (encoding, chunks) = match validation
|
|
|
|
.tokenize(
|
|
|
|
format!(
|
|
|
|
"test",
|
|
|
|
PIXEL_GIF, PIXEL_GIF
|
|
|
|
),
|
2024-08-29 14:29:01 +00:00
|
|
|
true,
|
2024-06-27 13:54:35 +00:00
|
|
|
None,
|
|
|
|
)
|
|
|
|
.await
|
|
|
|
{
|
2024-10-28 04:00:24 +00:00
|
|
|
Ok((encoding, chunks)) => (encoding, chunks),
|
2024-06-27 13:54:35 +00:00
|
|
|
_ => panic!("Unexpected tokenization failure"),
|
|
|
|
};
|
|
|
|
|
|
|
|
assert!(
|
|
|
|
chunks
|
|
|
|
== vec![
|
|
|
|
Chunk::Text("test".to_string()).into(),
|
|
|
|
Chunk::Image(Image {
|
|
|
|
data: pixel_data.clone(),
|
|
|
|
mimetype: "image/gif".to_string()
|
|
|
|
})
|
|
|
|
.into(),
|
|
|
|
Chunk::Image(Image {
|
|
|
|
data: pixel_data.clone(),
|
|
|
|
mimetype: "image/gif".to_string()
|
|
|
|
})
|
|
|
|
.into()
|
|
|
|
],
|
|
|
|
"Failed to process images",
|
|
|
|
);
|
|
|
|
|
|
|
|
// Verify the number of fake tokens:
|
|
|
|
//
|
|
|
|
// - Two images surrounded/separated by a fake token = 3.
|
|
|
|
// - Both are split in 5 subimages, separated by a fake token: 2 * 4
|
|
|
|
//
|
|
|
|
// Fake tokens get split up by the testing tokenizer, but we don't care.
|
|
|
|
assert_eq!(
|
|
|
|
encoding
|
|
|
|
.get_tokens()
|
|
|
|
.iter()
|
|
|
|
.filter(|t| *t == "fake")
|
|
|
|
.count(),
|
|
|
|
11
|
|
|
|
);
|
|
|
|
}
|
2023-04-25 12:13:14 +00:00
|
|
|
}
|