mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-11 04:14:52 +00:00
Return num input tokens (#3)
returning number of input tokens in the details message Co-authored-by: Yessen Kanapin <yessen@deepinfra.com>
This commit is contained in:
parent
9826cd1dad
commit
57e57e6fee
@ -182,12 +182,17 @@ COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-39 /opt/co
|
|||||||
RUN pip install einops --no-cache-dir
|
RUN pip install einops --no-cache-dir
|
||||||
|
|
||||||
# Install server
|
# Install server
|
||||||
|
COPY server/requirements.txt server/requirements.txt
|
||||||
|
COPY server/pyproject.toml server/pyproject.toml
|
||||||
|
COPY server/poetry.lock server/poetry.lock
|
||||||
|
RUN cd server && \
|
||||||
|
pip install -r requirements.txt
|
||||||
|
|
||||||
COPY proto proto
|
COPY proto proto
|
||||||
COPY server server
|
COPY server server
|
||||||
COPY server/Makefile server/Makefile
|
COPY server/Makefile server/Makefile
|
||||||
RUN cd server && \
|
RUN cd server && \
|
||||||
make gen-server && \
|
make gen-server && \
|
||||||
pip install -r requirements.txt && \
|
|
||||||
pip install ".[bnb, accelerate, quantize]" --no-cache-dir
|
pip install ".[bnb, accelerate, quantize]" --no-cache-dir
|
||||||
|
|
||||||
# Install benchmarker
|
# Install benchmarker
|
||||||
|
@ -211,6 +211,8 @@ class StreamDetails(BaseModel):
|
|||||||
finish_reason: FinishReason
|
finish_reason: FinishReason
|
||||||
# Number of generated tokens
|
# Number of generated tokens
|
||||||
generated_tokens: int
|
generated_tokens: int
|
||||||
|
# Number of input tokens
|
||||||
|
input_tokens: int
|
||||||
# Sampling seed if sampling was activated
|
# Sampling seed if sampling was activated
|
||||||
seed: Optional[int]
|
seed: Optional[int]
|
||||||
|
|
||||||
|
@ -147,6 +147,7 @@ impl Infer {
|
|||||||
let mut result_generated_text = None;
|
let mut result_generated_text = None;
|
||||||
let mut result_start = None;
|
let mut result_start = None;
|
||||||
let mut result_queued = None;
|
let mut result_queued = None;
|
||||||
|
let mut number_input_tokens = 0;
|
||||||
|
|
||||||
// Iterate on stream
|
// Iterate on stream
|
||||||
while let Some(response) = stream.next().await {
|
while let Some(response) = stream.next().await {
|
||||||
@ -155,6 +156,7 @@ impl Infer {
|
|||||||
InferStreamResponse::Prefill(tokens) => {
|
InferStreamResponse::Prefill(tokens) => {
|
||||||
// Create Token objects
|
// Create Token objects
|
||||||
// We do that here instead of in the Python code as Rust for loops are faster
|
// We do that here instead of in the Python code as Rust for loops are faster
|
||||||
|
number_input_tokens = tokens.ids.len() as u32;
|
||||||
result_prefill = tokens
|
result_prefill = tokens
|
||||||
.ids
|
.ids
|
||||||
.into_iter()
|
.into_iter()
|
||||||
@ -188,6 +190,7 @@ impl Infer {
|
|||||||
Ok(InferResponse {
|
Ok(InferResponse {
|
||||||
prefill: result_prefill,
|
prefill: result_prefill,
|
||||||
tokens: result_tokens,
|
tokens: result_tokens,
|
||||||
|
input_tokens: number_input_tokens,
|
||||||
generated_text,
|
generated_text,
|
||||||
queued,
|
queued,
|
||||||
start,
|
start,
|
||||||
@ -581,6 +584,7 @@ pub(crate) struct InferResponse {
|
|||||||
pub(crate) prefill: Vec<PrefillToken>,
|
pub(crate) prefill: Vec<PrefillToken>,
|
||||||
pub(crate) tokens: Vec<Token>,
|
pub(crate) tokens: Vec<Token>,
|
||||||
pub(crate) generated_text: GeneratedText,
|
pub(crate) generated_text: GeneratedText,
|
||||||
|
pub(crate) input_tokens: u32,
|
||||||
pub(crate) queued: Instant,
|
pub(crate) queued: Instant,
|
||||||
pub(crate) start: Instant,
|
pub(crate) start: Instant,
|
||||||
}
|
}
|
||||||
|
@ -231,6 +231,8 @@ pub(crate) struct BestOfSequence {
|
|||||||
pub finish_reason: FinishReason,
|
pub finish_reason: FinishReason,
|
||||||
#[schema(example = 1)]
|
#[schema(example = 1)]
|
||||||
pub generated_tokens: u32,
|
pub generated_tokens: u32,
|
||||||
|
#[schema(example = 100)]
|
||||||
|
pub input_tokens: u32,
|
||||||
#[schema(nullable = true, example = 42)]
|
#[schema(nullable = true, example = 42)]
|
||||||
pub seed: Option<u64>,
|
pub seed: Option<u64>,
|
||||||
pub prefill: Vec<PrefillToken>,
|
pub prefill: Vec<PrefillToken>,
|
||||||
@ -243,6 +245,8 @@ pub(crate) struct Details {
|
|||||||
pub finish_reason: FinishReason,
|
pub finish_reason: FinishReason,
|
||||||
#[schema(example = 1)]
|
#[schema(example = 1)]
|
||||||
pub generated_tokens: u32,
|
pub generated_tokens: u32,
|
||||||
|
#[schema(example = 100)]
|
||||||
|
pub input_tokens: u32,
|
||||||
#[schema(nullable = true, example = 42)]
|
#[schema(nullable = true, example = 42)]
|
||||||
pub seed: Option<u64>,
|
pub seed: Option<u64>,
|
||||||
pub prefill: Vec<PrefillToken>,
|
pub prefill: Vec<PrefillToken>,
|
||||||
@ -265,6 +269,8 @@ pub(crate) struct StreamDetails {
|
|||||||
pub finish_reason: FinishReason,
|
pub finish_reason: FinishReason,
|
||||||
#[schema(example = 1)]
|
#[schema(example = 1)]
|
||||||
pub generated_tokens: u32,
|
pub generated_tokens: u32,
|
||||||
|
#[schema(example = 100)]
|
||||||
|
pub input_tokens: u32,
|
||||||
#[schema(nullable = true, example = 42)]
|
#[schema(nullable = true, example = 42)]
|
||||||
pub seed: Option<u64>,
|
pub seed: Option<u64>,
|
||||||
}
|
}
|
||||||
|
@ -191,6 +191,7 @@ async fn generate(
|
|||||||
response.generated_text.finish_reason,
|
response.generated_text.finish_reason,
|
||||||
),
|
),
|
||||||
generated_tokens: response.generated_text.generated_tokens,
|
generated_tokens: response.generated_text.generated_tokens,
|
||||||
|
input_tokens: response.input_tokens,
|
||||||
prefill: response.prefill,
|
prefill: response.prefill,
|
||||||
tokens: response.tokens,
|
tokens: response.tokens,
|
||||||
seed: response.generated_text.seed,
|
seed: response.generated_text.seed,
|
||||||
@ -202,6 +203,7 @@ async fn generate(
|
|||||||
Some(Details {
|
Some(Details {
|
||||||
finish_reason: FinishReason::from(response.generated_text.finish_reason),
|
finish_reason: FinishReason::from(response.generated_text.finish_reason),
|
||||||
generated_tokens: response.generated_text.generated_tokens,
|
generated_tokens: response.generated_text.generated_tokens,
|
||||||
|
input_tokens: response.input_tokens,
|
||||||
prefill: response.prefill,
|
prefill: response.prefill,
|
||||||
tokens: response.tokens,
|
tokens: response.tokens,
|
||||||
seed: response.generated_text.seed,
|
seed: response.generated_text.seed,
|
||||||
@ -380,12 +382,15 @@ async fn generate_stream(
|
|||||||
// Keep permit as long as generate_stream lives
|
// Keep permit as long as generate_stream lives
|
||||||
Ok((_permit, mut response_stream)) => {
|
Ok((_permit, mut response_stream)) => {
|
||||||
// Server-Sent Event stream
|
// Server-Sent Event stream
|
||||||
|
let mut number_input_tokens = 0;
|
||||||
while let Some(response) = response_stream.next().await {
|
while let Some(response) = response_stream.next().await {
|
||||||
match response {
|
match response {
|
||||||
Ok(response) => {
|
Ok(response) => {
|
||||||
match response {
|
match response {
|
||||||
// Prefill is ignored
|
// Prefill is only used for initial num input tokens
|
||||||
InferStreamResponse::Prefill(_) => {}
|
InferStreamResponse::Prefill(prefill_tokens) => {
|
||||||
|
number_input_tokens = prefill_tokens.ids.len() as u32;
|
||||||
|
}
|
||||||
// Yield event for every new token
|
// Yield event for every new token
|
||||||
InferStreamResponse::Token(token) => {
|
InferStreamResponse::Token(token) => {
|
||||||
tracing::debug!(parent: &span, "Token: {:?}", token);
|
tracing::debug!(parent: &span, "Token: {:?}", token);
|
||||||
@ -411,6 +416,7 @@ async fn generate_stream(
|
|||||||
true => Some(StreamDetails {
|
true => Some(StreamDetails {
|
||||||
finish_reason: FinishReason::from(generated_text.finish_reason),
|
finish_reason: FinishReason::from(generated_text.finish_reason),
|
||||||
generated_tokens: generated_text.generated_tokens,
|
generated_tokens: generated_text.generated_tokens,
|
||||||
|
input_tokens: number_input_tokens,
|
||||||
seed: generated_text.seed,
|
seed: generated_text.seed,
|
||||||
}),
|
}),
|
||||||
false => None,
|
false => None,
|
||||||
|
@ -985,14 +985,18 @@ class FlashCausalLM(Model):
|
|||||||
generated_text = None
|
generated_text = None
|
||||||
|
|
||||||
# Prefill
|
# Prefill
|
||||||
if prefill and request.prefill_logprobs:
|
if prefill:
|
||||||
out_start_index = batch.prefill_cu_outlens[i]
|
out_start_index = batch.prefill_cu_outlens[i]
|
||||||
out_end_index = batch.prefill_cu_outlens[i + 1]
|
out_end_index = batch.prefill_cu_outlens[i + 1]
|
||||||
|
|
||||||
|
if request.prefill_logprobs:
|
||||||
# Remove generated token to only have prefill and add nan for first prompt token
|
# Remove generated token to only have prefill and add nan for first prompt token
|
||||||
request_prefill_logprobs = [float("nan")] + prefill_logprobs[
|
request_prefill_logprobs = [float("nan")] + prefill_logprobs[
|
||||||
out_start_index : out_end_index - 1
|
out_start_index : out_end_index - 1
|
||||||
]
|
]
|
||||||
|
else:
|
||||||
|
request_prefill_logprobs = []
|
||||||
|
|
||||||
prefill_token_ids = all_input_ids[:-1]
|
prefill_token_ids = all_input_ids[:-1]
|
||||||
prefill_texts = self.tokenizer.batch_decode(
|
prefill_texts = self.tokenizer.batch_decode(
|
||||||
prefill_token_ids,
|
prefill_token_ids,
|
||||||
|
Loading…
Reference in New Issue
Block a user