Return num input tokens (#3)

returning number of input tokens in the details message

Co-authored-by: Yessen Kanapin <yessen@deepinfra.com>
This commit is contained in:
Nikola Borisov 2023-08-30 15:20:47 -07:00 committed by GitHub
parent 9826cd1dad
commit 57e57e6fee
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 35 additions and 8 deletions

View File

@ -182,12 +182,17 @@ COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-39 /opt/co
RUN pip install einops --no-cache-dir RUN pip install einops --no-cache-dir
# Install server # Install server
COPY server/requirements.txt server/requirements.txt
COPY server/pyproject.toml server/pyproject.toml
COPY server/poetry.lock server/poetry.lock
RUN cd server && \
pip install -r requirements.txt
COPY proto proto COPY proto proto
COPY server server COPY server server
COPY server/Makefile server/Makefile COPY server/Makefile server/Makefile
RUN cd server && \ RUN cd server && \
make gen-server && \ make gen-server && \
pip install -r requirements.txt && \
pip install ".[bnb, accelerate, quantize]" --no-cache-dir pip install ".[bnb, accelerate, quantize]" --no-cache-dir
# Install benchmarker # Install benchmarker

View File

@ -211,6 +211,8 @@ class StreamDetails(BaseModel):
finish_reason: FinishReason finish_reason: FinishReason
# Number of generated tokens # Number of generated tokens
generated_tokens: int generated_tokens: int
# Number of input tokens
input_tokens: int
# Sampling seed if sampling was activated # Sampling seed if sampling was activated
seed: Optional[int] seed: Optional[int]

View File

@ -147,6 +147,7 @@ impl Infer {
let mut result_generated_text = None; let mut result_generated_text = None;
let mut result_start = None; let mut result_start = None;
let mut result_queued = None; let mut result_queued = None;
let mut number_input_tokens = 0;
// Iterate on stream // Iterate on stream
while let Some(response) = stream.next().await { while let Some(response) = stream.next().await {
@ -155,6 +156,7 @@ impl Infer {
InferStreamResponse::Prefill(tokens) => { InferStreamResponse::Prefill(tokens) => {
// Create Token objects // Create Token objects
// We do that here instead of in the Python code as Rust for loops are faster // We do that here instead of in the Python code as Rust for loops are faster
number_input_tokens = tokens.ids.len() as u32;
result_prefill = tokens result_prefill = tokens
.ids .ids
.into_iter() .into_iter()
@ -188,6 +190,7 @@ impl Infer {
Ok(InferResponse { Ok(InferResponse {
prefill: result_prefill, prefill: result_prefill,
tokens: result_tokens, tokens: result_tokens,
input_tokens: number_input_tokens,
generated_text, generated_text,
queued, queued,
start, start,
@ -581,6 +584,7 @@ pub(crate) struct InferResponse {
pub(crate) prefill: Vec<PrefillToken>, pub(crate) prefill: Vec<PrefillToken>,
pub(crate) tokens: Vec<Token>, pub(crate) tokens: Vec<Token>,
pub(crate) generated_text: GeneratedText, pub(crate) generated_text: GeneratedText,
pub(crate) input_tokens: u32,
pub(crate) queued: Instant, pub(crate) queued: Instant,
pub(crate) start: Instant, pub(crate) start: Instant,
} }

View File

@ -231,6 +231,8 @@ pub(crate) struct BestOfSequence {
pub finish_reason: FinishReason, pub finish_reason: FinishReason,
#[schema(example = 1)] #[schema(example = 1)]
pub generated_tokens: u32, pub generated_tokens: u32,
#[schema(example = 100)]
pub input_tokens: u32,
#[schema(nullable = true, example = 42)] #[schema(nullable = true, example = 42)]
pub seed: Option<u64>, pub seed: Option<u64>,
pub prefill: Vec<PrefillToken>, pub prefill: Vec<PrefillToken>,
@ -243,6 +245,8 @@ pub(crate) struct Details {
pub finish_reason: FinishReason, pub finish_reason: FinishReason,
#[schema(example = 1)] #[schema(example = 1)]
pub generated_tokens: u32, pub generated_tokens: u32,
#[schema(example = 100)]
pub input_tokens: u32,
#[schema(nullable = true, example = 42)] #[schema(nullable = true, example = 42)]
pub seed: Option<u64>, pub seed: Option<u64>,
pub prefill: Vec<PrefillToken>, pub prefill: Vec<PrefillToken>,
@ -265,6 +269,8 @@ pub(crate) struct StreamDetails {
pub finish_reason: FinishReason, pub finish_reason: FinishReason,
#[schema(example = 1)] #[schema(example = 1)]
pub generated_tokens: u32, pub generated_tokens: u32,
#[schema(example = 100)]
pub input_tokens: u32,
#[schema(nullable = true, example = 42)] #[schema(nullable = true, example = 42)]
pub seed: Option<u64>, pub seed: Option<u64>,
} }

View File

@ -191,6 +191,7 @@ async fn generate(
response.generated_text.finish_reason, response.generated_text.finish_reason,
), ),
generated_tokens: response.generated_text.generated_tokens, generated_tokens: response.generated_text.generated_tokens,
input_tokens: response.input_tokens,
prefill: response.prefill, prefill: response.prefill,
tokens: response.tokens, tokens: response.tokens,
seed: response.generated_text.seed, seed: response.generated_text.seed,
@ -202,6 +203,7 @@ async fn generate(
Some(Details { Some(Details {
finish_reason: FinishReason::from(response.generated_text.finish_reason), finish_reason: FinishReason::from(response.generated_text.finish_reason),
generated_tokens: response.generated_text.generated_tokens, generated_tokens: response.generated_text.generated_tokens,
input_tokens: response.input_tokens,
prefill: response.prefill, prefill: response.prefill,
tokens: response.tokens, tokens: response.tokens,
seed: response.generated_text.seed, seed: response.generated_text.seed,
@ -380,12 +382,15 @@ async fn generate_stream(
// Keep permit as long as generate_stream lives // Keep permit as long as generate_stream lives
Ok((_permit, mut response_stream)) => { Ok((_permit, mut response_stream)) => {
// Server-Sent Event stream // Server-Sent Event stream
let mut number_input_tokens = 0;
while let Some(response) = response_stream.next().await { while let Some(response) = response_stream.next().await {
match response { match response {
Ok(response) => { Ok(response) => {
match response { match response {
// Prefill is ignored // Prefill is only used for initial num input tokens
InferStreamResponse::Prefill(_) => {} InferStreamResponse::Prefill(prefill_tokens) => {
number_input_tokens = prefill_tokens.ids.len() as u32;
}
// Yield event for every new token // Yield event for every new token
InferStreamResponse::Token(token) => { InferStreamResponse::Token(token) => {
tracing::debug!(parent: &span, "Token: {:?}", token); tracing::debug!(parent: &span, "Token: {:?}", token);
@ -411,6 +416,7 @@ async fn generate_stream(
true => Some(StreamDetails { true => Some(StreamDetails {
finish_reason: FinishReason::from(generated_text.finish_reason), finish_reason: FinishReason::from(generated_text.finish_reason),
generated_tokens: generated_text.generated_tokens, generated_tokens: generated_text.generated_tokens,
input_tokens: number_input_tokens,
seed: generated_text.seed, seed: generated_text.seed,
}), }),
false => None, false => None,

View File

@ -985,14 +985,18 @@ class FlashCausalLM(Model):
generated_text = None generated_text = None
# Prefill # Prefill
if prefill and request.prefill_logprobs: if prefill:
out_start_index = batch.prefill_cu_outlens[i] out_start_index = batch.prefill_cu_outlens[i]
out_end_index = batch.prefill_cu_outlens[i + 1] out_end_index = batch.prefill_cu_outlens[i + 1]
# Remove generated token to only have prefill and add nan for first prompt token if request.prefill_logprobs:
request_prefill_logprobs = [float("nan")] + prefill_logprobs[ # Remove generated token to only have prefill and add nan for first prompt token
out_start_index : out_end_index - 1 request_prefill_logprobs = [float("nan")] + prefill_logprobs[
] out_start_index : out_end_index - 1
]
else:
request_prefill_logprobs = []
prefill_token_ids = all_input_ids[:-1] prefill_token_ids = all_input_ids[:-1]
prefill_texts = self.tokenizer.batch_decode( prefill_texts = self.tokenizer.batch_decode(
prefill_token_ids, prefill_token_ids,