diff --git a/Cargo.lock b/Cargo.lock index 33d75f0eb..8f09c3a56 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2175,6 +2175,16 @@ dependencies = [ "tracing-core", ] +[[package]] +name = "tracing-serde" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc6b213177105856957181934e4920de57730fc69bf42c37ee5bb664d406d9e1" +dependencies = [ + "serde", + "tracing-core", +] + [[package]] name = "tracing-subscriber" version = "0.3.16" @@ -2182,11 +2192,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a6176eae26dd70d0c919749377897b54a9276bd7061339665dd68777926b5a70" dependencies = [ "nu-ansi-term", + "serde", + "serde_json", "sharded-slab", "smallvec", "thread_local", "tracing-core", "tracing-log", + "tracing-serde", ] [[package]] diff --git a/Dockerfile b/Dockerfile index a2bf199a8..ebe79609a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -73,4 +73,4 @@ COPY --from=router-builder /usr/local/cargo/bin/text-generation-router /usr/loca # Install launcher COPY --from=launcher-builder /usr/local/cargo/bin/text-generation-launcher /usr/local/bin/text-generation-launcher -CMD HUGGINGFACE_HUB_CACHE=$MODEL_BASE_PATH text-generation-launcher --num-shard $NUM_GPUS \ No newline at end of file +CMD HUGGINGFACE_HUB_CACHE=$MODEL_BASE_PATH text-generation-launcher --num-shard $NUM_GPUS --json-output \ No newline at end of file diff --git a/README.md b/README.md index 2d2d49d37..0c4f6f713 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# LLM Text Generation Inference +# Text Generation Inference
@@ -6,12 +6,12 @@
-A Rust and gRPC server for large language models text generation inference. +A Rust and gRPC server for text generation inference. ## Features -- Quantization with [bitsandbytes](https://github.com/TimDettmers/bitsandbytes) - [Dynamic bathing of incoming requests](https://github.com/huggingface/text-generation-inference/blob/main/router/src/batcher.rs#L88) for increased total throughput +- Quantization with [bitsandbytes](https://github.com/TimDettmers/bitsandbytes) - [Safetensors](https://github.com/huggingface/safetensors) weight loading - 45ms per token generation for BLOOM with 8xA100 80GB diff --git a/launcher/Cargo.toml b/launcher/Cargo.toml index ae3558b1c..1779c0519 100644 --- a/launcher/Cargo.toml +++ b/launcher/Cargo.toml @@ -10,4 +10,4 @@ clap = { version = "4.0.15", features = ["derive", "env"] } ctrlc = "3.2.3" subprocess = "0.2.9" tracing = "0.1.37" -tracing-subscriber = "0.3.16" +tracing-subscriber = { version = "0.3.16", features = ["json"] } diff --git a/launcher/src/main.rs b/launcher/src/main.rs index f94dd589c..ed6cf8e59 100644 --- a/launcher/src/main.rs +++ b/launcher/src/main.rs @@ -39,11 +39,11 @@ struct Args { master_addr: String, #[clap(default_value = "29500", long, env)] master_port: usize, + #[clap(long, env)] + json_output: bool, } fn main() -> ExitCode { - tracing_subscriber::fmt().compact().with_ansi(false).init(); - // Pattern match configuration let Args { model_name, @@ -57,8 +57,15 @@ fn main() -> ExitCode { shard_uds_path, master_addr, master_port, + json_output, } = Args::parse(); + if json_output { + tracing_subscriber::fmt().json().init(); + } else { + tracing_subscriber::fmt().compact().init(); + } + // By default we only have one master shard let num_shard = num_shard.unwrap_or(1); @@ -139,24 +146,30 @@ fn main() -> ExitCode { // All shard started // Start webserver tracing::info!("Starting Webserver"); + let mut argv = vec![ + "text-generation-router".to_string(), + "--max-concurrent-requests".to_string(), + max_concurrent_requests.to_string(), + "--max-input-length".to_string(), + max_input_length.to_string(), + "--max-batch-size".to_string(), + max_batch_size.to_string(), + "--max-waiting-tokens".to_string(), + max_waiting_tokens.to_string(), + "--port".to_string(), + port.to_string(), + "--master-shard-uds-path".to_string(), + format!("{}-0", shard_uds_path), + "--tokenizer-name".to_string(), + model_name, + ]; + + if json_output { + argv.push("--json-output".to_string()); + } + let mut webserver = match Popen::create( - &[ - "text-generation-router", - "--max-concurrent-requests", - &max_concurrent_requests.to_string(), - "--max-input-length", - &max_input_length.to_string(), - "--max-batch-size", - &max_batch_size.to_string(), - "--max-waiting-tokens", - &max_waiting_tokens.to_string(), - "--port", - &port.to_string(), - "--master-shard-uds-path", - &format!("{}-0", shard_uds_path), - "--tokenizer-name", - &model_name, - ], + &argv, PopenConfig { stdout: Redirection::Pipe, stderr: Redirection::Pipe, diff --git a/router/Cargo.toml b/router/Cargo.toml index da9518bfd..f99069d37 100644 --- a/router/Cargo.toml +++ b/router/Cargo.toml @@ -24,5 +24,5 @@ thiserror = "1.0.37" tokenizers = "0.13.0" tokio = { version = "1.21.1", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync"] } tracing = "0.1.36" -tracing-subscriber = "0.3.15" +tracing-subscriber = { version = "0.3.15", features = ["json"] } diff --git a/router/src/main.rs b/router/src/main.rs index b24ec4c98..6497f23ee 100644 --- a/router/src/main.rs +++ b/router/src/main.rs @@ -25,6 +25,8 @@ struct Args { tokenizer_name: String, #[clap(default_value = "2", long, env)] validation_workers: usize, + #[clap(long, env)] + json_output: bool, } fn main() -> Result<(), std::io::Error> { @@ -40,11 +42,16 @@ fn main() -> Result<(), std::io::Error> { master_shard_uds_path, tokenizer_name, validation_workers, + json_output, } = args; - tracing_subscriber::fmt().compact().with_ansi(false).init(); + if json_output { + tracing_subscriber::fmt().json().init(); + } else { + tracing_subscriber::fmt().compact().init(); + } - if validation_workers == 1 { + if validation_workers == 0 { panic!("validation_workers must be > 0"); } diff --git a/server/poetry.lock b/server/poetry.lock index 5e635a673..3c92903ec 100644 --- a/server/poetry.lock +++ b/server/poetry.lock @@ -88,14 +88,6 @@ grpcio = ">=1.50.0" protobuf = ">=4.21.6,<5.0dev" setuptools = "*" -[[package]] -name = "joblib" -version = "1.2.0" -description = "Lightweight pipelining with Python functions" -category = "main" -optional = false -python-versions = ">=3.7" - [[package]] name = "numpy" version = "1.23.4" @@ -210,10 +202,13 @@ category = "main" optional = false python-versions = ">=3.7" +[extras] +bnb = ["bitsandbytes"] + [metadata] lock-version = "1.1" python-versions = "^3.9" -content-hash = "50d9d44577a0222f125c770732d5f88807378573bd7386036eb5c79fc2a7c552" +content-hash = "224b1e379d6105fe911bff4563946a90dfa6ff5918cf2e7be59f8d4f7c5cd7cf" [metadata.files] accelerate = [ @@ -330,10 +325,6 @@ grpcio-tools = [ {file = "grpcio_tools-1.50.0-cp39-cp39-win32.whl", hash = "sha256:e1a8f9a57bbcc2e633aaf327e39830527f3c1f7add18c7580f3058fe9a0fa780"}, {file = "grpcio_tools-1.50.0-cp39-cp39-win_amd64.whl", hash = "sha256:b7eb7a84d9171c0ae1550833f4a6ca52372bed9db0fa10f8c9dbe6ca65f97a8c"}, ] -joblib = [ - {file = "joblib-1.2.0-py3-none-any.whl", hash = "sha256:091138ed78f800342968c523bdde947e7a305b8594b910a0fea2ab83c3c6d385"}, - {file = "joblib-1.2.0.tar.gz", hash = "sha256:e1cee4a79e4af22881164f218d4311f60074197fb707e082e803b61f6d137018"}, -] numpy = [ {file = "numpy-1.23.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:95d79ada05005f6f4f337d3bb9de8a7774f259341c70bc88047a1f7b96a4bcb2"}, {file = "numpy-1.23.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:926db372bc4ac1edf81cfb6c59e2a881606b409ddc0d0920b988174b2e2a767f"}, diff --git a/server/pyproject.toml b/server/pyproject.toml index 50f993989..e2ba98a7b 100644 --- a/server/pyproject.toml +++ b/server/pyproject.toml @@ -14,7 +14,6 @@ grpcio = "^1.49.1" typer = "^0.6.1" grpcio-reflection = "^1.49.1" accelerate = "^0.12.0" -joblib = "^1.2.0" bitsandbytes = "^0.35.1" [tool.poetry.extras] diff --git a/server/text_generation/models/__init__.py b/server/text_generation/models/__init__.py index d4f3cf8b7..1f141c3ce 100644 --- a/server/text_generation/models/__init__.py +++ b/server/text_generation/models/__init__.py @@ -15,7 +15,7 @@ def get_model(model_name: str, sharded: bool, quantize: bool) -> Model: return Model(model_name) else: if sharded: - raise ValueError("sharded is only supported for BLOOM") + raise ValueError("sharded is only supported for BLOOM models") if quantize: raise ValueError("Quantization is only supported for BLOOM models") diff --git a/server/text_generation/models/model.py b/server/text_generation/models/model.py index db1367b90..e585e4767 100644 --- a/server/text_generation/models/model.py +++ b/server/text_generation/models/model.py @@ -20,7 +20,7 @@ class Model: self.tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left") self.tokenizer.add_special_tokens({"pad_token": "[PAD]"}) self.model = AutoModelForCausalLM.from_pretrained( - model_name, torch_dtype=dtype, device_map="auto" + model_name, torch_dtype=dtype, device_map="auto" if torch.cuda.is_available() else None ).eval() self.num_heads = self.model.config.num_attention_heads diff --git a/server/text_generation/utils.py b/server/text_generation/utils.py index 926fc69cf..50cd4124f 100644 --- a/server/text_generation/utils.py +++ b/server/text_generation/utils.py @@ -1,11 +1,13 @@ +import concurrent import os +import signal import torch import torch.distributed from datetime import timedelta +from concurrent.futures import ThreadPoolExecutor from functools import partial -from joblib import Parallel, delayed from huggingface_hub import HfApi, hf_hub_download, try_to_load_from_cache from huggingface_hub.utils import LocalEntryNotFoundError from tqdm import tqdm @@ -124,8 +126,9 @@ def download_weights(model_name, extension=".safetensors"): download_function = partial( hf_hub_download, repo_id=model_name, local_files_only=False ) - # FIXME: fix the overlapping progress bars - files = Parallel(n_jobs=5)( - delayed(download_function)(filename=filename) for filename in tqdm(filenames) - ) + + executor = ThreadPoolExecutor(max_workers=5) + futures = [executor.submit(download_function, filename=filename) for filename in filenames] + files = [file for file in tqdm(concurrent.futures.as_completed(futures), total=len(futures))] + return files