diff --git a/Dockerfile b/Dockerfile index 47f6549e..865e2980 100644 --- a/Dockerfile +++ b/Dockerfile @@ -157,7 +157,7 @@ RUN make build-vllm-cuda # Build megablocks FROM kernel-builder as megablocks-builder -RUN pip install git+https://github.com/OlivierDehaene/megablocks@16c5350f7b313a5ab52ab109feb45f159f1e5d3d +RUN pip install git+https://github.com/OlivierDehaene/megablocks@3de516d9b774ea5dd1b79c68e2c475880f3983e7 # Text Generation Inference base image FROM nvidia/cuda:12.1.0-base-ubuntu20.04 as base diff --git a/router/src/server.rs b/router/src/server.rs index 5f41fd5e..fe1b8309 100644 --- a/router/src/server.rs +++ b/router/src/server.rs @@ -629,6 +629,9 @@ pub async fn run( // Batch size buckets let batch_size_matcher = Matcher::Full(String::from("tgi_batch_next_size")); let batch_size_buckets: Vec = (0..1024).map(|x| (x + 1) as f64).collect(); + // Speculated tokens buckets + let skipped_matcher = Matcher::Full(String::from("tgi_request_skipped_tokens")); + let skipped_buckets: Vec = (0..shard_info.speculate + 1).map(|x| x as f64).collect(); // Prometheus handler let builder = PrometheusBuilder::new() @@ -641,6 +644,8 @@ pub async fn run( .set_buckets_for_metric(max_new_tokens_matcher, &max_new_tokens_buckets) .unwrap() .set_buckets_for_metric(batch_size_matcher, &batch_size_buckets) + .unwrap() + .set_buckets_for_metric(skipped_matcher, &skipped_buckets) .unwrap(); let prom_handle = builder .install_recorder() diff --git a/server/Makefile b/server/Makefile index 86c56ee7..d271e894 100644 --- a/server/Makefile +++ b/server/Makefile @@ -17,7 +17,7 @@ gen-server: touch text_generation_server/pb/__init__.py install-megablocks: - pip install git+https://github.com/OlivierDehaene/megablocks@16c5350f7b313a5ab52ab109feb45f159f1e5d3d + pip install git+https://github.com/OlivierDehaene/megablocks@3de516d9b774ea5dd1b79c68e2c475880f3983e7 install: gen-server pip install pip --upgrade