diff --git a/Cargo.lock b/Cargo.lock
index 33d75f0e..8f09c3a5 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2175,6 +2175,16 @@ dependencies = [
  "tracing-core",
 ]
 
+[[package]]
+name = "tracing-serde"
+version = "0.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bc6b213177105856957181934e4920de57730fc69bf42c37ee5bb664d406d9e1"
+dependencies = [
+ "serde",
+ "tracing-core",
+]
+
 [[package]]
 name = "tracing-subscriber"
 version = "0.3.16"
@@ -2182,11 +2192,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a6176eae26dd70d0c919749377897b54a9276bd7061339665dd68777926b5a70"
 dependencies = [
  "nu-ansi-term",
+ "serde",
+ "serde_json",
  "sharded-slab",
  "smallvec",
  "thread_local",
  "tracing-core",
  "tracing-log",
+ "tracing-serde",
 ]
 
 [[package]]
diff --git a/Dockerfile b/Dockerfile
index a2bf199a..ebe79609 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -73,4 +73,4 @@ COPY --from=router-builder /usr/local/cargo/bin/text-generation-router /usr/loca
 # Install launcher
 COPY --from=launcher-builder /usr/local/cargo/bin/text-generation-launcher /usr/local/bin/text-generation-launcher
 
-CMD HUGGINGFACE_HUB_CACHE=$MODEL_BASE_PATH text-generation-launcher --num-shard $NUM_GPUS
\ No newline at end of file
+CMD HUGGINGFACE_HUB_CACHE=$MODEL_BASE_PATH text-generation-launcher --num-shard $NUM_GPUS --json-output
\ No newline at end of file
diff --git a/README.md b/README.md
index 2d2d49d3..0c4f6f71 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# LLM Text Generation Inference
+# Text Generation Inference
 
 <div align="center">
 
@@ -6,12 +6,12 @@
 
 </div>
 
-A Rust and gRPC server for large language models text generation inference.
+A Rust and gRPC server for text generation inference.
 
 ## Features
 
-- Quantization with [bitsandbytes](https://github.com/TimDettmers/bitsandbytes)
 - [Dynamic bathing of incoming requests](https://github.com/huggingface/text-generation-inference/blob/main/router/src/batcher.rs#L88) for increased total throughput
+- Quantization with [bitsandbytes](https://github.com/TimDettmers/bitsandbytes)
 - [Safetensors](https://github.com/huggingface/safetensors) weight loading
 - 45ms per token generation for BLOOM with 8xA100 80GB
 
diff --git a/launcher/Cargo.toml b/launcher/Cargo.toml
index ae3558b1..1779c051 100644
--- a/launcher/Cargo.toml
+++ b/launcher/Cargo.toml
@@ -10,4 +10,4 @@ clap = { version = "4.0.15", features = ["derive", "env"] }
 ctrlc = "3.2.3"
 subprocess = "0.2.9"
 tracing = "0.1.37"
-tracing-subscriber = "0.3.16"
+tracing-subscriber = { version = "0.3.16", features = ["json"] }
diff --git a/launcher/src/main.rs b/launcher/src/main.rs
index f94dd589..ed6cf8e5 100644
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@@ -39,11 +39,11 @@ struct Args {
     master_addr: String,
     #[clap(default_value = "29500", long, env)]
     master_port: usize,
+    #[clap(long, env)]
+    json_output: bool,
 }
 
 fn main() -> ExitCode {
-    tracing_subscriber::fmt().compact().with_ansi(false).init();
-
     // Pattern match configuration
     let Args {
         model_name,
@@ -57,8 +57,15 @@ fn main() -> ExitCode {
         shard_uds_path,
         master_addr,
         master_port,
+        json_output,
     } = Args::parse();
 
+    if json_output {
+        tracing_subscriber::fmt().json().init();
+    } else {
+        tracing_subscriber::fmt().compact().init();
+    }
+
     // By default we only have one master shard
     let num_shard = num_shard.unwrap_or(1);
 
@@ -139,24 +146,30 @@ fn main() -> ExitCode {
     // All shard started
     // Start webserver
     tracing::info!("Starting Webserver");
+    let mut argv = vec![
+        "text-generation-router".to_string(),
+        "--max-concurrent-requests".to_string(),
+        max_concurrent_requests.to_string(),
+        "--max-input-length".to_string(),
+        max_input_length.to_string(),
+        "--max-batch-size".to_string(),
+        max_batch_size.to_string(),
+        "--max-waiting-tokens".to_string(),
+        max_waiting_tokens.to_string(),
+        "--port".to_string(),
+        port.to_string(),
+        "--master-shard-uds-path".to_string(),
+        format!("{}-0", shard_uds_path),
+        "--tokenizer-name".to_string(),
+        model_name,
+    ];
+
+    if json_output {
+        argv.push("--json-output".to_string());
+    }
+
     let mut webserver = match Popen::create(
-        &[
-            "text-generation-router",
-            "--max-concurrent-requests",
-            &max_concurrent_requests.to_string(),
-            "--max-input-length",
-            &max_input_length.to_string(),
-            "--max-batch-size",
-            &max_batch_size.to_string(),
-            "--max-waiting-tokens",
-            &max_waiting_tokens.to_string(),
-            "--port",
-            &port.to_string(),
-            "--master-shard-uds-path",
-            &format!("{}-0", shard_uds_path),
-            "--tokenizer-name",
-            &model_name,
-        ],
+        &argv,
         PopenConfig {
             stdout: Redirection::Pipe,
             stderr: Redirection::Pipe,
diff --git a/router/Cargo.toml b/router/Cargo.toml
index da9518bf..f99069d3 100644
--- a/router/Cargo.toml
+++ b/router/Cargo.toml
@@ -24,5 +24,5 @@ thiserror = "1.0.37"
 tokenizers = "0.13.0"
 tokio = { version = "1.21.1", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync"] }
 tracing = "0.1.36"
-tracing-subscriber = "0.3.15"
+tracing-subscriber = { version = "0.3.15", features = ["json"] }
 
diff --git a/router/src/main.rs b/router/src/main.rs
index b24ec4c9..6497f23e 100644
--- a/router/src/main.rs
+++ b/router/src/main.rs
@@ -25,6 +25,8 @@ struct Args {
     tokenizer_name: String,
     #[clap(default_value = "2", long, env)]
     validation_workers: usize,
+    #[clap(long, env)]
+    json_output: bool,
 }
 
 fn main() -> Result<(), std::io::Error> {
@@ -40,11 +42,16 @@ fn main() -> Result<(), std::io::Error> {
         master_shard_uds_path,
         tokenizer_name,
         validation_workers,
+        json_output,
     } = args;
 
-    tracing_subscriber::fmt().compact().with_ansi(false).init();
+    if json_output {
+        tracing_subscriber::fmt().json().init();
+    } else {
+        tracing_subscriber::fmt().compact().init();
+    }
 
-    if validation_workers == 1 {
+    if validation_workers == 0 {
         panic!("validation_workers must be > 0");
     }
 
diff --git a/server/poetry.lock b/server/poetry.lock
index 5e635a67..3c92903e 100644
--- a/server/poetry.lock
+++ b/server/poetry.lock
@@ -88,14 +88,6 @@ grpcio = ">=1.50.0"
 protobuf = ">=4.21.6,<5.0dev"
 setuptools = "*"
 
-[[package]]
-name = "joblib"
-version = "1.2.0"
-description = "Lightweight pipelining with Python functions"
-category = "main"
-optional = false
-python-versions = ">=3.7"
-
 [[package]]
 name = "numpy"
 version = "1.23.4"
@@ -210,10 +202,13 @@ category = "main"
 optional = false
 python-versions = ">=3.7"
 
+[extras]
+bnb = ["bitsandbytes"]
+
 [metadata]
 lock-version = "1.1"
 python-versions = "^3.9"
-content-hash = "50d9d44577a0222f125c770732d5f88807378573bd7386036eb5c79fc2a7c552"
+content-hash = "224b1e379d6105fe911bff4563946a90dfa6ff5918cf2e7be59f8d4f7c5cd7cf"
 
 [metadata.files]
 accelerate = [
@@ -330,10 +325,6 @@ grpcio-tools = [
     {file = "grpcio_tools-1.50.0-cp39-cp39-win32.whl", hash = "sha256:e1a8f9a57bbcc2e633aaf327e39830527f3c1f7add18c7580f3058fe9a0fa780"},
     {file = "grpcio_tools-1.50.0-cp39-cp39-win_amd64.whl", hash = "sha256:b7eb7a84d9171c0ae1550833f4a6ca52372bed9db0fa10f8c9dbe6ca65f97a8c"},
 ]
-joblib = [
-    {file = "joblib-1.2.0-py3-none-any.whl", hash = "sha256:091138ed78f800342968c523bdde947e7a305b8594b910a0fea2ab83c3c6d385"},
-    {file = "joblib-1.2.0.tar.gz", hash = "sha256:e1cee4a79e4af22881164f218d4311f60074197fb707e082e803b61f6d137018"},
-]
 numpy = [
     {file = "numpy-1.23.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:95d79ada05005f6f4f337d3bb9de8a7774f259341c70bc88047a1f7b96a4bcb2"},
     {file = "numpy-1.23.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:926db372bc4ac1edf81cfb6c59e2a881606b409ddc0d0920b988174b2e2a767f"},
diff --git a/server/pyproject.toml b/server/pyproject.toml
index 50f99398..e2ba98a7 100644
--- a/server/pyproject.toml
+++ b/server/pyproject.toml
@@ -14,7 +14,6 @@ grpcio = "^1.49.1"
 typer = "^0.6.1"
 grpcio-reflection = "^1.49.1"
 accelerate = "^0.12.0"
-joblib = "^1.2.0"
 bitsandbytes = "^0.35.1"
 
 [tool.poetry.extras]
diff --git a/server/text_generation/models/__init__.py b/server/text_generation/models/__init__.py
index d4f3cf8b..1f141c3c 100644
--- a/server/text_generation/models/__init__.py
+++ b/server/text_generation/models/__init__.py
@@ -15,7 +15,7 @@ def get_model(model_name: str, sharded: bool, quantize: bool) -> Model:
             return Model(model_name)
     else:
         if sharded:
-            raise ValueError("sharded is only supported for BLOOM")
+            raise ValueError("sharded is only supported for BLOOM models")
         if quantize:
             raise ValueError("Quantization is only supported for BLOOM models")
 
diff --git a/server/text_generation/models/model.py b/server/text_generation/models/model.py
index db1367b9..e585e476 100644
--- a/server/text_generation/models/model.py
+++ b/server/text_generation/models/model.py
@@ -20,7 +20,7 @@ class Model:
         self.tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
         self.tokenizer.add_special_tokens({"pad_token": "[PAD]"})
         self.model = AutoModelForCausalLM.from_pretrained(
-            model_name, torch_dtype=dtype, device_map="auto"
+            model_name, torch_dtype=dtype, device_map="auto" if torch.cuda.is_available() else None
         ).eval()
 
         self.num_heads = self.model.config.num_attention_heads
diff --git a/server/text_generation/utils.py b/server/text_generation/utils.py
index 926fc69c..50cd4124 100644
--- a/server/text_generation/utils.py
+++ b/server/text_generation/utils.py
@@ -1,11 +1,13 @@
+import concurrent
 import os
+import signal
 import torch
 import torch.distributed
 
 from datetime import timedelta
 
+from concurrent.futures import ThreadPoolExecutor
 from functools import partial
-from joblib import Parallel, delayed
 from huggingface_hub import HfApi, hf_hub_download, try_to_load_from_cache
 from huggingface_hub.utils import LocalEntryNotFoundError
 from tqdm import tqdm
@@ -124,8 +126,9 @@ def download_weights(model_name, extension=".safetensors"):
     download_function = partial(
         hf_hub_download, repo_id=model_name, local_files_only=False
     )
-    # FIXME: fix the overlapping progress bars
-    files = Parallel(n_jobs=5)(
-        delayed(download_function)(filename=filename) for filename in tqdm(filenames)
-    )
+
+    executor = ThreadPoolExecutor(max_workers=5)
+    futures = [executor.submit(download_function, filename=filename) for filename in filenames]
+    files = [file for file in tqdm(concurrent.futures.as_completed(futures), total=len(futures))]
+
     return files