diff --git a/Dockerfile b/Dockerfile index 592f1f72..a94bc3a0 100644 --- a/Dockerfile +++ b/Dockerfile @@ -37,7 +37,7 @@ ENV LANG=C.UTF-8 \ MODEL_ID=bigscience/bloom-560m \ QUANTIZE=false \ NUM_SHARD=1 \ - PORT=80 \ + PORT=8080 \ CUDA_HOME=/usr/local/cuda \ LD_LIBRARY_PATH="/opt/miniconda/envs/text-generation/lib:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH" \ CONDA_DEFAULT_ENV=text-generation \ diff --git a/launcher/src/main.rs b/launcher/src/main.rs index 80466fe6..f015a393 100644 --- a/launcher/src/main.rs +++ b/launcher/src/main.rs @@ -19,13 +19,13 @@ use subprocess::{ExitStatus, Popen, PopenConfig, PopenError, Redirection}; #[derive(Parser, Debug)] #[clap(author, version, about, long_about = None)] struct Args { - #[clap(default_value = "bigscience/bloom-560m", long, env)] + #[clap(default_value = "bigscience/bloom-560m", long, env = "HF_MODEL_ID")] model_id: String, #[clap(long, env)] revision: Option, #[clap(long, env)] sharded: Option, - #[clap(long, env)] + #[clap(long, env = "SM_NUM_GPUS")] num_shard: Option, #[clap(long, env)] quantize: bool, diff --git a/router/src/server.rs b/router/src/server.rs index 3b63ec8a..5e0535c6 100644 --- a/router/src/server.rs +++ b/router/src/server.rs @@ -530,10 +530,12 @@ pub async fn run( let app = Router::new() .merge(SwaggerUi::new("/docs").url("/api-doc/openapi.json", ApiDoc::openapi())) .route("/", post(compat_generate)) + .route("/invocations", post(compat_generate)) .route("/generate", post(generate)) .route("/generate_stream", post(generate_stream)) .route("/", get(health)) .route("/health", get(health)) + .route("/ping", get(health)) .route("/metrics", get(metrics)) .layer(Extension(compat_return_full_text)) .layer(Extension(infer))