mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-07-26 01:40:17 +00:00
add new target in dockerfile
This commit is contained in:
parent
930842a7f0
commit
00d5ade28f
45
.github/workflows/build.yaml
vendored
45
.github/workflows/build.yaml
vendored
@ -83,4 +83,49 @@ jobs:
|
|||||||
tags: ${{ steps.meta.outputs.tags }}
|
tags: ${{ steps.meta.outputs.tags }}
|
||||||
labels: ${{ steps.meta.outputs.labels }}
|
labels: ${{ steps.meta.outputs.labels }}
|
||||||
cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache,mode=max
|
cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache,mode=max
|
||||||
|
cache-to: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache,mode=max
|
||||||
|
|
||||||
|
build-and-push-sagemaker-image:
|
||||||
|
needs:
|
||||||
|
- build-and-push-image
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- name: Initialize Docker Buildx
|
||||||
|
uses: docker/setup-buildx-action@v2.0.0
|
||||||
|
with:
|
||||||
|
install: true
|
||||||
|
- name: Checkout repository
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
- name: Inject slug/short variables
|
||||||
|
uses: rlespinasse/github-slug-action@v4
|
||||||
|
- name: Login to internal Container Registry
|
||||||
|
uses: docker/login-action@v2.1.0
|
||||||
|
with:
|
||||||
|
username: ${{ secrets.TAILSCALE_DOCKER_USERNAME }}
|
||||||
|
password: ${{ secrets.TAILSCALE_DOCKER_PASSWORD }}
|
||||||
|
registry: registry.internal.huggingface.tech
|
||||||
|
- name: Extract metadata (tags, labels) for Docker
|
||||||
|
id: meta
|
||||||
|
uses: docker/metadata-action@v4.3.0
|
||||||
|
with:
|
||||||
|
flavor: |
|
||||||
|
latest=auto
|
||||||
|
images: |
|
||||||
|
registry.internal.huggingface.tech/api-inference/community/text-generation-inference/sagemaker
|
||||||
|
tags: |
|
||||||
|
type=semver,pattern={{version}}
|
||||||
|
type=semver,pattern={{major}}.{{minor}}
|
||||||
|
type=raw,value=latest,enable=${{ github.ref == format('refs/heads/{0}', github.event.repository.default_branch) }}
|
||||||
|
type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}
|
||||||
|
- name: Build and push Docker image
|
||||||
|
uses: docker/build-push-action@v2
|
||||||
|
with:
|
||||||
|
context: .
|
||||||
|
file: Dockerfile
|
||||||
|
push: ${{ github.event_name != 'pull_request' }}
|
||||||
|
platforms: 'linux/amd64'
|
||||||
|
target: sagemaker
|
||||||
|
tags: ${{ steps.meta.outputs.tags }}
|
||||||
|
labels: ${{ steps.meta.outputs.labels }}
|
||||||
|
cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache,mode=max
|
||||||
cache-to: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache,mode=max
|
cache-to: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache,mode=max
|
17
Dockerfile
17
Dockerfile
@ -27,7 +27,7 @@ COPY router router
|
|||||||
COPY launcher launcher
|
COPY launcher launcher
|
||||||
RUN cargo build --release
|
RUN cargo build --release
|
||||||
|
|
||||||
FROM nvidia/cuda:11.8.0-devel-ubuntu22.04
|
FROM nvidia/cuda:11.8.0-devel-ubuntu22.04 as base
|
||||||
|
|
||||||
ENV LANG=C.UTF-8 \
|
ENV LANG=C.UTF-8 \
|
||||||
LC_ALL=C.UTF-8 \
|
LC_ALL=C.UTF-8 \
|
||||||
@ -37,7 +37,7 @@ ENV LANG=C.UTF-8 \
|
|||||||
MODEL_ID=bigscience/bloom-560m \
|
MODEL_ID=bigscience/bloom-560m \
|
||||||
QUANTIZE=false \
|
QUANTIZE=false \
|
||||||
NUM_SHARD=1 \
|
NUM_SHARD=1 \
|
||||||
PORT=8080 \
|
PORT=80 \
|
||||||
CUDA_HOME=/usr/local/cuda \
|
CUDA_HOME=/usr/local/cuda \
|
||||||
LD_LIBRARY_PATH="/opt/miniconda/envs/text-generation/lib:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH" \
|
LD_LIBRARY_PATH="/opt/miniconda/envs/text-generation/lib:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH" \
|
||||||
CONDA_DEFAULT_ENV=text-generation \
|
CONDA_DEFAULT_ENV=text-generation \
|
||||||
@ -76,7 +76,16 @@ COPY --from=builder /usr/src/target/release/text-generation-router /usr/local/bi
|
|||||||
# Install launcher
|
# Install launcher
|
||||||
COPY --from=builder /usr/src/target/release/text-generation-launcher /usr/local/bin/text-generation-launcher
|
COPY --from=builder /usr/src/target/release/text-generation-launcher /usr/local/bin/text-generation-launcher
|
||||||
|
|
||||||
COPY entrypoint.sh entrypoint.sh
|
# AWS Sagemaker compatbile image
|
||||||
|
FROM base as sagemaker
|
||||||
|
|
||||||
|
COPY sagemaker-entrypoint.sh entrypoint.sh
|
||||||
RUN chmod +x entrypoint.sh
|
RUN chmod +x entrypoint.sh
|
||||||
|
|
||||||
ENTRYPOINT ["./entrypoint.sh"]
|
ENTRYPOINT ["./entrypoint.sh"]
|
||||||
|
|
||||||
|
# Original image
|
||||||
|
FROM base
|
||||||
|
|
||||||
|
ENTRYPOINT ["text-generation-launcher"]
|
||||||
|
CMD ["--json-output"]
|
@ -1,3 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
text-generation-launcher
|
|
@ -19,15 +19,15 @@ use subprocess::{ExitStatus, Popen, PopenConfig, PopenError, Redirection};
|
|||||||
#[derive(Parser, Debug)]
|
#[derive(Parser, Debug)]
|
||||||
#[clap(author, version, about, long_about = None)]
|
#[clap(author, version, about, long_about = None)]
|
||||||
struct Args {
|
struct Args {
|
||||||
#[clap(default_value = "bigscience/bloom-560m", long, env = "HF_MODEL_ID")]
|
#[clap(default_value = "bigscience/bloom-560m", long, env)]
|
||||||
model_id: String,
|
model_id: String,
|
||||||
#[clap(long, env = "HF_MODEL_REVISION")]
|
#[clap(long, env)]
|
||||||
revision: Option<String>,
|
revision: Option<String>,
|
||||||
#[clap(long, env)]
|
#[clap(long, env)]
|
||||||
sharded: Option<bool>,
|
sharded: Option<bool>,
|
||||||
#[clap(long, env = "SM_NUM_GPUS")]
|
#[clap(long, env)]
|
||||||
num_shard: Option<usize>,
|
num_shard: Option<usize>,
|
||||||
#[clap(long, env = "HF_MODEL_QUANTIZE")]
|
#[clap(long, env)]
|
||||||
quantize: bool,
|
quantize: bool,
|
||||||
#[clap(default_value = "128", long, env)]
|
#[clap(default_value = "128", long, env)]
|
||||||
max_concurrent_requests: usize,
|
max_concurrent_requests: usize,
|
||||||
|
@ -529,13 +529,19 @@ pub async fn run(
|
|||||||
// Create router
|
// Create router
|
||||||
let app = Router::new()
|
let app = Router::new()
|
||||||
.merge(SwaggerUi::new("/docs").url("/api-doc/openapi.json", ApiDoc::openapi()))
|
.merge(SwaggerUi::new("/docs").url("/api-doc/openapi.json", ApiDoc::openapi()))
|
||||||
|
// Base routes
|
||||||
.route("/", post(compat_generate))
|
.route("/", post(compat_generate))
|
||||||
.route("/invocations", post(compat_generate))
|
|
||||||
.route("/generate", post(generate))
|
.route("/generate", post(generate))
|
||||||
.route("/generate_stream", post(generate_stream))
|
.route("/generate_stream", post(generate_stream))
|
||||||
.route("/", get(health))
|
// AWS Sagemaker route
|
||||||
|
.route("/invocations", post(compat_generate))
|
||||||
|
// Base Health route
|
||||||
.route("/health", get(health))
|
.route("/health", get(health))
|
||||||
|
// Inference API health route
|
||||||
|
.route("/", get(health))
|
||||||
|
// AWS Sagemaker health route
|
||||||
.route("/ping", get(health))
|
.route("/ping", get(health))
|
||||||
|
// Prometheus metrics route
|
||||||
.route("/metrics", get(metrics))
|
.route("/metrics", get(metrics))
|
||||||
.layer(Extension(compat_return_full_text))
|
.layer(Extension(compat_return_full_text))
|
||||||
.layer(Extension(infer))
|
.layer(Extension(infer))
|
||||||
|
20
sagemaker-entrypoint.sh
Executable file
20
sagemaker-entrypoint.sh
Executable file
@ -0,0 +1,20 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
if [[ -z "${HF_MODEL_ID}" ]]; then
|
||||||
|
echo "HF_MODEL_ID must be set"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -n "${HF_MODEL_REVISION}" ]]; then
|
||||||
|
export REVISION="${HF_MODEL_REVISION}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -n "${SM_NUM_GPUS}" ]]; then
|
||||||
|
export NUM_SHARD="${SM_NUM_GPUS}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -n "${HF_MODEL_QUANTIZE}" ]]; then
|
||||||
|
export QUANTIZE="${HF_MODEL_QUANTIZE}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
text-generation-launcher --port 8080
|
Loading…
Reference in New Issue
Block a user