From bc95ef2e8bf0939b043b4992e8915c2f3024877e Mon Sep 17 00:00:00 2001 From: David Corvoysier Date: Tue, 18 Feb 2025 13:50:25 +0000 Subject: [PATCH] review: do not use latest tag --- backends/neuron/Makefile | 1 - docs/source/backends/neuron.md | 16 ++++++++-------- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/backends/neuron/Makefile b/backends/neuron/Makefile index 06674971..6c5002ce 100644 --- a/backends/neuron/Makefile +++ b/backends/neuron/Makefile @@ -25,7 +25,6 @@ image: --ulimit nofile=100000:100000 \ --build-arg VERSION=$(VERSION) \ -t text-generation-inference:$(VERSION)-neuron ${root_dir} - docker tag text-generation-inference:$(VERSION)-neuron text-generation-inference:latest-neuron install_server: make -C ${mkfile_dir}/server install VERSION:=${VERSION} diff --git a/docs/source/backends/neuron.md b/docs/source/backends/neuron.md index 50f70fb2..4d292879 100644 --- a/docs/source/backends/neuron.md +++ b/docs/source/backends/neuron.md @@ -31,7 +31,7 @@ deployment instructions in the model card: The service is launched simply by running the text-generation-inference container with two sets of parameters: ``` -docker run ghcr.io/huggingface/text-generation-inference:latest-neuron +docker run ghcr.io/huggingface/text-generation-inference:3.1.0-neuron ``` - system parameters are used to map ports, volumes and devices between the host and the service, @@ -59,7 +59,7 @@ docker run -p 8080:80 \ -v $(pwd)/data:/data \ --privileged \ -e HF_TOKEN=${HF_TOKEN} \ - ghcr.io/huggingface/text-generation-inference:latest-neuron \ + ghcr.io/huggingface/text-generation-inference:-neuron \ ``` @@ -70,7 +70,7 @@ docker run -p 8080:80 \ -v $(pwd)/data:/data \ --device=/dev/neuron0 \ -e HF_TOKEN=${HF_TOKEN} \ - ghcr.io/huggingface/text-generation-inference:latest-neuron \ + ghcr.io/huggingface/text-generation-inference:-neuron \ ``` @@ -92,7 +92,7 @@ docker run -p 8080:80 \ -e HF_TOKEN=${HF_TOKEN} \ -e HF_AUTO_CAST_TYPE="fp16" \ -e HF_NUM_CORES=2 \ - ghcr.io/huggingface/text-generation-inference:latest-neuron:latest \ + ghcr.io/huggingface/text-generation-inference:-neuron \ --model-id meta-llama/Meta-Llama-3-8B \ --max-batch-size 1 \ --max-input-length 3164 \ @@ -101,7 +101,7 @@ docker run -p 8080:80 \ ### Using a model exported to a local path -Alternatively, you can first [export the model to neuron format](https://huggingface.co/docs/optimum-neuron/main/en/guides/export_model#exporting-neuron-models-using-text-generation-inference:latest-neuron) locally. +Alternatively, you can first [export the model to neuron format](https://huggingface.co/docs/optimum-neuron/main/en/guides/export_model#exporting-neuron-models-using-text-generation-inference) locally. You can then deploy the service inside the shared volume: @@ -109,7 +109,7 @@ You can then deploy the service inside the shared volume: docker run -p 8080:80 \ -v $(pwd)/data:/data \ --privileged \ - ghcr.io/huggingface/text-generation-inference:latest-neuron:latest \ + ghcr.io/huggingface/text-generation-inference:-neuron \ --model-id /data/ ``` @@ -126,7 +126,7 @@ docker run -p 8080:80 \ -v $(pwd)/data:/data \ --privileged \ -e HF_TOKEN=${HF_TOKEN} \ - ghcr.io/huggingface/text-generation-inference:latest-neuron:latest \ + ghcr.io/huggingface/text-generation-inference:-neuron \ --model-id / ``` @@ -135,7 +135,7 @@ docker run -p 8080:80 \ Use the following command to list the available service parameters: ``` -docker run ghcr.io/huggingface/text-generation-inference:latest-neuron --help +docker run ghcr.io/huggingface/text-generation-inference:-neuron --help ``` The configuration of an inference endpoint is always a compromise between throughput and latency: serving more requests in parallel will allow a higher throughput, but it will increase the latency.