From bc95ef2e8bf0939b043b4992e8915c2f3024877e Mon Sep 17 00:00:00 2001
From: David Corvoysier <david@huggingface.co>
Date: Tue, 18 Feb 2025 13:50:25 +0000
Subject: [PATCH] review: do not use latest tag

---
 backends/neuron/Makefile       |  1 -
 docs/source/backends/neuron.md | 16 ++++++++--------
 2 files changed, 8 insertions(+), 9 deletions(-)
diff --git a/backends/neuron/Makefile b/backends/neuron/Makefile
index 06674971..6c5002ce 100644
--- a/backends/neuron/Makefile
+++ b/backends/neuron/Makefile
@@ -25,7 +25,6 @@ image:
 				 --ulimit nofile=100000:100000 \
 				 --build-arg VERSION=$(VERSION) \
 				 -t text-generation-inference:$(VERSION)-neuron ${root_dir}
-	docker tag text-generation-inference:$(VERSION)-neuron text-generation-inference:latest-neuron
 
 install_server:
 	make -C ${mkfile_dir}/server install VERSION:=${VERSION}
diff --git a/docs/source/backends/neuron.md b/docs/source/backends/neuron.md
index 50f70fb2..4d292879 100644
--- a/docs/source/backends/neuron.md
+++ b/docs/source/backends/neuron.md
@@ -31,7 +31,7 @@ deployment instructions in the model card:
 The service is launched simply by running the text-generation-inference container with two sets of parameters:
 
 ```
-docker run <system_parameters> ghcr.io/huggingface/text-generation-inference:latest-neuron <service_parameters>
+docker run <system_parameters> ghcr.io/huggingface/text-generation-inference:3.1.0-neuron <service_parameters>
 ```
 
 - system parameters are used to map ports, volumes and devices between the host and the service,
@@ -59,7 +59,7 @@ docker run -p 8080:80 \
        -v $(pwd)/data:/data \
        --privileged \
        -e HF_TOKEN=${HF_TOKEN} \
-       ghcr.io/huggingface/text-generation-inference:latest-neuron \
+       ghcr.io/huggingface/text-generation-inference:<VERSION>-neuron \
        <service_parameters>
 ```
 
@@ -70,7 +70,7 @@ docker run -p 8080:80 \
        -v $(pwd)/data:/data \
        --device=/dev/neuron0 \
        -e HF_TOKEN=${HF_TOKEN} \
-       ghcr.io/huggingface/text-generation-inference:latest-neuron \
+       ghcr.io/huggingface/text-generation-inference:<VERSION>-neuron \
        <service_parameters>
 ```
 
@@ -92,7 +92,7 @@ docker run -p 8080:80 \
        -e HF_TOKEN=${HF_TOKEN} \
        -e HF_AUTO_CAST_TYPE="fp16" \
        -e HF_NUM_CORES=2 \
-       ghcr.io/huggingface/text-generation-inference:latest-neuron:latest \
+       ghcr.io/huggingface/text-generation-inference:<VERSION>-neuron \
        --model-id meta-llama/Meta-Llama-3-8B \
        --max-batch-size 1 \
        --max-input-length 3164 \
@@ -101,7 +101,7 @@ docker run -p 8080:80 \
 
 ### Using a model exported to a local path
 
-Alternatively, you can first [export the model to neuron format](https://huggingface.co/docs/optimum-neuron/main/en/guides/export_model#exporting-neuron-models-using-text-generation-inference:latest-neuron) locally.
+Alternatively, you can first [export the model to neuron format](https://huggingface.co/docs/optimum-neuron/main/en/guides/export_model#exporting-neuron-models-using-text-generation-inference) locally.
 
 You can then deploy the service inside the shared volume:
 
@@ -109,7 +109,7 @@ You can then deploy the service inside the shared volume:
 docker run -p 8080:80 \
        -v $(pwd)/data:/data \
        --privileged \
-       ghcr.io/huggingface/text-generation-inference:latest-neuron:latest \
+       ghcr.io/huggingface/text-generation-inference:<VERSION>-neuron \
        --model-id /data/<neuron_model_path>
 ```
 
@@ -126,7 +126,7 @@ docker run -p 8080:80 \
        -v $(pwd)/data:/data \
        --privileged \
        -e HF_TOKEN=${HF_TOKEN} \
-       ghcr.io/huggingface/text-generation-inference:latest-neuron:latest \
+       ghcr.io/huggingface/text-generation-inference:<VERSION>-neuron \
        --model-id <organization>/<neuron-model>
 ```
 
@@ -135,7 +135,7 @@ docker run -p 8080:80 \
 Use the following command to list the available service parameters:
 
 ```
-docker run ghcr.io/huggingface/text-generation-inference:latest-neuron --help
+docker run ghcr.io/huggingface/text-generation-inference:<VERSION>-neuron --help
 ```
 
 The configuration of an inference endpoint is always a compromise between throughput and latency: serving more requests in parallel will allow a higher throughput, but it will increase the latency.