mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-19 22:02:06 +00:00
Update neuron backend (#3098)
* feat(neuron): use AWS Neuron SDK 2.21.1 * feat(neuron): bump optimum-neuron version * feat(neuron): tag latest image for local tests * test(neuron): simplify sampling test
This commit is contained in:
parent
5c5528e362
commit
f01dc9e743
@ -5,7 +5,7 @@ RUN mkdir -p /tgi
|
|||||||
# Fetch the optimum-neuron sources directly to avoid relying on pypi deployments
|
# Fetch the optimum-neuron sources directly to avoid relying on pypi deployments
|
||||||
FROM alpine AS optimum-neuron
|
FROM alpine AS optimum-neuron
|
||||||
RUN mkdir -p /optimum-neuron
|
RUN mkdir -p /optimum-neuron
|
||||||
ADD https://github.com/huggingface/optimum-neuron/archive/refs/tags/v0.0.28.tar.gz /optimum-neuron/sources.tar.gz
|
ADD https://github.com/huggingface/optimum-neuron/archive/refs/tags/v0.1.0.tar.gz /optimum-neuron/sources.tar.gz
|
||||||
RUN tar -C /optimum-neuron -xf /optimum-neuron/sources.tar.gz --strip-components=1
|
RUN tar -C /optimum-neuron -xf /optimum-neuron/sources.tar.gz --strip-components=1
|
||||||
|
|
||||||
# Build cargo components (adapted from TGI original Dockerfile)
|
# Build cargo components (adapted from TGI original Dockerfile)
|
||||||
@ -108,10 +108,10 @@ RUN wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEU
|
|||||||
# Install neuronx packages
|
# Install neuronx packages
|
||||||
RUN apt-get update -y \
|
RUN apt-get update -y \
|
||||||
&& apt-get install -y --no-install-recommends \
|
&& apt-get install -y --no-install-recommends \
|
||||||
aws-neuronx-dkms=2.18.20.0 \
|
aws-neuronx-dkms=2.19.64.0 \
|
||||||
aws-neuronx-collectives=2.22.33.0-d2128d1aa \
|
aws-neuronx-collectives=2.23.135.0-3e70920f2 \
|
||||||
aws-neuronx-runtime-lib=2.22.19.0-5856c0b42 \
|
aws-neuronx-runtime-lib=2.23.112.0-9b5179492 \
|
||||||
aws-neuronx-tools=2.19.0.0 \
|
aws-neuronx-tools=2.20.204.0 \
|
||||||
libxml2 \
|
libxml2 \
|
||||||
&& rm -rf /var/lib/apt/lists/* \
|
&& rm -rf /var/lib/apt/lists/* \
|
||||||
&& apt-get clean
|
&& apt-get clean
|
||||||
@ -120,16 +120,16 @@ ENV PATH="/opt/bin/:/opt/aws/neuron/bin:${PATH}"
|
|||||||
|
|
||||||
# Install manually torch CPU version to avoid pulling CUDA
|
# Install manually torch CPU version to avoid pulling CUDA
|
||||||
RUN pip3 install \
|
RUN pip3 install \
|
||||||
torch==2.1.2 \
|
torch==2.5.1 \
|
||||||
torchvision==0.16.2 \
|
torchvision==0.20.1 \
|
||||||
--index-url https://download.pytorch.org/whl/cpu
|
--index-url https://download.pytorch.org/whl/cpu
|
||||||
|
|
||||||
RUN pip3 install \
|
RUN pip3 install \
|
||||||
neuronx-cc==2.15.143.0 \
|
neuronx-cc==2.16.372.0 \
|
||||||
torch-neuronx==2.1.2.2.3.2 \
|
torch-neuronx==2.5.1.2.4.0 \
|
||||||
transformers-neuronx==0.12.313 \
|
transformers-neuronx==0.13.322 \
|
||||||
neuronx-distributed==0.9.0 \
|
neuronx-distributed==0.10.1 \
|
||||||
libneuronxla==2.0.5347.0 \
|
libneuronxla==2.1.681.0 \
|
||||||
--extra-index-url=https://pip.repos.neuron.amazonaws.com
|
--extra-index-url=https://pip.repos.neuron.amazonaws.com
|
||||||
|
|
||||||
# Install HuggingFace packages
|
# Install HuggingFace packages
|
||||||
|
@ -25,6 +25,7 @@ image:
|
|||||||
--ulimit nofile=100000:100000 \
|
--ulimit nofile=100000:100000 \
|
||||||
--build-arg VERSION=$(VERSION) \
|
--build-arg VERSION=$(VERSION) \
|
||||||
-t text-generation-inference:$(VERSION)-neuron ${root_dir}
|
-t text-generation-inference:$(VERSION)-neuron ${root_dir}
|
||||||
|
docker tag text-generation-inference:$(VERSION)-neuron text-generation-inference:latest-neuron
|
||||||
|
|
||||||
install_server:
|
install_server:
|
||||||
make -C ${mkfile_dir}/server install VERSION:=${VERSION}
|
make -C ${mkfile_dir}/server install VERSION:=${VERSION}
|
||||||
|
@ -49,17 +49,11 @@ async def test_model_single_request(tgi_service):
|
|||||||
max_new_tokens=128,
|
max_new_tokens=128,
|
||||||
seed=42,
|
seed=42,
|
||||||
)
|
)
|
||||||
sample_expectations = {
|
# The response must be different
|
||||||
"gpt2": "Deep Learning",
|
assert not response.startswith(greedy_expectations[service_name])
|
||||||
"llama": "Deep Learning",
|
|
||||||
"mistral": "Deep learning",
|
|
||||||
"qwen2": "Deep Learning",
|
|
||||||
"granite": "Deep learning",
|
|
||||||
}
|
|
||||||
assert sample_expectations[service_name] in response
|
|
||||||
|
|
||||||
# Sampling with stop sequence
|
# Sampling with stop sequence (using one of the words returned from the previous test)
|
||||||
stop_sequence = sample_expectations[service_name][-5:]
|
stop_sequence = response.split(" ")[-5]
|
||||||
response = await tgi_service.client.text_generation(
|
response = await tgi_service.client.text_generation(
|
||||||
"What is Deep Learning?",
|
"What is Deep Learning?",
|
||||||
do_sample=True,
|
do_sample=True,
|
||||||
|
Loading…
Reference in New Issue
Block a user