adding peft loading instructions and dockerfile

2025-09-10 20:04:52 +00:00 · 2023-08-27 21:17:58 +02:00 · 2023-08-27 21:17:58 +02:00 · d23cc5857f
commit d23cc5857f
parent 1659b871b6
3 changed files with 10 additions and 1 deletions
--- a/Dockerfile.bake-peft-into-container
+++ b/Dockerfile.bake-peft-into-container
@ -0,0 +1,5 @@
+FROM ghcr.io/ohmytofu-ai/tgi-angry:1.0.3-rc1
+COPY ./CUSTOM_MODELS/ /mnt/TOFU/HF_MODELS
+
+ENTRYPOINT ["text-generation-launcher"]
+CMD ["--json-output"]
--- a/README.md
+++ b/README.md
@ -15,6 +15,9 @@ TGI is well suited for distributed/ cloud burst/ on-demand workloads, yet HF's f

 `</endOfMissionStatement>`

+## LLama with PEFT
+append `--peft-model-path /my/local/peft-adapter-folder` to  the `run-dev` command inside `server/Makefile` and follow the steps indicated inside the prev. section. The folder should contain a `adater_config.json` file.
+
 <div align="center">

 ![image](https://github.com/huggingface/text-generation-inference/assets/3841370/38ba1531-ea0d-4851-b31a-a6d4ddc944b0)
--- a/server/Makefile
+++ b/server/Makefile
@ -24,6 +24,7 @@ install: gen-server install-torch
 	pip install -e ".[bnb, accelerate]"

 run-dev:
-	SAFETENSORS_FAST_GPU=1 python -m torch.distributed.run --nproc_per_node=1 text_generation_server/cli.py serve /mnt/TOFU/HF_MODELS/Llama-2-7b-chat-hf --quantize bitsandbytes --peft-model-path /mnt/TOFU/HF_MODELS/Llama-2-7b-chat-hf-function-calling-adapters-v2
+	# SAFETENSORS_FAST_GPU=1 python -m torch.distributed.run --nproc_per_node=1 text_generation_server/cli.py serve /mnt/TOFU/HF_MODELS/Llama-2-7b-chat-hf --quantize bitsandbytes --peft-model-path /mnt/TOFU/HF_MODELS/Llama-2-7b-chat-hf-function-calling-adapters-v2
+	SAFETENSORS_FAST_GPU=1 python -m torch.distributed.run --nproc_per_node=1 text_generation_server/cli.py serve meta-llama/Llama-2-7b-chat-hf --quantize bitsandbytes --peft-model-path /mnt/TOFU/HF_MODELS/Llama-2-7b-chat-hf-function-calling-adapters-v2
 export-requirements:
 	poetry export -o requirements.txt -E bnb -E quantize --without-hashes