diff --git a/Dockerfile.bake-peft-into-container b/Dockerfile.bake-peft-into-container new file mode 100644 index 00000000..820ddea6 --- /dev/null +++ b/Dockerfile.bake-peft-into-container @@ -0,0 +1,5 @@ +FROM ghcr.io/ohmytofu-ai/tgi-angry:1.0.3-rc1 +COPY ./CUSTOM_MODELS/ /mnt/TOFU/HF_MODELS + +ENTRYPOINT ["text-generation-launcher"] +CMD ["--json-output"] diff --git a/README.md b/README.md index 8eb99c73..a4000bf7 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,9 @@ TGI is well suited for distributed/ cloud burst/ on-demand workloads, yet HF's f `` +## LLama with PEFT +append `--peft-model-path /my/local/peft-adapter-folder` to the `run-dev` command inside `server/Makefile` and follow the steps indicated inside the prev. section. The folder should contain a `adater_config.json` file. +
![image](https://github.com/huggingface/text-generation-inference/assets/3841370/38ba1531-ea0d-4851-b31a-a6d4ddc944b0) diff --git a/server/Makefile b/server/Makefile index 01e215e5..a8851c5f 100644 --- a/server/Makefile +++ b/server/Makefile @@ -24,6 +24,7 @@ install: gen-server install-torch pip install -e ".[bnb, accelerate]" run-dev: - SAFETENSORS_FAST_GPU=1 python -m torch.distributed.run --nproc_per_node=1 text_generation_server/cli.py serve /mnt/TOFU/HF_MODELS/Llama-2-7b-chat-hf --quantize bitsandbytes --peft-model-path /mnt/TOFU/HF_MODELS/Llama-2-7b-chat-hf-function-calling-adapters-v2 + # SAFETENSORS_FAST_GPU=1 python -m torch.distributed.run --nproc_per_node=1 text_generation_server/cli.py serve /mnt/TOFU/HF_MODELS/Llama-2-7b-chat-hf --quantize bitsandbytes --peft-model-path /mnt/TOFU/HF_MODELS/Llama-2-7b-chat-hf-function-calling-adapters-v2 + SAFETENSORS_FAST_GPU=1 python -m torch.distributed.run --nproc_per_node=1 text_generation_server/cli.py serve meta-llama/Llama-2-7b-chat-hf --quantize bitsandbytes --peft-model-path /mnt/TOFU/HF_MODELS/Llama-2-7b-chat-hf-function-calling-adapters-v2 export-requirements: poetry export -o requirements.txt -E bnb -E quantize --without-hashes