mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-10 20:04:52 +00:00
adding peft loading instructions and dockerfile
This commit is contained in:
parent
1659b871b6
commit
d23cc5857f
5
Dockerfile.bake-peft-into-container
Normal file
5
Dockerfile.bake-peft-into-container
Normal file
@ -0,0 +1,5 @@
|
||||
FROM ghcr.io/ohmytofu-ai/tgi-angry:1.0.3-rc1
|
||||
COPY ./CUSTOM_MODELS/ /mnt/TOFU/HF_MODELS
|
||||
|
||||
ENTRYPOINT ["text-generation-launcher"]
|
||||
CMD ["--json-output"]
|
@ -15,6 +15,9 @@ TGI is well suited for distributed/ cloud burst/ on-demand workloads, yet HF's f
|
||||
|
||||
`</endOfMissionStatement>`
|
||||
|
||||
## LLama with PEFT
|
||||
append `--peft-model-path /my/local/peft-adapter-folder` to the `run-dev` command inside `server/Makefile` and follow the steps indicated inside the prev. section. The folder should contain a `adater_config.json` file.
|
||||
|
||||
<div align="center">
|
||||
|
||||

|
||||
|
@ -24,6 +24,7 @@ install: gen-server install-torch
|
||||
pip install -e ".[bnb, accelerate]"
|
||||
|
||||
run-dev:
|
||||
SAFETENSORS_FAST_GPU=1 python -m torch.distributed.run --nproc_per_node=1 text_generation_server/cli.py serve /mnt/TOFU/HF_MODELS/Llama-2-7b-chat-hf --quantize bitsandbytes --peft-model-path /mnt/TOFU/HF_MODELS/Llama-2-7b-chat-hf-function-calling-adapters-v2
|
||||
# SAFETENSORS_FAST_GPU=1 python -m torch.distributed.run --nproc_per_node=1 text_generation_server/cli.py serve /mnt/TOFU/HF_MODELS/Llama-2-7b-chat-hf --quantize bitsandbytes --peft-model-path /mnt/TOFU/HF_MODELS/Llama-2-7b-chat-hf-function-calling-adapters-v2
|
||||
SAFETENSORS_FAST_GPU=1 python -m torch.distributed.run --nproc_per_node=1 text_generation_server/cli.py serve meta-llama/Llama-2-7b-chat-hf --quantize bitsandbytes --peft-model-path /mnt/TOFU/HF_MODELS/Llama-2-7b-chat-hf-function-calling-adapters-v2
|
||||
export-requirements:
|
||||
poetry export -o requirements.txt -E bnb -E quantize --without-hashes
|
||||
|
Loading…
Reference in New Issue
Block a user