docker-compose

This commit is contained in:
Stefan Daniel Schwarz 2024-06-12 23:35:40 +02:00
parent 90184df79c
commit 86b42f5f6d
2 changed files with 82 additions and 0 deletions

57
.env Normal file
View File

@ -0,0 +1,57 @@
#VOLUME=$PWD/data
HF_TOKEN=
#NVIDIA_VISIBLE_DEVICES=all
#MODEL_ID=bigscience/bloom-560m
#REVISION=main
#NUM_SHARD=1
#QUANTIZE=awq
#QUANTIZE=eetq
#QUANTIZE=exl # 2.0.5
#DTYPE=float
#DTYPE=bfloat16
#TRUST_REMOTE_CODE=false
#MAX_INPUT_TOKENS= # min(max_position_embeddings - 1, 4095)
#MAX_TOTAL_TOKENS= # min(max_position_embeddings, 4096)
#MAX_BATCH_PREFILL_TOKENS= # max_input_tokens + 50
#MAX_BATCH_TOTAL_TOKENS=
# Overall this number should be the largest possible amount that fits the
# remaining memory (after the model is loaded). Since the actual memory overhead
# depends on other parameters like if you're using quantization, flash attention
# or the model implementation, text-generation-inference cannot infer this
# number automatically.
#PORT=8080
#JSON_OUTPUT=true # Outputs the logs in JSON format (useful for telemetry)
#CMD_ADDITIONAL_ARGUMENTS=""
################################################################################
# Qwen2 7B Instruct
################################################################################
MODEL_ID=Qwen/Qwen2-7B-Instruct
MAX_INPUT_TOKENS=32767
MAX_TOTAL_TOKENS=32768
MAX_BATCH_PREFILL_TOKENS=32818
################################################################################
# Meta Llama 3 8B Instruct
################################################################################
#MODEL_ID=meta-llama/Meta-Llama-3-8B-Instruct
#MAX_INPUT_TOKENS=8191
#MAX_TOTAL_TOKENS=8192
#MAX_BATCH_PREFILL_TOKENS=8242
################################################################################
# Mixtral 8X7B Instruct v0.1 - AWQ
################################################################################
#MODEL_ID=casperhansen/mixtral-instruct-awq
#MAX_INPUT_TOKENS=32767
#MAX_TOTAL_TOKENS=32768
#MAX_BATCH_PREFILL_TOKENS=32818

25
docker-compose.yml Normal file
View File

@ -0,0 +1,25 @@
services:
tgi:
command:
- "--hostname 0.0.0.0"
- "--port 80"
- "${CMD_ADDITIONAL_ARGUMENTS:-}"
container_name: tgi
deploy:
resources:
reservations:
devices:
- capabilities: [gpu]
count: all
driver: nvidia
env_file: .env
hostname: tgi
image: ghcr.io/huggingface/text-generation-inference:2.0 # 2.0.x
ports:
- "${PORT:-8080}:80"
restart: unless-stopped
shm_size: 1g
volumes:
- "${VOLUME:-./data}:/data"
- /etc/localtime:/etc/localtime:ro
- /etc/timezone:/etc/timezone:ro