mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-12 04:44:52 +00:00
docker-compose
This commit is contained in:
parent
90184df79c
commit
86b42f5f6d
57
.env
Normal file
57
.env
Normal file
@ -0,0 +1,57 @@
|
||||
#VOLUME=$PWD/data
|
||||
HF_TOKEN=
|
||||
#NVIDIA_VISIBLE_DEVICES=all
|
||||
|
||||
#MODEL_ID=bigscience/bloom-560m
|
||||
#REVISION=main
|
||||
|
||||
#NUM_SHARD=1
|
||||
|
||||
#QUANTIZE=awq
|
||||
#QUANTIZE=eetq
|
||||
#QUANTIZE=exl # 2.0.5
|
||||
|
||||
#DTYPE=float
|
||||
#DTYPE=bfloat16
|
||||
|
||||
#TRUST_REMOTE_CODE=false
|
||||
|
||||
#MAX_INPUT_TOKENS= # min(max_position_embeddings - 1, 4095)
|
||||
#MAX_TOTAL_TOKENS= # min(max_position_embeddings, 4096)
|
||||
#MAX_BATCH_PREFILL_TOKENS= # max_input_tokens + 50
|
||||
|
||||
#MAX_BATCH_TOTAL_TOKENS=
|
||||
# Overall this number should be the largest possible amount that fits the
|
||||
# remaining memory (after the model is loaded). Since the actual memory overhead
|
||||
# depends on other parameters like if you're using quantization, flash attention
|
||||
# or the model implementation, text-generation-inference cannot infer this
|
||||
# number automatically.
|
||||
|
||||
#PORT=8080
|
||||
#JSON_OUTPUT=true # Outputs the logs in JSON format (useful for telemetry)
|
||||
|
||||
#CMD_ADDITIONAL_ARGUMENTS=""
|
||||
|
||||
################################################################################
|
||||
# Qwen2 7B Instruct
|
||||
################################################################################
|
||||
MODEL_ID=Qwen/Qwen2-7B-Instruct
|
||||
MAX_INPUT_TOKENS=32767
|
||||
MAX_TOTAL_TOKENS=32768
|
||||
MAX_BATCH_PREFILL_TOKENS=32818
|
||||
|
||||
################################################################################
|
||||
# Meta Llama 3 8B Instruct
|
||||
################################################################################
|
||||
#MODEL_ID=meta-llama/Meta-Llama-3-8B-Instruct
|
||||
#MAX_INPUT_TOKENS=8191
|
||||
#MAX_TOTAL_TOKENS=8192
|
||||
#MAX_BATCH_PREFILL_TOKENS=8242
|
||||
|
||||
################################################################################
|
||||
# Mixtral 8X7B Instruct v0.1 - AWQ
|
||||
################################################################################
|
||||
#MODEL_ID=casperhansen/mixtral-instruct-awq
|
||||
#MAX_INPUT_TOKENS=32767
|
||||
#MAX_TOTAL_TOKENS=32768
|
||||
#MAX_BATCH_PREFILL_TOKENS=32818
|
25
docker-compose.yml
Normal file
25
docker-compose.yml
Normal file
@ -0,0 +1,25 @@
|
||||
services:
|
||||
tgi:
|
||||
command:
|
||||
- "--hostname 0.0.0.0"
|
||||
- "--port 80"
|
||||
- "${CMD_ADDITIONAL_ARGUMENTS:-}"
|
||||
container_name: tgi
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- capabilities: [gpu]
|
||||
count: all
|
||||
driver: nvidia
|
||||
env_file: .env
|
||||
hostname: tgi
|
||||
image: ghcr.io/huggingface/text-generation-inference:2.0 # 2.0.x
|
||||
ports:
|
||||
- "${PORT:-8080}:80"
|
||||
restart: unless-stopped
|
||||
shm_size: 1g
|
||||
volumes:
|
||||
- "${VOLUME:-./data}:/data"
|
||||
- /etc/localtime:/etc/localtime:ro
|
||||
- /etc/timezone:/etc/timezone:ro
|
Loading…
Reference in New Issue
Block a user