mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-12 04:44:52 +00:00
docker-compose
This commit is contained in:
parent
90184df79c
commit
86b42f5f6d
57
.env
Normal file
57
.env
Normal file
@ -0,0 +1,57 @@
|
|||||||
|
#VOLUME=$PWD/data
|
||||||
|
HF_TOKEN=
|
||||||
|
#NVIDIA_VISIBLE_DEVICES=all
|
||||||
|
|
||||||
|
#MODEL_ID=bigscience/bloom-560m
|
||||||
|
#REVISION=main
|
||||||
|
|
||||||
|
#NUM_SHARD=1
|
||||||
|
|
||||||
|
#QUANTIZE=awq
|
||||||
|
#QUANTIZE=eetq
|
||||||
|
#QUANTIZE=exl # 2.0.5
|
||||||
|
|
||||||
|
#DTYPE=float
|
||||||
|
#DTYPE=bfloat16
|
||||||
|
|
||||||
|
#TRUST_REMOTE_CODE=false
|
||||||
|
|
||||||
|
#MAX_INPUT_TOKENS= # min(max_position_embeddings - 1, 4095)
|
||||||
|
#MAX_TOTAL_TOKENS= # min(max_position_embeddings, 4096)
|
||||||
|
#MAX_BATCH_PREFILL_TOKENS= # max_input_tokens + 50
|
||||||
|
|
||||||
|
#MAX_BATCH_TOTAL_TOKENS=
|
||||||
|
# Overall this number should be the largest possible amount that fits the
|
||||||
|
# remaining memory (after the model is loaded). Since the actual memory overhead
|
||||||
|
# depends on other parameters like if you're using quantization, flash attention
|
||||||
|
# or the model implementation, text-generation-inference cannot infer this
|
||||||
|
# number automatically.
|
||||||
|
|
||||||
|
#PORT=8080
|
||||||
|
#JSON_OUTPUT=true # Outputs the logs in JSON format (useful for telemetry)
|
||||||
|
|
||||||
|
#CMD_ADDITIONAL_ARGUMENTS=""
|
||||||
|
|
||||||
|
################################################################################
|
||||||
|
# Qwen2 7B Instruct
|
||||||
|
################################################################################
|
||||||
|
MODEL_ID=Qwen/Qwen2-7B-Instruct
|
||||||
|
MAX_INPUT_TOKENS=32767
|
||||||
|
MAX_TOTAL_TOKENS=32768
|
||||||
|
MAX_BATCH_PREFILL_TOKENS=32818
|
||||||
|
|
||||||
|
################################################################################
|
||||||
|
# Meta Llama 3 8B Instruct
|
||||||
|
################################################################################
|
||||||
|
#MODEL_ID=meta-llama/Meta-Llama-3-8B-Instruct
|
||||||
|
#MAX_INPUT_TOKENS=8191
|
||||||
|
#MAX_TOTAL_TOKENS=8192
|
||||||
|
#MAX_BATCH_PREFILL_TOKENS=8242
|
||||||
|
|
||||||
|
################################################################################
|
||||||
|
# Mixtral 8X7B Instruct v0.1 - AWQ
|
||||||
|
################################################################################
|
||||||
|
#MODEL_ID=casperhansen/mixtral-instruct-awq
|
||||||
|
#MAX_INPUT_TOKENS=32767
|
||||||
|
#MAX_TOTAL_TOKENS=32768
|
||||||
|
#MAX_BATCH_PREFILL_TOKENS=32818
|
25
docker-compose.yml
Normal file
25
docker-compose.yml
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
services:
|
||||||
|
tgi:
|
||||||
|
command:
|
||||||
|
- "--hostname 0.0.0.0"
|
||||||
|
- "--port 80"
|
||||||
|
- "${CMD_ADDITIONAL_ARGUMENTS:-}"
|
||||||
|
container_name: tgi
|
||||||
|
deploy:
|
||||||
|
resources:
|
||||||
|
reservations:
|
||||||
|
devices:
|
||||||
|
- capabilities: [gpu]
|
||||||
|
count: all
|
||||||
|
driver: nvidia
|
||||||
|
env_file: .env
|
||||||
|
hostname: tgi
|
||||||
|
image: ghcr.io/huggingface/text-generation-inference:2.0 # 2.0.x
|
||||||
|
ports:
|
||||||
|
- "${PORT:-8080}:80"
|
||||||
|
restart: unless-stopped
|
||||||
|
shm_size: 1g
|
||||||
|
volumes:
|
||||||
|
- "${VOLUME:-./data}:/data"
|
||||||
|
- /etc/localtime:/etc/localtime:ro
|
||||||
|
- /etc/timezone:/etc/timezone:ro
|
Loading…
Reference in New Issue
Block a user