From 86b42f5f6d4f8435f4c74c2964caa6ad21241677 Mon Sep 17 00:00:00 2001 From: Stefan Daniel Schwarz Date: Wed, 12 Jun 2024 23:35:40 +0200 Subject: [PATCH] docker-compose --- .env | 57 ++++++++++++++++++++++++++++++++++++++++++++++ docker-compose.yml | 25 ++++++++++++++++++++ 2 files changed, 82 insertions(+) create mode 100644 .env create mode 100644 docker-compose.yml diff --git a/.env b/.env new file mode 100644 index 00000000..ae5938c0 --- /dev/null +++ b/.env @@ -0,0 +1,57 @@ +#VOLUME=$PWD/data +HF_TOKEN= +#NVIDIA_VISIBLE_DEVICES=all + +#MODEL_ID=bigscience/bloom-560m +#REVISION=main + +#NUM_SHARD=1 + +#QUANTIZE=awq +#QUANTIZE=eetq +#QUANTIZE=exl # 2.0.5 + +#DTYPE=float +#DTYPE=bfloat16 + +#TRUST_REMOTE_CODE=false + +#MAX_INPUT_TOKENS= # min(max_position_embeddings - 1, 4095) +#MAX_TOTAL_TOKENS= # min(max_position_embeddings, 4096) +#MAX_BATCH_PREFILL_TOKENS= # max_input_tokens + 50 + +#MAX_BATCH_TOTAL_TOKENS= +# Overall this number should be the largest possible amount that fits the +# remaining memory (after the model is loaded). Since the actual memory overhead +# depends on other parameters like if you're using quantization, flash attention +# or the model implementation, text-generation-inference cannot infer this +# number automatically. + +#PORT=8080 +#JSON_OUTPUT=true # Outputs the logs in JSON format (useful for telemetry) + +#CMD_ADDITIONAL_ARGUMENTS="" + +################################################################################ +# Qwen2 7B Instruct +################################################################################ +MODEL_ID=Qwen/Qwen2-7B-Instruct +MAX_INPUT_TOKENS=32767 +MAX_TOTAL_TOKENS=32768 +MAX_BATCH_PREFILL_TOKENS=32818 + +################################################################################ +# Meta Llama 3 8B Instruct +################################################################################ +#MODEL_ID=meta-llama/Meta-Llama-3-8B-Instruct +#MAX_INPUT_TOKENS=8191 +#MAX_TOTAL_TOKENS=8192 +#MAX_BATCH_PREFILL_TOKENS=8242 + +################################################################################ +# Mixtral 8X7B Instruct v0.1 - AWQ +################################################################################ +#MODEL_ID=casperhansen/mixtral-instruct-awq +#MAX_INPUT_TOKENS=32767 +#MAX_TOTAL_TOKENS=32768 +#MAX_BATCH_PREFILL_TOKENS=32818 diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 00000000..737b155c --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,25 @@ +services: + tgi: + command: + - "--hostname 0.0.0.0" + - "--port 80" + - "${CMD_ADDITIONAL_ARGUMENTS:-}" + container_name: tgi + deploy: + resources: + reservations: + devices: + - capabilities: [gpu] + count: all + driver: nvidia + env_file: .env + hostname: tgi + image: ghcr.io/huggingface/text-generation-inference:2.0 # 2.0.x + ports: + - "${PORT:-8080}:80" + restart: unless-stopped + shm_size: 1g + volumes: + - "${VOLUME:-./data}:/data" + - /etc/localtime:/etc/localtime:ro + - /etc/timezone:/etc/timezone:ro