docker-compose

2025-09-12 04:44:52 +00:00 · 2024-06-12 23:35:40 +02:00 · 2024-06-12 23:35:40 +02:00 · 86b42f5f6d
commit 86b42f5f6d
parent 90184df79c
2 changed files with 82 additions and 0 deletions
--- a/.env
+++ b/.env
@ -0,0 +1,57 @@
+#VOLUME=$PWD/data
+HF_TOKEN=
+#NVIDIA_VISIBLE_DEVICES=all
+
+#MODEL_ID=bigscience/bloom-560m
+#REVISION=main
+
+#NUM_SHARD=1
+
+#QUANTIZE=awq
+#QUANTIZE=eetq
+#QUANTIZE=exl # 2.0.5
+
+#DTYPE=float
+#DTYPE=bfloat16
+
+#TRUST_REMOTE_CODE=false
+
+#MAX_INPUT_TOKENS= # min(max_position_embeddings - 1, 4095)
+#MAX_TOTAL_TOKENS= # min(max_position_embeddings, 4096)
+#MAX_BATCH_PREFILL_TOKENS= # max_input_tokens + 50
+
+#MAX_BATCH_TOTAL_TOKENS=
+# Overall this number should be the largest possible amount that fits the
+# remaining memory (after the model is loaded). Since the actual memory overhead
+# depends on other parameters like if you're using quantization, flash attention
+# or the model implementation, text-generation-inference cannot infer this
+# number automatically.
+
+#PORT=8080
+#JSON_OUTPUT=true # Outputs the logs in JSON format (useful for telemetry)
+
+#CMD_ADDITIONAL_ARGUMENTS=""
+
+################################################################################
+# Qwen2 7B Instruct
+################################################################################
+MODEL_ID=Qwen/Qwen2-7B-Instruct
+MAX_INPUT_TOKENS=32767
+MAX_TOTAL_TOKENS=32768
+MAX_BATCH_PREFILL_TOKENS=32818
+
+################################################################################
+# Meta Llama 3 8B Instruct
+################################################################################
+#MODEL_ID=meta-llama/Meta-Llama-3-8B-Instruct
+#MAX_INPUT_TOKENS=8191
+#MAX_TOTAL_TOKENS=8192
+#MAX_BATCH_PREFILL_TOKENS=8242
+
+################################################################################
+# Mixtral 8X7B Instruct v0.1 - AWQ
+################################################################################
+#MODEL_ID=casperhansen/mixtral-instruct-awq
+#MAX_INPUT_TOKENS=32767
+#MAX_TOTAL_TOKENS=32768
+#MAX_BATCH_PREFILL_TOKENS=32818
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -0,0 +1,25 @@
+services:
+  tgi:
+    command:
+      - "--hostname 0.0.0.0"
+      - "--port 80"
+      - "${CMD_ADDITIONAL_ARGUMENTS:-}"
+    container_name: tgi
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - capabilities: [gpu]
+              count: all
+              driver: nvidia
+    env_file: .env
+    hostname: tgi
+    image: ghcr.io/huggingface/text-generation-inference:2.0 # 2.0.x
+    ports:
+      - "${PORT:-8080}:80"
+    restart: unless-stopped
+    shm_size: 1g
+    volumes:
+      - "${VOLUME:-./data}:/data"
+      - /etc/localtime:/etc/localtime:ro
+      - /etc/timezone:/etc/timezone:ro