From 86b42f5f6d4f8435f4c74c2964caa6ad21241677 Mon Sep 17 00:00:00 2001
From: Stefan Daniel Schwarz <github.com@wolfram.ravenwolf.de>
Date: Wed, 12 Jun 2024 23:35:40 +0200
Subject: [PATCH] docker-compose

---
 .env               | 57 ++++++++++++++++++++++++++++++++++++++++++++++
 docker-compose.yml | 25 ++++++++++++++++++++
 2 files changed, 82 insertions(+)
 create mode 100644 .env
 create mode 100644 docker-compose.yml

diff --git a/.env b/.env
new file mode 100644
index 00000000..ae5938c0
--- /dev/null
+++ b/.env
@@ -0,0 +1,57 @@
+#VOLUME=$PWD/data
+HF_TOKEN=
+#NVIDIA_VISIBLE_DEVICES=all
+
+#MODEL_ID=bigscience/bloom-560m
+#REVISION=main
+
+#NUM_SHARD=1
+
+#QUANTIZE=awq
+#QUANTIZE=eetq
+#QUANTIZE=exl # 2.0.5
+
+#DTYPE=float
+#DTYPE=bfloat16
+
+#TRUST_REMOTE_CODE=false
+
+#MAX_INPUT_TOKENS= # min(max_position_embeddings - 1, 4095)
+#MAX_TOTAL_TOKENS= # min(max_position_embeddings, 4096)
+#MAX_BATCH_PREFILL_TOKENS= # max_input_tokens + 50
+
+#MAX_BATCH_TOTAL_TOKENS=
+# Overall this number should be the largest possible amount that fits the
+# remaining memory (after the model is loaded). Since the actual memory overhead
+# depends on other parameters like if you're using quantization, flash attention
+# or the model implementation, text-generation-inference cannot infer this
+# number automatically.
+
+#PORT=8080
+#JSON_OUTPUT=true # Outputs the logs in JSON format (useful for telemetry)
+
+#CMD_ADDITIONAL_ARGUMENTS=""
+
+################################################################################
+# Qwen2 7B Instruct
+################################################################################
+MODEL_ID=Qwen/Qwen2-7B-Instruct
+MAX_INPUT_TOKENS=32767
+MAX_TOTAL_TOKENS=32768
+MAX_BATCH_PREFILL_TOKENS=32818
+
+################################################################################
+# Meta Llama 3 8B Instruct
+################################################################################
+#MODEL_ID=meta-llama/Meta-Llama-3-8B-Instruct
+#MAX_INPUT_TOKENS=8191
+#MAX_TOTAL_TOKENS=8192
+#MAX_BATCH_PREFILL_TOKENS=8242
+
+################################################################################
+# Mixtral 8X7B Instruct v0.1 - AWQ
+################################################################################
+#MODEL_ID=casperhansen/mixtral-instruct-awq
+#MAX_INPUT_TOKENS=32767
+#MAX_TOTAL_TOKENS=32768
+#MAX_BATCH_PREFILL_TOKENS=32818
diff --git a/docker-compose.yml b/docker-compose.yml
new file mode 100644
index 00000000..737b155c
--- /dev/null
+++ b/docker-compose.yml
@@ -0,0 +1,25 @@
+services:
+  tgi:
+    command:
+      - "--hostname 0.0.0.0"
+      - "--port 80"
+      - "${CMD_ADDITIONAL_ARGUMENTS:-}"
+    container_name: tgi
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - capabilities: [gpu]
+              count: all
+              driver: nvidia
+    env_file: .env
+    hostname: tgi
+    image: ghcr.io/huggingface/text-generation-inference:2.0 # 2.0.x
+    ports:
+      - "${PORT:-8080}:80"
+    restart: unless-stopped
+    shm_size: 1g
+    volumes:
+      - "${VOLUME:-./data}:/data"
+      - /etc/localtime:/etc/localtime:ro
+      - /etc/timezone:/etc/timezone:ro