docker-compose

2025-09-12 04:44:52 +00:00 · 2024-06-12 23:35:40 +02:00 · 2024-06-12 23:35:40 +02:00 · 86b42f5f6d
commit 86b42f5f6d
parent 90184df79c
2 changed files with 82 additions and 0 deletions
--- a/.env
+++ b/.env
@ -0,0 +1,57 @@
 #VOLUME=$PWD/data
 HF_TOKEN=
 #NVIDIA_VISIBLE_DEVICES=all
 #MODEL_ID=bigscience/bloom-560m
 #REVISION=main
 #NUM_SHARD=1
 #QUANTIZE=awq
 #QUANTIZE=eetq
 #QUANTIZE=exl # 2.0.5
 #DTYPE=float
 #DTYPE=bfloat16
 #TRUST_REMOTE_CODE=false
 #MAX_INPUT_TOKENS= # min(max_position_embeddings - 1, 4095)
 #MAX_TOTAL_TOKENS= # min(max_position_embeddings, 4096)
 #MAX_BATCH_PREFILL_TOKENS= # max_input_tokens + 50
 #MAX_BATCH_TOTAL_TOKENS=
 # Overall this number should be the largest possible amount that fits the
 # remaining memory (after the model is loaded). Since the actual memory overhead
 # depends on other parameters like if you're using quantization, flash attention
 # or the model implementation, text-generation-inference cannot infer this
 # number automatically.
 #PORT=8080
 #JSON_OUTPUT=true # Outputs the logs in JSON format (useful for telemetry)
 #CMD_ADDITIONAL_ARGUMENTS=""
 ################################################################################
 # Qwen2 7B Instruct
 ################################################################################
 MODEL_ID=Qwen/Qwen2-7B-Instruct
 MAX_INPUT_TOKENS=32767
 MAX_TOTAL_TOKENS=32768
 MAX_BATCH_PREFILL_TOKENS=32818
 ################################################################################
 # Meta Llama 3 8B Instruct
 ################################################################################
 #MODEL_ID=meta-llama/Meta-Llama-3-8B-Instruct
 #MAX_INPUT_TOKENS=8191
 #MAX_TOTAL_TOKENS=8192
 #MAX_BATCH_PREFILL_TOKENS=8242
 ################################################################################
 # Mixtral 8X7B Instruct v0.1 - AWQ
 ################################################################################
 #MODEL_ID=casperhansen/mixtral-instruct-awq
 #MAX_INPUT_TOKENS=32767
 #MAX_TOTAL_TOKENS=32768
 #MAX_BATCH_PREFILL_TOKENS=32818
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -0,0 +1,25 @@
 services:
  tgi:
    command:
      - "--hostname 0.0.0.0"
      - "--port 80"
      - "${CMD_ADDITIONAL_ARGUMENTS:-}"
    container_name: tgi
    deploy:
      resources:
        reservations:
          devices:
            - capabilities: [gpu]
              count: all
              driver: nvidia
    env_file: .env
    hostname: tgi
    image: ghcr.io/huggingface/text-generation-inference:2.0 # 2.0.x
    ports:
      - "${PORT:-8080}:80"
    restart: unless-stopped
    shm_size: 1g
    volumes:
      - "${VOLUME:-./data}:/data"
      - /etc/localtime:/etc/localtime:ro
      - /etc/timezone:/etc/timezone:ro