AI_Secretary_System/docker-compose.full.yml at main · DebuggingMax/AI_Secretary_System · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# =============================================================================
# AI Secretary System - Full Containerized Setup (with vLLM container)
# =============================================================================
#
# Use this for production deployment or CI/CD where you need everything in Docker.
# WARNING: vLLM image is ~9GB, first pull takes time.
#
# Usage:
#   docker compose -f docker-compose.yml -f docker-compose.full.yml up -d
#
# =============================================================================

services:
  orchestrator:
    environment:
      # Override to use containerized vLLM
      - VLLM_API_URL=http://vllm:8000/v1
    extra_hosts: []  # Remove host.docker.internal
    depends_on:
      redis:
        condition: service_healthy
      vllm:
        condition: service_healthy
    volumes:
      # Persistent data
      - ./data:/app/data
      - ./logs:/app/logs
      - ./models:/app/models
      # Voice samples
      - ./Анна:/app/Анна:ro
      - ./Марина:/app/Марина:ro
      # Named volumes for caches (not host mounts)
      - tts_cache:/root/.local/share/tts
      - hf_cache:/root/.cache/huggingface

  # ---------------------------------------------------------------------------
  # vLLM - Local LLM Inference Server (containerized)
  # ---------------------------------------------------------------------------
  vllm:
    image: vllm/vllm-openai:latest
    container_name: ai-secretary-vllm
    command: >
      --model ${VLLM_MODEL:-Qwen/Qwen2.5-7B-Instruct-AWQ}
      --gpu-memory-utilization 0.5
      --max-model-len 4096
      --dtype float16
      --max-num-seqs 32
      --enforce-eager
      --trust-remote-code
      --host 0.0.0.0
      --port 8000
    volumes:
      - hf_cache:/root/.cache/huggingface
      - ./finetune/adapters:/app/adapters:ro
    environment:
      - HUGGING_FACE_HUB_TOKEN=${HF_TOKEN:-}
      - VLLM_LOGGING_LEVEL=WARNING
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: 1
              capabilities: [gpu]
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
      interval: 30s
      timeout: 10s
      retries: 20
      start_period: 180s
    restart: unless-stopped
    networks:
      - ai-secretary

volumes:
  tts_cache:
    driver: local
  hf_cache:
    driver: local