Docker Compose 生产环境部署

概述

将 Docker Compose 应用部署到生产环境需要考虑安全性、可靠性、可扩展性和可维护性等多个方面。本文将详细介绍生产环境部署的最佳实践,包括配置优化、安全加固、监控告警、CI/CD 集成等关键内容。

生产环境配置

1. 生产环境 Compose 文件

# docker-compose.prod.yml
version: '3.8'

services:
  # Web 应用
  web:
    image: myapp:${APP_VERSION:-latest}
    restart: unless-stopped
    deploy:
      replicas: 3
      resources:
        limits:
          cpus: '1.0'
          memory: 1G
        reservations:
          cpus: '0.25'
          memory: 512M
      update_config:
        parallelism: 1
        delay: 30s
        failure_action: rollback
        monitor: 60s
        order: start-first
      restart_policy:
        condition: on-failure
        delay: 5s
        max_attempts: 3
        window: 120s
    environment:
      - NODE_ENV=production
      - DATABASE_URL=${DATABASE_URL}
      - REDIS_URL=${REDIS_URL}
      - JWT_SECRET=${JWT_SECRET}
      - API_KEY=${API_KEY}
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:3000/health"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 60s
    networks:
      - frontend
      - backend
    volumes:
      - app_logs:/app/logs
    labels:
      - "traefik.enable=true"
      - "traefik.http.routers.web.rule=Host(`myapp.example.com`)"
      - "traefik.http.routers.web.tls=true"
      - "traefik.http.routers.web.tls.certresolver=letsencrypt"

  # 负载均衡器
  traefik:
    image: traefik:v2.8
    restart: unless-stopped
    ports:
      - "80:80"
      - "443:443"
    environment:
      - TRAEFIK_API_DASHBOARD=true
      - TRAEFIK_API_INSECURE=false
      - TRAEFIK_ENTRYPOINTS_WEB_ADDRESS=:80
      - TRAEFIK_ENTRYPOINTS_WEBSECURE_ADDRESS=:443
      - TRAEFIK_PROVIDERS_DOCKER=true
      - TRAEFIK_PROVIDERS_DOCKER_EXPOSEDBYDEFAULT=false
      - TRAEFIK_CERTIFICATESRESOLVERS_LETSENCRYPT_ACME_EMAIL=${ACME_EMAIL}
      - TRAEFIK_CERTIFICATESRESOLVERS_LETSENCRYPT_ACME_STORAGE=/acme.json
      - TRAEFIK_CERTIFICATESRESOLVERS_LETSENCRYPT_ACME_HTTPCHALLENGE_ENTRYPOINT=web
    volumes:
      - /var/run/docker.sock:/var/run/docker.sock:ro
      - traefik_acme:/acme.json
    networks:
      - frontend
    labels:
      - "traefik.enable=true"
      - "traefik.http.routers.dashboard.rule=Host(`traefik.example.com`)"
      - "traefik.http.routers.dashboard.tls=true"
      - "traefik.http.routers.dashboard.service=api@internal"
      - "traefik.http.routers.dashboard.middlewares=auth"
      - "traefik.http.middlewares.auth.basicauth.users=${TRAEFIK_AUTH}"

  # 数据库
  database:
    image: postgres:13-alpine
    restart: unless-stopped
    environment:
      - POSTGRES_DB=${DB_NAME}
      - POSTGRES_USER=${DB_USER}
      - POSTGRES_PASSWORD=${DB_PASSWORD}
    volumes:
      - postgres_data:/var/lib/postgresql/data
      - ./backups:/backups
    networks:
      - backend
    deploy:
      resources:
        limits:
          cpus: '2.0'
          memory: 4G
        reservations:
          cpus: '0.5'
          memory: 2G
    healthcheck:
      test: ["CMD-SHELL", "pg_isready -U ${DB_USER} -d ${DB_NAME}"]
      interval: 30s
      timeout: 10s
      retries: 3

  # 缓存
  redis:
    image: redis:6-alpine
    restart: unless-stopped
    command: |
      redis-server
      --requirepass ${REDIS_PASSWORD}
      --maxmemory 1gb
      --maxmemory-policy allkeys-lru
      --save 900 1
      --save 300 10
      --save 60 10000
    volumes:
      - redis_data:/data
    networks:
      - backend
    deploy:
      resources:
        limits:
          cpus: '0.5'
          memory: 1G
        reservations:
          cpus: '0.1'
          memory: 512M
    healthcheck:
      test: ["CMD", "redis-cli", "--raw", "incr", "ping"]
      interval: 30s
      timeout: 10s
      retries: 3

volumes:
  postgres_data:
    driver: local
    driver_opts:
      type: ext4
      device: /dev/disk/by-label/postgres-data
      o: defaults,noatime

  redis_data:
    driver: local
    driver_opts:
      type: ext4
      device: /dev/disk/by-label/redis-data
      o: defaults,noatime

  app_logs:
    driver: local

  traefik_acme:
    driver: local

networks:
  frontend:
    driver: bridge
    ipam:
      config:
        - subnet: 172.20.0.0/16

  backend:
    driver: bridge
    internal: true
    ipam:
      config:
        - subnet: 172.21.0.0/16

2. 环境变量配置

# .env.production
# 应用配置
APP_VERSION=v1.2.3
NODE_ENV=production

# 数据库配置
DB_NAME=myapp_prod
DB_USER=myapp_user
DB_PASSWORD=secure_db_password_here
DATABASE_URL=postgresql://${DB_USER}:${DB_PASSWORD}@database:5432/${DB_NAME}

# Redis 配置
REDIS_PASSWORD=secure_redis_password_here
REDIS_URL=redis://:${REDIS_PASSWORD}@redis:6379

# 安全配置
JWT_SECRET=very_secure_jwt_secret_key_here
API_KEY=secure_api_key_here
SESSION_SECRET=secure_session_secret_here

# SSL 证书配置
ACME_EMAIL=admin@example.com
TRAEFIK_AUTH=admin:$2y$10$encrypted_password_hash_here

# 监控配置
PROMETHEUS_ENABLED=true
GRAFANA_ADMIN_PASSWORD=secure_grafana_password

# 日志配置
LOG_LEVEL=info
LOG_FORMAT=json

# 备份配置
BACKUP_SCHEDULE=0 2 * * *
BACKUP_RETENTION_DAYS=30
S3_BACKUP_BUCKET=myapp-backups
AWS_ACCESS_KEY_ID=your_aws_access_key
AWS_SECRET_ACCESS_KEY=your_aws_secret_key

3. 多环境配置管理

# docker-compose.override.yml (开发环境)
version: '3.8'

services:
  web:
    build:
      context: .
      dockerfile: Dockerfile.dev
    volumes:
      - .:/app
      - /app/node_modules
    environment:
      - NODE_ENV=development
      - DEBUG=app:*
    ports:
      - "3000:3000"

  database:
    ports:
      - "5432:5432"
    environment:
      - POSTGRES_DB=myapp_dev

  redis:
    ports:
      - "6379:6379"
# docker-compose.staging.yml
version: '3.8'

services:
  web:
    image: myapp:staging
    environment:
      - NODE_ENV=staging
      - DATABASE_URL=${STAGING_DATABASE_URL}
    labels:
      - "traefik.http.routers.web.rule=Host(`staging.myapp.example.com`)"

  database:
    environment:
      - POSTGRES_DB=${STAGING_DB_NAME}
      - POSTGRES_USER=${STAGING_DB_USER}
      - POSTGRES_PASSWORD=${STAGING_DB_PASSWORD}

安全配置

1. 容器安全

version: '3.8'

services:
  secure_app:
    image: myapp:latest
    # 使用非 root 用户
    user: "1001:1001"

    # 只读文件系统
    read_only: true

    # 临时文件系统
    tmpfs:
      - /tmp:noexec,nosuid,size=100m
      - /var/run:noexec,nosuid,size=50m

    # 安全选项
    security_opt:
      - no-new-privileges:true
      - apparmor:docker-default

    # 能力限制
    cap_drop:
      - ALL
    cap_add:
      - NET_BIND_SERVICE  # 仅在需要绑定特权端口时添加

    # 资源限制
    deploy:
      resources:
        limits:
          cpus: '1.0'
          memory: 1G
          pids: 100

    # 健康检查
    healthcheck:
      test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:3000/health || exit 1"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 60s

    # 环境变量(使用 secrets)
    environment:
      - NODE_ENV=production
    secrets:
      - db_password
      - jwt_secret
      - api_key

    networks:
      - app_network

secrets:
  db_password:
    external: true
  jwt_secret:
    external: true
  api_key:
    external: true

networks:
  app_network:
    driver: bridge
    internal: true

2. 网络安全

version: '3.8'

services:
  # 前端代理(公网访问)
  proxy:
    image: nginx:alpine
    ports:
      - "80:80"
      - "443:443"
    volumes:
      - ./nginx/nginx.conf:/etc/nginx/nginx.conf:ro
      - ./ssl:/etc/nginx/ssl:ro
    networks:
      - frontend
      - dmz
    deploy:
      resources:
        limits:
          cpus: '0.5'
          memory: 512M

  # 应用服务(DMZ)
  app:
    image: myapp:latest
    networks:
      - dmz
      - backend
    # 不暴露端口到主机

  # 数据库(内网)
  database:
    image: postgres:13-alpine
    networks:
      - backend
    # 完全隔离,仅内网访问

  # 防火墙规则
  firewall:
    image: alpine
    network_mode: host
    privileged: true
    volumes:
      - ./firewall/rules.sh:/rules.sh:ro
    command: sh /rules.sh

networks:
  # 公网接入层
  frontend:
    driver: bridge
    ipam:
      config:
        - subnet: 172.20.0.0/24

  # DMZ 网络
  dmz:
    driver: bridge
    ipam:
      config:
        - subnet: 172.21.0.0/24

  # 内网
  backend:
    driver: bridge
    internal: true
    ipam:
      config:
        - subnet: 172.22.0.0/24
#!/bin/bash
# firewall/rules.sh

# 清空现有规则
iptables -F
iptables -X
iptables -t nat -F
iptables -t nat -X

# 默认策略
iptables -P INPUT DROP
iptables -P FORWARD DROP
iptables -P OUTPUT ACCEPT

# 允许本地回环
iptables -A INPUT -i lo -j ACCEPT
iptables -A OUTPUT -o lo -j ACCEPT

# 允许已建立的连接
iptables -A INPUT -m state --state ESTABLISHED,RELATED -j ACCEPT

# 允许 SSH(限制源 IP)
iptables -A INPUT -p tcp --dport 22 -s 192.168.1.0/24 -j ACCEPT

# 允许 HTTP/HTTPS
iptables -A INPUT -p tcp --dport 80 -j ACCEPT
iptables -A INPUT -p tcp --dport 443 -j ACCEPT

# 允许 Docker 网络通信
iptables -A INPUT -i docker0 -j ACCEPT
iptables -A FORWARD -i docker0 -o docker0 -j ACCEPT

# 限制连接频率(防 DDoS)
iptables -A INPUT -p tcp --dport 80 -m limit --limit 25/minute --limit-burst 100 -j ACCEPT
iptables -A INPUT -p tcp --dport 443 -m limit --limit 25/minute --limit-burst 100 -j ACCEPT

# 记录被拒绝的连接
iptables -A INPUT -j LOG --log-prefix "IPTABLES-DROPPED: "
iptables -A INPUT -j DROP

echo "Firewall rules applied successfully"

3. 密钥管理

#!/bin/bash
# scripts/setup-secrets.sh

# 创建 Docker secrets
echo "Setting up Docker secrets..."

# 数据库密码
echo "$DB_PASSWORD" | docker secret create db_password -

# JWT 密钥
echo "$JWT_SECRET" | docker secret create jwt_secret -

# API 密钥
echo "$API_KEY" | docker secret create api_key -

# SSL 证书
docker secret create ssl_cert ./ssl/cert.pem
docker secret create ssl_key ./ssl/key.pem

echo "Secrets created successfully"
# 使用外部密钥管理系统
version: '3.8'

services:
  app:
    image: myapp:latest
    environment:
      - VAULT_ADDR=${VAULT_ADDR}
      - VAULT_TOKEN=${VAULT_TOKEN}
    volumes:
      - ./scripts/get-secrets.sh:/get-secrets.sh:ro
    command: |
      sh -c '
        # 从 Vault 获取密钥
        source /get-secrets.sh
        # 启动应用
        npm start
      '

  # Vault 代理
  vault_agent:
    image: vault:latest
    volumes:
      - ./vault/agent.hcl:/vault/config/agent.hcl:ro
      - vault_secrets:/vault/secrets
    command: vault agent -config=/vault/config/agent.hcl
    networks:
      - backend

volumes:
  vault_secrets:

监控和日志

1. 监控配置

# docker-compose.monitoring.yml
version: '3.8'

services:
  # Prometheus 监控
  prometheus:
    image: prom/prometheus:latest
    ports:
      - "9090:9090"
    volumes:
      - ./monitoring/prometheus.yml:/etc/prometheus/prometheus.yml:ro
      - ./monitoring/rules:/etc/prometheus/rules:ro
      - prometheus_data:/prometheus
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--storage.tsdb.path=/prometheus'
      - '--web.console.libraries=/etc/prometheus/console_libraries'
      - '--web.console.templates=/etc/prometheus/consoles'
      - '--storage.tsdb.retention.time=30d'
      - '--web.enable-lifecycle'
      - '--web.enable-admin-api'
    networks:
      - monitoring
    deploy:
      resources:
        limits:
          cpus: '1.0'
          memory: 2G

  # Grafana 可视化
  grafana:
    image: grafana/grafana:latest
    ports:
      - "3001:3000"
    environment:
      - GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD}
      - GF_USERS_ALLOW_SIGN_UP=false
      - GF_SMTP_ENABLED=true
      - GF_SMTP_HOST=${SMTP_HOST}
      - GF_SMTP_USER=${SMTP_USER}
      - GF_SMTP_PASSWORD=${SMTP_PASSWORD}
    volumes:
      - grafana_data:/var/lib/grafana
      - ./monitoring/grafana/dashboards:/etc/grafana/provisioning/dashboards:ro
      - ./monitoring/grafana/datasources:/etc/grafana/provisioning/datasources:ro
    networks:
      - monitoring
    deploy:
      resources:
        limits:
          cpus: '0.5'
          memory: 1G

  # AlertManager 告警
  alertmanager:
    image: prom/alertmanager:latest
    ports:
      - "9093:9093"
    volumes:
      - ./monitoring/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
      - alertmanager_data:/alertmanager
    command:
      - '--config.file=/etc/alertmanager/alertmanager.yml'
      - '--storage.path=/alertmanager'
      - '--web.external-url=http://localhost:9093'
    networks:
      - monitoring

  # Node Exporter
  node_exporter:
    image: prom/node-exporter:latest
    ports:
      - "9100:9100"
    volumes:
      - /proc:/host/proc:ro
      - /sys:/host/sys:ro
      - /:/rootfs:ro
    command:
      - '--path.procfs=/host/proc'
      - '--path.sysfs=/host/sys'
      - '--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)'
    networks:
      - monitoring

  # cAdvisor
  cadvisor:
    image: gcr.io/cadvisor/cadvisor:latest
    ports:
      - "8080:8080"
    volumes:
      - /:/rootfs:ro
      - /var/run:/var/run:rw
      - /sys:/sys:ro
      - /var/lib/docker/:/var/lib/docker:ro
    networks:
      - monitoring

volumes:
  prometheus_data:
  grafana_data:
  alertmanager_data:

networks:
  monitoring:

2. 日志管理

# docker-compose.logging.yml
version: '3.8'

services:
  # ELK Stack - Elasticsearch
  elasticsearch:
    image: docker.elastic.co/elasticsearch/elasticsearch:7.15.0
    environment:
      - discovery.type=single-node
      - "ES_JAVA_OPTS=-Xms1g -Xmx1g"
      - xpack.security.enabled=false
    volumes:
      - elasticsearch_data:/usr/share/elasticsearch/data
    networks:
      - logging
    deploy:
      resources:
        limits:
          cpus: '2.0'
          memory: 4G

  # Logstash
  logstash:
    image: docker.elastic.co/logstash/logstash:7.15.0
    volumes:
      - ./logging/logstash.conf:/usr/share/logstash/pipeline/logstash.conf:ro
    environment:
      - "LS_JAVA_OPTS=-Xmx1g -Xms1g"
    networks:
      - logging
    depends_on:
      - elasticsearch

  # Kibana
  kibana:
    image: docker.elastic.co/kibana/kibana:7.15.0
    ports:
      - "5601:5601"
    environment:
      - ELASTICSEARCH_HOSTS=http://elasticsearch:9200
    networks:
      - logging
    depends_on:
      - elasticsearch

  # Fluentd 日志收集
  fluentd:
    image: fluent/fluentd:v1.14-1
    ports:
      - "24224:24224"
    volumes:
      - ./logging/fluent.conf:/fluentd/etc/fluent.conf:ro
      - fluentd_data:/fluentd/log
    networks:
      - logging
    depends_on:
      - elasticsearch

  # 应用(配置日志驱动)
  app:
    image: myapp:latest
    logging:
      driver: fluentd
      options:
        fluentd-address: localhost:24224
        tag: myapp.{{.Name}}
        fluentd-async-connect: "true"
        fluentd-retry-wait: "1s"
        fluentd-max-retries: "30"
    networks:
      - app_network
      - logging

volumes:
  elasticsearch_data:
  fluentd_data:

networks:
  logging:
  app_network:
# logging/fluent.conf
<source>
  @type forward
  port 24224
  bind 0.0.0.0
</source>

<filter myapp.**>
  @type parser
  key_name log
  reserve_data true
  <parse>
    @type json
  </parse>
</filter>

<match myapp.**>
  @type elasticsearch
  host elasticsearch
  port 9200
  index_name myapp-logs
  type_name _doc

  <buffer>
    @type file
    path /fluentd/log/myapp.buffer
    flush_mode interval
    flush_interval 10s
    chunk_limit_size 10MB
    queue_limit_length 32
    retry_max_interval 30
    retry_forever true
  </buffer>
</match>

CI/CD 集成

1. GitLab CI/CD

# .gitlab-ci.yml
stages:
  - test
  - build
  - deploy-staging
  - deploy-production

variables:
  DOCKER_DRIVER: overlay2
  DOCKER_TLS_CERTDIR: "/certs"
  IMAGE_NAME: $CI_REGISTRY_IMAGE
  IMAGE_TAG: $CI_COMMIT_SHA

services:
  - docker:20.10.16-dind

before_script:
  - docker login -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $CI_REGISTRY

# 测试阶段
test:
  stage: test
  image: node:16-alpine
  script:
    - npm ci
    - npm run test
    - npm run lint
  coverage: '/Lines\s*:\s*(\d+\.?\d*)%/'
  artifacts:
    reports:
      coverage_report:
        coverage_format: cobertura
        path: coverage/cobertura-coverage.xml
  only:
    - merge_requests
    - main
    - develop

# 构建阶段
build:
  stage: build
  image: docker:20.10.16
  script:
    - docker build -t $IMAGE_NAME:$IMAGE_TAG .
    - docker tag $IMAGE_NAME:$IMAGE_TAG $IMAGE_NAME:latest
    - docker push $IMAGE_NAME:$IMAGE_TAG
    - docker push $IMAGE_NAME:latest
  only:
    - main
    - develop

# 部署到测试环境
deploy-staging:
  stage: deploy-staging
  image: alpine:latest
  before_script:
    - apk add --no-cache openssh-client docker-compose
    - eval $(ssh-agent -s)
    - echo "$SSH_PRIVATE_KEY" | tr -d '\r' | ssh-add -
    - mkdir -p ~/.ssh
    - chmod 700 ~/.ssh
    - ssh-keyscan $STAGING_HOST >> ~/.ssh/known_hosts
    - chmod 644 ~/.ssh/known_hosts
  script:
    - |
      ssh $STAGING_USER@$STAGING_HOST << EOF
        cd /opt/myapp
        export APP_VERSION=$IMAGE_TAG
        docker-compose -f docker-compose.yml -f docker-compose.staging.yml pull
        docker-compose -f docker-compose.yml -f docker-compose.staging.yml up -d
        docker system prune -f
      EOF
  environment:
    name: staging
    url: https://staging.myapp.example.com
  only:
    - develop

# 部署到生产环境
deploy-production:
  stage: deploy-production
  image: alpine:latest
  before_script:
    - apk add --no-cache openssh-client docker-compose
    - eval $(ssh-agent -s)
    - echo "$SSH_PRIVATE_KEY" | tr -d '\r' | ssh-add -
    - mkdir -p ~/.ssh
    - chmod 700 ~/.ssh
    - ssh-keyscan $PRODUCTION_HOST >> ~/.ssh/known_hosts
    - chmod 644 ~/.ssh/known_hosts
  script:
    - |
      ssh $PRODUCTION_USER@$PRODUCTION_HOST << EOF
        cd /opt/myapp

        # 备份当前版本
        docker-compose -f docker-compose.prod.yml exec -T database pg_dump -U \$DB_USER \$DB_NAME > backup_\$(date +%Y%m%d_%H%M%S).sql

        # 部署新版本
        export APP_VERSION=$IMAGE_TAG
        docker-compose -f docker-compose.prod.yml pull
        docker-compose -f docker-compose.prod.yml up -d

        # 健康检查
        sleep 30
        if ! curl -f http://localhost/health; then
          echo "Health check failed, rolling back..."
          docker-compose -f docker-compose.prod.yml rollback
          exit 1
        fi

        # 清理旧镜像
        docker system prune -f
      EOF
  environment:
    name: production
    url: https://myapp.example.com
  when: manual
  only:
    - main

2. GitHub Actions

# .github/workflows/deploy.yml
name: Deploy to Production

on:
  push:
    branches: [main]
  pull_request:
    branches: [main]

env:
  REGISTRY: ghcr.io
  IMAGE_NAME: ${{ github.repository }}

jobs:
  test:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v3

      - name: Setup Node.js
        uses: actions/setup-node@v3
        with:
          node-version: '16'
          cache: 'npm'

      - name: Install dependencies
        run: npm ci

      - name: Run tests
        run: |
          npm run test
          npm run lint

      - name: Upload coverage reports
        uses: codecov/codecov-action@v3
        with:
          file: ./coverage/lcov.info

  build:
    needs: test
    runs-on: ubuntu-latest
    if: github.ref == 'refs/heads/main'
    permissions:
      contents: read
      packages: write

    steps:
      - name: Checkout repository
        uses: actions/checkout@v3

      - name: Log in to Container Registry
        uses: docker/login-action@v2
        with:
          registry: ${{ env.REGISTRY }}
          username: ${{ github.actor }}
          password: ${{ secrets.GITHUB_TOKEN }}

      - name: Extract metadata
        id: meta
        uses: docker/metadata-action@v4
        with:
          images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
          tags: |
            type=ref,event=branch
            type=ref,event=pr
            type=sha,prefix={{branch}}-
            type=raw,value=latest,enable={{is_default_branch}}

      - name: Build and push Docker image
        uses: docker/build-push-action@v4
        with:
          context: .
          push: true
          tags: ${{ steps.meta.outputs.tags }}
          labels: ${{ steps.meta.outputs.labels }}

  deploy:
    needs: build
    runs-on: ubuntu-latest
    if: github.ref == 'refs/heads/main'
    environment: production

    steps:
      - name: Checkout repository
        uses: actions/checkout@v3

      - name: Deploy to production
        uses: appleboy/ssh-action@v0.1.5
        with:
          host: ${{ secrets.PRODUCTION_HOST }}
          username: ${{ secrets.PRODUCTION_USER }}
          key: ${{ secrets.SSH_PRIVATE_KEY }}
          script: |
            cd /opt/myapp

            # 设置环境变量
            export APP_VERSION=${{ github.sha }}

            # 备份数据库
            docker-compose -f docker-compose.prod.yml exec -T database pg_dump -U $DB_USER $DB_NAME > backup_$(date +%Y%m%d_%H%M%S).sql

            # 拉取新镜像
            docker-compose -f docker-compose.prod.yml pull

            # 滚动更新
            docker-compose -f docker-compose.prod.yml up -d

            # 等待服务启动
            sleep 30

            # 健康检查
            if ! curl -f http://localhost/health; then
              echo "Health check failed, rolling back..."
              docker-compose -f docker-compose.prod.yml down
              # 恢复到上一个版本的逻辑
              exit 1
            fi

            # 清理旧镜像
            docker system prune -f

      - name: Notify deployment
        if: always()
        uses: 8398a7/action-slack@v3
        with:
          status: ${{ job.status }}
          channel: '#deployments'
          webhook_url: ${{ secrets.SLACK_WEBHOOK }}

3. 部署脚本

#!/bin/bash
# scripts/deploy.sh

set -e

# 配置
APP_NAME="myapp"
DEPLOY_ENV=${1:-production}
APP_VERSION=${2:-latest}
BACKUP_RETENTION_DAYS=30

# 颜色输出
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color

log() {
    echo -e "${GREEN}[$(date +'%Y-%m-%d %H:%M:%S')] $1${NC}"
}

warn() {
    echo -e "${YELLOW}[$(date +'%Y-%m-%d %H:%M:%S')] WARNING: $1${NC}"
}

error() {
    echo -e "${RED}[$(date +'%Y-%m-%d %H:%M:%S')] ERROR: $1${NC}"
    exit 1
}

# 检查环境
check_environment() {
    log "Checking deployment environment..."

    if [ ! -f "docker-compose.${DEPLOY_ENV}.yml" ]; then
        error "Compose file for environment '${DEPLOY_ENV}' not found"
    fi

    if [ ! -f ".env.${DEPLOY_ENV}" ]; then
        error "Environment file for '${DEPLOY_ENV}' not found"
    fi

    # 检查 Docker 和 Docker Compose
    command -v docker >/dev/null 2>&1 || error "Docker is not installed"
    command -v docker-compose >/dev/null 2>&1 || error "Docker Compose is not installed"

    log "Environment check passed"
}

# 备份数据库
backup_database() {
    log "Creating database backup..."

    local backup_file="backup_${DEPLOY_ENV}_$(date +%Y%m%d_%H%M%S).sql"

    if docker-compose -f docker-compose.${DEPLOY_ENV}.yml exec -T database pg_isready -U $DB_USER; then
        docker-compose -f docker-compose.${DEPLOY_ENV}.yml exec -T database pg_dump -U $DB_USER $DB_NAME > "./backups/${backup_file}"
        log "Database backup created: ${backup_file}"

        # 清理旧备份
        find ./backups -name "backup_${DEPLOY_ENV}_*.sql" -mtime +${BACKUP_RETENTION_DAYS} -delete
        log "Old backups cleaned up"
    else
        warn "Database is not running, skipping backup"
    fi
}

# 健康检查
health_check() {
    log "Performing health check..."

    local max_attempts=30
    local attempt=1

    while [ $attempt -le $max_attempts ]; do
        if curl -f -s http://localhost/health > /dev/null; then
            log "Health check passed"
            return 0
        fi

        log "Health check attempt $attempt/$max_attempts failed, retrying in 10 seconds..."
        sleep 10
        attempt=$((attempt + 1))
    done

    error "Health check failed after $max_attempts attempts"
}

# 回滚
rollback() {
    warn "Rolling back to previous version..."

    # 停止当前服务
    docker-compose -f docker-compose.${DEPLOY_ENV}.yml down

    # 恢复到上一个版本(这里需要根据实际情况实现)
    # 例如:从备份恢复,或者使用上一个镜像版本

    error "Rollback completed"
}

# 主部署流程
deploy() {
    log "Starting deployment of ${APP_NAME} version ${APP_VERSION} to ${DEPLOY_ENV}"

    # 设置环境变量
    export APP_VERSION
    source .env.${DEPLOY_ENV}

    # 创建备份
    backup_database

    # 拉取新镜像
    log "Pulling new images..."
    docker-compose -f docker-compose.${DEPLOY_ENV}.yml pull

    # 滚动更新
    log "Starting rolling update..."
    docker-compose -f docker-compose.${DEPLOY_ENV}.yml up -d

    # 等待服务启动
    log "Waiting for services to start..."
    sleep 30

    # 健康检查
    if ! health_check; then
        rollback
    fi

    # 清理
    log "Cleaning up old images..."
    docker system prune -f

    log "Deployment completed successfully!"
}

# 主函数
main() {
    log "=== Docker Compose Deployment Script ==="
    log "Environment: ${DEPLOY_ENV}"
    log "Version: ${APP_VERSION}"

    check_environment
    deploy

    log "=== Deployment Finished ==="
}

# 错误处理
trap 'error "Deployment failed"' ERR

# 执行主函数
main "$@"

备份和恢复

1. 自动备份配置

# docker-compose.backup.yml
version: '3.8'

services:
  # 数据库备份
  db_backup:
    image: postgres:13-alpine
    volumes:
      - ./backups:/backups
      - ./scripts/backup-db.sh:/backup-db.sh:ro
    environment:
      - PGPASSWORD=${DB_PASSWORD}
      - DB_HOST=database
      - DB_NAME=${DB_NAME}
      - DB_USER=${DB_USER}
      - BACKUP_RETENTION_DAYS=30
      - S3_BUCKET=${S3_BACKUP_BUCKET}
      - AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID}
      - AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY}
    command: |
      sh -c '
        # 安装 AWS CLI
        apk add --no-cache aws-cli

        # 设置 cron 任务
        echo "0 2 * * * /backup-db.sh" | crontab -

        # 启动 cron
        crond -f
      '
    networks:
      - backend
    depends_on:
      - database

  # 文件备份
  file_backup:
    image: alpine:latest
    volumes:
      - app_data:/data:ro
      - ./backups:/backups
      - ./scripts/backup-files.sh:/backup-files.sh:ro
    environment:
      - BACKUP_RETENTION_DAYS=30
      - S3_BUCKET=${S3_BACKUP_BUCKET}
      - AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID}
      - AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY}
    command: |
      sh -c '
        apk add --no-cache aws-cli tar gzip
        echo "0 3 * * * /backup-files.sh" | crontab -
        crond -f
      '

volumes:
  app_data:
    external: true

networks:
  backend:
    external: true
#!/bin/bash
# scripts/backup-db.sh

set -e

BACKUP_DATE=$(date +%Y%m%d_%H%M%S)
BACKUP_FILE="db_backup_${BACKUP_DATE}.sql"
LOCAL_BACKUP_PATH="/backups/${BACKUP_FILE}"
S3_BACKUP_PATH="s3://${S3_BUCKET}/database/${BACKUP_FILE}"

echo "Starting database backup at $(date)"

# 创建备份
pg_dump -h $DB_HOST -U $DB_USER -d $DB_NAME > $LOCAL_BACKUP_PATH

# 压缩备份
gzip $LOCAL_BACKUP_PATH
LOCAL_BACKUP_PATH="${LOCAL_BACKUP_PATH}.gz"
S3_BACKUP_PATH="${S3_BACKUP_PATH}.gz"

# 上传到 S3
if [ -n "$S3_BUCKET" ]; then
    aws s3 cp $LOCAL_BACKUP_PATH $S3_BACKUP_PATH
    echo "Backup uploaded to S3: $S3_BACKUP_PATH"
fi

# 清理本地旧备份
find /backups -name "db_backup_*.sql.gz" -mtime +$BACKUP_RETENTION_DAYS -delete

# 清理 S3 旧备份
if [ -n "$S3_BUCKET" ]; then
    aws s3 ls s3://${S3_BUCKET}/database/ | while read -r line; do
        createDate=$(echo $line | awk '{print $1" "$2}')
        createDate=$(date -d "$createDate" +%s)
        olderThan=$(date -d "$BACKUP_RETENTION_DAYS days ago" +%s)
        if [[ $createDate -lt $olderThan ]]; then
            fileName=$(echo $line | awk '{print $4}')
            if [[ $fileName != "" ]]; then
                aws s3 rm s3://${S3_BUCKET}/database/$fileName
                echo "Deleted old backup: $fileName"
            fi
        fi
    done
fi

echo "Database backup completed at $(date)"

2. 恢复脚本

#!/bin/bash
# scripts/restore.sh

set -e

BACKUP_FILE=${1}
RESTORE_ENV=${2:-production}

if [ -z "$BACKUP_FILE" ]; then
    echo "Usage: $0 <backup_file> [environment]"
    echo "Available backups:"
    ls -la ./backups/
    exit 1
fi

echo "Restoring from backup: $BACKUP_FILE"
echo "Target environment: $RESTORE_ENV"

# 确认操作
read -p "Are you sure you want to restore? This will overwrite existing data. (yes/no): " confirm
if [ "$confirm" != "yes" ]; then
    echo "Restore cancelled"
    exit 0
fi

# 加载环境变量
source .env.$RESTORE_ENV

# 停止应用服务(保持数据库运行)
docker-compose -f docker-compose.$RESTORE_ENV.yml stop web

# 创建当前数据库备份
echo "Creating backup of current database..."
current_backup="backup_before_restore_$(date +%Y%m%d_%H%M%S).sql"
docker-compose -f docker-compose.$RESTORE_ENV.yml exec -T database pg_dump -U $DB_USER $DB_NAME > "./backups/$current_backup"
echo "Current database backed up to: $current_backup"

# 恢复数据库
echo "Restoring database..."
if [[ $BACKUP_FILE == *.gz ]]; then
    gunzip -c "./backups/$BACKUP_FILE" | docker-compose -f docker-compose.$RESTORE_ENV.yml exec -T database psql -U $DB_USER -d $DB_NAME
else
    docker-compose -f docker-compose.$RESTORE_ENV.yml exec -T database psql -U $DB_USER -d $DB_NAME < "./backups/$BACKUP_FILE"
fi

# 重启应用服务
echo "Restarting application services..."
docker-compose -f docker-compose.$RESTORE_ENV.yml up -d

# 健康检查
echo "Performing health check..."
sleep 30
if curl -f http://localhost/health; then
    echo "Restore completed successfully!"
else
    echo "Health check failed after restore"
    exit 1
fi

最佳实践总结

1. 生产环境检查清单

# 生产环境部署检查清单
production_checklist:
  security:
    - ✓ 使用非 root 用户运行容器
    - ✓ 启用只读文件系统
    - ✓ 配置安全选项和能力限制
    - ✓ 使用 secrets 管理敏感信息
    - ✓ 配置网络隔离
    - ✓ 启用防火墙规则
    - ✓ 使用 HTTPS 和 SSL 证书

  reliability:
    - ✓ 配置健康检查
    - ✓ 设置重启策略
    - ✓ 配置资源限制
    - ✓ 实现优雅关闭
    - ✓ 配置数据持久化
    - ✓ 设置备份策略

  monitoring:
    - ✓ 配置日志收集
    - ✓ 设置性能监控
    - ✓ 配置告警规则
    - ✓ 实现分布式追踪
    - ✓ 设置仪表板

  deployment:
    - ✓ 实现 CI/CD 流水线
    - ✓ 配置滚动更新
    - ✓ 实现蓝绿部署
    - ✓ 配置回滚机制
    - ✓ 自动化测试

2. 性能优化建议

performance_optimization:
  container:
    - 使用多阶段构建减小镜像大小
    - 选择合适的基础镜像
    - 优化 Dockerfile 层缓存
    - 配置合理的资源限制

  network:
    - 使用内部网络减少延迟
    - 配置负载均衡
    - 启用 HTTP/2 和压缩
    - 使用 CDN 加速静态资源

  storage:
    - 使用 SSD 存储
    - 配置数据库连接池
    - 实现缓存策略
    - 优化数据库查询

  application:
    - 启用生产模式
    - 配置进程管理
    - 实现连接复用
    - 优化内存使用

总结

Docker Compose 生产环境部署是一个复杂的系统工程,需要综合考虑:

  1. 安全性: 容器安全、网络隔离、密钥管理
  2. 可靠性: 健康检查、重启策略、数据备份
  3. 可观测性: 监控告警、日志收集、性能分析
  4. 自动化: CI/CD 流水线、自动部署、回滚机制
  5. 性能: 资源优化、网络优化、存储优化
  6. 运维: 备份恢复、故障处理、扩容缩容

关键要点:

  • 遵循安全最佳实践
  • 建立完善的监控体系
  • 实现自动化部署流程
  • 制定应急响应计划
  • 定期进行安全审计和性能优化
  • 保持文档更新和团队培训

通过系统性的规划和实施,可以构建稳定、安全、高效的 Docker Compose 生产环境。

powered by Gitbook© 2025 编外计划 | 最后修改: 2025-08-29 15:40:15

results matching ""

    No results matching ""