Docker Compose 生产环境部署
概述
将 Docker Compose 应用部署到生产环境需要考虑安全性、可靠性、可扩展性和可维护性等多个方面。本文将详细介绍生产环境部署的最佳实践,包括配置优化、安全加固、监控告警、CI/CD 集成等关键内容。
生产环境配置
1. 生产环境 Compose 文件
# docker-compose.prod.yml
version: '3.8'
services:
# Web 应用
web:
image: myapp:${APP_VERSION:-latest}
restart: unless-stopped
deploy:
replicas: 3
resources:
limits:
cpus: '1.0'
memory: 1G
reservations:
cpus: '0.25'
memory: 512M
update_config:
parallelism: 1
delay: 30s
failure_action: rollback
monitor: 60s
order: start-first
restart_policy:
condition: on-failure
delay: 5s
max_attempts: 3
window: 120s
environment:
- NODE_ENV=production
- DATABASE_URL=${DATABASE_URL}
- REDIS_URL=${REDIS_URL}
- JWT_SECRET=${JWT_SECRET}
- API_KEY=${API_KEY}
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:3000/health"]
interval: 30s
timeout: 10s
retries: 3
start_period: 60s
networks:
- frontend
- backend
volumes:
- app_logs:/app/logs
labels:
- "traefik.enable=true"
- "traefik.http.routers.web.rule=Host(`myapp.example.com`)"
- "traefik.http.routers.web.tls=true"
- "traefik.http.routers.web.tls.certresolver=letsencrypt"
# 负载均衡器
traefik:
image: traefik:v2.8
restart: unless-stopped
ports:
- "80:80"
- "443:443"
environment:
- TRAEFIK_API_DASHBOARD=true
- TRAEFIK_API_INSECURE=false
- TRAEFIK_ENTRYPOINTS_WEB_ADDRESS=:80
- TRAEFIK_ENTRYPOINTS_WEBSECURE_ADDRESS=:443
- TRAEFIK_PROVIDERS_DOCKER=true
- TRAEFIK_PROVIDERS_DOCKER_EXPOSEDBYDEFAULT=false
- TRAEFIK_CERTIFICATESRESOLVERS_LETSENCRYPT_ACME_EMAIL=${ACME_EMAIL}
- TRAEFIK_CERTIFICATESRESOLVERS_LETSENCRYPT_ACME_STORAGE=/acme.json
- TRAEFIK_CERTIFICATESRESOLVERS_LETSENCRYPT_ACME_HTTPCHALLENGE_ENTRYPOINT=web
volumes:
- /var/run/docker.sock:/var/run/docker.sock:ro
- traefik_acme:/acme.json
networks:
- frontend
labels:
- "traefik.enable=true"
- "traefik.http.routers.dashboard.rule=Host(`traefik.example.com`)"
- "traefik.http.routers.dashboard.tls=true"
- "traefik.http.routers.dashboard.service=api@internal"
- "traefik.http.routers.dashboard.middlewares=auth"
- "traefik.http.middlewares.auth.basicauth.users=${TRAEFIK_AUTH}"
# 数据库
database:
image: postgres:13-alpine
restart: unless-stopped
environment:
- POSTGRES_DB=${DB_NAME}
- POSTGRES_USER=${DB_USER}
- POSTGRES_PASSWORD=${DB_PASSWORD}
volumes:
- postgres_data:/var/lib/postgresql/data
- ./backups:/backups
networks:
- backend
deploy:
resources:
limits:
cpus: '2.0'
memory: 4G
reservations:
cpus: '0.5'
memory: 2G
healthcheck:
test: ["CMD-SHELL", "pg_isready -U ${DB_USER} -d ${DB_NAME}"]
interval: 30s
timeout: 10s
retries: 3
# 缓存
redis:
image: redis:6-alpine
restart: unless-stopped
command: |
redis-server
--requirepass ${REDIS_PASSWORD}
--maxmemory 1gb
--maxmemory-policy allkeys-lru
--save 900 1
--save 300 10
--save 60 10000
volumes:
- redis_data:/data
networks:
- backend
deploy:
resources:
limits:
cpus: '0.5'
memory: 1G
reservations:
cpus: '0.1'
memory: 512M
healthcheck:
test: ["CMD", "redis-cli", "--raw", "incr", "ping"]
interval: 30s
timeout: 10s
retries: 3
volumes:
postgres_data:
driver: local
driver_opts:
type: ext4
device: /dev/disk/by-label/postgres-data
o: defaults,noatime
redis_data:
driver: local
driver_opts:
type: ext4
device: /dev/disk/by-label/redis-data
o: defaults,noatime
app_logs:
driver: local
traefik_acme:
driver: local
networks:
frontend:
driver: bridge
ipam:
config:
- subnet: 172.20.0.0/16
backend:
driver: bridge
internal: true
ipam:
config:
- subnet: 172.21.0.0/16
2. 环境变量配置
# .env.production
# 应用配置
APP_VERSION=v1.2.3
NODE_ENV=production
# 数据库配置
DB_NAME=myapp_prod
DB_USER=myapp_user
DB_PASSWORD=secure_db_password_here
DATABASE_URL=postgresql://${DB_USER}:${DB_PASSWORD}@database:5432/${DB_NAME}
# Redis 配置
REDIS_PASSWORD=secure_redis_password_here
REDIS_URL=redis://:${REDIS_PASSWORD}@redis:6379
# 安全配置
JWT_SECRET=very_secure_jwt_secret_key_here
API_KEY=secure_api_key_here
SESSION_SECRET=secure_session_secret_here
# SSL 证书配置
ACME_EMAIL=admin@example.com
TRAEFIK_AUTH=admin:$2y$10$encrypted_password_hash_here
# 监控配置
PROMETHEUS_ENABLED=true
GRAFANA_ADMIN_PASSWORD=secure_grafana_password
# 日志配置
LOG_LEVEL=info
LOG_FORMAT=json
# 备份配置
BACKUP_SCHEDULE=0 2 * * *
BACKUP_RETENTION_DAYS=30
S3_BACKUP_BUCKET=myapp-backups
AWS_ACCESS_KEY_ID=your_aws_access_key
AWS_SECRET_ACCESS_KEY=your_aws_secret_key
3. 多环境配置管理
# docker-compose.override.yml (开发环境)
version: '3.8'
services:
web:
build:
context: .
dockerfile: Dockerfile.dev
volumes:
- .:/app
- /app/node_modules
environment:
- NODE_ENV=development
- DEBUG=app:*
ports:
- "3000:3000"
database:
ports:
- "5432:5432"
environment:
- POSTGRES_DB=myapp_dev
redis:
ports:
- "6379:6379"
# docker-compose.staging.yml
version: '3.8'
services:
web:
image: myapp:staging
environment:
- NODE_ENV=staging
- DATABASE_URL=${STAGING_DATABASE_URL}
labels:
- "traefik.http.routers.web.rule=Host(`staging.myapp.example.com`)"
database:
environment:
- POSTGRES_DB=${STAGING_DB_NAME}
- POSTGRES_USER=${STAGING_DB_USER}
- POSTGRES_PASSWORD=${STAGING_DB_PASSWORD}
安全配置
1. 容器安全
version: '3.8'
services:
secure_app:
image: myapp:latest
# 使用非 root 用户
user: "1001:1001"
# 只读文件系统
read_only: true
# 临时文件系统
tmpfs:
- /tmp:noexec,nosuid,size=100m
- /var/run:noexec,nosuid,size=50m
# 安全选项
security_opt:
- no-new-privileges:true
- apparmor:docker-default
# 能力限制
cap_drop:
- ALL
cap_add:
- NET_BIND_SERVICE # 仅在需要绑定特权端口时添加
# 资源限制
deploy:
resources:
limits:
cpus: '1.0'
memory: 1G
pids: 100
# 健康检查
healthcheck:
test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:3000/health || exit 1"]
interval: 30s
timeout: 10s
retries: 3
start_period: 60s
# 环境变量(使用 secrets)
environment:
- NODE_ENV=production
secrets:
- db_password
- jwt_secret
- api_key
networks:
- app_network
secrets:
db_password:
external: true
jwt_secret:
external: true
api_key:
external: true
networks:
app_network:
driver: bridge
internal: true
2. 网络安全
version: '3.8'
services:
# 前端代理(公网访问)
proxy:
image: nginx:alpine
ports:
- "80:80"
- "443:443"
volumes:
- ./nginx/nginx.conf:/etc/nginx/nginx.conf:ro
- ./ssl:/etc/nginx/ssl:ro
networks:
- frontend
- dmz
deploy:
resources:
limits:
cpus: '0.5'
memory: 512M
# 应用服务(DMZ)
app:
image: myapp:latest
networks:
- dmz
- backend
# 不暴露端口到主机
# 数据库(内网)
database:
image: postgres:13-alpine
networks:
- backend
# 完全隔离,仅内网访问
# 防火墙规则
firewall:
image: alpine
network_mode: host
privileged: true
volumes:
- ./firewall/rules.sh:/rules.sh:ro
command: sh /rules.sh
networks:
# 公网接入层
frontend:
driver: bridge
ipam:
config:
- subnet: 172.20.0.0/24
# DMZ 网络
dmz:
driver: bridge
ipam:
config:
- subnet: 172.21.0.0/24
# 内网
backend:
driver: bridge
internal: true
ipam:
config:
- subnet: 172.22.0.0/24
#!/bin/bash
# firewall/rules.sh
# 清空现有规则
iptables -F
iptables -X
iptables -t nat -F
iptables -t nat -X
# 默认策略
iptables -P INPUT DROP
iptables -P FORWARD DROP
iptables -P OUTPUT ACCEPT
# 允许本地回环
iptables -A INPUT -i lo -j ACCEPT
iptables -A OUTPUT -o lo -j ACCEPT
# 允许已建立的连接
iptables -A INPUT -m state --state ESTABLISHED,RELATED -j ACCEPT
# 允许 SSH(限制源 IP)
iptables -A INPUT -p tcp --dport 22 -s 192.168.1.0/24 -j ACCEPT
# 允许 HTTP/HTTPS
iptables -A INPUT -p tcp --dport 80 -j ACCEPT
iptables -A INPUT -p tcp --dport 443 -j ACCEPT
# 允许 Docker 网络通信
iptables -A INPUT -i docker0 -j ACCEPT
iptables -A FORWARD -i docker0 -o docker0 -j ACCEPT
# 限制连接频率(防 DDoS)
iptables -A INPUT -p tcp --dport 80 -m limit --limit 25/minute --limit-burst 100 -j ACCEPT
iptables -A INPUT -p tcp --dport 443 -m limit --limit 25/minute --limit-burst 100 -j ACCEPT
# 记录被拒绝的连接
iptables -A INPUT -j LOG --log-prefix "IPTABLES-DROPPED: "
iptables -A INPUT -j DROP
echo "Firewall rules applied successfully"
3. 密钥管理
#!/bin/bash
# scripts/setup-secrets.sh
# 创建 Docker secrets
echo "Setting up Docker secrets..."
# 数据库密码
echo "$DB_PASSWORD" | docker secret create db_password -
# JWT 密钥
echo "$JWT_SECRET" | docker secret create jwt_secret -
# API 密钥
echo "$API_KEY" | docker secret create api_key -
# SSL 证书
docker secret create ssl_cert ./ssl/cert.pem
docker secret create ssl_key ./ssl/key.pem
echo "Secrets created successfully"
# 使用外部密钥管理系统
version: '3.8'
services:
app:
image: myapp:latest
environment:
- VAULT_ADDR=${VAULT_ADDR}
- VAULT_TOKEN=${VAULT_TOKEN}
volumes:
- ./scripts/get-secrets.sh:/get-secrets.sh:ro
command: |
sh -c '
# 从 Vault 获取密钥
source /get-secrets.sh
# 启动应用
npm start
'
# Vault 代理
vault_agent:
image: vault:latest
volumes:
- ./vault/agent.hcl:/vault/config/agent.hcl:ro
- vault_secrets:/vault/secrets
command: vault agent -config=/vault/config/agent.hcl
networks:
- backend
volumes:
vault_secrets:
监控和日志
1. 监控配置
# docker-compose.monitoring.yml
version: '3.8'
services:
# Prometheus 监控
prometheus:
image: prom/prometheus:latest
ports:
- "9090:9090"
volumes:
- ./monitoring/prometheus.yml:/etc/prometheus/prometheus.yml:ro
- ./monitoring/rules:/etc/prometheus/rules:ro
- prometheus_data:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--web.console.libraries=/etc/prometheus/console_libraries'
- '--web.console.templates=/etc/prometheus/consoles'
- '--storage.tsdb.retention.time=30d'
- '--web.enable-lifecycle'
- '--web.enable-admin-api'
networks:
- monitoring
deploy:
resources:
limits:
cpus: '1.0'
memory: 2G
# Grafana 可视化
grafana:
image: grafana/grafana:latest
ports:
- "3001:3000"
environment:
- GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD}
- GF_USERS_ALLOW_SIGN_UP=false
- GF_SMTP_ENABLED=true
- GF_SMTP_HOST=${SMTP_HOST}
- GF_SMTP_USER=${SMTP_USER}
- GF_SMTP_PASSWORD=${SMTP_PASSWORD}
volumes:
- grafana_data:/var/lib/grafana
- ./monitoring/grafana/dashboards:/etc/grafana/provisioning/dashboards:ro
- ./monitoring/grafana/datasources:/etc/grafana/provisioning/datasources:ro
networks:
- monitoring
deploy:
resources:
limits:
cpus: '0.5'
memory: 1G
# AlertManager 告警
alertmanager:
image: prom/alertmanager:latest
ports:
- "9093:9093"
volumes:
- ./monitoring/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
- alertmanager_data:/alertmanager
command:
- '--config.file=/etc/alertmanager/alertmanager.yml'
- '--storage.path=/alertmanager'
- '--web.external-url=http://localhost:9093'
networks:
- monitoring
# Node Exporter
node_exporter:
image: prom/node-exporter:latest
ports:
- "9100:9100"
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
command:
- '--path.procfs=/host/proc'
- '--path.sysfs=/host/sys'
- '--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)'
networks:
- monitoring
# cAdvisor
cadvisor:
image: gcr.io/cadvisor/cadvisor:latest
ports:
- "8080:8080"
volumes:
- /:/rootfs:ro
- /var/run:/var/run:rw
- /sys:/sys:ro
- /var/lib/docker/:/var/lib/docker:ro
networks:
- monitoring
volumes:
prometheus_data:
grafana_data:
alertmanager_data:
networks:
monitoring:
2. 日志管理
# docker-compose.logging.yml
version: '3.8'
services:
# ELK Stack - Elasticsearch
elasticsearch:
image: docker.elastic.co/elasticsearch/elasticsearch:7.15.0
environment:
- discovery.type=single-node
- "ES_JAVA_OPTS=-Xms1g -Xmx1g"
- xpack.security.enabled=false
volumes:
- elasticsearch_data:/usr/share/elasticsearch/data
networks:
- logging
deploy:
resources:
limits:
cpus: '2.0'
memory: 4G
# Logstash
logstash:
image: docker.elastic.co/logstash/logstash:7.15.0
volumes:
- ./logging/logstash.conf:/usr/share/logstash/pipeline/logstash.conf:ro
environment:
- "LS_JAVA_OPTS=-Xmx1g -Xms1g"
networks:
- logging
depends_on:
- elasticsearch
# Kibana
kibana:
image: docker.elastic.co/kibana/kibana:7.15.0
ports:
- "5601:5601"
environment:
- ELASTICSEARCH_HOSTS=http://elasticsearch:9200
networks:
- logging
depends_on:
- elasticsearch
# Fluentd 日志收集
fluentd:
image: fluent/fluentd:v1.14-1
ports:
- "24224:24224"
volumes:
- ./logging/fluent.conf:/fluentd/etc/fluent.conf:ro
- fluentd_data:/fluentd/log
networks:
- logging
depends_on:
- elasticsearch
# 应用(配置日志驱动)
app:
image: myapp:latest
logging:
driver: fluentd
options:
fluentd-address: localhost:24224
tag: myapp.{{.Name}}
fluentd-async-connect: "true"
fluentd-retry-wait: "1s"
fluentd-max-retries: "30"
networks:
- app_network
- logging
volumes:
elasticsearch_data:
fluentd_data:
networks:
logging:
app_network:
# logging/fluent.conf
<source>
@type forward
port 24224
bind 0.0.0.0
</source>
<filter myapp.**>
@type parser
key_name log
reserve_data true
<parse>
@type json
</parse>
</filter>
<match myapp.**>
@type elasticsearch
host elasticsearch
port 9200
index_name myapp-logs
type_name _doc
<buffer>
@type file
path /fluentd/log/myapp.buffer
flush_mode interval
flush_interval 10s
chunk_limit_size 10MB
queue_limit_length 32
retry_max_interval 30
retry_forever true
</buffer>
</match>
CI/CD 集成
1. GitLab CI/CD
# .gitlab-ci.yml
stages:
- test
- build
- deploy-staging
- deploy-production
variables:
DOCKER_DRIVER: overlay2
DOCKER_TLS_CERTDIR: "/certs"
IMAGE_NAME: $CI_REGISTRY_IMAGE
IMAGE_TAG: $CI_COMMIT_SHA
services:
- docker:20.10.16-dind
before_script:
- docker login -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $CI_REGISTRY
# 测试阶段
test:
stage: test
image: node:16-alpine
script:
- npm ci
- npm run test
- npm run lint
coverage: '/Lines\s*:\s*(\d+\.?\d*)%/'
artifacts:
reports:
coverage_report:
coverage_format: cobertura
path: coverage/cobertura-coverage.xml
only:
- merge_requests
- main
- develop
# 构建阶段
build:
stage: build
image: docker:20.10.16
script:
- docker build -t $IMAGE_NAME:$IMAGE_TAG .
- docker tag $IMAGE_NAME:$IMAGE_TAG $IMAGE_NAME:latest
- docker push $IMAGE_NAME:$IMAGE_TAG
- docker push $IMAGE_NAME:latest
only:
- main
- develop
# 部署到测试环境
deploy-staging:
stage: deploy-staging
image: alpine:latest
before_script:
- apk add --no-cache openssh-client docker-compose
- eval $(ssh-agent -s)
- echo "$SSH_PRIVATE_KEY" | tr -d '\r' | ssh-add -
- mkdir -p ~/.ssh
- chmod 700 ~/.ssh
- ssh-keyscan $STAGING_HOST >> ~/.ssh/known_hosts
- chmod 644 ~/.ssh/known_hosts
script:
- |
ssh $STAGING_USER@$STAGING_HOST << EOF
cd /opt/myapp
export APP_VERSION=$IMAGE_TAG
docker-compose -f docker-compose.yml -f docker-compose.staging.yml pull
docker-compose -f docker-compose.yml -f docker-compose.staging.yml up -d
docker system prune -f
EOF
environment:
name: staging
url: https://staging.myapp.example.com
only:
- develop
# 部署到生产环境
deploy-production:
stage: deploy-production
image: alpine:latest
before_script:
- apk add --no-cache openssh-client docker-compose
- eval $(ssh-agent -s)
- echo "$SSH_PRIVATE_KEY" | tr -d '\r' | ssh-add -
- mkdir -p ~/.ssh
- chmod 700 ~/.ssh
- ssh-keyscan $PRODUCTION_HOST >> ~/.ssh/known_hosts
- chmod 644 ~/.ssh/known_hosts
script:
- |
ssh $PRODUCTION_USER@$PRODUCTION_HOST << EOF
cd /opt/myapp
# 备份当前版本
docker-compose -f docker-compose.prod.yml exec -T database pg_dump -U \$DB_USER \$DB_NAME > backup_\$(date +%Y%m%d_%H%M%S).sql
# 部署新版本
export APP_VERSION=$IMAGE_TAG
docker-compose -f docker-compose.prod.yml pull
docker-compose -f docker-compose.prod.yml up -d
# 健康检查
sleep 30
if ! curl -f http://localhost/health; then
echo "Health check failed, rolling back..."
docker-compose -f docker-compose.prod.yml rollback
exit 1
fi
# 清理旧镜像
docker system prune -f
EOF
environment:
name: production
url: https://myapp.example.com
when: manual
only:
- main
2. GitHub Actions
# .github/workflows/deploy.yml
name: Deploy to Production
on:
push:
branches: [main]
pull_request:
branches: [main]
env:
REGISTRY: ghcr.io
IMAGE_NAME: ${{ github.repository }}
jobs:
test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Setup Node.js
uses: actions/setup-node@v3
with:
node-version: '16'
cache: 'npm'
- name: Install dependencies
run: npm ci
- name: Run tests
run: |
npm run test
npm run lint
- name: Upload coverage reports
uses: codecov/codecov-action@v3
with:
file: ./coverage/lcov.info
build:
needs: test
runs-on: ubuntu-latest
if: github.ref == 'refs/heads/main'
permissions:
contents: read
packages: write
steps:
- name: Checkout repository
uses: actions/checkout@v3
- name: Log in to Container Registry
uses: docker/login-action@v2
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Extract metadata
id: meta
uses: docker/metadata-action@v4
with:
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
tags: |
type=ref,event=branch
type=ref,event=pr
type=sha,prefix={{branch}}-
type=raw,value=latest,enable={{is_default_branch}}
- name: Build and push Docker image
uses: docker/build-push-action@v4
with:
context: .
push: true
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
deploy:
needs: build
runs-on: ubuntu-latest
if: github.ref == 'refs/heads/main'
environment: production
steps:
- name: Checkout repository
uses: actions/checkout@v3
- name: Deploy to production
uses: appleboy/ssh-action@v0.1.5
with:
host: ${{ secrets.PRODUCTION_HOST }}
username: ${{ secrets.PRODUCTION_USER }}
key: ${{ secrets.SSH_PRIVATE_KEY }}
script: |
cd /opt/myapp
# 设置环境变量
export APP_VERSION=${{ github.sha }}
# 备份数据库
docker-compose -f docker-compose.prod.yml exec -T database pg_dump -U $DB_USER $DB_NAME > backup_$(date +%Y%m%d_%H%M%S).sql
# 拉取新镜像
docker-compose -f docker-compose.prod.yml pull
# 滚动更新
docker-compose -f docker-compose.prod.yml up -d
# 等待服务启动
sleep 30
# 健康检查
if ! curl -f http://localhost/health; then
echo "Health check failed, rolling back..."
docker-compose -f docker-compose.prod.yml down
# 恢复到上一个版本的逻辑
exit 1
fi
# 清理旧镜像
docker system prune -f
- name: Notify deployment
if: always()
uses: 8398a7/action-slack@v3
with:
status: ${{ job.status }}
channel: '#deployments'
webhook_url: ${{ secrets.SLACK_WEBHOOK }}
3. 部署脚本
#!/bin/bash
# scripts/deploy.sh
set -e
# 配置
APP_NAME="myapp"
DEPLOY_ENV=${1:-production}
APP_VERSION=${2:-latest}
BACKUP_RETENTION_DAYS=30
# 颜色输出
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color
log() {
echo -e "${GREEN}[$(date +'%Y-%m-%d %H:%M:%S')] $1${NC}"
}
warn() {
echo -e "${YELLOW}[$(date +'%Y-%m-%d %H:%M:%S')] WARNING: $1${NC}"
}
error() {
echo -e "${RED}[$(date +'%Y-%m-%d %H:%M:%S')] ERROR: $1${NC}"
exit 1
}
# 检查环境
check_environment() {
log "Checking deployment environment..."
if [ ! -f "docker-compose.${DEPLOY_ENV}.yml" ]; then
error "Compose file for environment '${DEPLOY_ENV}' not found"
fi
if [ ! -f ".env.${DEPLOY_ENV}" ]; then
error "Environment file for '${DEPLOY_ENV}' not found"
fi
# 检查 Docker 和 Docker Compose
command -v docker >/dev/null 2>&1 || error "Docker is not installed"
command -v docker-compose >/dev/null 2>&1 || error "Docker Compose is not installed"
log "Environment check passed"
}
# 备份数据库
backup_database() {
log "Creating database backup..."
local backup_file="backup_${DEPLOY_ENV}_$(date +%Y%m%d_%H%M%S).sql"
if docker-compose -f docker-compose.${DEPLOY_ENV}.yml exec -T database pg_isready -U $DB_USER; then
docker-compose -f docker-compose.${DEPLOY_ENV}.yml exec -T database pg_dump -U $DB_USER $DB_NAME > "./backups/${backup_file}"
log "Database backup created: ${backup_file}"
# 清理旧备份
find ./backups -name "backup_${DEPLOY_ENV}_*.sql" -mtime +${BACKUP_RETENTION_DAYS} -delete
log "Old backups cleaned up"
else
warn "Database is not running, skipping backup"
fi
}
# 健康检查
health_check() {
log "Performing health check..."
local max_attempts=30
local attempt=1
while [ $attempt -le $max_attempts ]; do
if curl -f -s http://localhost/health > /dev/null; then
log "Health check passed"
return 0
fi
log "Health check attempt $attempt/$max_attempts failed, retrying in 10 seconds..."
sleep 10
attempt=$((attempt + 1))
done
error "Health check failed after $max_attempts attempts"
}
# 回滚
rollback() {
warn "Rolling back to previous version..."
# 停止当前服务
docker-compose -f docker-compose.${DEPLOY_ENV}.yml down
# 恢复到上一个版本(这里需要根据实际情况实现)
# 例如:从备份恢复,或者使用上一个镜像版本
error "Rollback completed"
}
# 主部署流程
deploy() {
log "Starting deployment of ${APP_NAME} version ${APP_VERSION} to ${DEPLOY_ENV}"
# 设置环境变量
export APP_VERSION
source .env.${DEPLOY_ENV}
# 创建备份
backup_database
# 拉取新镜像
log "Pulling new images..."
docker-compose -f docker-compose.${DEPLOY_ENV}.yml pull
# 滚动更新
log "Starting rolling update..."
docker-compose -f docker-compose.${DEPLOY_ENV}.yml up -d
# 等待服务启动
log "Waiting for services to start..."
sleep 30
# 健康检查
if ! health_check; then
rollback
fi
# 清理
log "Cleaning up old images..."
docker system prune -f
log "Deployment completed successfully!"
}
# 主函数
main() {
log "=== Docker Compose Deployment Script ==="
log "Environment: ${DEPLOY_ENV}"
log "Version: ${APP_VERSION}"
check_environment
deploy
log "=== Deployment Finished ==="
}
# 错误处理
trap 'error "Deployment failed"' ERR
# 执行主函数
main "$@"
备份和恢复
1. 自动备份配置
# docker-compose.backup.yml
version: '3.8'
services:
# 数据库备份
db_backup:
image: postgres:13-alpine
volumes:
- ./backups:/backups
- ./scripts/backup-db.sh:/backup-db.sh:ro
environment:
- PGPASSWORD=${DB_PASSWORD}
- DB_HOST=database
- DB_NAME=${DB_NAME}
- DB_USER=${DB_USER}
- BACKUP_RETENTION_DAYS=30
- S3_BUCKET=${S3_BACKUP_BUCKET}
- AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID}
- AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY}
command: |
sh -c '
# 安装 AWS CLI
apk add --no-cache aws-cli
# 设置 cron 任务
echo "0 2 * * * /backup-db.sh" | crontab -
# 启动 cron
crond -f
'
networks:
- backend
depends_on:
- database
# 文件备份
file_backup:
image: alpine:latest
volumes:
- app_data:/data:ro
- ./backups:/backups
- ./scripts/backup-files.sh:/backup-files.sh:ro
environment:
- BACKUP_RETENTION_DAYS=30
- S3_BUCKET=${S3_BACKUP_BUCKET}
- AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID}
- AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY}
command: |
sh -c '
apk add --no-cache aws-cli tar gzip
echo "0 3 * * * /backup-files.sh" | crontab -
crond -f
'
volumes:
app_data:
external: true
networks:
backend:
external: true
#!/bin/bash
# scripts/backup-db.sh
set -e
BACKUP_DATE=$(date +%Y%m%d_%H%M%S)
BACKUP_FILE="db_backup_${BACKUP_DATE}.sql"
LOCAL_BACKUP_PATH="/backups/${BACKUP_FILE}"
S3_BACKUP_PATH="s3://${S3_BUCKET}/database/${BACKUP_FILE}"
echo "Starting database backup at $(date)"
# 创建备份
pg_dump -h $DB_HOST -U $DB_USER -d $DB_NAME > $LOCAL_BACKUP_PATH
# 压缩备份
gzip $LOCAL_BACKUP_PATH
LOCAL_BACKUP_PATH="${LOCAL_BACKUP_PATH}.gz"
S3_BACKUP_PATH="${S3_BACKUP_PATH}.gz"
# 上传到 S3
if [ -n "$S3_BUCKET" ]; then
aws s3 cp $LOCAL_BACKUP_PATH $S3_BACKUP_PATH
echo "Backup uploaded to S3: $S3_BACKUP_PATH"
fi
# 清理本地旧备份
find /backups -name "db_backup_*.sql.gz" -mtime +$BACKUP_RETENTION_DAYS -delete
# 清理 S3 旧备份
if [ -n "$S3_BUCKET" ]; then
aws s3 ls s3://${S3_BUCKET}/database/ | while read -r line; do
createDate=$(echo $line | awk '{print $1" "$2}')
createDate=$(date -d "$createDate" +%s)
olderThan=$(date -d "$BACKUP_RETENTION_DAYS days ago" +%s)
if [[ $createDate -lt $olderThan ]]; then
fileName=$(echo $line | awk '{print $4}')
if [[ $fileName != "" ]]; then
aws s3 rm s3://${S3_BUCKET}/database/$fileName
echo "Deleted old backup: $fileName"
fi
fi
done
fi
echo "Database backup completed at $(date)"
2. 恢复脚本
#!/bin/bash
# scripts/restore.sh
set -e
BACKUP_FILE=${1}
RESTORE_ENV=${2:-production}
if [ -z "$BACKUP_FILE" ]; then
echo "Usage: $0 <backup_file> [environment]"
echo "Available backups:"
ls -la ./backups/
exit 1
fi
echo "Restoring from backup: $BACKUP_FILE"
echo "Target environment: $RESTORE_ENV"
# 确认操作
read -p "Are you sure you want to restore? This will overwrite existing data. (yes/no): " confirm
if [ "$confirm" != "yes" ]; then
echo "Restore cancelled"
exit 0
fi
# 加载环境变量
source .env.$RESTORE_ENV
# 停止应用服务(保持数据库运行)
docker-compose -f docker-compose.$RESTORE_ENV.yml stop web
# 创建当前数据库备份
echo "Creating backup of current database..."
current_backup="backup_before_restore_$(date +%Y%m%d_%H%M%S).sql"
docker-compose -f docker-compose.$RESTORE_ENV.yml exec -T database pg_dump -U $DB_USER $DB_NAME > "./backups/$current_backup"
echo "Current database backed up to: $current_backup"
# 恢复数据库
echo "Restoring database..."
if [[ $BACKUP_FILE == *.gz ]]; then
gunzip -c "./backups/$BACKUP_FILE" | docker-compose -f docker-compose.$RESTORE_ENV.yml exec -T database psql -U $DB_USER -d $DB_NAME
else
docker-compose -f docker-compose.$RESTORE_ENV.yml exec -T database psql -U $DB_USER -d $DB_NAME < "./backups/$BACKUP_FILE"
fi
# 重启应用服务
echo "Restarting application services..."
docker-compose -f docker-compose.$RESTORE_ENV.yml up -d
# 健康检查
echo "Performing health check..."
sleep 30
if curl -f http://localhost/health; then
echo "Restore completed successfully!"
else
echo "Health check failed after restore"
exit 1
fi
最佳实践总结
1. 生产环境检查清单
# 生产环境部署检查清单
production_checklist:
security:
- ✓ 使用非 root 用户运行容器
- ✓ 启用只读文件系统
- ✓ 配置安全选项和能力限制
- ✓ 使用 secrets 管理敏感信息
- ✓ 配置网络隔离
- ✓ 启用防火墙规则
- ✓ 使用 HTTPS 和 SSL 证书
reliability:
- ✓ 配置健康检查
- ✓ 设置重启策略
- ✓ 配置资源限制
- ✓ 实现优雅关闭
- ✓ 配置数据持久化
- ✓ 设置备份策略
monitoring:
- ✓ 配置日志收集
- ✓ 设置性能监控
- ✓ 配置告警规则
- ✓ 实现分布式追踪
- ✓ 设置仪表板
deployment:
- ✓ 实现 CI/CD 流水线
- ✓ 配置滚动更新
- ✓ 实现蓝绿部署
- ✓ 配置回滚机制
- ✓ 自动化测试
2. 性能优化建议
performance_optimization:
container:
- 使用多阶段构建减小镜像大小
- 选择合适的基础镜像
- 优化 Dockerfile 层缓存
- 配置合理的资源限制
network:
- 使用内部网络减少延迟
- 配置负载均衡
- 启用 HTTP/2 和压缩
- 使用 CDN 加速静态资源
storage:
- 使用 SSD 存储
- 配置数据库连接池
- 实现缓存策略
- 优化数据库查询
application:
- 启用生产模式
- 配置进程管理
- 实现连接复用
- 优化内存使用
总结
Docker Compose 生产环境部署是一个复杂的系统工程,需要综合考虑:
- 安全性: 容器安全、网络隔离、密钥管理
- 可靠性: 健康检查、重启策略、数据备份
- 可观测性: 监控告警、日志收集、性能分析
- 自动化: CI/CD 流水线、自动部署、回滚机制
- 性能: 资源优化、网络优化、存储优化
- 运维: 备份恢复、故障处理、扩容缩容
关键要点:
- 遵循安全最佳实践
- 建立完善的监控体系
- 实现自动化部署流程
- 制定应急响应计划
- 定期进行安全审计和性能优化
- 保持文档更新和团队培训
通过系统性的规划和实施,可以构建稳定、安全、高效的 Docker Compose 生产环境。