Docker Compose 健康检查

概述

健康检查是确保容器化应用稳定运行的重要机制。Docker Compose 提供了丰富的健康检查功能,包括容器健康状态监控、服务依赖管理、自动重启策略等。本文将详细介绍如何在 Docker Compose 中配置和使用健康检查。

健康检查基础

1. 健康检查状态

  • starting: 容器启动中,健康检查尚未开始
  • healthy: 健康检查通过
  • unhealthy: 健康检查失败
  • none: 未配置健康检查

2. 健康检查配置参数

healthcheck:
  test: ["CMD", "curl", "-f", "http://localhost/health"]
  interval: 30s      # 检查间隔
  timeout: 10s       # 超时时间
  retries: 3         # 重试次数
  start_period: 40s  # 启动宽限期
  start_interval: 5s # 启动期间的检查间隔

基本健康检查配置

1. HTTP 健康检查

version: '3.8'

services:
  web:
    image: nginx:alpine
    ports:
      - "80:80"
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost/health"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 40s

  api:
    build: .
    ports:
      - "3000:3000"
    healthcheck:
      test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:3000/api/health"]
      interval: 30s
      timeout: 5s
      retries: 3
      start_period: 60s

  app_with_custom_check:
    image: myapp
    healthcheck:
      test: ["CMD-SHELL", "curl -f http://localhost:8080/health || exit 1"]
      interval: 30s
      timeout: 10s
      retries: 5
      start_period: 2m

2. 数据库健康检查

services:
  postgres:
    image: postgres:13-alpine
    environment:
      - POSTGRES_DB=myapp
      - POSTGRES_USER=user
      - POSTGRES_PASSWORD=password
    healthcheck:
      test: ["CMD-SHELL", "pg_isready -U user -d myapp"]
      interval: 10s
      timeout: 5s
      retries: 5
      start_period: 30s

  mysql:
    image: mysql:8.0
    environment:
      - MYSQL_ROOT_PASSWORD=rootpassword
      - MYSQL_DATABASE=myapp
    healthcheck:
      test: ["CMD", "mysqladmin", "ping", "-h", "localhost"]
      interval: 10s
      timeout: 5s
      retries: 3
      start_period: 30s

  mongodb:
    image: mongo:4.4
    healthcheck:
      test: ["CMD", "mongo", "--eval", "db.adminCommand('ping')"]
      interval: 10s
      timeout: 5s
      retries: 3
      start_period: 30s

  redis:
    image: redis:6-alpine
    healthcheck:
      test: ["CMD", "redis-cli", "ping"]
      interval: 10s
      timeout: 3s
      retries: 3
      start_period: 30s

3. 应用程序健康检查

services:
  node_app:
    build: .
    ports:
      - "3000:3000"
    healthcheck:
      test: ["CMD", "node", "healthcheck.js"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 60s

  python_app:
    build: .
    ports:
      - "5000:5000"
    healthcheck:
      test: ["CMD", "python", "health_check.py"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 60s

  java_app:
    image: openjdk:11-jre
    ports:
      - "8080:8080"
    healthcheck:
      test: ["CMD", "java", "-cp", "/app", "HealthCheck"]
      interval: 30s
      timeout: 15s
      retries: 3
      start_period: 120s

高级健康检查

1. 多层健康检查

version: '3.8'

services:
  web_app:
    build: .
    ports:
      - "3000:3000"
    healthcheck:
      test: |
        CMD-SHELL '
          # 检查进程是否运行
          pgrep -f "node server.js" > /dev/null || exit 1

          # 检查端口是否监听
          netstat -ln | grep ":3000" > /dev/null || exit 1

          # 检查 HTTP 响应
          curl -f http://localhost:3000/health || exit 1

          # 检查数据库连接
          curl -f http://localhost:3000/api/db-health || exit 1
        '
      interval: 30s
      timeout: 15s
      retries: 3
      start_period: 60s
    depends_on:
      db:
        condition: service_healthy
      redis:
        condition: service_healthy

  db:
    image: postgres:13-alpine
    environment:
      - POSTGRES_DB=myapp
      - POSTGRES_USER=user
      - POSTGRES_PASSWORD=password
    healthcheck:
      test: ["CMD-SHELL", "pg_isready -U user -d myapp"]
      interval: 10s
      timeout: 5s
      retries: 5
      start_period: 30s

  redis:
    image: redis:6-alpine
    healthcheck:
      test: ["CMD", "redis-cli", "ping"]
      interval: 10s
      timeout: 3s
      retries: 3
      start_period: 30s

2. 自定义健康检查脚本

services:
  app:
    build: .
    volumes:
      - ./health-check.sh:/health-check.sh:ro
    healthcheck:
      test: ["CMD", "/health-check.sh"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 60s
#!/bin/bash
# health-check.sh

set -e

# 检查应用进程
if ! pgrep -f "myapp" > /dev/null; then
    echo "Application process not running"
    exit 1
fi

# 检查内存使用
MEM_USAGE=$(ps -o pid,ppid,cmd,%mem,%cpu --sort=-%mem -p $(pgrep -f "myapp") | tail -n +2 | awk '{print $4}')
if (( $(echo "$MEM_USAGE > 80" | bc -l) )); then
    echo "Memory usage too high: $MEM_USAGE%"
    exit 1
fi

# 检查磁盘空间
DISK_USAGE=$(df /app | tail -1 | awk '{print $5}' | sed 's/%//')
if [ $DISK_USAGE -gt 90 ]; then
    echo "Disk usage too high: $DISK_USAGE%"
    exit 1
fi

# 检查 HTTP 端点
if ! curl -f -s http://localhost:3000/health > /dev/null; then
    echo "Health endpoint not responding"
    exit 1
fi

# 检查数据库连接
if ! curl -f -s http://localhost:3000/api/db-status > /dev/null; then
    echo "Database connection failed"
    exit 1
fi

echo "All health checks passed"
exit 0

3. 条件依赖启动

version: '3.8'

services:
  # 基础服务
  database:
    image: postgres:13-alpine
    environment:
      - POSTGRES_DB=myapp
      - POSTGRES_USER=user
      - POSTGRES_PASSWORD=password
    healthcheck:
      test: ["CMD-SHELL", "pg_isready -U user -d myapp"]
      interval: 5s
      timeout: 3s
      retries: 5
      start_period: 30s

  cache:
    image: redis:6-alpine
    healthcheck:
      test: ["CMD", "redis-cli", "ping"]
      interval: 5s
      timeout: 3s
      retries: 3
      start_period: 10s

  # 应用服务 - 等待基础服务健康
  backend:
    build: ./backend
    depends_on:
      database:
        condition: service_healthy
      cache:
        condition: service_healthy
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 60s

  # 前端服务 - 等待后端健康
  frontend:
    build: ./frontend
    ports:
      - "80:80"
    depends_on:
      backend:
        condition: service_healthy
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost/health"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 30s

  # 工作进程 - 等待所有服务健康
  worker:
    build: ./worker
    depends_on:
      database:
        condition: service_healthy
      cache:
        condition: service_healthy
      backend:
        condition: service_healthy
    healthcheck:
      test: ["CMD", "python", "worker_health.py"]
      interval: 60s
      timeout: 30s
      retries: 3
      start_period: 120s

健康检查监控

1. 健康状态日志

services:
  app:
    image: myapp
    healthcheck:
      test: ["CMD-SHELL", "curl -f http://localhost:3000/health && echo 'Health check passed at $(date)' >> /var/log/health.log || (echo 'Health check failed at $(date)' >> /var/log/health.log && exit 1)"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 60s
    volumes:
      - health_logs:/var/log

volumes:
  health_logs:

2. 健康检查指标收集

version: '3.8'

services:
  app:
    image: myapp
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:3000/health"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 60s
    labels:
      - "prometheus.io/scrape=true"
      - "prometheus.io/port=3000"
      - "prometheus.io/path=/metrics"

  health_monitor:
    image: alpine
    volumes:
      - /var/run/docker.sock:/var/run/docker.sock:ro
    command: |
      sh -c '
        apk add --no-cache docker-cli curl
        while true; do
          # 检查容器健康状态
          docker ps --format "table {{.Names}}\t{{.Status}}" | grep -E "(healthy|unhealthy)"

          # 发送健康状态到监控系统
          for container in $(docker ps --format "{{.Names}}"); do
            health_status=$(docker inspect --format="{{.State.Health.Status}}" $container 2>/dev/null || echo "none")
            echo "Container: $container, Health: $health_status"

            # 发送到 Prometheus Pushgateway
            echo "container_health_status{container=\"$container\"} $([ "$health_status" = "healthy" ] && echo 1 || echo 0)" | \
              curl -X POST --data-binary @- http://pushgateway:9091/metrics/job/health_monitor
          done

          sleep 30
        done
      '
    depends_on:
      - app

3. 健康检查告警

services:
  alertmanager:
    image: prom/alertmanager
    ports:
      - "9093:9093"
    volumes:
      - ./alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
    command:
      - '--config.file=/etc/alertmanager/alertmanager.yml'
      - '--storage.path=/alertmanager'
      - '--web.external-url=http://localhost:9093'

  health_alerter:
    image: alpine
    volumes:
      - /var/run/docker.sock:/var/run/docker.sock:ro
    command: |
      sh -c '
        apk add --no-cache docker-cli curl jq
        while true; do
          # 检查不健康的容器
          unhealthy_containers=$(docker ps --filter "health=unhealthy" --format "{{.Names}}")

          if [ ! -z "$unhealthy_containers" ]; then
            for container in $unhealthy_containers; do
              # 发送告警
              curl -X POST http://alertmanager:9093/api/v1/alerts \
                -H "Content-Type: application/json" \
                -d "[{
                  \"labels\": {
                    \"alertname\": \"ContainerUnhealthy\",
                    \"container\": \"$container\",
                    \"severity\": \"critical\"
                  },
                  \"annotations\": {
                    \"summary\": \"Container $container is unhealthy\",
                    \"description\": \"Container $container has failed health checks\"
                  }
                }]"
            done
          fi

          sleep 60
        done
      '

应用程序健康检查实现

1. Node.js 健康检查

// healthcheck.js
const http = require('http');
const { promisify } = require('util');

async function healthCheck() {
  try {
    // 检查 HTTP 服务
    const response = await makeRequest('http://localhost:3000/api/health');
    if (response.statusCode !== 200) {
      throw new Error(`HTTP health check failed: ${response.statusCode}`);
    }

    // 检查数据库连接
    const dbResponse = await makeRequest('http://localhost:3000/api/db-health');
    if (dbResponse.statusCode !== 200) {
      throw new Error('Database health check failed');
    }

    // 检查内存使用
    const memUsage = process.memoryUsage();
    const memUsagePercent = (memUsage.heapUsed / memUsage.heapTotal) * 100;
    if (memUsagePercent > 90) {
      throw new Error(`Memory usage too high: ${memUsagePercent.toFixed(2)}%`);
    }

    console.log('Health check passed');
    process.exit(0);
  } catch (error) {
    console.error('Health check failed:', error.message);
    process.exit(1);
  }
}

function makeRequest(url) {
  return new Promise((resolve, reject) => {
    const req = http.get(url, (res) => {
      resolve(res);
    });
    req.on('error', reject);
    req.setTimeout(5000, () => {
      req.destroy();
      reject(new Error('Request timeout'));
    });
  });
}

healthCheck();

2. Python 健康检查

#!/usr/bin/env python3
# health_check.py

import sys
import requests
import psutil
import time
from urllib.parse import urljoin

def check_http_endpoint(url, timeout=5):
    """检查 HTTP 端点"""
    try:
        response = requests.get(url, timeout=timeout)
        response.raise_for_status()
        return True
    except requests.RequestException as e:
        print(f"HTTP check failed: {e}")
        return False

def check_database_connection():
    """检查数据库连接"""
    try:
        response = requests.get('http://localhost:5000/api/db-health', timeout=5)
        response.raise_for_status()
        return True
    except requests.RequestException as e:
        print(f"Database check failed: {e}")
        return False

def check_system_resources():
    """检查系统资源"""
    # 检查内存使用
    memory = psutil.virtual_memory()
    if memory.percent > 90:
        print(f"Memory usage too high: {memory.percent}%")
        return False

    # 检查磁盘使用
    disk = psutil.disk_usage('/')
    disk_percent = (disk.used / disk.total) * 100
    if disk_percent > 90:
        print(f"Disk usage too high: {disk_percent:.2f}%")
        return False

    return True

def check_process_running(process_name):
    """检查进程是否运行"""
    for proc in psutil.process_iter(['pid', 'name', 'cmdline']):
        try:
            if process_name in ' '.join(proc.info['cmdline'] or []):
                return True
        except (psutil.NoSuchProcess, psutil.AccessDenied):
            continue
    return False

def main():
    checks = [
        ('HTTP endpoint', lambda: check_http_endpoint('http://localhost:5000/health')),
        ('Database connection', check_database_connection),
        ('System resources', check_system_resources),
        ('Application process', lambda: check_process_running('python app.py')),
    ]

    all_passed = True

    for check_name, check_func in checks:
        try:
            if check_func():
                print(f"✓ {check_name} passed")
            else:
                print(f"✗ {check_name} failed")
                all_passed = False
        except Exception as e:
            print(f"✗ {check_name} error: {e}")
            all_passed = False

    if all_passed:
        print("All health checks passed")
        sys.exit(0)
    else:
        print("Some health checks failed")
        sys.exit(1)

if __name__ == '__main__':
    main()

3. Java 健康检查

// HealthCheck.java
import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.URL;
import java.lang.management.ManagementFactory;
import java.lang.management.MemoryMXBean;
import java.lang.management.MemoryUsage;

public class HealthCheck {
    private static final String HEALTH_ENDPOINT = "http://localhost:8080/actuator/health";
    private static final int TIMEOUT = 5000;
    private static final double MAX_MEMORY_USAGE = 0.9;

    public static void main(String[] args) {
        try {
            // 检查 HTTP 端点
            if (!checkHttpEndpoint()) {
                System.err.println("HTTP health check failed");
                System.exit(1);
            }

            // 检查内存使用
            if (!checkMemoryUsage()) {
                System.err.println("Memory usage check failed");
                System.exit(1);
            }

            // 检查线程状态
            if (!checkThreads()) {
                System.err.println("Thread check failed");
                System.exit(1);
            }

            System.out.println("All health checks passed");
            System.exit(0);

        } catch (Exception e) {
            System.err.println("Health check error: " + e.getMessage());
            System.exit(1);
        }
    }

    private static boolean checkHttpEndpoint() {
        try {
            URL url = new URL(HEALTH_ENDPOINT);
            HttpURLConnection connection = (HttpURLConnection) url.openConnection();
            connection.setRequestMethod("GET");
            connection.setConnectTimeout(TIMEOUT);
            connection.setReadTimeout(TIMEOUT);

            int responseCode = connection.getResponseCode();
            return responseCode == 200;

        } catch (IOException e) {
            System.err.println("HTTP check failed: " + e.getMessage());
            return false;
        }
    }

    private static boolean checkMemoryUsage() {
        MemoryMXBean memoryBean = ManagementFactory.getMemoryMXBean();
        MemoryUsage heapUsage = memoryBean.getHeapMemoryUsage();

        double usageRatio = (double) heapUsage.getUsed() / heapUsage.getMax();

        if (usageRatio > MAX_MEMORY_USAGE) {
            System.err.println(String.format("Memory usage too high: %.2f%%", usageRatio * 100));
            return false;
        }

        return true;
    }

    private static boolean checkThreads() {
        int activeThreads = Thread.activeCount();
        int maxThreads = 1000; // 根据应用调整

        if (activeThreads > maxThreads) {
            System.err.println(String.format("Too many active threads: %d", activeThreads));
            return false;
        }

        return true;
    }
}

故障恢复策略

1. 自动重启配置

version: '3.8'

services:
  web:
    image: myapp
    restart: unless-stopped
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:3000/health"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 60s
    deploy:
      restart_policy:
        condition: on-failure
        delay: 5s
        max_attempts: 3
        window: 120s

  db:
    image: postgres:13-alpine
    restart: unless-stopped
    healthcheck:
      test: ["CMD-SHELL", "pg_isready"]
      interval: 10s
      timeout: 5s
      retries: 5
      start_period: 30s
    environment:
      - POSTGRES_DB=myapp
      - POSTGRES_USER=user
      - POSTGRES_PASSWORD=password

2. 故障转移配置

version: '3.8'

services:
  # 主应用实例
  app_primary:
    image: myapp
    ports:
      - "3000:3000"
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:3000/health"]
      interval: 10s
      timeout: 5s
      retries: 2
      start_period: 30s
    labels:
      - "traefik.enable=true"
      - "traefik.http.routers.app.rule=Host(`app.local`)"
      - "traefik.http.services.app.loadbalancer.server.port=3000"

  # 备用应用实例
  app_backup:
    image: myapp
    ports:
      - "3001:3000"
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:3000/health"]
      interval: 10s
      timeout: 5s
      retries: 2
      start_period: 30s
    labels:
      - "traefik.enable=true"
      - "traefik.http.routers.app-backup.rule=Host(`app.local`)"
      - "traefik.http.routers.app-backup.priority=1"
      - "traefik.http.services.app-backup.loadbalancer.server.port=3000"
    profiles:
      - backup

  # 负载均衡器
  traefik:
    image: traefik:v2.5
    ports:
      - "80:80"
      - "8080:8080"
    volumes:
      - /var/run/docker.sock:/var/run/docker.sock:ro
    command:
      - --api.insecure=true
      - --providers.docker=true
      - --providers.docker.exposedbydefault=false
      - --entrypoints.web.address=:80

3. 健康检查恢复脚本

#!/bin/bash
# recovery.sh

SERVICE_NAME=${1:-"app"}
MAX_RETRIES=${2:-3}
RETRY_INTERVAL=${3:-30}

for i in $(seq 1 $MAX_RETRIES); do
    echo "Checking health of $SERVICE_NAME (attempt $i/$MAX_RETRIES)..."

    # 检查容器健康状态
    HEALTH_STATUS=$(docker-compose ps -q $SERVICE_NAME | xargs docker inspect --format='{{.State.Health.Status}}' 2>/dev/null)

    if [ "$HEALTH_STATUS" = "healthy" ]; then
        echo "$SERVICE_NAME is healthy"
        exit 0
    elif [ "$HEALTH_STATUS" = "unhealthy" ]; then
        echo "$SERVICE_NAME is unhealthy, attempting recovery..."

        # 尝试重启服务
        docker-compose restart $SERVICE_NAME

        # 等待服务启动
        sleep $RETRY_INTERVAL
    else
        echo "$SERVICE_NAME health status unknown: $HEALTH_STATUS"
        sleep $RETRY_INTERVAL
    fi
done

echo "Failed to recover $SERVICE_NAME after $MAX_RETRIES attempts"
exit 1

最佳实践

1. 健康检查设计原则

services:
  app:
    image: myapp
    healthcheck:
      # 使用轻量级检查,避免影响性能
      test: ["CMD", "curl", "-f", "http://localhost:3000/health"]

      # 合理设置检查间隔
      interval: 30s        # 不要太频繁
      timeout: 10s         # 给足够时间响应
      retries: 3           # 允许偶发失败
      start_period: 60s    # 给应用足够启动时间
      start_interval: 5s   # 启动期间更频繁检查

2. 健康检查端点实现

// Express.js 健康检查端点
app.get('/health', async (req, res) => {
  const checks = {
    uptime: process.uptime(),
    message: 'OK',
    timestamp: Date.now(),
    checks: {
      database: 'unknown',
      redis: 'unknown',
      memory: 'unknown'
    }
  };

  try {
    // 检查数据库
    await db.query('SELECT 1');
    checks.checks.database = 'healthy';
  } catch (error) {
    checks.checks.database = 'unhealthy';
    checks.message = 'Database connection failed';
  }

  try {
    // 检查 Redis
    await redis.ping();
    checks.checks.redis = 'healthy';
  } catch (error) {
    checks.checks.redis = 'unhealthy';
    checks.message = 'Redis connection failed';
  }

  // 检查内存使用
  const memUsage = process.memoryUsage();
  const memPercent = (memUsage.heapUsed / memUsage.heapTotal) * 100;
  checks.checks.memory = memPercent < 90 ? 'healthy' : 'unhealthy';

  if (memPercent >= 90) {
    checks.message = 'High memory usage';
  }

  // 如果任何检查失败,返回 503
  const isHealthy = Object.values(checks.checks).every(status => status === 'healthy');
  const statusCode = isHealthy ? 200 : 503;

  res.status(statusCode).json(checks);
});

3. 监控集成

version: '3.8'

services:
  app:
    image: myapp
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:3000/health"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 60s
    labels:
      - "prometheus.io/scrape=true"
      - "prometheus.io/port=3000"
      - "prometheus.io/path=/metrics"
      - "health.check.endpoint=/health"
      - "health.check.interval=30s"

  prometheus:
    image: prom/prometheus
    ports:
      - "9090:9090"
    volumes:
      - ./prometheus.yml:/etc/prometheus/prometheus.yml:ro
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--storage.tsdb.path=/prometheus'
      - '--web.console.libraries=/etc/prometheus/console_libraries'
      - '--web.console.templates=/etc/prometheus/consoles'

总结

Docker Compose 健康检查是确保容器化应用稳定运行的关键机制。通过合理配置健康检查,可以实现:

  1. 自动故障检测: 及时发现服务异常
  2. 依赖管理: 确保服务按正确顺序启动
  3. 自动恢复: 配合重启策略实现故障自愈
  4. 监控集成: 与监控系统集成实现全面监控
  5. 负载均衡: 配合负载均衡器实现故障转移

关键要点:

  • 设计轻量级、快速的健康检查
  • 合理设置检查参数(间隔、超时、重试)
  • 实现多层次的健康检查(进程、网络、业务逻辑)
  • 建立完善的监控和告警机制
  • 制定故障恢复和转移策略

正确的健康检查配置是构建高可用容器化应用的基础。

powered by Gitbook© 2025 编外计划 | 最后修改: 2025-08-29 15:40:15

results matching ""

    No results matching ""