Docker Compose 健康检查

概述

健康检查是确保容器化应用稳定运行的重要机制。Docker Compose 提供了丰富的健康检查功能，包括容器健康状态监控、服务依赖管理、自动重启策略等。本文将详细介绍如何在 Docker Compose 中配置和使用健康检查。

健康检查基础

1. 健康检查状态

starting: 容器启动中，健康检查尚未开始
healthy: 健康检查通过
unhealthy: 健康检查失败
none: 未配置健康检查

2. 健康检查配置参数

healthcheck:
  test: ["CMD", "curl", "-f", "http://localhost/health"]
  interval: 30s      # 检查间隔
  timeout: 10s       # 超时时间
  retries: 3         # 重试次数
  start_period: 40s  # 启动宽限期
  start_interval: 5s # 启动期间的检查间隔

基本健康检查配置

1. HTTP 健康检查

version: '3.8'

services:
  web:
    image: nginx:alpine
    ports:
      - "80:80"
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost/health"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 40s

  api:
    build: .
    ports:
      - "3000:3000"
    healthcheck:
      test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:3000/api/health"]
      interval: 30s
      timeout: 5s
      retries: 3
      start_period: 60s

  app_with_custom_check:
    image: myapp
    healthcheck:
      test: ["CMD-SHELL", "curl -f http://localhost:8080/health || exit 1"]
      interval: 30s
      timeout: 10s
      retries: 5
      start_period: 2m

2. 数据库健康检查

services:
  postgres:
    image: postgres:13-alpine
    environment:
      - POSTGRES_DB=myapp
      - POSTGRES_USER=user
      - POSTGRES_PASSWORD=password
    healthcheck:
      test: ["CMD-SHELL", "pg_isready -U user -d myapp"]
      interval: 10s
      timeout: 5s
      retries: 5
      start_period: 30s

  mysql:
    image: mysql:8.0
    environment:
      - MYSQL_ROOT_PASSWORD=rootpassword
      - MYSQL_DATABASE=myapp
    healthcheck:
      test: ["CMD", "mysqladmin", "ping", "-h", "localhost"]
      interval: 10s
      timeout: 5s
      retries: 3
      start_period: 30s

  mongodb:
    image: mongo:4.4
    healthcheck:
      test: ["CMD", "mongo", "--eval", "db.adminCommand('ping')"]
      interval: 10s
      timeout: 5s
      retries: 3
      start_period: 30s

  redis:
    image: redis:6-alpine
    healthcheck:
      test: ["CMD", "redis-cli", "ping"]
      interval: 10s
      timeout: 3s
      retries: 3
      start_period: 30s

3. 应用程序健康检查

services:
  node_app:
    build: .
    ports:
      - "3000:3000"
    healthcheck:
      test: ["CMD", "node", "healthcheck.js"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 60s

  python_app:
    build: .
    ports:
      - "5000:5000"
    healthcheck:
      test: ["CMD", "python", "health_check.py"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 60s

  java_app:
    image: openjdk:11-jre
    ports:
      - "8080:8080"
    healthcheck:
      test: ["CMD", "java", "-cp", "/app", "HealthCheck"]
      interval: 30s
      timeout: 15s
      retries: 3
      start_period: 120s

高级健康检查

1. 多层健康检查

version: '3.8'

services:
  web_app:
    build: .
    ports:
      - "3000:3000"
    healthcheck:
      test: |
        CMD-SHELL '
          # 检查进程是否运行
          pgrep -f "node server.js" > /dev/null || exit 1

          # 检查端口是否监听
          netstat -ln | grep ":3000" > /dev/null || exit 1

          # 检查 HTTP 响应
          curl -f http://localhost:3000/health || exit 1

          # 检查数据库连接
          curl -f http://localhost:3000/api/db-health || exit 1
        '
      interval: 30s
      timeout: 15s
      retries: 3
      start_period: 60s
    depends_on:
      db:
        condition: service_healthy
      redis:
        condition: service_healthy

  db:
    image: postgres:13-alpine
    environment:
      - POSTGRES_DB=myapp
      - POSTGRES_USER=user
      - POSTGRES_PASSWORD=password
    healthcheck:
      test: ["CMD-SHELL", "pg_isready -U user -d myapp"]
      interval: 10s
      timeout: 5s
      retries: 5
      start_period: 30s

  redis:
    image: redis:6-alpine
    healthcheck:
      test: ["CMD", "redis-cli", "ping"]
      interval: 10s
      timeout: 3s
      retries: 3
      start_period: 30s

2. 自定义健康检查脚本

services:
  app:
    build: .
    volumes:
      - ./health-check.sh:/health-check.sh:ro
    healthcheck:
      test: ["CMD", "/health-check.sh"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 60s

#!/bin/bash
# health-check.sh

set -e

# 检查应用进程
if ! pgrep -f "myapp" > /dev/null; then
    echo "Application process not running"
    exit 1
fi

# 检查内存使用
MEM_USAGE=$(ps -o pid,ppid,cmd,%mem,%cpu --sort=-%mem -p $(pgrep -f "myapp") | tail -n +2 | awk '{print $4}')
if (( $(echo "$MEM_USAGE > 80" | bc -l) )); then
    echo "Memory usage too high: $MEM_USAGE%"
    exit 1
fi

# 检查磁盘空间
DISK_USAGE=$(df /app | tail -1 | awk '{print $5}' | sed 's/%//')
if [ $DISK_USAGE -gt 90 ]; then
    echo "Disk usage too high: $DISK_USAGE%"
    exit 1
fi

# 检查 HTTP 端点
if ! curl -f -s http://localhost:3000/health > /dev/null; then
    echo "Health endpoint not responding"
    exit 1
fi

# 检查数据库连接
if ! curl -f -s http://localhost:3000/api/db-status > /dev/null; then
    echo "Database connection failed"
    exit 1
fi

echo "All health checks passed"
exit 0

3. 条件依赖启动

version: '3.8'

services:
  # 基础服务
  database:
    image: postgres:13-alpine
    environment:
      - POSTGRES_DB=myapp
      - POSTGRES_USER=user
      - POSTGRES_PASSWORD=password
    healthcheck:
      test: ["CMD-SHELL", "pg_isready -U user -d myapp"]
      interval: 5s
      timeout: 3s
      retries: 5
      start_period: 30s

  cache:
    image: redis:6-alpine
    healthcheck:
      test: ["CMD", "redis-cli", "ping"]
      interval: 5s
      timeout: 3s
      retries: 3
      start_period: 10s

  # 应用服务 - 等待基础服务健康
  backend:
    build: ./backend
    depends_on:
      database:
        condition: service_healthy
      cache:
        condition: service_healthy
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 60s

  # 前端服务 - 等待后端健康
  frontend:
    build: ./frontend
    ports:
      - "80:80"
    depends_on:
      backend:
        condition: service_healthy
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost/health"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 30s

  # 工作进程 - 等待所有服务健康
  worker:
    build: ./worker
    depends_on:
      database:
        condition: service_healthy
      cache:
        condition: service_healthy
      backend:
        condition: service_healthy
    healthcheck:
      test: ["CMD", "python", "worker_health.py"]
      interval: 60s
      timeout: 30s
      retries: 3
      start_period: 120s

健康检查监控

1. 健康状态日志

services:
  app:
    image: myapp
    healthcheck:
      test: ["CMD-SHELL", "curl -f http://localhost:3000/health && echo 'Health check passed at $(date)' >> /var/log/health.log || (echo 'Health check failed at $(date)' >> /var/log/health.log && exit 1)"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 60s
    volumes:
      - health_logs:/var/log

volumes:
  health_logs:

2. 健康检查指标收集

version: '3.8'

services:
  app:
    image: myapp
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:3000/health"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 60s
    labels:
      - "prometheus.io/scrape=true"
      - "prometheus.io/port=3000"
      - "prometheus.io/path=/metrics"

  health_monitor:
    image: alpine
    volumes:
      - /var/run/docker.sock:/var/run/docker.sock:ro
    command: |
      sh -c '
        apk add --no-cache docker-cli curl
        while true; do
          # 检查容器健康状态
          docker ps --format "table {{.Names}}\t{{.Status}}" | grep -E "(healthy|unhealthy)"

          # 发送健康状态到监控系统
          for container in $(docker ps --format "{{.Names}}"); do
            health_status=$(docker inspect --format="{{.State.Health.Status}}" $container 2>/dev/null || echo "none")
            echo "Container: $container, Health: $health_status"

            # 发送到 Prometheus Pushgateway
            echo "container_health_status{container=\"$container\"} $([ "$health_status" = "healthy" ] && echo 1 || echo 0)" | \
              curl -X POST --data-binary @- http://pushgateway:9091/metrics/job/health_monitor
          done

          sleep 30
        done
      '
    depends_on:
      - app

3. 健康检查告警

services:
  alertmanager:
    image: prom/alertmanager
    ports:
      - "9093:9093"
    volumes:
      - ./alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
    command:
      - '--config.file=/etc/alertmanager/alertmanager.yml'
      - '--storage.path=/alertmanager'
      - '--web.external-url=http://localhost:9093'

  health_alerter:
    image: alpine
    volumes:
      - /var/run/docker.sock:/var/run/docker.sock:ro
    command: |
      sh -c '
        apk add --no-cache docker-cli curl jq
        while true; do
          # 检查不健康的容器
          unhealthy_containers=$(docker ps --filter "health=unhealthy" --format "{{.Names}}")

          if [ ! -z "$unhealthy_containers" ]; then
            for container in $unhealthy_containers; do
              # 发送告警
              curl -X POST http://alertmanager:9093/api/v1/alerts \
                -H "Content-Type: application/json" \
                -d "[{
                  \"labels\": {
                    \"alertname\": \"ContainerUnhealthy\",
                    \"container\": \"$container\",
                    \"severity\": \"critical\"
                  },
                  \"annotations\": {
                    \"summary\": \"Container $container is unhealthy\",
                    \"description\": \"Container $container has failed health checks\"
                  }
                }]"
            done
          fi

          sleep 60
        done
      '

应用程序健康检查实现

1. Node.js 健康检查

// healthcheck.js
const http = require('http');
const { promisify } = require('util');

async function healthCheck() {
  try {
    // 检查 HTTP 服务
    const response = await makeRequest('http://localhost:3000/api/health');
    if (response.statusCode !== 200) {
      throw new Error(`HTTP health check failed: ${response.statusCode}`);
    }

    // 检查数据库连接
    const dbResponse = await makeRequest('http://localhost:3000/api/db-health');
    if (dbResponse.statusCode !== 200) {
      throw new Error('Database health check failed');
    }

    // 检查内存使用
    const memUsage = process.memoryUsage();
    const memUsagePercent = (memUsage.heapUsed / memUsage.heapTotal) * 100;
    if (memUsagePercent > 90) {
      throw new Error(`Memory usage too high: ${memUsagePercent.toFixed(2)}%`);
    }

    console.log('Health check passed');
    process.exit(0);
  } catch (error) {
    console.error('Health check failed:', error.message);
    process.exit(1);
  }
}

function makeRequest(url) {
  return new Promise((resolve, reject) => {
    const req = http.get(url, (res) => {
      resolve(res);
    });
    req.on('error', reject);
    req.setTimeout(5000, () => {
      req.destroy();
      reject(new Error('Request timeout'));
    });
  });
}

healthCheck();

2. Python 健康检查

#!/usr/bin/env python3
# health_check.py

import sys
import requests
import psutil
import time
from urllib.parse import urljoin

def check_http_endpoint(url, timeout=5):
    """检查 HTTP 端点"""
    try:
        response = requests.get(url, timeout=timeout)
        response.raise_for_status()
        return True
    except requests.RequestException as e:
        print(f"HTTP check failed: {e}")
        return False

def check_database_connection():
    """检查数据库连接"""
    try:
        response = requests.get('http://localhost:5000/api/db-health', timeout=5)
        response.raise_for_status()
        return True
    except requests.RequestException as e:
        print(f"Database check failed: {e}")
        return False

def check_system_resources():
    """检查系统资源"""
    # 检查内存使用
    memory = psutil.virtual_memory()
    if memory.percent > 90:
        print(f"Memory usage too high: {memory.percent}%")
        return False

    # 检查磁盘使用
    disk = psutil.disk_usage('/')
    disk_percent = (disk.used / disk.total) * 100
    if disk_percent > 90:
        print(f"Disk usage too high: {disk_percent:.2f}%")
        return False

    return True

def check_process_running(process_name):
    """检查进程是否运行"""
    for proc in psutil.process_iter(['pid', 'name', 'cmdline']):
        try:
            if process_name in ' '.join(proc.info['cmdline'] or []):
                return True
        except (psutil.NoSuchProcess, psutil.AccessDenied):
            continue
    return False

def main():
    checks = [
        ('HTTP endpoint', lambda: check_http_endpoint('http://localhost:5000/health')),
        ('Database connection', check_database_connection),
        ('System resources', check_system_resources),
        ('Application process', lambda: check_process_running('python app.py')),
    ]

    all_passed = True

    for check_name, check_func in checks:
        try:
            if check_func():
                print(f"✓ {check_name} passed")
            else:
                print(f"✗ {check_name} failed")
                all_passed = False
        except Exception as e:
            print(f"✗ {check_name} error: {e}")
            all_passed = False

    if all_passed:
        print("All health checks passed")
        sys.exit(0)
    else:
        print("Some health checks failed")
        sys.exit(1)

if __name__ == '__main__':
    main()

3. Java 健康检查

// HealthCheck.java
import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.URL;
import java.lang.management.ManagementFactory;
import java.lang.management.MemoryMXBean;
import java.lang.management.MemoryUsage;

public class HealthCheck {
    private static final String HEALTH_ENDPOINT = "http://localhost:8080/actuator/health";
    private static final int TIMEOUT = 5000;
    private static final double MAX_MEMORY_USAGE = 0.9;

    public static void main(String[] args) {
        try {
            // 检查 HTTP 端点
            if (!checkHttpEndpoint()) {
                System.err.println("HTTP health check failed");
                System.exit(1);
            }

            // 检查内存使用
            if (!checkMemoryUsage()) {
                System.err.println("Memory usage check failed");
                System.exit(1);
            }

            // 检查线程状态
            if (!checkThreads()) {
                System.err.println("Thread check failed");
                System.exit(1);
            }

            System.out.println("All health checks passed");
            System.exit(0);

        } catch (Exception e) {
            System.err.println("Health check error: " + e.getMessage());
            System.exit(1);
        }
    }

    private static boolean checkHttpEndpoint() {
        try {
            URL url = new URL(HEALTH_ENDPOINT);
            HttpURLConnection connection = (HttpURLConnection) url.openConnection();
            connection.setRequestMethod("GET");
            connection.setConnectTimeout(TIMEOUT);
            connection.setReadTimeout(TIMEOUT);

            int responseCode = connection.getResponseCode();
            return responseCode == 200;

        } catch (IOException e) {
            System.err.println("HTTP check failed: " + e.getMessage());
            return false;
        }
    }

    private static boolean checkMemoryUsage() {
        MemoryMXBean memoryBean = ManagementFactory.getMemoryMXBean();
        MemoryUsage heapUsage = memoryBean.getHeapMemoryUsage();

        double usageRatio = (double) heapUsage.getUsed() / heapUsage.getMax();

        if (usageRatio > MAX_MEMORY_USAGE) {
            System.err.println(String.format("Memory usage too high: %.2f%%", usageRatio * 100));
            return false;
        }

        return true;
    }

    private static boolean checkThreads() {
        int activeThreads = Thread.activeCount();
        int maxThreads = 1000; // 根据应用调整

        if (activeThreads > maxThreads) {
            System.err.println(String.format("Too many active threads: %d", activeThreads));
            return false;
        }

        return true;
    }
}

故障恢复策略

1. 自动重启配置

version: '3.8'

services:
  web:
    image: myapp
    restart: unless-stopped
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:3000/health"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 60s
    deploy:
      restart_policy:
        condition: on-failure
        delay: 5s
        max_attempts: 3
        window: 120s

  db:
    image: postgres:13-alpine
    restart: unless-stopped
    healthcheck:
      test: ["CMD-SHELL", "pg_isready"]
      interval: 10s
      timeout: 5s
      retries: 5
      start_period: 30s
    environment:
      - POSTGRES_DB=myapp
      - POSTGRES_USER=user
      - POSTGRES_PASSWORD=password

2. 故障转移配置

version: '3.8'

services:
  # 主应用实例
  app_primary:
    image: myapp
    ports:
      - "3000:3000"
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:3000/health"]
      interval: 10s
      timeout: 5s
      retries: 2
      start_period: 30s
    labels:
      - "traefik.enable=true"
      - "traefik.http.routers.app.rule=Host(`app.local`)"
      - "traefik.http.services.app.loadbalancer.server.port=3000"

  # 备用应用实例
  app_backup:
    image: myapp
    ports:
      - "3001:3000"
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:3000/health"]
      interval: 10s
      timeout: 5s
      retries: 2
      start_period: 30s
    labels:
      - "traefik.enable=true"
      - "traefik.http.routers.app-backup.rule=Host(`app.local`)"
      - "traefik.http.routers.app-backup.priority=1"
      - "traefik.http.services.app-backup.loadbalancer.server.port=3000"
    profiles:
      - backup

  # 负载均衡器
  traefik:
    image: traefik:v2.5
    ports:
      - "80:80"
      - "8080:8080"
    volumes:
      - /var/run/docker.sock:/var/run/docker.sock:ro
    command:
      - --api.insecure=true
      - --providers.docker=true
      - --providers.docker.exposedbydefault=false
      - --entrypoints.web.address=:80

3. 健康检查恢复脚本

#!/bin/bash
# recovery.sh

SERVICE_NAME=${1:-"app"}
MAX_RETRIES=${2:-3}
RETRY_INTERVAL=${3:-30}

for i in $(seq 1 $MAX_RETRIES); do
    echo "Checking health of $SERVICE_NAME (attempt $i/$MAX_RETRIES)..."

    # 检查容器健康状态
    HEALTH_STATUS=$(docker-compose ps -q $SERVICE_NAME | xargs docker inspect --format='{{.State.Health.Status}}' 2>/dev/null)

    if [ "$HEALTH_STATUS" = "healthy" ]; then
        echo "$SERVICE_NAME is healthy"
        exit 0
    elif [ "$HEALTH_STATUS" = "unhealthy" ]; then
        echo "$SERVICE_NAME is unhealthy, attempting recovery..."

        # 尝试重启服务
        docker-compose restart $SERVICE_NAME

        # 等待服务启动
        sleep $RETRY_INTERVAL
    else
        echo "$SERVICE_NAME health status unknown: $HEALTH_STATUS"
        sleep $RETRY_INTERVAL
    fi
done

echo "Failed to recover $SERVICE_NAME after $MAX_RETRIES attempts"
exit 1

最佳实践

1. 健康检查设计原则

services:
  app:
    image: myapp
    healthcheck:
      # 使用轻量级检查，避免影响性能
      test: ["CMD", "curl", "-f", "http://localhost:3000/health"]

      # 合理设置检查间隔
      interval: 30s        # 不要太频繁
      timeout: 10s         # 给足够时间响应
      retries: 3           # 允许偶发失败
      start_period: 60s    # 给应用足够启动时间
      start_interval: 5s   # 启动期间更频繁检查

2. 健康检查端点实现

// Express.js 健康检查端点
app.get('/health', async (req, res) => {
  const checks = {
    uptime: process.uptime(),
    message: 'OK',
    timestamp: Date.now(),
    checks: {
      database: 'unknown',
      redis: 'unknown',
      memory: 'unknown'
    }
  };

  try {
    // 检查数据库
    await db.query('SELECT 1');
    checks.checks.database = 'healthy';
  } catch (error) {
    checks.checks.database = 'unhealthy';
    checks.message = 'Database connection failed';
  }

  try {
    // 检查 Redis
    await redis.ping();
    checks.checks.redis = 'healthy';
  } catch (error) {
    checks.checks.redis = 'unhealthy';
    checks.message = 'Redis connection failed';
  }

  // 检查内存使用
  const memUsage = process.memoryUsage();
  const memPercent = (memUsage.heapUsed / memUsage.heapTotal) * 100;
  checks.checks.memory = memPercent < 90 ? 'healthy' : 'unhealthy';

  if (memPercent >= 90) {
    checks.message = 'High memory usage';
  }

  // 如果任何检查失败，返回 503
  const isHealthy = Object.values(checks.checks).every(status => status === 'healthy');
  const statusCode = isHealthy ? 200 : 503;

  res.status(statusCode).json(checks);
});

3. 监控集成

version: '3.8'

services:
  app:
    image: myapp
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:3000/health"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 60s
    labels:
      - "prometheus.io/scrape=true"
      - "prometheus.io/port=3000"
      - "prometheus.io/path=/metrics"
      - "health.check.endpoint=/health"
      - "health.check.interval=30s"

  prometheus:
    image: prom/prometheus
    ports:
      - "9090:9090"
    volumes:
      - ./prometheus.yml:/etc/prometheus/prometheus.yml:ro
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--storage.tsdb.path=/prometheus'
      - '--web.console.libraries=/etc/prometheus/console_libraries'
      - '--web.console.templates=/etc/prometheus/consoles'

总结

Docker Compose 健康检查是确保容器化应用稳定运行的关键机制。通过合理配置健康检查，可以实现：

自动故障检测: 及时发现服务异常
依赖管理: 确保服务按正确顺序启动
自动恢复: 配合重启策略实现故障自愈
监控集成: 与监控系统集成实现全面监控
负载均衡: 配合负载均衡器实现故障转移

关键要点：

设计轻量级、快速的健康检查
合理设置检查参数（间隔、超时、重试）
实现多层次的健康检查（进程、网络、业务逻辑）
建立完善的监控和告警机制
制定故障恢复和转移策略

正确的健康检查配置是构建高可用容器化应用的基础。

健康检查

Docker Compose 健康检查

概述

健康检查基础

1. 健康检查状态

2. 健康检查配置参数

基本健康检查配置

1. HTTP 健康检查

2. 数据库健康检查

3. 应用程序健康检查

高级健康检查

1. 多层健康检查

2. 自定义健康检查脚本

3. 条件依赖启动

健康检查监控

1. 健康状态日志

2. 健康检查指标收集

3. 健康检查告警

应用程序健康检查实现

1. Node.js 健康检查

2. Python 健康检查

3. Java 健康检查

故障恢复策略

1. 自动重启配置

2. 故障转移配置

3. 健康检查恢复脚本

最佳实践

1. 健康检查设计原则

2. 健康检查端点实现

3. 监控集成

总结

results matching ""

No results matching ""