Nginx 上游服务器健康检查

Upstream Server Health Checks

概述

健康检查是确保负载均衡系统稳定性的关键机制。Nginx提供了被动和主动两种健康检查方式,能够自动检测后端服务器的状态,并将故障服务器从负载均衡池中移除。本文将详细介绍各种健康检查的配置方法和最佳实践。

1. 健康检查基础概念

1.1 健康检查类型

健康检查类型:
├── 被动健康检查 (Passive Health Checks)
│   ├── 基于真实请求的错误监测
│   ├── 失败计数和超时检测
│   └── 自动故障转移
└── 主动健康检查 (Active Health Checks)
    ├── 定期发送探测请求
    ├── 自定义检查端点
    └── 更快的故障检测

1.2 健康状态管理

服务器状态:
├── 健康 (Healthy) - 正常处理请求
├── 不健康 (Unhealthy) - 暂时移除
├── 备份 (Backup) - 仅在主服务器都故障时使用
└── 下线 (Down) - 管理员手动下线

2. 被动健康检查

2.1 基本配置

upstream backend {
    # 被动健康检查参数
    server 192.168.1.10:8080 max_fails=3 fail_timeout=30s;
    server 192.168.1.11:8080 max_fails=3 fail_timeout=30s;
    server 192.168.1.12:8080 max_fails=5 fail_timeout=60s weight=2;
    server 192.168.1.13:8080 backup;
}

server {
    listen 80;
    server_name example.com;

    location / {
        proxy_pass http://backend;

        # 定义失败条件
        proxy_next_upstream error timeout invalid_header http_500 http_502 http_503 http_504;
        proxy_next_upstream_tries 3;
        proxy_next_upstream_timeout 10s;

        proxy_set_header Host $host;
        proxy_set_header X-Real-IP $remote_addr;
        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
    }
}

2.2 详细失败条件配置

upstream detailed_backend {
    server 192.168.1.10:8080 max_fails=2 fail_timeout=20s;
    server 192.168.1.11:8080 max_fails=3 fail_timeout=30s;
    server 192.168.1.12:8080 max_fails=1 fail_timeout=10s;
}

server {
    listen 80;
    server_name detailed.example.com;

    location / {
        proxy_pass http://detailed_backend;

        # 详细的故障转移条件
        proxy_next_upstream 
            error           # 连接错误
            timeout         # 超时
            invalid_header  # 无效响应头
            http_500        # 内部服务器错误
            http_502        # 网关错误
            http_503        # 服务不可用
            http_504        # 网关超时
            http_403        # 禁止访问
            http_404;       # 未找到(可选)

        # 限制重试次数和时间
        proxy_next_upstream_tries 2;
        proxy_next_upstream_timeout 5s;

        # 超时设置
        proxy_connect_timeout 5s;
        proxy_send_timeout 10s;
        proxy_read_timeout 10s;

        proxy_set_header Host $host;
        proxy_set_header X-Real-IP $remote_addr;
        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
    }
}

2.3 不同服务的健康检查策略

# API服务器 - 快速失败和恢复
upstream api_backend {
    server 192.168.1.20:8080 max_fails=1 fail_timeout=10s;
    server 192.168.1.21:8080 max_fails=1 fail_timeout=10s;
    server 192.168.1.22:8080 max_fails=1 fail_timeout=10s;
}

# 文件服务器 - 容忍更多失败
upstream file_backend {
    server 192.168.1.30:8080 max_fails=5 fail_timeout=60s;
    server 192.168.1.31:8080 max_fails=5 fail_timeout=60s;
}

# 数据库代理 - 保守的健康检查
upstream db_proxy {
    server 192.168.1.40:3306 max_fails=1 fail_timeout=300s;
    server 192.168.1.41:3306 max_fails=1 fail_timeout=300s backup;
}

server {
    listen 80;
    server_name services.example.com;

    # API请求 - 快速故障转移
    location /api/ {
        proxy_pass http://api_backend;
        proxy_next_upstream error timeout http_500 http_502 http_503;
        proxy_next_upstream_tries 3;
        proxy_next_upstream_timeout 3s;

        proxy_connect_timeout 2s;
        proxy_send_timeout 5s;
        proxy_read_timeout 10s;
    }

    # 文件下载 - 允许更长超时
    location /files/ {
        proxy_pass http://file_backend;
        proxy_next_upstream error timeout http_500 http_502 http_503;
        proxy_next_upstream_tries 2;
        proxy_next_upstream_timeout 30s;

        proxy_connect_timeout 5s;
        proxy_send_timeout 60s;
        proxy_read_timeout 300s;
    }
}

3. 主动健康检查 (Nginx Plus)

3.1 基本主动健康检查

upstream active_backend {
    zone backend 64k;  # 共享内存区域

    server 192.168.1.10:8080;
    server 192.168.1.11:8080;
    server 192.168.1.12:8080;
}

server {
    listen 80;
    server_name active.example.com;

    location / {
        proxy_pass http://active_backend;

        # 主动健康检查配置
        health_check interval=5s    # 检查间隔
                    fails=3         # 连续失败次数
                    passes=2        # 连续成功次数
                    uri=/health     # 健康检查端点
                    match=server_ok; # 匹配条件

        proxy_set_header Host $host;
        proxy_set_header X-Real-IP $remote_addr;
        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
    }
}

# 定义健康检查匹配条件
match server_ok {
    status 200;
    header Content-Type ~ "application/json";
    body ~ "\"status\":\"ok\"";
}

3.2 高级主动健康检查

# 不同类型服务的健康检查
upstream web_servers {
    zone web_servers 64k;
    server 192.168.1.10:80;
    server 192.168.1.11:80;
}

upstream api_servers {
    zone api_servers 64k;
    server 192.168.1.20:8080;
    server 192.168.1.21:8080;
}

upstream database_servers {
    zone database_servers 64k;
    server 192.168.1.30:3306;
    server 192.168.1.31:3306;
}

# Web服务器健康检查匹配
match web_ok {
    status 200-399;
    header Content-Type ~ "text/html";
    body !~ "error|Error|ERROR";
}

# API服务器健康检查匹配
match api_ok {
    status 200;
    header Content-Type = "application/json";
    body ~ "\"status\":\"healthy\"";
    body ~ "\"timestamp\":[0-9]+";
}

# 数据库健康检查匹配
match db_ok {
    status 200;
    body ~ "database.*ok";
}

server {
    listen 80;
    server_name comprehensive.example.com;

    # Web服务健康检查
    location / {
        proxy_pass http://web_servers;
        health_check interval=10s fails=2 passes=1 uri=/ match=web_ok;
    }

    # API服务健康检查
    location /api/ {
        proxy_pass http://api_servers;
        health_check interval=3s fails=1 passes=2 uri=/health match=api_ok;
    }

    # 数据库代理健康检查
    location /db-status {
        proxy_pass http://database_servers;
        health_check interval=30s fails=1 passes=1 uri=/ping match=db_ok;

        # 只允许内部访问
        allow 192.168.1.0/24;
        deny all;
    }
}

3.3 自定义健康检查端点

# 为健康检查创建专用端点
server {
    listen 8081;
    server_name localhost;

    # 简单健康检查端点
    location /health {
        access_log off;
        return 200 '{"status":"healthy","timestamp":$msec,"server":"$hostname"}';
        add_header Content-Type application/json;
    }

    # 详细健康检查端点
    location /health/detailed {
        access_log off;

        # 检查应用依赖
        proxy_pass http://127.0.0.1:3000/internal/health;
        proxy_connect_timeout 1s;
        proxy_read_timeout 2s;

        # 如果后端不可用,返回错误
        error_page 502 503 504 = @unhealthy;
    }

    location @unhealthy {
        return 503 '{"status":"unhealthy","error":"backend unavailable"}';
        add_header Content-Type application/json;
    }

    # 数据库健康检查
    location /health/database {
        access_log off;

        # 这里可以连接到实际的数据库检查脚本
        proxy_pass http://127.0.0.1:3001/db-check;
        proxy_connect_timeout 2s;
        proxy_read_timeout 5s;
    }
}

4. 开源Nginx的健康检查方案

4.1 使用第三方模块

# 使用nginx-upstream-fair模块
upstream fair_backend {
    fair;  # 启用fair算法,包含健康检查
    server 192.168.1.10:8080;
    server 192.168.1.11:8080;
    server 192.168.1.12:8080;
}

# 使用nginx-healthcheck-module
upstream healthcheck_backend {
    server 192.168.1.10:8080;
    server 192.168.1.11:8080;
    server 192.168.1.12:8080;

    check interval=3000 rise=2 fall=5 timeout=1000 type=http;
    check_http_send "HEAD /health HTTP/1.0\r\n\r\n";
    check_http_expect_alive http_2xx http_3xx;
}

server {
    listen 80;
    server_name healthcheck.example.com;

    location / {
        proxy_pass http://healthcheck_backend;
    }

    # 健康检查状态页面
    location /status {
        check_status;
        access_log off;
        allow 192.168.1.0/24;
        deny all;
    }
}

4.2 外部健康检查脚本

#!/bin/bash
# nginx-health-monitor.sh

UPSTREAM_SERVERS=(
    "192.168.1.10:8080"
    "192.168.1.11:8080"
    "192.168.1.12:8080"
)

HEALTH_CHECK_URL="/health"
NGINX_UPSTREAM_CONF="/etc/nginx/conf.d/upstream.conf"
TEMP_CONF="/tmp/upstream.conf.tmp"

check_server_health() {
    local server=$1
    local url="http://${server}${HEALTH_CHECK_URL}"

    if curl -f -s --max-time 5 "$url" > /dev/null 2>&1; then
        return 0  # 健康
    else
        return 1  # 不健康
    fi
}

update_upstream_config() {
    echo "upstream backend {" > $TEMP_CONF

    for server in "${UPSTREAM_SERVERS[@]}"; do
        if check_server_health "$server"; then
            echo "    server $server;" >> $TEMP_CONF
            echo "Server $server is healthy"
        else
            echo "    # server $server;  # unhealthy" >> $TEMP_CONF
            echo "Server $server is unhealthy - removed from pool"
        fi
    done

    echo "}" >> $TEMP_CONF

    # 检查配置是否有变化
    if ! cmp -s "$NGINX_UPSTREAM_CONF" "$TEMP_CONF"; then
        mv "$TEMP_CONF" "$NGINX_UPSTREAM_CONF"
        nginx -t && nginx -s reload
        echo "Nginx configuration updated and reloaded"
    else
        rm "$TEMP_CONF"
    fi
}

# 主循环
while true; do
    update_upstream_config
    sleep 30
done

5. 健康检查监控和告警

5.1 健康状态日志记录

upstream monitored_backend {
    server 192.168.1.10:8080 max_fails=2 fail_timeout=30s;
    server 192.168.1.11:8080 max_fails=2 fail_timeout=30s;
    server 192.168.1.12:8080 max_fails=2 fail_timeout=30s;
}

# 定义健康检查日志格式
log_format health_check '$remote_addr - [$time_local] "$request" '
                       '$status $body_bytes_sent '
                       '"$upstream_addr" "$upstream_status" '
                       '$upstream_response_time "$upstream_cache_status"';

server {
    listen 80;
    server_name monitored.example.com;

    # 记录详细的健康状态
    access_log /var/log/nginx/health_check.log health_check;

    location / {
        proxy_pass http://monitored_backend;

        # 添加健康状态头部
        add_header X-Upstream-Server $upstream_addr;
        add_header X-Upstream-Status $upstream_status;
        add_header X-Upstream-Response-Time $upstream_response_time;

        proxy_set_header Host $host;
        proxy_set_header X-Real-IP $remote_addr;
        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
    }
}

5.2 健康检查告警脚本

#!/bin/bash
# health-check-alert.sh

LOG_FILE="/var/log/nginx/health_check.log"
ALERT_EMAIL="admin@example.com"
ALERT_THRESHOLD=5  # 5分钟内失败超过此次数则告警

check_upstream_health() {
    local current_time=$(date +%s)
    local five_minutes_ago=$((current_time - 300))

    # 统计最近5分钟的5xx错误
    local error_count=$(awk -v start_time="$five_minutes_ago" '
    {
        # 解析时间戳
        gsub(/\[|\]/, "", $4)
        cmd = "date -d \"" $4 "\" +%s"
        cmd | getline timestamp
        close(cmd)

        if (timestamp >= start_time && $6 ~ /^5[0-9][0-9]$/) {
            count++
        }
    }
    END { print count+0 }' "$LOG_FILE")

    if [ "$error_count" -gt "$ALERT_THRESHOLD" ]; then
        local message="WARNING: Upstream health check failures detected!

Failures in last 5 minutes: $error_count
Time: $(date)
Log file: $LOG_FILE

Recent errors:
$(tail -20 "$LOG_FILE" | grep -E ' 5[0-9][0-9] ')"

        echo "$message" | mail -s "Nginx Upstream Health Alert" "$ALERT_EMAIL"

        # 记录告警
        echo "$(date): Health check alert sent - $error_count failures" >> /var/log/nginx/health_alerts.log
    fi
}

# 检查上游服务器可用性
check_server_availability() {
    local servers=(
        "192.168.1.10:8080"
        "192.168.1.11:8080"
        "192.168.1.12:8080"
    )

    local failed_servers=()

    for server in "${servers[@]}"; do
        if ! curl -f -s --max-time 3 "http://$server/health" > /dev/null; then
            failed_servers+=("$server")
        fi
    done

    if [ ${#failed_servers[@]} -gt 0 ]; then
        local message="CRITICAL: Upstream servers are down!

Failed servers: ${failed_servers[*]}
Time: $(date)
Total failed: ${#failed_servers[@]}/${#servers[@]}"

        echo "$message" | mail -s "CRITICAL: Nginx Upstream Servers Down" "$ALERT_EMAIL"
    fi
}

# 主循环
while true; do
    check_upstream_health
    check_server_availability
    sleep 60
done

6. 故障恢复和自动化

6.1 自动故障恢复

upstream auto_recovery_backend {
    # 使用较短的fail_timeout实现快速恢复
    server 192.168.1.10:8080 max_fails=2 fail_timeout=10s;
    server 192.168.1.11:8080 max_fails=2 fail_timeout=10s;
    server 192.168.1.12:8080 max_fails=2 fail_timeout=10s;

    # 备份服务器
    server 192.168.1.20:8080 backup;
}

server {
    listen 80;
    server_name recovery.example.com;

    location / {
        proxy_pass http://auto_recovery_backend;

        # 积极的重试策略
        proxy_next_upstream error timeout http_500 http_502 http_503;
        proxy_next_upstream_tries 2;
        proxy_next_upstream_timeout 5s;

        proxy_set_header Host $host;
        proxy_set_header X-Real-IP $remote_addr;
        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
    }
}

6.2 集成监控系统

# prometheus.yml - Prometheus配置
scrape_configs:
  - job_name: 'nginx'
    static_configs:
      - targets: ['localhost:9113']
    scrape_interval: 5s

  - job_name: 'nginx-upstream'
    static_configs:
      - targets: ['localhost:8080']
    metrics_path: '/metrics'
    scrape_interval: 10s
# Nginx配置 - 暴露指标
server {
    listen 8080;
    server_name localhost;

    location /metrics {
        # 使用nginx-prometheus-exporter
        stub_status on;
        access_log off;
        allow 127.0.0.1;
        allow 192.168.1.0/24;
        deny all;
    }

    location /nginx_status {
        stub_status on;
        access_log off;
        allow 127.0.0.1;
        allow 192.168.1.0/24;
        deny all;
    }
}

7. 性能优化

7.1 健康检查性能优化

# 优化健康检查性能
upstream optimized_backend {
    # 使用合理的检查参数
    server 192.168.1.10:8080 max_fails=3 fail_timeout=30s;
    server 192.168.1.11:8080 max_fails=3 fail_timeout=30s;
    server 192.168.1.12:8080 max_fails=3 fail_timeout=30s;

    # 连接池优化
    keepalive 16;
    keepalive_requests 100;
    keepalive_timeout 60s;
}

server {
    listen 80;
    server_name optimized.example.com;

    location / {
        proxy_pass http://optimized_backend;

        # 连接复用
        proxy_http_version 1.1;
        proxy_set_header Connection "";

        # 优化超时设置
        proxy_connect_timeout 3s;
        proxy_send_timeout 10s;
        proxy_read_timeout 30s;

        # 智能重试
        proxy_next_upstream error timeout http_500 http_502 http_503;
        proxy_next_upstream_tries 1;  # 减少重试次数
        proxy_next_upstream_timeout 3s;

        proxy_set_header Host $host;
        proxy_set_header X-Real-IP $remote_addr;
        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
    }

    # 专用健康检查端点
    location /internal/health {
        internal;
        proxy_pass http://optimized_backend/health;
        proxy_connect_timeout 1s;
        proxy_read_timeout 2s;
    }
}

8. 最佳实践

8.1 健康检查最佳实践

# 生产环境健康检查配置示例
upstream production_backend {
    # 基于服务器性能设置不同参数
    server 192.168.1.10:8080 weight=3 max_fails=2 fail_timeout=20s;  # 高性能服务器
    server 192.168.1.11:8080 weight=2 max_fails=3 fail_timeout=30s;  # 中等性能服务器
    server 192.168.1.12:8080 weight=1 max_fails=5 fail_timeout=60s;  # 低性能服务器

    # 本地备份服务器
    server 127.0.0.1:8080 backup max_fails=1 fail_timeout=10s;

    # 连接优化
    keepalive 32;
    keepalive_requests 1000;
    keepalive_timeout 60s;
}

server {
    listen 80;
    server_name production.example.com;

    location / {
        proxy_pass http://production_backend;

        # 生产环境故障转移策略
        proxy_next_upstream error timeout invalid_header http_500 http_502 http_503 http_504;
        proxy_next_upstream_tries 2;
        proxy_next_upstream_timeout 5s;

        # 超时设置
        proxy_connect_timeout 5s;
        proxy_send_timeout 60s;
        proxy_read_timeout 60s;

        # 必要的头部
        proxy_set_header Host $host;
        proxy_set_header X-Real-IP $remote_addr;
        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
        proxy_set_header X-Forwarded-Proto $scheme;

        # 健康状态记录
        add_header X-Upstream-Server $upstream_addr always;
        add_header X-Upstream-Response-Time $upstream_response_time always;
    }
}

小结

通过本文的学习,你应该掌握:

  1. 被动和主动健康检查的区别和配置方法
  2. 不同场景下的健康检查策略选择
  3. 健康检查的监控和告警机制
  4. 故障恢复和自动化处理
  5. 健康检查的性能优化技巧
  6. 生产环境的最佳实践

下一篇文章将介绍Session保持与粘性会话的配置方法。

powered by Gitbook© 2025 编外计划 | 最后修改: 2025-08-29 15:40:15

results matching ""

    No results matching ""