Nginx 上游服务器健康检查

Upstream Server Health Checks

概述

健康检查是确保负载均衡系统稳定性的关键机制。Nginx提供了被动和主动两种健康检查方式，能够自动检测后端服务器的状态，并将故障服务器从负载均衡池中移除。本文将详细介绍各种健康检查的配置方法和最佳实践。

1. 健康检查基础概念

1.1 健康检查类型

健康检查类型：
├── 被动健康检查 (Passive Health Checks)
│   ├── 基于真实请求的错误监测
│   ├── 失败计数和超时检测
│   └── 自动故障转移
└── 主动健康检查 (Active Health Checks)
    ├── 定期发送探测请求
    ├── 自定义检查端点
    └── 更快的故障检测

1.2 健康状态管理

服务器状态：
├── 健康 (Healthy) - 正常处理请求
├── 不健康 (Unhealthy) - 暂时移除
├── 备份 (Backup) - 仅在主服务器都故障时使用
└── 下线 (Down) - 管理员手动下线

2. 被动健康检查

2.1 基本配置

upstream backend {
    # 被动健康检查参数
    server 192.168.1.10:8080 max_fails=3 fail_timeout=30s;
    server 192.168.1.11:8080 max_fails=3 fail_timeout=30s;
    server 192.168.1.12:8080 max_fails=5 fail_timeout=60s weight=2;
    server 192.168.1.13:8080 backup;
}

server {
    listen 80;
    server_name example.com;

    location / {
        proxy_pass http://backend;

        # 定义失败条件
        proxy_next_upstream error timeout invalid_header http_500 http_502 http_503 http_504;
        proxy_next_upstream_tries 3;
        proxy_next_upstream_timeout 10s;

        proxy_set_header Host $host;
        proxy_set_header X-Real-IP $remote_addr;
        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
    }
}

2.2 详细失败条件配置

upstream detailed_backend {
    server 192.168.1.10:8080 max_fails=2 fail_timeout=20s;
    server 192.168.1.11:8080 max_fails=3 fail_timeout=30s;
    server 192.168.1.12:8080 max_fails=1 fail_timeout=10s;
}

server {
    listen 80;
    server_name detailed.example.com;

    location / {
        proxy_pass http://detailed_backend;

        # 详细的故障转移条件
        proxy_next_upstream 
            error           # 连接错误
            timeout         # 超时
            invalid_header  # 无效响应头
            http_500        # 内部服务器错误
            http_502        # 网关错误
            http_503        # 服务不可用
            http_504        # 网关超时
            http_403        # 禁止访问
            http_404;       # 未找到（可选）

        # 限制重试次数和时间
        proxy_next_upstream_tries 2;
        proxy_next_upstream_timeout 5s;

        # 超时设置
        proxy_connect_timeout 5s;
        proxy_send_timeout 10s;
        proxy_read_timeout 10s;

        proxy_set_header Host $host;
        proxy_set_header X-Real-IP $remote_addr;
        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
    }
}

2.3 不同服务的健康检查策略

# API服务器 - 快速失败和恢复
upstream api_backend {
    server 192.168.1.20:8080 max_fails=1 fail_timeout=10s;
    server 192.168.1.21:8080 max_fails=1 fail_timeout=10s;
    server 192.168.1.22:8080 max_fails=1 fail_timeout=10s;
}

# 文件服务器 - 容忍更多失败
upstream file_backend {
    server 192.168.1.30:8080 max_fails=5 fail_timeout=60s;
    server 192.168.1.31:8080 max_fails=5 fail_timeout=60s;
}

# 数据库代理 - 保守的健康检查
upstream db_proxy {
    server 192.168.1.40:3306 max_fails=1 fail_timeout=300s;
    server 192.168.1.41:3306 max_fails=1 fail_timeout=300s backup;
}

server {
    listen 80;
    server_name services.example.com;

    # API请求 - 快速故障转移
    location /api/ {
        proxy_pass http://api_backend;
        proxy_next_upstream error timeout http_500 http_502 http_503;
        proxy_next_upstream_tries 3;
        proxy_next_upstream_timeout 3s;

        proxy_connect_timeout 2s;
        proxy_send_timeout 5s;
        proxy_read_timeout 10s;
    }

    # 文件下载 - 允许更长超时
    location /files/ {
        proxy_pass http://file_backend;
        proxy_next_upstream error timeout http_500 http_502 http_503;
        proxy_next_upstream_tries 2;
        proxy_next_upstream_timeout 30s;

        proxy_connect_timeout 5s;
        proxy_send_timeout 60s;
        proxy_read_timeout 300s;
    }
}

3. 主动健康检查 (Nginx Plus)

3.1 基本主动健康检查

upstream active_backend {
    zone backend 64k;  # 共享内存区域

    server 192.168.1.10:8080;
    server 192.168.1.11:8080;
    server 192.168.1.12:8080;
}

server {
    listen 80;
    server_name active.example.com;

    location / {
        proxy_pass http://active_backend;

        # 主动健康检查配置
        health_check interval=5s    # 检查间隔
                    fails=3         # 连续失败次数
                    passes=2        # 连续成功次数
                    uri=/health     # 健康检查端点
                    match=server_ok; # 匹配条件

        proxy_set_header Host $host;
        proxy_set_header X-Real-IP $remote_addr;
        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
    }
}

# 定义健康检查匹配条件
match server_ok {
    status 200;
    header Content-Type ~ "application/json";
    body ~ "\"status\":\"ok\"";
}

3.2 高级主动健康检查

# 不同类型服务的健康检查
upstream web_servers {
    zone web_servers 64k;
    server 192.168.1.10:80;
    server 192.168.1.11:80;
}

upstream api_servers {
    zone api_servers 64k;
    server 192.168.1.20:8080;
    server 192.168.1.21:8080;
}

upstream database_servers {
    zone database_servers 64k;
    server 192.168.1.30:3306;
    server 192.168.1.31:3306;
}

# Web服务器健康检查匹配
match web_ok {
    status 200-399;
    header Content-Type ~ "text/html";
    body !~ "error|Error|ERROR";
}

# API服务器健康检查匹配
match api_ok {
    status 200;
    header Content-Type = "application/json";
    body ~ "\"status\":\"healthy\"";
    body ~ "\"timestamp\":[0-9]+";
}

# 数据库健康检查匹配
match db_ok {
    status 200;
    body ~ "database.*ok";
}

server {
    listen 80;
    server_name comprehensive.example.com;

    # Web服务健康检查
    location / {
        proxy_pass http://web_servers;
        health_check interval=10s fails=2 passes=1 uri=/ match=web_ok;
    }

    # API服务健康检查
    location /api/ {
        proxy_pass http://api_servers;
        health_check interval=3s fails=1 passes=2 uri=/health match=api_ok;
    }

    # 数据库代理健康检查
    location /db-status {
        proxy_pass http://database_servers;
        health_check interval=30s fails=1 passes=1 uri=/ping match=db_ok;

        # 只允许内部访问
        allow 192.168.1.0/24;
        deny all;
    }
}

3.3 自定义健康检查端点

# 为健康检查创建专用端点
server {
    listen 8081;
    server_name localhost;

    # 简单健康检查端点
    location /health {
        access_log off;
        return 200 '{"status":"healthy","timestamp":$msec,"server":"$hostname"}';
        add_header Content-Type application/json;
    }

    # 详细健康检查端点
    location /health/detailed {
        access_log off;

        # 检查应用依赖
        proxy_pass http://127.0.0.1:3000/internal/health;
        proxy_connect_timeout 1s;
        proxy_read_timeout 2s;

        # 如果后端不可用，返回错误
        error_page 502 503 504 = @unhealthy;
    }

    location @unhealthy {
        return 503 '{"status":"unhealthy","error":"backend unavailable"}';
        add_header Content-Type application/json;
    }

    # 数据库健康检查
    location /health/database {
        access_log off;

        # 这里可以连接到实际的数据库检查脚本
        proxy_pass http://127.0.0.1:3001/db-check;
        proxy_connect_timeout 2s;
        proxy_read_timeout 5s;
    }
}

4. 开源Nginx的健康检查方案

4.1 使用第三方模块

# 使用nginx-upstream-fair模块
upstream fair_backend {
    fair;  # 启用fair算法，包含健康检查
    server 192.168.1.10:8080;
    server 192.168.1.11:8080;
    server 192.168.1.12:8080;
}

# 使用nginx-healthcheck-module
upstream healthcheck_backend {
    server 192.168.1.10:8080;
    server 192.168.1.11:8080;
    server 192.168.1.12:8080;

    check interval=3000 rise=2 fall=5 timeout=1000 type=http;
    check_http_send "HEAD /health HTTP/1.0\r\n\r\n";
    check_http_expect_alive http_2xx http_3xx;
}

server {
    listen 80;
    server_name healthcheck.example.com;

    location / {
        proxy_pass http://healthcheck_backend;
    }

    # 健康检查状态页面
    location /status {
        check_status;
        access_log off;
        allow 192.168.1.0/24;
        deny all;
    }
}

4.2 外部健康检查脚本

#!/bin/bash
# nginx-health-monitor.sh

UPSTREAM_SERVERS=(
    "192.168.1.10:8080"
    "192.168.1.11:8080"
    "192.168.1.12:8080"
)

HEALTH_CHECK_URL="/health"
NGINX_UPSTREAM_CONF="/etc/nginx/conf.d/upstream.conf"
TEMP_CONF="/tmp/upstream.conf.tmp"

check_server_health() {
    local server=$1
    local url="http://${server}${HEALTH_CHECK_URL}"

    if curl -f -s --max-time 5 "$url" > /dev/null 2>&1; then
        return 0  # 健康
    else
        return 1  # 不健康
    fi
}

update_upstream_config() {
    echo "upstream backend {" > $TEMP_CONF

    for server in "${UPSTREAM_SERVERS[@]}"; do
        if check_server_health "$server"; then
            echo "    server $server;" >> $TEMP_CONF
            echo "Server $server is healthy"
        else
            echo "    # server $server;  # unhealthy" >> $TEMP_CONF
            echo "Server $server is unhealthy - removed from pool"
        fi
    done

    echo "}" >> $TEMP_CONF

    # 检查配置是否有变化
    if ! cmp -s "$NGINX_UPSTREAM_CONF" "$TEMP_CONF"; then
        mv "$TEMP_CONF" "$NGINX_UPSTREAM_CONF"
        nginx -t && nginx -s reload
        echo "Nginx configuration updated and reloaded"
    else
        rm "$TEMP_CONF"
    fi
}

# 主循环
while true; do
    update_upstream_config
    sleep 30
done

5. 健康检查监控和告警

5.1 健康状态日志记录

upstream monitored_backend {
    server 192.168.1.10:8080 max_fails=2 fail_timeout=30s;
    server 192.168.1.11:8080 max_fails=2 fail_timeout=30s;
    server 192.168.1.12:8080 max_fails=2 fail_timeout=30s;
}

# 定义健康检查日志格式
log_format health_check '$remote_addr - [$time_local] "$request" '
                       '$status $body_bytes_sent '
                       '"$upstream_addr" "$upstream_status" '
                       '$upstream_response_time "$upstream_cache_status"';

server {
    listen 80;
    server_name monitored.example.com;

    # 记录详细的健康状态
    access_log /var/log/nginx/health_check.log health_check;

    location / {
        proxy_pass http://monitored_backend;

        # 添加健康状态头部
        add_header X-Upstream-Server $upstream_addr;
        add_header X-Upstream-Status $upstream_status;
        add_header X-Upstream-Response-Time $upstream_response_time;

        proxy_set_header Host $host;
        proxy_set_header X-Real-IP $remote_addr;
        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
    }
}

5.2 健康检查告警脚本

#!/bin/bash
# health-check-alert.sh

LOG_FILE="/var/log/nginx/health_check.log"
ALERT_EMAIL="admin@example.com"
ALERT_THRESHOLD=5  # 5分钟内失败超过此次数则告警

check_upstream_health() {
    local current_time=$(date +%s)
    local five_minutes_ago=$((current_time - 300))

    # 统计最近5分钟的5xx错误
    local error_count=$(awk -v start_time="$five_minutes_ago" '
    {
        # 解析时间戳
        gsub(/\[|\]/, "", $4)
        cmd = "date -d \"" $4 "\" +%s"
        cmd | getline timestamp
        close(cmd)

        if (timestamp >= start_time && $6 ~ /^5[0-9][0-9]$/) {
            count++
        }
    }
    END { print count+0 }' "$LOG_FILE")

    if [ "$error_count" -gt "$ALERT_THRESHOLD" ]; then
        local message="WARNING: Upstream health check failures detected!

Failures in last 5 minutes: $error_count
Time: $(date)
Log file: $LOG_FILE

Recent errors:
$(tail -20 "$LOG_FILE" | grep -E ' 5[0-9][0-9] ')"

        echo "$message" | mail -s "Nginx Upstream Health Alert" "$ALERT_EMAIL"

        # 记录告警
        echo "$(date): Health check alert sent - $error_count failures" >> /var/log/nginx/health_alerts.log
    fi
}

# 检查上游服务器可用性
check_server_availability() {
    local servers=(
        "192.168.1.10:8080"
        "192.168.1.11:8080"
        "192.168.1.12:8080"
    )

    local failed_servers=()

    for server in "${servers[@]}"; do
        if ! curl -f -s --max-time 3 "http://$server/health" > /dev/null; then
            failed_servers+=("$server")
        fi
    done

    if [ ${#failed_servers[@]} -gt 0 ]; then
        local message="CRITICAL: Upstream servers are down!

Failed servers: ${failed_servers[*]}
Time: $(date)
Total failed: ${#failed_servers[@]}/${#servers[@]}"

        echo "$message" | mail -s "CRITICAL: Nginx Upstream Servers Down" "$ALERT_EMAIL"
    fi
}

# 主循环
while true; do
    check_upstream_health
    check_server_availability
    sleep 60
done

6. 故障恢复和自动化

6.1 自动故障恢复

upstream auto_recovery_backend {
    # 使用较短的fail_timeout实现快速恢复
    server 192.168.1.10:8080 max_fails=2 fail_timeout=10s;
    server 192.168.1.11:8080 max_fails=2 fail_timeout=10s;
    server 192.168.1.12:8080 max_fails=2 fail_timeout=10s;

    # 备份服务器
    server 192.168.1.20:8080 backup;
}

server {
    listen 80;
    server_name recovery.example.com;

    location / {
        proxy_pass http://auto_recovery_backend;

        # 积极的重试策略
        proxy_next_upstream error timeout http_500 http_502 http_503;
        proxy_next_upstream_tries 2;
        proxy_next_upstream_timeout 5s;

        proxy_set_header Host $host;
        proxy_set_header X-Real-IP $remote_addr;
        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
    }
}

6.2 集成监控系统

# prometheus.yml - Prometheus配置
scrape_configs:
  - job_name: 'nginx'
    static_configs:
      - targets: ['localhost:9113']
    scrape_interval: 5s

  - job_name: 'nginx-upstream'
    static_configs:
      - targets: ['localhost:8080']
    metrics_path: '/metrics'
    scrape_interval: 10s

# Nginx配置 - 暴露指标
server {
    listen 8080;
    server_name localhost;

    location /metrics {
        # 使用nginx-prometheus-exporter
        stub_status on;
        access_log off;
        allow 127.0.0.1;
        allow 192.168.1.0/24;
        deny all;
    }

    location /nginx_status {
        stub_status on;
        access_log off;
        allow 127.0.0.1;
        allow 192.168.1.0/24;
        deny all;
    }
}

7. 性能优化

7.1 健康检查性能优化

# 优化健康检查性能
upstream optimized_backend {
    # 使用合理的检查参数
    server 192.168.1.10:8080 max_fails=3 fail_timeout=30s;
    server 192.168.1.11:8080 max_fails=3 fail_timeout=30s;
    server 192.168.1.12:8080 max_fails=3 fail_timeout=30s;

    # 连接池优化
    keepalive 16;
    keepalive_requests 100;
    keepalive_timeout 60s;
}

server {
    listen 80;
    server_name optimized.example.com;

    location / {
        proxy_pass http://optimized_backend;

        # 连接复用
        proxy_http_version 1.1;
        proxy_set_header Connection "";

        # 优化超时设置
        proxy_connect_timeout 3s;
        proxy_send_timeout 10s;
        proxy_read_timeout 30s;

        # 智能重试
        proxy_next_upstream error timeout http_500 http_502 http_503;
        proxy_next_upstream_tries 1;  # 减少重试次数
        proxy_next_upstream_timeout 3s;

        proxy_set_header Host $host;
        proxy_set_header X-Real-IP $remote_addr;
        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
    }

    # 专用健康检查端点
    location /internal/health {
        internal;
        proxy_pass http://optimized_backend/health;
        proxy_connect_timeout 1s;
        proxy_read_timeout 2s;
    }
}

8. 最佳实践

8.1 健康检查最佳实践

# 生产环境健康检查配置示例
upstream production_backend {
    # 基于服务器性能设置不同参数
    server 192.168.1.10:8080 weight=3 max_fails=2 fail_timeout=20s;  # 高性能服务器
    server 192.168.1.11:8080 weight=2 max_fails=3 fail_timeout=30s;  # 中等性能服务器
    server 192.168.1.12:8080 weight=1 max_fails=5 fail_timeout=60s;  # 低性能服务器

    # 本地备份服务器
    server 127.0.0.1:8080 backup max_fails=1 fail_timeout=10s;

    # 连接优化
    keepalive 32;
    keepalive_requests 1000;
    keepalive_timeout 60s;
}

server {
    listen 80;
    server_name production.example.com;

    location / {
        proxy_pass http://production_backend;

        # 生产环境故障转移策略
        proxy_next_upstream error timeout invalid_header http_500 http_502 http_503 http_504;
        proxy_next_upstream_tries 2;
        proxy_next_upstream_timeout 5s;

        # 超时设置
        proxy_connect_timeout 5s;
        proxy_send_timeout 60s;
        proxy_read_timeout 60s;

        # 必要的头部
        proxy_set_header Host $host;
        proxy_set_header X-Real-IP $remote_addr;
        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
        proxy_set_header X-Forwarded-Proto $scheme;

        # 健康状态记录
        add_header X-Upstream-Server $upstream_addr always;
        add_header X-Upstream-Response-Time $upstream_response_time always;
    }
}

小结

通过本文的学习，你应该掌握：

被动和主动健康检查的区别和配置方法
不同场景下的健康检查策略选择
健康检查的监控和告警机制
故障恢复和自动化处理
健康检查的性能优化技巧
生产环境的最佳实践

下一篇文章将介绍Session保持与粘性会话的配置方法。

上游服务器健康检查

Nginx 上游服务器健康检查

Upstream Server Health Checks

概述

1. 健康检查基础概念

1.1 健康检查类型

1.2 健康状态管理

2. 被动健康检查

2.1 基本配置

2.2 详细失败条件配置

2.3 不同服务的健康检查策略

3. 主动健康检查 (Nginx Plus)

3.1 基本主动健康检查

3.2 高级主动健康检查

3.3 自定义健康检查端点

4. 开源Nginx的健康检查方案

4.1 使用第三方模块

4.2 外部健康检查脚本

5. 健康检查监控和告警

5.1 健康状态日志记录

5.2 健康检查告警脚本

6. 故障恢复和自动化

6.1 自动故障恢复

6.2 集成监控系统

7. 性能优化

7.1 健康检查性能优化

8. 最佳实践

8.1 健康检查最佳实践

小结

results matching ""

No results matching ""