Nginx 上游服务器健康检查
Upstream Server Health Checks
概述
健康检查是确保负载均衡系统稳定性的关键机制。Nginx提供了被动和主动两种健康检查方式,能够自动检测后端服务器的状态,并将故障服务器从负载均衡池中移除。本文将详细介绍各种健康检查的配置方法和最佳实践。
1. 健康检查基础概念
1.1 健康检查类型
健康检查类型:
├── 被动健康检查 (Passive Health Checks)
│ ├── 基于真实请求的错误监测
│ ├── 失败计数和超时检测
│ └── 自动故障转移
└── 主动健康检查 (Active Health Checks)
├── 定期发送探测请求
├── 自定义检查端点
└── 更快的故障检测
1.2 健康状态管理
服务器状态:
├── 健康 (Healthy) - 正常处理请求
├── 不健康 (Unhealthy) - 暂时移除
├── 备份 (Backup) - 仅在主服务器都故障时使用
└── 下线 (Down) - 管理员手动下线
2. 被动健康检查
2.1 基本配置
upstream backend {
# 被动健康检查参数
server 192.168.1.10:8080 max_fails=3 fail_timeout=30s;
server 192.168.1.11:8080 max_fails=3 fail_timeout=30s;
server 192.168.1.12:8080 max_fails=5 fail_timeout=60s weight=2;
server 192.168.1.13:8080 backup;
}
server {
listen 80;
server_name example.com;
location / {
proxy_pass http://backend;
# 定义失败条件
proxy_next_upstream error timeout invalid_header http_500 http_502 http_503 http_504;
proxy_next_upstream_tries 3;
proxy_next_upstream_timeout 10s;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
}
}
2.2 详细失败条件配置
upstream detailed_backend {
server 192.168.1.10:8080 max_fails=2 fail_timeout=20s;
server 192.168.1.11:8080 max_fails=3 fail_timeout=30s;
server 192.168.1.12:8080 max_fails=1 fail_timeout=10s;
}
server {
listen 80;
server_name detailed.example.com;
location / {
proxy_pass http://detailed_backend;
# 详细的故障转移条件
proxy_next_upstream
error # 连接错误
timeout # 超时
invalid_header # 无效响应头
http_500 # 内部服务器错误
http_502 # 网关错误
http_503 # 服务不可用
http_504 # 网关超时
http_403 # 禁止访问
http_404; # 未找到(可选)
# 限制重试次数和时间
proxy_next_upstream_tries 2;
proxy_next_upstream_timeout 5s;
# 超时设置
proxy_connect_timeout 5s;
proxy_send_timeout 10s;
proxy_read_timeout 10s;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
}
}
2.3 不同服务的健康检查策略
# API服务器 - 快速失败和恢复
upstream api_backend {
server 192.168.1.20:8080 max_fails=1 fail_timeout=10s;
server 192.168.1.21:8080 max_fails=1 fail_timeout=10s;
server 192.168.1.22:8080 max_fails=1 fail_timeout=10s;
}
# 文件服务器 - 容忍更多失败
upstream file_backend {
server 192.168.1.30:8080 max_fails=5 fail_timeout=60s;
server 192.168.1.31:8080 max_fails=5 fail_timeout=60s;
}
# 数据库代理 - 保守的健康检查
upstream db_proxy {
server 192.168.1.40:3306 max_fails=1 fail_timeout=300s;
server 192.168.1.41:3306 max_fails=1 fail_timeout=300s backup;
}
server {
listen 80;
server_name services.example.com;
# API请求 - 快速故障转移
location /api/ {
proxy_pass http://api_backend;
proxy_next_upstream error timeout http_500 http_502 http_503;
proxy_next_upstream_tries 3;
proxy_next_upstream_timeout 3s;
proxy_connect_timeout 2s;
proxy_send_timeout 5s;
proxy_read_timeout 10s;
}
# 文件下载 - 允许更长超时
location /files/ {
proxy_pass http://file_backend;
proxy_next_upstream error timeout http_500 http_502 http_503;
proxy_next_upstream_tries 2;
proxy_next_upstream_timeout 30s;
proxy_connect_timeout 5s;
proxy_send_timeout 60s;
proxy_read_timeout 300s;
}
}
3. 主动健康检查 (Nginx Plus)
3.1 基本主动健康检查
upstream active_backend {
zone backend 64k; # 共享内存区域
server 192.168.1.10:8080;
server 192.168.1.11:8080;
server 192.168.1.12:8080;
}
server {
listen 80;
server_name active.example.com;
location / {
proxy_pass http://active_backend;
# 主动健康检查配置
health_check interval=5s # 检查间隔
fails=3 # 连续失败次数
passes=2 # 连续成功次数
uri=/health # 健康检查端点
match=server_ok; # 匹配条件
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
}
}
# 定义健康检查匹配条件
match server_ok {
status 200;
header Content-Type ~ "application/json";
body ~ "\"status\":\"ok\"";
}
3.2 高级主动健康检查
# 不同类型服务的健康检查
upstream web_servers {
zone web_servers 64k;
server 192.168.1.10:80;
server 192.168.1.11:80;
}
upstream api_servers {
zone api_servers 64k;
server 192.168.1.20:8080;
server 192.168.1.21:8080;
}
upstream database_servers {
zone database_servers 64k;
server 192.168.1.30:3306;
server 192.168.1.31:3306;
}
# Web服务器健康检查匹配
match web_ok {
status 200-399;
header Content-Type ~ "text/html";
body !~ "error|Error|ERROR";
}
# API服务器健康检查匹配
match api_ok {
status 200;
header Content-Type = "application/json";
body ~ "\"status\":\"healthy\"";
body ~ "\"timestamp\":[0-9]+";
}
# 数据库健康检查匹配
match db_ok {
status 200;
body ~ "database.*ok";
}
server {
listen 80;
server_name comprehensive.example.com;
# Web服务健康检查
location / {
proxy_pass http://web_servers;
health_check interval=10s fails=2 passes=1 uri=/ match=web_ok;
}
# API服务健康检查
location /api/ {
proxy_pass http://api_servers;
health_check interval=3s fails=1 passes=2 uri=/health match=api_ok;
}
# 数据库代理健康检查
location /db-status {
proxy_pass http://database_servers;
health_check interval=30s fails=1 passes=1 uri=/ping match=db_ok;
# 只允许内部访问
allow 192.168.1.0/24;
deny all;
}
}
3.3 自定义健康检查端点
# 为健康检查创建专用端点
server {
listen 8081;
server_name localhost;
# 简单健康检查端点
location /health {
access_log off;
return 200 '{"status":"healthy","timestamp":$msec,"server":"$hostname"}';
add_header Content-Type application/json;
}
# 详细健康检查端点
location /health/detailed {
access_log off;
# 检查应用依赖
proxy_pass http://127.0.0.1:3000/internal/health;
proxy_connect_timeout 1s;
proxy_read_timeout 2s;
# 如果后端不可用,返回错误
error_page 502 503 504 = @unhealthy;
}
location @unhealthy {
return 503 '{"status":"unhealthy","error":"backend unavailable"}';
add_header Content-Type application/json;
}
# 数据库健康检查
location /health/database {
access_log off;
# 这里可以连接到实际的数据库检查脚本
proxy_pass http://127.0.0.1:3001/db-check;
proxy_connect_timeout 2s;
proxy_read_timeout 5s;
}
}
4. 开源Nginx的健康检查方案
4.1 使用第三方模块
# 使用nginx-upstream-fair模块
upstream fair_backend {
fair; # 启用fair算法,包含健康检查
server 192.168.1.10:8080;
server 192.168.1.11:8080;
server 192.168.1.12:8080;
}
# 使用nginx-healthcheck-module
upstream healthcheck_backend {
server 192.168.1.10:8080;
server 192.168.1.11:8080;
server 192.168.1.12:8080;
check interval=3000 rise=2 fall=5 timeout=1000 type=http;
check_http_send "HEAD /health HTTP/1.0\r\n\r\n";
check_http_expect_alive http_2xx http_3xx;
}
server {
listen 80;
server_name healthcheck.example.com;
location / {
proxy_pass http://healthcheck_backend;
}
# 健康检查状态页面
location /status {
check_status;
access_log off;
allow 192.168.1.0/24;
deny all;
}
}
4.2 外部健康检查脚本
#!/bin/bash
# nginx-health-monitor.sh
UPSTREAM_SERVERS=(
"192.168.1.10:8080"
"192.168.1.11:8080"
"192.168.1.12:8080"
)
HEALTH_CHECK_URL="/health"
NGINX_UPSTREAM_CONF="/etc/nginx/conf.d/upstream.conf"
TEMP_CONF="/tmp/upstream.conf.tmp"
check_server_health() {
local server=$1
local url="http://${server}${HEALTH_CHECK_URL}"
if curl -f -s --max-time 5 "$url" > /dev/null 2>&1; then
return 0 # 健康
else
return 1 # 不健康
fi
}
update_upstream_config() {
echo "upstream backend {" > $TEMP_CONF
for server in "${UPSTREAM_SERVERS[@]}"; do
if check_server_health "$server"; then
echo " server $server;" >> $TEMP_CONF
echo "Server $server is healthy"
else
echo " # server $server; # unhealthy" >> $TEMP_CONF
echo "Server $server is unhealthy - removed from pool"
fi
done
echo "}" >> $TEMP_CONF
# 检查配置是否有变化
if ! cmp -s "$NGINX_UPSTREAM_CONF" "$TEMP_CONF"; then
mv "$TEMP_CONF" "$NGINX_UPSTREAM_CONF"
nginx -t && nginx -s reload
echo "Nginx configuration updated and reloaded"
else
rm "$TEMP_CONF"
fi
}
# 主循环
while true; do
update_upstream_config
sleep 30
done
5. 健康检查监控和告警
5.1 健康状态日志记录
upstream monitored_backend {
server 192.168.1.10:8080 max_fails=2 fail_timeout=30s;
server 192.168.1.11:8080 max_fails=2 fail_timeout=30s;
server 192.168.1.12:8080 max_fails=2 fail_timeout=30s;
}
# 定义健康检查日志格式
log_format health_check '$remote_addr - [$time_local] "$request" '
'$status $body_bytes_sent '
'"$upstream_addr" "$upstream_status" '
'$upstream_response_time "$upstream_cache_status"';
server {
listen 80;
server_name monitored.example.com;
# 记录详细的健康状态
access_log /var/log/nginx/health_check.log health_check;
location / {
proxy_pass http://monitored_backend;
# 添加健康状态头部
add_header X-Upstream-Server $upstream_addr;
add_header X-Upstream-Status $upstream_status;
add_header X-Upstream-Response-Time $upstream_response_time;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
}
}
5.2 健康检查告警脚本
#!/bin/bash
# health-check-alert.sh
LOG_FILE="/var/log/nginx/health_check.log"
ALERT_EMAIL="admin@example.com"
ALERT_THRESHOLD=5 # 5分钟内失败超过此次数则告警
check_upstream_health() {
local current_time=$(date +%s)
local five_minutes_ago=$((current_time - 300))
# 统计最近5分钟的5xx错误
local error_count=$(awk -v start_time="$five_minutes_ago" '
{
# 解析时间戳
gsub(/\[|\]/, "", $4)
cmd = "date -d \"" $4 "\" +%s"
cmd | getline timestamp
close(cmd)
if (timestamp >= start_time && $6 ~ /^5[0-9][0-9]$/) {
count++
}
}
END { print count+0 }' "$LOG_FILE")
if [ "$error_count" -gt "$ALERT_THRESHOLD" ]; then
local message="WARNING: Upstream health check failures detected!
Failures in last 5 minutes: $error_count
Time: $(date)
Log file: $LOG_FILE
Recent errors:
$(tail -20 "$LOG_FILE" | grep -E ' 5[0-9][0-9] ')"
echo "$message" | mail -s "Nginx Upstream Health Alert" "$ALERT_EMAIL"
# 记录告警
echo "$(date): Health check alert sent - $error_count failures" >> /var/log/nginx/health_alerts.log
fi
}
# 检查上游服务器可用性
check_server_availability() {
local servers=(
"192.168.1.10:8080"
"192.168.1.11:8080"
"192.168.1.12:8080"
)
local failed_servers=()
for server in "${servers[@]}"; do
if ! curl -f -s --max-time 3 "http://$server/health" > /dev/null; then
failed_servers+=("$server")
fi
done
if [ ${#failed_servers[@]} -gt 0 ]; then
local message="CRITICAL: Upstream servers are down!
Failed servers: ${failed_servers[*]}
Time: $(date)
Total failed: ${#failed_servers[@]}/${#servers[@]}"
echo "$message" | mail -s "CRITICAL: Nginx Upstream Servers Down" "$ALERT_EMAIL"
fi
}
# 主循环
while true; do
check_upstream_health
check_server_availability
sleep 60
done
6. 故障恢复和自动化
6.1 自动故障恢复
upstream auto_recovery_backend {
# 使用较短的fail_timeout实现快速恢复
server 192.168.1.10:8080 max_fails=2 fail_timeout=10s;
server 192.168.1.11:8080 max_fails=2 fail_timeout=10s;
server 192.168.1.12:8080 max_fails=2 fail_timeout=10s;
# 备份服务器
server 192.168.1.20:8080 backup;
}
server {
listen 80;
server_name recovery.example.com;
location / {
proxy_pass http://auto_recovery_backend;
# 积极的重试策略
proxy_next_upstream error timeout http_500 http_502 http_503;
proxy_next_upstream_tries 2;
proxy_next_upstream_timeout 5s;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
}
}
6.2 集成监控系统
# prometheus.yml - Prometheus配置
scrape_configs:
- job_name: 'nginx'
static_configs:
- targets: ['localhost:9113']
scrape_interval: 5s
- job_name: 'nginx-upstream'
static_configs:
- targets: ['localhost:8080']
metrics_path: '/metrics'
scrape_interval: 10s
# Nginx配置 - 暴露指标
server {
listen 8080;
server_name localhost;
location /metrics {
# 使用nginx-prometheus-exporter
stub_status on;
access_log off;
allow 127.0.0.1;
allow 192.168.1.0/24;
deny all;
}
location /nginx_status {
stub_status on;
access_log off;
allow 127.0.0.1;
allow 192.168.1.0/24;
deny all;
}
}
7. 性能优化
7.1 健康检查性能优化
# 优化健康检查性能
upstream optimized_backend {
# 使用合理的检查参数
server 192.168.1.10:8080 max_fails=3 fail_timeout=30s;
server 192.168.1.11:8080 max_fails=3 fail_timeout=30s;
server 192.168.1.12:8080 max_fails=3 fail_timeout=30s;
# 连接池优化
keepalive 16;
keepalive_requests 100;
keepalive_timeout 60s;
}
server {
listen 80;
server_name optimized.example.com;
location / {
proxy_pass http://optimized_backend;
# 连接复用
proxy_http_version 1.1;
proxy_set_header Connection "";
# 优化超时设置
proxy_connect_timeout 3s;
proxy_send_timeout 10s;
proxy_read_timeout 30s;
# 智能重试
proxy_next_upstream error timeout http_500 http_502 http_503;
proxy_next_upstream_tries 1; # 减少重试次数
proxy_next_upstream_timeout 3s;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
}
# 专用健康检查端点
location /internal/health {
internal;
proxy_pass http://optimized_backend/health;
proxy_connect_timeout 1s;
proxy_read_timeout 2s;
}
}
8. 最佳实践
8.1 健康检查最佳实践
# 生产环境健康检查配置示例
upstream production_backend {
# 基于服务器性能设置不同参数
server 192.168.1.10:8080 weight=3 max_fails=2 fail_timeout=20s; # 高性能服务器
server 192.168.1.11:8080 weight=2 max_fails=3 fail_timeout=30s; # 中等性能服务器
server 192.168.1.12:8080 weight=1 max_fails=5 fail_timeout=60s; # 低性能服务器
# 本地备份服务器
server 127.0.0.1:8080 backup max_fails=1 fail_timeout=10s;
# 连接优化
keepalive 32;
keepalive_requests 1000;
keepalive_timeout 60s;
}
server {
listen 80;
server_name production.example.com;
location / {
proxy_pass http://production_backend;
# 生产环境故障转移策略
proxy_next_upstream error timeout invalid_header http_500 http_502 http_503 http_504;
proxy_next_upstream_tries 2;
proxy_next_upstream_timeout 5s;
# 超时设置
proxy_connect_timeout 5s;
proxy_send_timeout 60s;
proxy_read_timeout 60s;
# 必要的头部
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
# 健康状态记录
add_header X-Upstream-Server $upstream_addr always;
add_header X-Upstream-Response-Time $upstream_response_time always;
}
}
小结
通过本文的学习,你应该掌握:
- 被动和主动健康检查的区别和配置方法
- 不同场景下的健康检查策略选择
- 健康检查的监控和告警机制
- 故障恢复和自动化处理
- 健康检查的性能优化技巧
- 生产环境的最佳实践
下一篇文章将介绍Session保持与粘性会话的配置方法。