Nginx 故障排除与调试技巧

Troubleshooting and Debugging Tips

概述

在生产环境中,快速定位和解决Nginx问题至关重要。本文将提供全面的故障排除指南,包括常见问题诊断、性能调优、日志分析和监控工具使用。

1. 常见问题诊断

1.1 服务启动问题

# 检查Nginx状态
systemctl status nginx

# 检查配置文件语法
nginx -t
nginx -T  # 显示完整配置

# 检查端口占用
netstat -tulpn | grep :80
ss -tulpn | grep :80
lsof -i :80

# 检查防火墙
ufw status  # Ubuntu
firewall-cmd --list-all  # CentOS
iptables -L -n

# 检查SELinux(CentOS/RHEL)
sestatus
getsebool -a | grep httpd
setsebool -P httpd_can_network_connect 1

1.2 配置文件问题

# 验证配置语法
nginx -t -c /etc/nginx/nginx.conf

# 检查包含的文件
nginx -T | grep "include"

# 查找配置文件位置
nginx -V 2>&1 | grep -o '\-\-conf-path=\S*'

# 检查配置文件权限
ls -la /etc/nginx/nginx.conf
ls -la /etc/nginx/conf.d/

# 备份和恢复配置
cp /etc/nginx/nginx.conf /etc/nginx/nginx.conf.backup
cp /etc/nginx/nginx.conf.backup /etc/nginx/nginx.conf

1.3 权限问题诊断

# 检查Nginx进程用户
ps aux | grep nginx

# 检查文件权限
ls -la /var/www/html/
ls -la /var/log/nginx/

# 设置正确权限
chown -R nginx:nginx /var/www/html/
chmod -R 755 /var/www/html/
chown -R nginx:nginx /var/log/nginx/

# 检查文档根目录
stat /var/www/html/

# SELinux上下文检查
ls -Z /var/www/html/
restorecon -R /var/www/html/

2. 日志分析技巧

2.1 错误日志分析

# 实时查看错误日志
tail -f /var/log/nginx/error.log

# 按时间过滤错误
grep "$(date +'%Y/%m/%d %H')" /var/log/nginx/error.log

# 统计错误类型
awk '{print $3}' /var/log/nginx/error.log | sort | uniq -c | sort -nr

# 查找特定错误
grep "404" /var/log/nginx/error.log
grep "upstream" /var/log/nginx/error.log
grep "permission denied" /var/log/nginx/error.log

2.2 访问日志分析

# 分析访问日志脚本
#!/bin/bash
# nginx-log-analyzer.sh

LOG_FILE=${1:-/var/log/nginx/access.log}

echo "=== Nginx访问日志分析报告 ==="
echo "文件: $LOG_FILE"
echo "时间: $(date)"
echo

# 总请求数
echo "总请求数: $(wc -l < $LOG_FILE)"

# 独立IP数
echo "独立IP数: $(awk '{print $1}' $LOG_FILE | sort | uniq | wc -l)"

# 状态码统计
echo -e "\n=== 状态码分布 ==="
awk '{print $9}' $LOG_FILE | sort | uniq -c | sort -nr

# 最多访问的IP
echo -e "\n=== 访问最多的IP(前10) ==="
awk '{print $1}' $LOG_FILE | sort | uniq -c | sort -nr | head -10

# 最多访问的页面
echo -e "\n=== 访问最多的页面(前10) ==="
awk '{print $7}' $LOG_FILE | sort | uniq -c | sort -nr | head -10

# 404错误页面
echo -e "\n=== 404错误页面 ==="
awk '$9==404 {print $7}' $LOG_FILE | sort | uniq -c | sort -nr | head -10

# 慢请求分析(响应时间>1秒)
echo -e "\n=== 慢请求分析 ==="
awk '$NF>1 {print $7, $NF}' $LOG_FILE | sort -k2 -nr | head -10

# 用户代理统计
echo -e "\n=== 浏览器统计 ==="
awk -F'"' '{print $6}' $LOG_FILE | sort | uniq -c | sort -nr | head -10

2.3 实时监控脚本

#!/bin/bash
# nginx-realtime-monitor.sh

INTERVAL=5
ACCESS_LOG="/var/log/nginx/access.log"
ERROR_LOG="/var/log/nginx/error.log"

while true; do
    clear
    echo "=== Nginx实时监控 $(date) ==="

    # 最近1分钟的请求数
    recent_requests=$(tail -1000 $ACCESS_LOG | \
                     awk -v now=$(date +%s) \
                     'BEGIN{count=0} {
                         gsub(/\[|\]/, "", $4); 
                         cmd="date -d \""$4"\" +%s"; 
                         cmd | getline timestamp; 
                         close(cmd); 
                         if(now - timestamp < 60) count++
                     } END{print count}')

    echo "最近1分钟请求数: $recent_requests"

    # 最近错误
    echo -e "\n最近错误:"
    tail -5 $ERROR_LOG | tail -3

    # 当前连接数
    connections=$(ss -ant | grep :80 | wc -l)
    echo -e "\n当前连接数: $connections"

    # CPU和内存使用
    ps aux | grep nginx | grep -v grep | \
    awk '{cpu+=$3; mem+=$4} END{printf "CPU: %.1f%%, Memory: %.1f%%\n", cpu, mem}'

    sleep $INTERVAL
done

3. 性能问题诊断

3.1 响应时间分析

# 测试响应时间
curl -w "@curl-format.txt" -o /dev/null -s http://example.com/

# curl-format.txt内容:
# time_namelookup:  %{time_namelookup}\n
# time_connect:     %{time_connect}\n
# time_appconnect:  %{time_appconnect}\n
# time_pretransfer: %{time_pretransfer}\n
# time_redirect:    %{time_redirect}\n
# time_starttransfer: %{time_starttransfer}\n
# time_total:       %{time_total}\n

# 批量测试脚本
#!/bin/bash
# performance-test.sh

URL=${1:-"http://localhost"}
REQUESTS=${2:-100}
CONCURRENCY=${3:-10}

echo "性能测试: $URL"
echo "请求数: $REQUESTS, 并发数: $CONCURRENCY"

# 使用ab进行压力测试
ab -n $REQUESTS -c $CONCURRENCY $URL

# 使用wrk进行测试(如果安装了)
if command -v wrk &> /dev/null; then
    echo -e "\n=== wrk测试结果 ==="
    wrk -t4 -c$CONCURRENCY -d30s $URL
fi

3.2 内存和CPU监控

# 监控Nginx进程资源使用
#!/bin/bash
# nginx-resource-monitor.sh

while true; do
    echo "=== $(date) ==="

    # Nginx进程信息
    ps aux | grep nginx | grep -v grep | \
    awk '{printf "PID: %s, CPU: %s%%, MEM: %s%%, RSS: %sKB\n", $2, $3, $4, $6}'

    # 系统负载
    uptime

    # 内存使用
    free -h

    # 网络连接统计
    ss -s

    echo "=========================="
    sleep 5
done

3.3 连接池调优诊断

# 检查连接状态
ss -ant | grep :80 | awk '{print $1}' | sort | uniq -c

# 检查TIME_WAIT连接
ss -ant | grep TIME_WAIT | wc -l

# 检查keepalive设置
curl -H "Connection: keep-alive" -v http://localhost/

# 测试并发连接
#!/bin/bash
# connection-test.sh

URL="http://localhost"
MAX_CONNECTIONS=1000

for i in $(seq 1 $MAX_CONNECTIONS); do
    curl -s $URL > /dev/null &
    if [ $((i % 100)) -eq 0 ]; then
        echo "已启动 $i 个连接"
        sleep 1
    fi
done

wait
echo "所有连接完成"

4. 调试工具和技巧

4.1 启用调试模式

# 调试配置
error_log /var/log/nginx/debug.log debug;

events {
    debug_connection 192.168.1.100;  # 调试特定IP
}

http {
    server {
        listen 80;
        server_name debug.example.com;

        # 添加调试头
        add_header X-Debug-Time $msec;
        add_header X-Debug-Connection $connection;
        add_header X-Debug-Requests $connection_requests;

        location /debug {
            # 返回调试信息
            return 200 "Debug Info:
Server: $hostname
Time: $time_local
Request: $request
Args: $args
Remote: $remote_addr
Forwarded: $http_x_forwarded_for
";
            add_header Content-Type text/plain;
        }
    }
}

4.2 使用strace诊断

# 追踪Nginx系统调用
strace -p $(pgrep nginx | head -1) -f -e trace=network,file

# 追踪特定问题
strace -p $(pgrep nginx | head -1) -e trace=open,openat -o nginx-trace.log

# 分析文件访问
grep "openat" nginx-trace.log | grep -v ENOENT

4.3 使用tcpdump抓包

# 抓取HTTP流量
tcpdump -i any -n -A port 80

# 保存到文件
tcpdump -i any -n -w nginx-traffic.pcap port 80

# 分析特定IP流量
tcpdump -i any -n host 192.168.1.100 and port 80

5. 内存泄漏检测

5.1 内存使用监控

#!/bin/bash
# memory-leak-detector.sh

LOG_FILE="/var/log/nginx-memory.log"
INTERVAL=60

while true; do
    timestamp=$(date '+%Y-%m-%d %H:%M:%S')

    # 获取Nginx进程内存使用
    nginx_memory=$(ps aux | grep nginx | grep -v grep | \
                  awk '{sum+=$6} END{print sum}')

    # 记录到日志
    echo "$timestamp,$nginx_memory" >> $LOG_FILE

    # 检查内存增长
    tail -10 $LOG_FILE | awk -F',' '
    BEGIN{prev=0; trend=0}
    {
        if(NR>1 && $2>prev) trend++;
        prev=$2
    }
    END{
        if(trend>8) print "WARNING: Memory increasing trend detected"
    }'

    sleep $INTERVAL
done

5.2 使用Valgrind检测

# 使用Valgrind检测内存问题(开发环境)
valgrind --tool=memcheck --leak-check=full --show-leak-kinds=all \
         nginx -g "daemon off;"

6. SSL/TLS问题诊断

6.1 SSL配置检测

# 检查SSL证书
openssl x509 -in /etc/ssl/certs/example.com.crt -text -noout

# 检查私钥
openssl rsa -in /etc/ssl/private/example.com.key -check

# 验证证书和私钥匹配
cert_md5=$(openssl x509 -noout -modulus -in /etc/ssl/certs/example.com.crt | openssl md5)
key_md5=$(openssl rsa -noout -modulus -in /etc/ssl/private/example.com.key | openssl md5)
if [ "$cert_md5" = "$key_md5" ]; then
    echo "证书和私钥匹配"
else
    echo "证书和私钥不匹配"
fi

# 测试SSL连接
openssl s_client -connect example.com:443 -servername example.com

# 检查SSL配置安全性
sslscan example.com
testssl.sh example.com

6.2 SSL握手问题

# 调试SSL握手
curl -v --trace-ascii ssl-trace.log https://example.com/

# 检查SSL日志
grep "SSL" /var/log/nginx/error.log

# 测试不同TLS版本
openssl s_client -connect example.com:443 -tls1_2
openssl s_client -connect example.com:443 -tls1_3

7. 上游服务器问题

7.1 后端连接测试

# 测试上游服务器连接
#!/bin/bash
# upstream-health-check.sh

UPSTREAM_SERVERS=(
    "192.168.1.10:8080"
    "192.168.1.11:8080"
    "192.168.1.12:8080"
)

for server in "${UPSTREAM_SERVERS[@]}"; do
    host=$(echo $server | cut -d: -f1)
    port=$(echo $server | cut -d: -f2)

    echo -n "Testing $server: "

    # TCP连接测试
    if timeout 5 bash -c "</dev/tcp/$host/$port"; then
        echo -n "TCP OK, "

        # HTTP测试
        response=$(curl -s -o /dev/null -w "%{http_code}" \
                  --max-time 5 "http://$server/health")

        if [ "$response" = "200" ]; then
            echo "HTTP OK"
        else
            echo "HTTP FAILED ($response)"
        fi
    else
        echo "TCP FAILED"
    fi
done

7.2 负载均衡问题诊断

# 检查负载均衡分发
#!/bin/bash
# load-balance-test.sh

URL="http://example.com"
REQUESTS=100

echo "测试负载均衡分发..."

for i in $(seq 1 $REQUESTS); do
    response=$(curl -s -H "X-Test-Request: $i" $URL)
    server=$(echo "$response" | grep -o "Server: [^,]*" | cut -d: -f2)
    echo "$i,$server" >> lb-test-results.txt
done

# 统计分发结果
echo "负载均衡分发统计:"
sort lb-test-results.txt | cut -d, -f2 | uniq -c

8. 自动化故障诊断

8.1 健康检查脚本

#!/bin/bash
# nginx-health-check.sh

# 配置参数
HEALTH_URL="http://localhost/health"
ERROR_THRESHOLD=5
RESPONSE_TIME_THRESHOLD=2000  # ms
ALERT_EMAIL="admin@example.com"

# 检查函数
check_nginx_status() {
    if ! systemctl is-active --quiet nginx; then
        echo "CRITICAL: Nginx service is not running"
        return 1
    fi
    return 0
}

check_response_time() {
    response_time=$(curl -w "%{time_total}" -s -o /dev/null $HEALTH_URL)
    response_time_ms=$(echo "$response_time * 1000" | bc)

    if (( $(echo "$response_time_ms > $RESPONSE_TIME_THRESHOLD" | bc -l) )); then
        echo "WARNING: Response time ${response_time_ms}ms exceeds threshold"
        return 1
    fi
    return 0
}

check_error_rate() {
    error_count=$(tail -1000 /var/log/nginx/access.log | \
                 awk '$9 >= 400 {count++} END {print count+0}')

    if [ "$error_count" -gt "$ERROR_THRESHOLD" ]; then
        echo "WARNING: High error rate: $error_count errors in last 1000 requests"
        return 1
    fi
    return 0
}

check_disk_space() {
    usage=$(df /var/log | tail -1 | awk '{print $5}' | sed 's/%//')
    if [ "$usage" -gt 90 ]; then
        echo "WARNING: Log disk usage ${usage}% is high"
        return 1
    fi
    return 0
}

# 主检查流程
main() {
    issues=()

    if ! check_nginx_status; then
        issues+=("Service Down")
    fi

    if ! check_response_time; then
        issues+=("Slow Response")
    fi

    if ! check_error_rate; then
        issues+=("High Error Rate")
    fi

    if ! check_disk_space; then
        issues+=("Disk Space")
    fi

    if [ ${#issues[@]} -gt 0 ]; then
        alert_message="Nginx Health Check Failed:\n$(printf '%s\n' "${issues[@]}")"
        echo -e "$alert_message"
        echo -e "$alert_message" | mail -s "Nginx Alert" $ALERT_EMAIL
        exit 1
    else
        echo "All checks passed"
        exit 0
    fi
}

main

8.2 自动修复脚本

#!/bin/bash
# nginx-auto-fix.sh

AUTO_RESTART=${AUTO_RESTART:-false}
MAX_RESTART_ATTEMPTS=3
RESTART_COUNT_FILE="/tmp/nginx_restart_count"

# 获取重启次数
get_restart_count() {
    if [ -f "$RESTART_COUNT_FILE" ]; then
        cat "$RESTART_COUNT_FILE"
    else
        echo "0"
    fi
}

# 增加重启次数
increment_restart_count() {
    local count=$(get_restart_count)
    echo $((count + 1)) > "$RESTART_COUNT_FILE"
}

# 重置重启次数
reset_restart_count() {
    rm -f "$RESTART_COUNT_FILE"
}

# 自动修复函数
auto_fix_nginx() {
    echo "Attempting to fix Nginx issues..."

    # 检查配置文件
    if ! nginx -t; then
        echo "Configuration syntax error detected"
        echo "Please check configuration manually"
        return 1
    fi

    # 检查磁盘空间
    log_usage=$(df /var/log | tail -1 | awk '{print $5}' | sed 's/%//')
    if [ "$log_usage" -gt 95 ]; then
        echo "Cleaning old log files..."
        find /var/log/nginx -name "*.log" -mtime +7 -exec gzip {} \;
        find /var/log/nginx -name "*.gz" -mtime +30 -delete
    fi

    # 重启Nginx
    if [ "$AUTO_RESTART" = "true" ]; then
        restart_count=$(get_restart_count)

        if [ "$restart_count" -lt "$MAX_RESTART_ATTEMPTS" ]; then
            echo "Restarting Nginx (attempt $((restart_count + 1))/$MAX_RESTART_ATTEMPTS)"
            increment_restart_count

            systemctl restart nginx
            sleep 5

            if systemctl is-active --quiet nginx; then
                echo "Nginx restarted successfully"
                reset_restart_count
                return 0
            else
                echo "Nginx restart failed"
                return 1
            fi
        else
            echo "Maximum restart attempts reached. Manual intervention required."
            return 1
        fi
    fi
}

# 运行健康检查,如果失败则尝试修复
if ! /usr/local/bin/nginx-health-check.sh; then
    auto_fix_nginx
fi

小结

通过本文学习,你应该掌握:

  1. 常见Nginx问题的诊断方法
  2. 日志分析和实时监控技巧
  3. 性能问题识别和调优
  4. 调试工具的使用方法
  5. SSL/TLS问题排查
  6. 上游服务器连接问题诊断
  7. 自动化监控和故障修复
  8. 完整的故障排除流程

这20篇文章涵盖了Nginx从基础安装到高级应用的全方位内容,为你在生产环境中使用Nginx提供了完整的参考指南。

powered by Gitbook© 2025 编外计划 | 最后修改: 2025-08-29 15:40:15

results matching ""

    No results matching ""