Nginx 故障排除与调试技巧
Troubleshooting and Debugging Tips
概述
在生产环境中,快速定位和解决Nginx问题至关重要。本文将提供全面的故障排除指南,包括常见问题诊断、性能调优、日志分析和监控工具使用。
1. 常见问题诊断
1.1 服务启动问题
# 检查Nginx状态
systemctl status nginx
# 检查配置文件语法
nginx -t
nginx -T # 显示完整配置
# 检查端口占用
netstat -tulpn | grep :80
ss -tulpn | grep :80
lsof -i :80
# 检查防火墙
ufw status # Ubuntu
firewall-cmd --list-all # CentOS
iptables -L -n
# 检查SELinux(CentOS/RHEL)
sestatus
getsebool -a | grep httpd
setsebool -P httpd_can_network_connect 1
1.2 配置文件问题
# 验证配置语法
nginx -t -c /etc/nginx/nginx.conf
# 检查包含的文件
nginx -T | grep "include"
# 查找配置文件位置
nginx -V 2>&1 | grep -o '\-\-conf-path=\S*'
# 检查配置文件权限
ls -la /etc/nginx/nginx.conf
ls -la /etc/nginx/conf.d/
# 备份和恢复配置
cp /etc/nginx/nginx.conf /etc/nginx/nginx.conf.backup
cp /etc/nginx/nginx.conf.backup /etc/nginx/nginx.conf
1.3 权限问题诊断
# 检查Nginx进程用户
ps aux | grep nginx
# 检查文件权限
ls -la /var/www/html/
ls -la /var/log/nginx/
# 设置正确权限
chown -R nginx:nginx /var/www/html/
chmod -R 755 /var/www/html/
chown -R nginx:nginx /var/log/nginx/
# 检查文档根目录
stat /var/www/html/
# SELinux上下文检查
ls -Z /var/www/html/
restorecon -R /var/www/html/
2. 日志分析技巧
2.1 错误日志分析
# 实时查看错误日志
tail -f /var/log/nginx/error.log
# 按时间过滤错误
grep "$(date +'%Y/%m/%d %H')" /var/log/nginx/error.log
# 统计错误类型
awk '{print $3}' /var/log/nginx/error.log | sort | uniq -c | sort -nr
# 查找特定错误
grep "404" /var/log/nginx/error.log
grep "upstream" /var/log/nginx/error.log
grep "permission denied" /var/log/nginx/error.log
2.2 访问日志分析
# 分析访问日志脚本
#!/bin/bash
# nginx-log-analyzer.sh
LOG_FILE=${1:-/var/log/nginx/access.log}
echo "=== Nginx访问日志分析报告 ==="
echo "文件: $LOG_FILE"
echo "时间: $(date)"
echo
# 总请求数
echo "总请求数: $(wc -l < $LOG_FILE)"
# 独立IP数
echo "独立IP数: $(awk '{print $1}' $LOG_FILE | sort | uniq | wc -l)"
# 状态码统计
echo -e "\n=== 状态码分布 ==="
awk '{print $9}' $LOG_FILE | sort | uniq -c | sort -nr
# 最多访问的IP
echo -e "\n=== 访问最多的IP(前10) ==="
awk '{print $1}' $LOG_FILE | sort | uniq -c | sort -nr | head -10
# 最多访问的页面
echo -e "\n=== 访问最多的页面(前10) ==="
awk '{print $7}' $LOG_FILE | sort | uniq -c | sort -nr | head -10
# 404错误页面
echo -e "\n=== 404错误页面 ==="
awk '$9==404 {print $7}' $LOG_FILE | sort | uniq -c | sort -nr | head -10
# 慢请求分析(响应时间>1秒)
echo -e "\n=== 慢请求分析 ==="
awk '$NF>1 {print $7, $NF}' $LOG_FILE | sort -k2 -nr | head -10
# 用户代理统计
echo -e "\n=== 浏览器统计 ==="
awk -F'"' '{print $6}' $LOG_FILE | sort | uniq -c | sort -nr | head -10
2.3 实时监控脚本
#!/bin/bash
# nginx-realtime-monitor.sh
INTERVAL=5
ACCESS_LOG="/var/log/nginx/access.log"
ERROR_LOG="/var/log/nginx/error.log"
while true; do
clear
echo "=== Nginx实时监控 $(date) ==="
# 最近1分钟的请求数
recent_requests=$(tail -1000 $ACCESS_LOG | \
awk -v now=$(date +%s) \
'BEGIN{count=0} {
gsub(/\[|\]/, "", $4);
cmd="date -d \""$4"\" +%s";
cmd | getline timestamp;
close(cmd);
if(now - timestamp < 60) count++
} END{print count}')
echo "最近1分钟请求数: $recent_requests"
# 最近错误
echo -e "\n最近错误:"
tail -5 $ERROR_LOG | tail -3
# 当前连接数
connections=$(ss -ant | grep :80 | wc -l)
echo -e "\n当前连接数: $connections"
# CPU和内存使用
ps aux | grep nginx | grep -v grep | \
awk '{cpu+=$3; mem+=$4} END{printf "CPU: %.1f%%, Memory: %.1f%%\n", cpu, mem}'
sleep $INTERVAL
done
3. 性能问题诊断
3.1 响应时间分析
# 测试响应时间
curl -w "@curl-format.txt" -o /dev/null -s http://example.com/
# curl-format.txt内容:
# time_namelookup: %{time_namelookup}\n
# time_connect: %{time_connect}\n
# time_appconnect: %{time_appconnect}\n
# time_pretransfer: %{time_pretransfer}\n
# time_redirect: %{time_redirect}\n
# time_starttransfer: %{time_starttransfer}\n
# time_total: %{time_total}\n
# 批量测试脚本
#!/bin/bash
# performance-test.sh
URL=${1:-"http://localhost"}
REQUESTS=${2:-100}
CONCURRENCY=${3:-10}
echo "性能测试: $URL"
echo "请求数: $REQUESTS, 并发数: $CONCURRENCY"
# 使用ab进行压力测试
ab -n $REQUESTS -c $CONCURRENCY $URL
# 使用wrk进行测试(如果安装了)
if command -v wrk &> /dev/null; then
echo -e "\n=== wrk测试结果 ==="
wrk -t4 -c$CONCURRENCY -d30s $URL
fi
3.2 内存和CPU监控
# 监控Nginx进程资源使用
#!/bin/bash
# nginx-resource-monitor.sh
while true; do
echo "=== $(date) ==="
# Nginx进程信息
ps aux | grep nginx | grep -v grep | \
awk '{printf "PID: %s, CPU: %s%%, MEM: %s%%, RSS: %sKB\n", $2, $3, $4, $6}'
# 系统负载
uptime
# 内存使用
free -h
# 网络连接统计
ss -s
echo "=========================="
sleep 5
done
3.3 连接池调优诊断
# 检查连接状态
ss -ant | grep :80 | awk '{print $1}' | sort | uniq -c
# 检查TIME_WAIT连接
ss -ant | grep TIME_WAIT | wc -l
# 检查keepalive设置
curl -H "Connection: keep-alive" -v http://localhost/
# 测试并发连接
#!/bin/bash
# connection-test.sh
URL="http://localhost"
MAX_CONNECTIONS=1000
for i in $(seq 1 $MAX_CONNECTIONS); do
curl -s $URL > /dev/null &
if [ $((i % 100)) -eq 0 ]; then
echo "已启动 $i 个连接"
sleep 1
fi
done
wait
echo "所有连接完成"
4. 调试工具和技巧
4.1 启用调试模式
# 调试配置
error_log /var/log/nginx/debug.log debug;
events {
debug_connection 192.168.1.100; # 调试特定IP
}
http {
server {
listen 80;
server_name debug.example.com;
# 添加调试头
add_header X-Debug-Time $msec;
add_header X-Debug-Connection $connection;
add_header X-Debug-Requests $connection_requests;
location /debug {
# 返回调试信息
return 200 "Debug Info:
Server: $hostname
Time: $time_local
Request: $request
Args: $args
Remote: $remote_addr
Forwarded: $http_x_forwarded_for
";
add_header Content-Type text/plain;
}
}
}
4.2 使用strace诊断
# 追踪Nginx系统调用
strace -p $(pgrep nginx | head -1) -f -e trace=network,file
# 追踪特定问题
strace -p $(pgrep nginx | head -1) -e trace=open,openat -o nginx-trace.log
# 分析文件访问
grep "openat" nginx-trace.log | grep -v ENOENT
4.3 使用tcpdump抓包
# 抓取HTTP流量
tcpdump -i any -n -A port 80
# 保存到文件
tcpdump -i any -n -w nginx-traffic.pcap port 80
# 分析特定IP流量
tcpdump -i any -n host 192.168.1.100 and port 80
5. 内存泄漏检测
5.1 内存使用监控
#!/bin/bash
# memory-leak-detector.sh
LOG_FILE="/var/log/nginx-memory.log"
INTERVAL=60
while true; do
timestamp=$(date '+%Y-%m-%d %H:%M:%S')
# 获取Nginx进程内存使用
nginx_memory=$(ps aux | grep nginx | grep -v grep | \
awk '{sum+=$6} END{print sum}')
# 记录到日志
echo "$timestamp,$nginx_memory" >> $LOG_FILE
# 检查内存增长
tail -10 $LOG_FILE | awk -F',' '
BEGIN{prev=0; trend=0}
{
if(NR>1 && $2>prev) trend++;
prev=$2
}
END{
if(trend>8) print "WARNING: Memory increasing trend detected"
}'
sleep $INTERVAL
done
5.2 使用Valgrind检测
# 使用Valgrind检测内存问题(开发环境)
valgrind --tool=memcheck --leak-check=full --show-leak-kinds=all \
nginx -g "daemon off;"
6. SSL/TLS问题诊断
6.1 SSL配置检测
# 检查SSL证书
openssl x509 -in /etc/ssl/certs/example.com.crt -text -noout
# 检查私钥
openssl rsa -in /etc/ssl/private/example.com.key -check
# 验证证书和私钥匹配
cert_md5=$(openssl x509 -noout -modulus -in /etc/ssl/certs/example.com.crt | openssl md5)
key_md5=$(openssl rsa -noout -modulus -in /etc/ssl/private/example.com.key | openssl md5)
if [ "$cert_md5" = "$key_md5" ]; then
echo "证书和私钥匹配"
else
echo "证书和私钥不匹配"
fi
# 测试SSL连接
openssl s_client -connect example.com:443 -servername example.com
# 检查SSL配置安全性
sslscan example.com
testssl.sh example.com
6.2 SSL握手问题
# 调试SSL握手
curl -v --trace-ascii ssl-trace.log https://example.com/
# 检查SSL日志
grep "SSL" /var/log/nginx/error.log
# 测试不同TLS版本
openssl s_client -connect example.com:443 -tls1_2
openssl s_client -connect example.com:443 -tls1_3
7. 上游服务器问题
7.1 后端连接测试
# 测试上游服务器连接
#!/bin/bash
# upstream-health-check.sh
UPSTREAM_SERVERS=(
"192.168.1.10:8080"
"192.168.1.11:8080"
"192.168.1.12:8080"
)
for server in "${UPSTREAM_SERVERS[@]}"; do
host=$(echo $server | cut -d: -f1)
port=$(echo $server | cut -d: -f2)
echo -n "Testing $server: "
# TCP连接测试
if timeout 5 bash -c "</dev/tcp/$host/$port"; then
echo -n "TCP OK, "
# HTTP测试
response=$(curl -s -o /dev/null -w "%{http_code}" \
--max-time 5 "http://$server/health")
if [ "$response" = "200" ]; then
echo "HTTP OK"
else
echo "HTTP FAILED ($response)"
fi
else
echo "TCP FAILED"
fi
done
7.2 负载均衡问题诊断
# 检查负载均衡分发
#!/bin/bash
# load-balance-test.sh
URL="http://example.com"
REQUESTS=100
echo "测试负载均衡分发..."
for i in $(seq 1 $REQUESTS); do
response=$(curl -s -H "X-Test-Request: $i" $URL)
server=$(echo "$response" | grep -o "Server: [^,]*" | cut -d: -f2)
echo "$i,$server" >> lb-test-results.txt
done
# 统计分发结果
echo "负载均衡分发统计:"
sort lb-test-results.txt | cut -d, -f2 | uniq -c
8. 自动化故障诊断
8.1 健康检查脚本
#!/bin/bash
# nginx-health-check.sh
# 配置参数
HEALTH_URL="http://localhost/health"
ERROR_THRESHOLD=5
RESPONSE_TIME_THRESHOLD=2000 # ms
ALERT_EMAIL="admin@example.com"
# 检查函数
check_nginx_status() {
if ! systemctl is-active --quiet nginx; then
echo "CRITICAL: Nginx service is not running"
return 1
fi
return 0
}
check_response_time() {
response_time=$(curl -w "%{time_total}" -s -o /dev/null $HEALTH_URL)
response_time_ms=$(echo "$response_time * 1000" | bc)
if (( $(echo "$response_time_ms > $RESPONSE_TIME_THRESHOLD" | bc -l) )); then
echo "WARNING: Response time ${response_time_ms}ms exceeds threshold"
return 1
fi
return 0
}
check_error_rate() {
error_count=$(tail -1000 /var/log/nginx/access.log | \
awk '$9 >= 400 {count++} END {print count+0}')
if [ "$error_count" -gt "$ERROR_THRESHOLD" ]; then
echo "WARNING: High error rate: $error_count errors in last 1000 requests"
return 1
fi
return 0
}
check_disk_space() {
usage=$(df /var/log | tail -1 | awk '{print $5}' | sed 's/%//')
if [ "$usage" -gt 90 ]; then
echo "WARNING: Log disk usage ${usage}% is high"
return 1
fi
return 0
}
# 主检查流程
main() {
issues=()
if ! check_nginx_status; then
issues+=("Service Down")
fi
if ! check_response_time; then
issues+=("Slow Response")
fi
if ! check_error_rate; then
issues+=("High Error Rate")
fi
if ! check_disk_space; then
issues+=("Disk Space")
fi
if [ ${#issues[@]} -gt 0 ]; then
alert_message="Nginx Health Check Failed:\n$(printf '%s\n' "${issues[@]}")"
echo -e "$alert_message"
echo -e "$alert_message" | mail -s "Nginx Alert" $ALERT_EMAIL
exit 1
else
echo "All checks passed"
exit 0
fi
}
main
8.2 自动修复脚本
#!/bin/bash
# nginx-auto-fix.sh
AUTO_RESTART=${AUTO_RESTART:-false}
MAX_RESTART_ATTEMPTS=3
RESTART_COUNT_FILE="/tmp/nginx_restart_count"
# 获取重启次数
get_restart_count() {
if [ -f "$RESTART_COUNT_FILE" ]; then
cat "$RESTART_COUNT_FILE"
else
echo "0"
fi
}
# 增加重启次数
increment_restart_count() {
local count=$(get_restart_count)
echo $((count + 1)) > "$RESTART_COUNT_FILE"
}
# 重置重启次数
reset_restart_count() {
rm -f "$RESTART_COUNT_FILE"
}
# 自动修复函数
auto_fix_nginx() {
echo "Attempting to fix Nginx issues..."
# 检查配置文件
if ! nginx -t; then
echo "Configuration syntax error detected"
echo "Please check configuration manually"
return 1
fi
# 检查磁盘空间
log_usage=$(df /var/log | tail -1 | awk '{print $5}' | sed 's/%//')
if [ "$log_usage" -gt 95 ]; then
echo "Cleaning old log files..."
find /var/log/nginx -name "*.log" -mtime +7 -exec gzip {} \;
find /var/log/nginx -name "*.gz" -mtime +30 -delete
fi
# 重启Nginx
if [ "$AUTO_RESTART" = "true" ]; then
restart_count=$(get_restart_count)
if [ "$restart_count" -lt "$MAX_RESTART_ATTEMPTS" ]; then
echo "Restarting Nginx (attempt $((restart_count + 1))/$MAX_RESTART_ATTEMPTS)"
increment_restart_count
systemctl restart nginx
sleep 5
if systemctl is-active --quiet nginx; then
echo "Nginx restarted successfully"
reset_restart_count
return 0
else
echo "Nginx restart failed"
return 1
fi
else
echo "Maximum restart attempts reached. Manual intervention required."
return 1
fi
fi
}
# 运行健康检查,如果失败则尝试修复
if ! /usr/local/bin/nginx-health-check.sh; then
auto_fix_nginx
fi
小结
通过本文学习,你应该掌握:
- 常见Nginx问题的诊断方法
- 日志分析和实时监控技巧
- 性能问题识别和调优
- 调试工具的使用方法
- SSL/TLS问题排查
- 上游服务器连接问题诊断
- 自动化监控和故障修复
- 完整的故障排除流程
这20篇文章涵盖了Nginx从基础安装到高级应用的全方位内容,为你在生产环境中使用Nginx提供了完整的参考指南。