Apache 监控与诊断
Apache Monitoring and Diagnostics
概述 (Overview)
有效的监控和诊断是确保Apache服务器稳定运行的关键。本文将详细介绍Apache监控的各种方法,包括内置监控模块、日志分析、性能指标收集、故障诊断和告警机制等核心技术。
Effective monitoring and diagnostics are key to ensuring Apache servers run stably. This article will detail various Apache monitoring methods, including built-in monitoring modules, log analysis, performance metrics collection, fault diagnosis, and alerting mechanisms core technologies.
1. 内置监控模块 (Built-in Monitoring Modules)
1.1 mod_status配置 (mod_status Configuration)
# 启用状态模块
LoadModule status_module modules/mod_status.so
# 基本状态监控配置
<Location "/server-status">
SetHandler server-status
# 启用扩展状态信息
ExtendedStatus On
# 访问控制
<RequireAll>
Require ip 127.0.0.1
Require ip 192.168.1.0/24
</RequireAll>
</Location>
# 机器可读状态信息
<Location "/server-status?auto">
SetHandler server-status
# 访问控制
<RequireAll>
Require ip 127.0.0.1
Require ip 192.168.1.0/24
</RequireAll>
</Location>
1.2 mod_info配置 (mod_info Configuration)
# 启用信息模块
LoadModule info_module modules/mod_info.so
# 服务器配置信息
<Location "/server-info">
SetHandler server-info
# 访问控制
<RequireAll>
Require ip 127.0.0.1
Require ip 192.168.1.0/24
</RequireAll>
</Location>
# 模块信息
<Location "/server-info?module">
SetHandler server-info
# 访问控制
<RequireAll>
Require ip 127.0.0.1
Require ip 192.168.1.0/24
</RequireAll>
</Location>
2. 日志监控和分析 (Log Monitoring and Analysis)
2.1 实时日志监控 (Real-time Log Monitoring)
# 自定义日志格式用于监控
LogFormat "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\" %D %{HTTP_HOST}e" monitor_combined
# 监控日志
CustomLog /var/log/apache2/monitor.log monitor_combined
ErrorLog /var/log/apache2/error.log
# 条件日志记录
SetEnvIf Request_URI "^/health" dontlog
CustomLog /var/log/apache2/access.log combined env=!dontlog
2.2 日志分析脚本 (Log Analysis Script)
#!/bin/bash
# log-analyzer.sh
analyze_logs() {
local log_file=${1:-"/var/log/apache2/access.log"}
local error_log=${2:-"/var/log/apache2/error.log"}
echo "=== Apache Log Analysis ==="
echo "Log file: $log_file"
echo
# 统计请求数
echo "1. Request Statistics:"
echo " Total requests: $(wc -l < $log_file)"
echo " Requests in last hour: $(awk -v date="$(date -d '1 hour ago' '+%d/%b/%Y:%H')" '$4 > "["date {print}' $log_file | wc -l)"
# HTTP状态码分析
echo
echo "2. HTTP Status Code Distribution:"
awk '{print $9}' $log_file | sort | uniq -c | sort -nr | head -10
# 最常见的请求URL
echo
echo "3. Top 10 Most Requested URLs:"
awk '{print $7}' $log_file | sort | uniq -c | sort -nr | head -10
# 响应时间分析
echo
echo "4. Response Time Analysis:"
awk '{print $NF}' $log_file | grep -E "^[0-9]+$" | sort -n | tail -10
# 错误日志分析
echo
echo "5. Recent Error Log Entries:"
tail -20 $error_log
# IP地址统计
echo
echo "6. Top 10 Client IPs:"
awk '{print $1}' $log_file | sort | uniq -c | sort -nr | head -10
echo
echo "Log analysis completed!"
}
analyze_logs $1 $2
3. 性能指标监控 (Performance Metrics Monitoring)
3.1 系统资源监控 (System Resource Monitoring)
#!/bin/bash
# resource-monitor.sh
monitor_resources() {
echo "=== System Resource Monitoring ==="
# CPU使用率
echo "1. CPU Usage:"
top -bn1 | grep "Cpu(s)" | awk '{print " " $2 " " $4 " " $6 " " $8}'
# 内存使用
echo
echo "2. Memory Usage:"
free -h | grep -v "Swap"
# 磁盘使用
echo
echo "3. Disk Usage:"
df -h | grep -E "(Filesystem|/var/www|/var/log)"
# 网络连接
echo
echo "4. Network Connections:"
netstat -an | grep :80 | awk '{print $6}' | sort | uniq -c
# Apache进程
echo
echo "5. Apache Processes:"
ps aux | grep apache2 | grep -v grep | wc -l
echo " Memory usage:"
ps -o pid,vsz,rss,comm -C apache2 | tail -n +2 | awk '{total+=$3} END {print " Total: " total/1024 " MB"}'
echo
echo "Resource monitoring completed!"
}
monitor_resources
3.2 Apache性能指标 (Apache Performance Metrics)
#!/bin/bash
# apache-metrics.sh
get_apache_metrics() {
local status_url=${1:-"http://localhost/server-status?auto"}
echo "=== Apache Performance Metrics ==="
# 获取服务器状态
metrics=$(curl -s "$status_url")
# 解析关键指标
echo "1. Server Status:"
echo "$metrics" | grep -E "^(Total Accesses|Total kBytes|CPULoad|Uptime|ReqPerSec|BytesPerSec|BytesPerReq|BusyWorkers|IdleWorkers)"
# 计算负载
echo
echo "2. Load Analysis:"
busy_workers=$(echo "$metrics" | grep "BusyWorkers" | awk '{print $2}')
idle_workers=$(echo "$metrics" | grep "IdleWorkers" | awk '{print $2}')
total_workers=$((busy_workers + idle_workers))
if [ $total_workers -gt 0 ]; then
load_percent=$(echo "scale=2; $busy_workers * 100 / $total_workers" | bc)
echo " Worker utilization: ${load_percent}%"
echo " Busy workers: $busy_workers"
echo " Idle workers: $idle_workers"
fi
echo
echo "Apache metrics collection completed!"
}
get_apache_metrics $1
4. 健康检查和告警 (Health Check and Alerting)
4.1 健康检查脚本 (Health Check Script)
#!/bin/bash
# health-check.sh
check_health() {
local host=${1:-"localhost"}
local port=${2:-80}
echo "=== Apache Health Check ==="
echo "Host: $host:$port"
echo
# 1. 检查端口是否开放
echo "1. Port Check:"
if nc -z $host $port 2>/dev/null; then
echo " ✓ Port $port is open"
else
echo " ✗ Port $port is closed"
return 1
fi
# 2. 检查HTTP响应
echo
echo "2. HTTP Response Check:"
response=$(curl -s -o /dev/null -w "%{http_code}" "http://$host:$port/" 2>/dev/null)
if [ "$response" = "200" ]; then
echo " ✓ HTTP 200 OK"
else
echo " ✗ HTTP $response"
fi
# 3. 检查服务器状态页面
echo
echo "3. Server Status Check:"
status_response=$(curl -s -o /dev/null -w "%{http_code}" "http://$host:$port/server-status?auto" 2>/dev/null)
if [ "$status_response" = "200" ]; then
echo " ✓ Server status page accessible"
else
echo " ✗ Server status page not accessible"
fi
# 4. 检查磁盘空间
echo
echo "4. Disk Space Check:"
disk_usage=$(df /var/www | tail -1 | awk '{print $5}' | sed 's/%//')
if [ "$disk_usage" -lt 90 ]; then
echo " ✓ Disk usage: ${disk_usage}%"
else
echo " ⚠️ Disk usage: ${disk_usage}% (Warning: over 90%)"
fi
# 5. 检查内存使用
echo
echo "5. Memory Usage Check:"
memory_usage=$(free | grep Mem | awk '{printf "%.0f", $3/$2 * 100.0}')
if [ "$memory_usage" -lt 80 ]; then
echo " ✓ Memory usage: ${memory_usage}%"
else
echo " ⚠️ Memory usage: ${memory_usage}% (Warning: over 80%)"
fi
echo
echo "Health check completed!"
}
check_health $1 $2
4.2 告警机制 (Alerting Mechanism)
#!/bin/bash
# alert-system.sh
send_alert() {
local subject="$1"
local message="$2"
local severity=${3:-"WARNING"}
echo "[$severity] $subject"
echo "$message"
# 发送邮件告警(需要配置邮件服务器)
# echo "$message" | mail -s "[$severity] $subject" admin@example.com
# 发送Slack告警(需要配置webhook)
# curl -X POST -H 'Content-type: application/json' \
# --data "{\"text\":\"[$severity] $subject: $message\"}" \
# https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK
# 记录到告警日志
echo "$(date): [$severity] $subject - $message" >> /var/log/apache-alerts.log
}
# 监控脚本示例
monitor_apache() {
# 检查Apache进程
if ! pgrep apache2 > /dev/null; then
send_alert "Apache Service Down" "Apache process is not running" "CRITICAL"
return 1
fi
# 检查响应时间
response_time=$(curl -s -o /dev/null -w "%{time_total}" http://localhost/ 2>/dev/null)
response_time_ms=$(echo "$response_time * 1000" | bc)
if (( $(echo "$response_time_ms > 5000" | bc -l) )); then
send_alert "High Response Time" "Response time is ${response_time_ms}ms (threshold: 5000ms)" "WARNING"
fi
# 检查错误日志
error_count=$(tail -100 /var/log/apache2/error.log | grep -c "error\|warn"i)
if [ "$error_count" -gt 10 ]; then
send_alert "High Error Rate" "Found $error_count errors/warnings in last 100 log entries" "WARNING"
fi
}
monitor_apache
5. 第三方监控工具集成 (Third-party Monitoring Tool Integration)
5.1 Nagios监控插件 (Nagios Monitoring Plugin)
#!/bin/bash
# check_apache.sh - Nagios plugin for Apache monitoring
# 插件状态码
STATE_OK=0
STATE_WARNING=1
STATE_CRITICAL=2
STATE_UNKNOWN=3
# 默认参数
HOST="localhost"
PORT=80
WARNING_THRESHOLD=5000
CRITICAL_THRESHOLD=10000
# 解析命令行参数
while getopts "H:P:w:c:" opt; do
case $opt in
H) HOST="$OPTARG" ;;
P) PORT="$OPTARG" ;;
w) WARNING_THRESHOLD="$OPTARG" ;;
c) CRITICAL_THRESHOLD="$OPTARG" ;;
*) echo "Usage: $0 -H host -P port -w warning_ms -c critical_ms"
exit $STATE_UNKNOWN ;;
esac
done
# 检查Apache状态
check_apache() {
# 检查端口
if ! nc -z "$HOST" "$PORT" 2>/dev/null; then
echo "CRITICAL - Apache is not listening on $HOST:$PORT"
exit $STATE_CRITICAL
fi
# 检查响应时间
start_time=$(date +%s%3N)
response_code=$(curl -s -o /dev/null -w "%{http_code}" "http://$HOST:$PORT/" 2>/dev/null)
end_time=$(date +%s%3N)
response_time=$((end_time - start_time))
# 检查HTTP状态码
if [ "$response_code" != "200" ]; then
echo "CRITICAL - HTTP $response_code from Apache"
exit $STATE_CRITICAL
fi
# 检查响应时间
if [ "$response_time" -gt "$CRITICAL_THRESHOLD" ]; then
echo "CRITICAL - Response time: ${response_time}ms (threshold: ${CRITICAL_THRESHOLD}ms)"
exit $STATE_CRITICAL
elif [ "$response_time" -gt "$WARNING_THRESHOLD" ]; then
echo "WARNING - Response time: ${response_time}ms (threshold: ${WARNING_THRESHOLD}ms)"
exit $STATE_WARNING
else
echo "OK - Response time: ${response_time}ms | response_time=${response_time}ms"
exit $STATE_OK
fi
}
check_apache
5.2 Prometheus监控集成 (Prometheus Monitoring Integration)
# 启用必要的模块
LoadModule ext_filter_module modules/mod_ext_filter.so
# Prometheus指标导出配置
<Location "/metrics">
SetHandler prometheus-exporter
# 访问控制
<RequireAll>
Require ip 127.0.0.1
Require ip 192.168.1.0/24
</RequireAll>
</Location>
# 自定义指标收集
LogFormat "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\" %D" prometheus_log
CustomLog /var/log/apache2/prometheus.log prometheus_log
6. 故障诊断工具 (Troubleshooting Tools)
6.1 调试日志配置 (Debug Log Configuration)
# 启用详细日志记录
LogLevel debug
# 或者针对特定模块启用调试
LogLevel info rewrite:trace3
# 或者更详细的调试级别
LogLevel debug rewrite:trace8
# 自定义调试日志格式
LogFormat "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\" %D %{HTTP_HOST}e %{REQUEST_ID}e" debug_combined
CustomLog /var/log/apache2/debug.log debug_combined
6.2 故障诊断脚本 (Troubleshooting Script)
#!/bin/bash
# diagnostic.sh
run_diagnostics() {
echo "=== Apache Diagnostic Report ==="
echo "Generated at: $(date)"
echo
# 1. 基本信息
echo "1. System Information:"
echo " Hostname: $(hostname)"
echo " OS: $(uname -s)"
echo " Apache Version: $(apache2 -v | head -1)"
# 2. 配置检查
echo
echo "2. Configuration Check:"
if apache2ctl configtest > /dev/null 2>&1; then
echo " ✓ Configuration syntax OK"
else
echo " ✗ Configuration syntax error"
apache2ctl configtest
fi
# 3. 模块检查
echo
echo "3. Loaded Modules:"
apache2ctl -M | head -20
# 4. 虚拟主机检查
echo
echo "4. Virtual Host Configuration:"
apache2ctl -S 2>/dev/null | head -10
# 5. 进程检查
echo
echo "5. Process Information:"
ps aux | grep apache2 | grep -v grep
# 6. 端口检查
echo
echo "6. Port Listening:"
netstat -tlnp | grep apache2
# 7. 最近错误
echo
echo "7. Recent Errors:"
tail -10 /var/log/apache2/error.log
echo
echo "Diagnostic report completed!"
}
run_diagnostics
小结 (Summary)
通过本文学习,你应该掌握:
- Apache内置监控模块的配置和使用
- 日志监控和分析技术
- 性能指标收集和监控方法
- 健康检查和告警机制
- 第三方监控工具集成
- 故障诊断工具和调试技术
监控和诊断是运维工作的重要组成部分,建立完善的监控体系能够帮助及时发现和解决问题,确保服务的稳定性和可用性。在下一篇文章中,我们将详细介绍Apache模块开发与扩展技术。