Apache 监控与诊断

Apache Monitoring and Diagnostics

概述 (Overview)

有效的监控和诊断是确保Apache服务器稳定运行的关键。本文将详细介绍Apache监控的各种方法，包括内置监控模块、日志分析、性能指标收集、故障诊断和告警机制等核心技术。

Effective monitoring and diagnostics are key to ensuring Apache servers run stably. This article will detail various Apache monitoring methods, including built-in monitoring modules, log analysis, performance metrics collection, fault diagnosis, and alerting mechanisms core technologies.

1. 内置监控模块 (Built-in Monitoring Modules)

1.1 mod_status配置 (mod_status Configuration)

# 启用状态模块
LoadModule status_module modules/mod_status.so

# 基本状态监控配置
<Location "/server-status">
    SetHandler server-status

    # 启用扩展状态信息
    ExtendedStatus On

    # 访问控制
    <RequireAll>
        Require ip 127.0.0.1
        Require ip 192.168.1.0/24
    </RequireAll>
</Location>

# 机器可读状态信息
<Location "/server-status?auto">
    SetHandler server-status

    # 访问控制
    <RequireAll>
        Require ip 127.0.0.1
        Require ip 192.168.1.0/24
    </RequireAll>
</Location>

1.2 mod_info配置 (mod_info Configuration)

# 启用信息模块
LoadModule info_module modules/mod_info.so

# 服务器配置信息
<Location "/server-info">
    SetHandler server-info

    # 访问控制
    <RequireAll>
        Require ip 127.0.0.1
        Require ip 192.168.1.0/24
    </RequireAll>
</Location>

# 模块信息
<Location "/server-info?module">
    SetHandler server-info

    # 访问控制
    <RequireAll>
        Require ip 127.0.0.1
        Require ip 192.168.1.0/24
    </RequireAll>
</Location>

2. 日志监控和分析 (Log Monitoring and Analysis)

2.1 实时日志监控 (Real-time Log Monitoring)

# 自定义日志格式用于监控
LogFormat "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\" %D %{HTTP_HOST}e" monitor_combined

# 监控日志
CustomLog /var/log/apache2/monitor.log monitor_combined
ErrorLog /var/log/apache2/error.log

# 条件日志记录
SetEnvIf Request_URI "^/health" dontlog
CustomLog /var/log/apache2/access.log combined env=!dontlog

2.2 日志分析脚本 (Log Analysis Script)

#!/bin/bash
# log-analyzer.sh

analyze_logs() {
    local log_file=${1:-"/var/log/apache2/access.log"}
    local error_log=${2:-"/var/log/apache2/error.log"}

    echo "=== Apache Log Analysis ==="
    echo "Log file: $log_file"
    echo

    # 统计请求数
    echo "1. Request Statistics:"
    echo "  Total requests: $(wc -l < $log_file)"
    echo "  Requests in last hour: $(awk -v date="$(date -d '1 hour ago' '+%d/%b/%Y:%H')" '$4 > "["date {print}' $log_file | wc -l)"

    # HTTP状态码分析
    echo
    echo "2. HTTP Status Code Distribution:"
    awk '{print $9}' $log_file | sort | uniq -c | sort -nr | head -10

    # 最常见的请求URL
    echo
    echo "3. Top 10 Most Requested URLs:"
    awk '{print $7}' $log_file | sort | uniq -c | sort -nr | head -10

    # 响应时间分析
    echo
    echo "4. Response Time Analysis:"
    awk '{print $NF}' $log_file | grep -E "^[0-9]+$" | sort -n | tail -10

    # 错误日志分析
    echo
    echo "5. Recent Error Log Entries:"
    tail -20 $error_log

    # IP地址统计
    echo
    echo "6. Top 10 Client IPs:"
    awk '{print $1}' $log_file | sort | uniq -c | sort -nr | head -10

    echo
    echo "Log analysis completed!"
}

analyze_logs $1 $2

3. 性能指标监控 (Performance Metrics Monitoring)

3.1 系统资源监控 (System Resource Monitoring)

#!/bin/bash
# resource-monitor.sh

monitor_resources() {
    echo "=== System Resource Monitoring ==="

    # CPU使用率
    echo "1. CPU Usage:"
    top -bn1 | grep "Cpu(s)" | awk '{print "  " $2 " " $4 " " $6 " " $8}'

    # 内存使用
    echo
    echo "2. Memory Usage:"
    free -h | grep -v "Swap"

    # 磁盘使用
    echo
    echo "3. Disk Usage:"
    df -h | grep -E "(Filesystem|/var/www|/var/log)"

    # 网络连接
    echo
    echo "4. Network Connections:"
    netstat -an | grep :80 | awk '{print $6}' | sort | uniq -c

    # Apache进程
    echo
    echo "5. Apache Processes:"
    ps aux | grep apache2 | grep -v grep | wc -l
    echo "  Memory usage:"
    ps -o pid,vsz,rss,comm -C apache2 | tail -n +2 | awk '{total+=$3} END {print "  Total: " total/1024 " MB"}'

    echo
    echo "Resource monitoring completed!"
}

monitor_resources

3.2 Apache性能指标 (Apache Performance Metrics)

#!/bin/bash
# apache-metrics.sh

get_apache_metrics() {
    local status_url=${1:-"http://localhost/server-status?auto"}

    echo "=== Apache Performance Metrics ==="

    # 获取服务器状态
    metrics=$(curl -s "$status_url")

    # 解析关键指标
    echo "1. Server Status:"
    echo "$metrics" | grep -E "^(Total Accesses|Total kBytes|CPULoad|Uptime|ReqPerSec|BytesPerSec|BytesPerReq|BusyWorkers|IdleWorkers)"

    # 计算负载
    echo
    echo "2. Load Analysis:"
    busy_workers=$(echo "$metrics" | grep "BusyWorkers" | awk '{print $2}')
    idle_workers=$(echo "$metrics" | grep "IdleWorkers" | awk '{print $2}')
    total_workers=$((busy_workers + idle_workers))

    if [ $total_workers -gt 0 ]; then
        load_percent=$(echo "scale=2; $busy_workers * 100 / $total_workers" | bc)
        echo "  Worker utilization: ${load_percent}%"
        echo "  Busy workers: $busy_workers"
        echo "  Idle workers: $idle_workers"
    fi

    echo
    echo "Apache metrics collection completed!"
}

get_apache_metrics $1

4. 健康检查和告警 (Health Check and Alerting)

4.1 健康检查脚本 (Health Check Script)

#!/bin/bash
# health-check.sh

check_health() {
    local host=${1:-"localhost"}
    local port=${2:-80}

    echo "=== Apache Health Check ==="
    echo "Host: $host:$port"
    echo

    # 1. 检查端口是否开放
    echo "1. Port Check:"
    if nc -z $host $port 2>/dev/null; then
        echo "  ✓ Port $port is open"
    else
        echo "  ✗ Port $port is closed"
        return 1
    fi

    # 2. 检查HTTP响应
    echo
    echo "2. HTTP Response Check:"
    response=$(curl -s -o /dev/null -w "%{http_code}" "http://$host:$port/" 2>/dev/null)
    if [ "$response" = "200" ]; then
        echo "  ✓ HTTP 200 OK"
    else
        echo "  ✗ HTTP $response"
    fi

    # 3. 检查服务器状态页面
    echo
    echo "3. Server Status Check:"
    status_response=$(curl -s -o /dev/null -w "%{http_code}" "http://$host:$port/server-status?auto" 2>/dev/null)
    if [ "$status_response" = "200" ]; then
        echo "  ✓ Server status page accessible"
    else
        echo "  ✗ Server status page not accessible"
    fi

    # 4. 检查磁盘空间
    echo
    echo "4. Disk Space Check:"
    disk_usage=$(df /var/www | tail -1 | awk '{print $5}' | sed 's/%//')
    if [ "$disk_usage" -lt 90 ]; then
        echo "  ✓ Disk usage: ${disk_usage}%"
    else
        echo "  ⚠️  Disk usage: ${disk_usage}% (Warning: over 90%)"
    fi

    # 5. 检查内存使用
    echo
    echo "5. Memory Usage Check:"
    memory_usage=$(free | grep Mem | awk '{printf "%.0f", $3/$2 * 100.0}')
    if [ "$memory_usage" -lt 80 ]; then
        echo "  ✓ Memory usage: ${memory_usage}%"
    else
        echo "  ⚠️  Memory usage: ${memory_usage}% (Warning: over 80%)"
    fi

    echo
    echo "Health check completed!"
}

check_health $1 $2

4.2 告警机制 (Alerting Mechanism)

#!/bin/bash
# alert-system.sh

send_alert() {
    local subject="$1"
    local message="$2"
    local severity=${3:-"WARNING"}

    echo "[$severity] $subject"
    echo "$message"

    # 发送邮件告警（需要配置邮件服务器）
    # echo "$message" | mail -s "[$severity] $subject" admin@example.com

    # 发送Slack告警（需要配置webhook）
    # curl -X POST -H 'Content-type: application/json' \
    #     --data "{\"text\":\"[$severity] $subject: $message\"}" \
    #     https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK

    # 记录到告警日志
    echo "$(date): [$severity] $subject - $message" >> /var/log/apache-alerts.log
}

# 监控脚本示例
monitor_apache() {
    # 检查Apache进程
    if ! pgrep apache2 > /dev/null; then
        send_alert "Apache Service Down" "Apache process is not running" "CRITICAL"
        return 1
    fi

    # 检查响应时间
    response_time=$(curl -s -o /dev/null -w "%{time_total}" http://localhost/ 2>/dev/null)
    response_time_ms=$(echo "$response_time * 1000" | bc)

    if (( $(echo "$response_time_ms > 5000" | bc -l) )); then
        send_alert "High Response Time" "Response time is ${response_time_ms}ms (threshold: 5000ms)" "WARNING"
    fi

    # 检查错误日志
    error_count=$(tail -100 /var/log/apache2/error.log | grep -c "error\|warn"i)
    if [ "$error_count" -gt 10 ]; then
        send_alert "High Error Rate" "Found $error_count errors/warnings in last 100 log entries" "WARNING"
    fi
}

monitor_apache

5. 第三方监控工具集成 (Third-party Monitoring Tool Integration)

5.1 Nagios监控插件 (Nagios Monitoring Plugin)

#!/bin/bash
# check_apache.sh - Nagios plugin for Apache monitoring

# 插件状态码
STATE_OK=0
STATE_WARNING=1
STATE_CRITICAL=2
STATE_UNKNOWN=3

# 默认参数
HOST="localhost"
PORT=80
WARNING_THRESHOLD=5000
CRITICAL_THRESHOLD=10000

# 解析命令行参数
while getopts "H:P:w:c:" opt; do
    case $opt in
        H) HOST="$OPTARG" ;;
        P) PORT="$OPTARG" ;;
        w) WARNING_THRESHOLD="$OPTARG" ;;
        c) CRITICAL_THRESHOLD="$OPTARG" ;;
        *) echo "Usage: $0 -H host -P port -w warning_ms -c critical_ms"
           exit $STATE_UNKNOWN ;;
    esac
done

# 检查Apache状态
check_apache() {
    # 检查端口
    if ! nc -z "$HOST" "$PORT" 2>/dev/null; then
        echo "CRITICAL - Apache is not listening on $HOST:$PORT"
        exit $STATE_CRITICAL
    fi

    # 检查响应时间
    start_time=$(date +%s%3N)
    response_code=$(curl -s -o /dev/null -w "%{http_code}" "http://$HOST:$PORT/" 2>/dev/null)
    end_time=$(date +%s%3N)
    response_time=$((end_time - start_time))

    # 检查HTTP状态码
    if [ "$response_code" != "200" ]; then
        echo "CRITICAL - HTTP $response_code from Apache"
        exit $STATE_CRITICAL
    fi

    # 检查响应时间
    if [ "$response_time" -gt "$CRITICAL_THRESHOLD" ]; then
        echo "CRITICAL - Response time: ${response_time}ms (threshold: ${CRITICAL_THRESHOLD}ms)"
        exit $STATE_CRITICAL
    elif [ "$response_time" -gt "$WARNING_THRESHOLD" ]; then
        echo "WARNING - Response time: ${response_time}ms (threshold: ${WARNING_THRESHOLD}ms)"
        exit $STATE_WARNING
    else
        echo "OK - Response time: ${response_time}ms | response_time=${response_time}ms"
        exit $STATE_OK
    fi
}

check_apache

5.2 Prometheus监控集成 (Prometheus Monitoring Integration)

# 启用必要的模块
LoadModule ext_filter_module modules/mod_ext_filter.so

# Prometheus指标导出配置
<Location "/metrics">
    SetHandler prometheus-exporter

    # 访问控制
    <RequireAll>
        Require ip 127.0.0.1
        Require ip 192.168.1.0/24
    </RequireAll>
</Location>

# 自定义指标收集
LogFormat "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\" %D" prometheus_log
CustomLog /var/log/apache2/prometheus.log prometheus_log

6. 故障诊断工具 (Troubleshooting Tools)

6.1 调试日志配置 (Debug Log Configuration)

# 启用详细日志记录
LogLevel debug

# 或者针对特定模块启用调试
LogLevel info rewrite:trace3

# 或者更详细的调试级别
LogLevel debug rewrite:trace8

# 自定义调试日志格式
LogFormat "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\" %D %{HTTP_HOST}e %{REQUEST_ID}e" debug_combined
CustomLog /var/log/apache2/debug.log debug_combined

6.2 故障诊断脚本 (Troubleshooting Script)

#!/bin/bash
# diagnostic.sh

run_diagnostics() {
    echo "=== Apache Diagnostic Report ==="
    echo "Generated at: $(date)"
    echo

    # 1. 基本信息
    echo "1. System Information:"
    echo "  Hostname: $(hostname)"
    echo "  OS: $(uname -s)"
    echo "  Apache Version: $(apache2 -v | head -1)"

    # 2. 配置检查
    echo
    echo "2. Configuration Check:"
    if apache2ctl configtest > /dev/null 2>&1; then
        echo "  ✓ Configuration syntax OK"
    else
        echo "  ✗ Configuration syntax error"
        apache2ctl configtest
    fi

    # 3. 模块检查
    echo
    echo "3. Loaded Modules:"
    apache2ctl -M | head -20

    # 4. 虚拟主机检查
    echo
    echo "4. Virtual Host Configuration:"
    apache2ctl -S 2>/dev/null | head -10

    # 5. 进程检查
    echo
    echo "5. Process Information:"
    ps aux | grep apache2 | grep -v grep

    # 6. 端口检查
    echo
    echo "6. Port Listening:"
    netstat -tlnp | grep apache2

    # 7. 最近错误
    echo
    echo "7. Recent Errors:"
    tail -10 /var/log/apache2/error.log

    echo
    echo "Diagnostic report completed!"
}

run_diagnostics

小结 (Summary)

通过本文学习，你应该掌握：

Apache内置监控模块的配置和使用
日志监控和分析技术
性能指标收集和监控方法
健康检查和告警机制
第三方监控工具集成
故障诊断工具和调试技术

监控和诊断是运维工作的重要组成部分，建立完善的监控体系能够帮助及时发现和解决问题，确保服务的稳定性和可用性。在下一篇文章中，我们将详细介绍Apache模块开发与扩展技术。

监控与诊断