Tomcat 故障排除与调优

Troubleshooting and Tuning

概述

Tomcat故障排除与调优是运维工程师的核心技能。本文详细介绍常见问题的诊断方法、性能瓶颈分析、故障处理流程和调优策略,帮助你快速定位和解决Tomcat运行中的各种问题。

1. 常见启动问题

1.1 端口占用检查

#!/bin/bash
# check-ports.sh

TOMCAT_PORTS=(8080 8443 8009 8005)

for port in "${TOMCAT_PORTS[@]}"; do
    echo "检查端口 $port:"

    if netstat -tuln | grep ":$port " > /dev/null; then
        echo "  ❌ 端口 $port 已被占用"
        pid=$(lsof -ti:$port)
        if [ ! -z "$pid" ]; then
            echo "  占用进程: $(ps -p $pid -o cmd --no-headers)"
            echo "  解决方案: kill -9 $pid"
        fi
    else
        echo "  ✅ 端口 $port 可用"
    fi
done

1.2 JVM问题诊断

#!/bin/bash
# diagnose-jvm.sh

TOMCAT_HOME="/opt/tomcat9"
CATALINA_OUT="$TOMCAT_HOME/logs/catalina.out"

# 检查Java环境
check_java() {
    if ! command -v java &> /dev/null; then
        echo "❌ Java未安装"
        return 1
    fi

    java_version=$(java -version 2>&1 | head -n1 | cut -d'"' -f2)
    echo "Java版本: $java_version"

    major_version=$(echo $java_version | cut -d'.' -f1)
    if [ "$major_version" -lt 8 ]; then
        echo "❌ Java版本过低"
        return 1
    fi

    echo "✅ Java版本兼容"
}

# 检查内存设置
check_memory() {
    total_memory=$(free -m | awk 'NR==2{print $2}')
    echo "系统内存: ${total_memory}MB"

    if [ -f "$TOMCAT_HOME/bin/setenv.sh" ]; then
        heap_size=$(grep -o 'Xmx[0-9]*[mg]' "$TOMCAT_HOME/bin/setenv.sh" | head -1)
        echo "堆内存设置: $heap_size"
    fi
}

# 检查启动错误
check_errors() {
    if [ ! -f "$CATALINA_OUT" ]; then
        echo "❌ 找不到catalina.out"
        return 1
    fi

    # 查找常见错误
    errors=("OutOfMemoryError" "ClassNotFoundException" "BindException")

    for error in "${errors[@]}"; do
        if tail -1000 "$CATALINA_OUT" | grep -q "$error"; then
            echo "❌ 发现错误: $error"
            tail -1000 "$CATALINA_OUT" | grep -A 2 "$error" | tail -5
        fi
    done

    if tail -100 "$CATALINA_OUT" | grep -q "Server startup in"; then
        echo "✅ 服务器启动成功"
    else
        echo "❌ 服务器启动失败"
    fi
}

check_java
check_memory  
check_errors

2. 性能问题诊断

2.1 性能分析工具

#!/bin/bash
# performance-analyzer.sh

TOMCAT_PID=$(jps -l | grep Bootstrap | awk '{print $1}')

analyze_cpu() {
    echo "=== CPU分析 ==="

    if [ -z "$TOMCAT_PID" ]; then
        echo "❌ Tomcat进程未找到"
        return 1
    fi

    # 监控CPU使用率
    total=0
    samples=10

    for i in $(seq 1 $samples); do
        cpu=$(top -p $TOMCAT_PID -n 1 -b | grep java | awk '{print $9}' | cut -d'%' -f1)
        total=$(echo "$total + $cpu" | bc)
        sleep 2
    done

    avg=$(echo "scale=1; $total / $samples" | bc)
    echo "平均CPU使用率: ${avg}%"

    if (( $(echo "$avg > 80" | bc -l) )); then
        echo "⚠️  CPU使用率过高"
    fi
}

analyze_memory() {
    echo "=== 内存分析 ==="

    jstat -gc $TOMCAT_PID | tail -1 | awk '{
        total_heap = ($1 + $2 + $5 + $7);
        used_heap = ($3 + $4 + $6 + $8);
        heap_usage = used_heap * 100 / total_heap;

        print "堆内存使用率: " heap_usage "%";
        print "Young GC次数: " $13;
        print "Full GC次数: " $15;

        if (heap_usage > 85) print "⚠️  堆内存使用率过高";
        if ($15 > 10) print "⚠️  Full GC次数过多";
    }'
}

analyze_threads() {
    echo "=== 线程分析 ==="

    thread_dump=$(jstack $TOMCAT_PID)

    total_threads=$(echo "$thread_dump" | grep -c "^\"")
    blocked_threads=$(echo "$thread_dump" | grep -c "BLOCKED")

    echo "总线程数: $total_threads"
    echo "阻塞线程: $blocked_threads"

    if [ "$blocked_threads" -gt 10 ]; then
        echo "⚠️  阻塞线程过多,检查死锁"
        jcmd $TOMCAT_PID Thread.print | grep -A 5 "deadlock"
    fi
}

case "$1" in
    "cpu") analyze_cpu ;;
    "memory") analyze_memory ;;
    "threads") analyze_threads ;;
    *)
        analyze_cpu
        echo
        analyze_memory
        echo
        analyze_threads
        ;;
esac

3. 内存问题处理

3.1 内存泄漏检测

#!/bin/bash
# memory-leak-detector.sh

TOMCAT_PID=$(jps -l | grep Bootstrap | awk '{print $1}')
HEAP_DUMP_DIR="/opt/tomcat/heapdumps"

detect_leak() {
    echo "=== 内存泄漏检测 ==="

    mkdir -p "$HEAP_DUMP_DIR"

    # 监控内存增长
    echo "监控内存增长(60秒)..."

    initial_memory=$(jstat -gc $TOMCAT_PID | tail -1 | awk '{print ($3 + $4 + $6 + $8)}')
    sleep 60
    final_memory=$(jstat -gc $TOMCAT_PID | tail -1 | awk '{print ($3 + $4 + $6 + $8)}')

    growth=$((final_memory - initial_memory))

    echo "内存增长: ${growth}KB"

    if [ "$growth" -gt 10240 ]; then
        echo "⚠️  检测到内存快速增长"
        generate_heap_dump
    fi
}

generate_heap_dump() {
    echo "生成堆转储..."

    local dump_file="$HEAP_DUMP_DIR/heap_$(date +%Y%m%d_%H%M%S).hprof"

    if jcmd $TOMCAT_PID GC.dump_heap "$dump_file"; then
        echo "堆转储已生成: $dump_file"

        # 分析大对象
        echo "类实例统计(前10):"
        jcmd $TOMCAT_PID GC.class_histogram | head -10
    else
        echo "❌ 堆转储生成失败"
    fi
}

monitor_memory() {
    echo "开始内存监控..."

    while true; do
        timestamp=$(date '+%H:%M:%S')
        jstat -gc $TOMCAT_PID | tail -1 | awk -v ts="$timestamp" '{
            used = ($3 + $4 + $6 + $8);
            total = ($1 + $2 + $5 + $7);
            usage = used * 100 / total;
            print ts " - 内存使用: " usage "% (" used "KB/" total "KB)";
        }'
        sleep 30
    done
}

case "$1" in
    "detect") detect_leak ;;
    "dump") generate_heap_dump ;;
    "monitor") monitor_memory ;;
    *) detect_leak ;;
esac

4. 网络连接问题

4.1 连接诊断工具

#!/bin/bash
# connection-diagnostics.sh

TOMCAT_PORT=8080

test_connectivity() {
    echo "=== 连接测试 ==="

    # 端口可访问性
    if timeout 5 bash -c "</dev/tcp/localhost/$TOMCAT_PORT"; then
        echo "✅ 端口可访问"
    else
        echo "❌ 端口不可访问"
        return 1
    fi

    # HTTP响应测试
    response_code=$(curl -s -o /dev/null -w "%{http_code}" "http://localhost:$TOMCAT_PORT/")
    response_time=$(curl -s -o /dev/null -w "%{time_total}" "http://localhost:$TOMCAT_PORT/")

    echo "HTTP响应码: $response_code"
    echo "响应时间: ${response_time}秒"
}

analyze_connections() {
    echo "=== 连接分析 ==="

    # 连接状态统计
    echo "连接状态统计:"
    netstat -an | grep ":$TOMCAT_PORT " | awk '{print $6}' | sort | uniq -c

    # 连接数统计
    total=$(netstat -an | grep ":$TOMCAT_PORT " | wc -l)
    established=$(netstat -an | grep ":$TOMCAT_PORT " | grep ESTABLISHED | wc -l)
    time_wait=$(netstat -an | grep ":$TOMCAT_PORT " | grep TIME_WAIT | wc -l)

    echo "总连接数: $total"
    echo "已建立: $established"
    echo "TIME_WAIT: $time_wait"

    if [ "$time_wait" -gt 1000 ]; then
        echo "⚠️  TIME_WAIT连接过多"
    fi
}

load_test() {
    echo "=== 负载测试 ==="

    if ! command -v ab &> /dev/null; then
        echo "需要安装apache2-utils"
        return 1
    fi

    echo "执行负载测试..."
    ab -n 1000 -c 50 "http://localhost:$TOMCAT_PORT/" | grep -E "(Requests per second|Failed requests)"
}

case "$1" in
    "test") test_connectivity ;;
    "analyze") analyze_connections ;;
    "load") load_test ;;
    *)
        test_connectivity
        echo
        analyze_connections
        ;;
esac

5. 自动调优工具

5.1 系统调优脚本

#!/bin/bash
# auto-tuning.sh

TOMCAT_HOME="/opt/tomcat9"

system_analysis() {
    cpu_cores=$(nproc)
    total_memory=$(free -m | awk 'NR==2{print $2}')

    echo "CPU核心: $cpu_cores"
    echo "总内存: ${total_memory}MB"

    echo "$cpu_cores,$total_memory"
}

tune_jvm() {
    local cpu_cores=$1
    local total_memory=$2

    echo "=== JVM调优 ==="

    # 计算堆内存(70%)
    heap_size=$((total_memory * 70 / 100))

    # 选择GC
    if [ "$heap_size" -gt 4096 ]; then
        gc_params="-XX:+UseG1GC -XX:MaxGCPauseMillis=200"
    else
        gc_params="-XX:+UseParallelGC"
    fi

    # 生成setenv.sh
    cat > "$TOMCAT_HOME/bin/setenv.sh" << EOF
#!/bin/bash
# Auto-generated JVM parameters

export JAVA_OPTS="-Xms${heap_size}m -Xmx${heap_size}m $gc_params"
export CATALINA_OPTS="-Dfile.encoding=UTF-8 -Djava.security.egd=file:/dev/./urandom"

# GC日志
export CATALINA_OPTS="\$CATALINA_OPTS -Xloggc:\$CATALINA_HOME/logs/gc.log"
export CATALINA_OPTS="\$CATALINA_OPTS -XX:+PrintGCDetails -XX:+PrintGCTimeStamps"
EOF

    chmod +x "$TOMCAT_HOME/bin/setenv.sh"
    echo "JVM参数已优化: 堆内存=${heap_size}MB"
}

tune_connector() {
    echo "=== 连接器调优 ==="

    local cpu_cores=$1
    local max_threads=$((cpu_cores * 50))
    local accept_count=$((max_threads / 2))

    # 备份原配置
    cp "$TOMCAT_HOME/conf/server.xml" "$TOMCAT_HOME/conf/server.xml.backup"

    # 更新连接器配置
    sed -i "s/maxThreads=\"[0-9]*\"/maxThreads=\"$max_threads\"/" "$TOMCAT_HOME/conf/server.xml"
    sed -i "s/acceptCount=\"[0-9]*\"/acceptCount=\"$accept_count\"/" "$TOMCAT_HOME/conf/server.xml"

    echo "连接器已优化: maxThreads=$max_threads, acceptCount=$accept_count"
}

tune_system() {
    echo "=== 系统参数调优 ==="

    # 文件描述符限制
    echo "* soft nofile 65536" >> /etc/security/limits.conf
    echo "* hard nofile 65536" >> /etc/security/limits.conf

    # TCP参数优化
    cat >> /etc/sysctl.conf << EOF
net.ipv4.tcp_tw_reuse = 1
net.ipv4.tcp_fin_timeout = 30
net.core.somaxconn = 32768
EOF

    sysctl -p
    echo "系统参数已优化"
}

generate_report() {
    echo "=== 调优报告 ==="

    echo "系统信息:"
    system_analysis

    echo
    echo "当前JVM参数:"
    if [ -f "$TOMCAT_HOME/bin/setenv.sh" ]; then
        grep "export.*OPTS" "$TOMCAT_HOME/bin/setenv.sh"
    fi

    echo
    echo "连接器配置:"
    grep -E "(maxThreads|acceptCount)" "$TOMCAT_HOME/conf/server.xml"

    echo
    echo "调优建议:"
    echo "1. 重启Tomcat使配置生效"
    echo "2. 监控应用性能"
    echo "3. 根据实际负载调整参数"
}

main() {
    if [ "$EUID" -ne 0 ]; then
        echo "需要root权限运行"
        exit 1
    fi

    analysis=$(system_analysis)
    cpu_cores=$(echo "$analysis" | cut -d',' -f1)
    total_memory=$(echo "$analysis" | cut -d',' -f2)

    case "$1" in
        "jvm") tune_jvm "$cpu_cores" "$total_memory" ;;
        "connector") tune_connector "$cpu_cores" ;;
        "system") tune_system ;;
        "report") generate_report ;;
        *)
            tune_jvm "$cpu_cores" "$total_memory"
            tune_connector "$cpu_cores"
            tune_system
            generate_report
            ;;
    esac
}

main "$@"

6. 故障处理检查清单

6.1 故障处理流程

#!/bin/bash
# troubleshoot-checklist.sh

echo "=== Tomcat故障处理检查清单 ==="

checklist=(
    "检查Tomcat进程是否运行"
    "验证端口是否可访问"
    "检查Java版本兼容性"
    "分析启动日志错误"
    "监控内存使用情况"
    "检查磁盘空间"
    "验证配置文件语法"
    "测试应用响应"
    "检查网络连接"
    "分析性能指标"
)

for i in "${!checklist[@]}"; do
    echo "$((i+1)). ${checklist[i]}"
done

echo
echo "常见问题快速诊断:"
echo "- 启动失败: 检查端口占用和Java环境"
echo "- 内存不足: 调整堆内存大小和GC参数"
echo "- 响应缓慢: 分析线程池和数据库连接"
echo "- 连接超时: 检查网络和防火墙设置"
echo "- 应用无响应: 检查死锁和资源竞争"

echo
echo "监控指标建议:"
echo "- CPU使用率 < 80%"
echo "- 内存使用率 < 85%"
echo "- 响应时间 < 5秒"
echo "- 错误率 < 1%"
echo "- GC暂停时间 < 200ms"

小结

通过本文学习,你应该掌握:

  1. 常见启动问题的诊断和解决方法
  2. 系统性能瓶颈的分析技术
  3. 内存泄漏的检测和处理
  4. 网络连接问题的排查技巧
  5. 自动化调优工具的使用
  6. 故障处理的标准流程
  7. 关键性能指标的监控

本系列20篇Tomcat文章涵盖了从基础安装到高级调优的全部内容,为你提供了完整的Tomcat使用和管理指南。

powered by Gitbook© 2025 编外计划 | 最后修改: 2025-08-29 15:40:15

results matching ""

    No results matching ""