2024-03-15

Linux系统监控：构建全方位运维监控体系

在现代IT基础设施中，Linux系统监控是保障服务稳定运行的关键环节。一个完善的监控体系不仅能够及时发现问题，还能预测潜在风险，为运维决策提供数据支撑。本文将深入探讨Linux系统监控的各个方面，从基础监控指标到企业级监控架构的设计与实现。

系统监控基础理论

监控的核心价值

系统监控在现代运维中发挥着至关重要的作用：

故障预防：通过监控关键指标，在问题发生前进行预警
快速定位：当故障发生时，快速定位问题根源
性能优化：基于监控数据进行系统性能调优
容量规划：为系统扩容和资源规划提供数据依据
SLA保障：确保服务水平协议的达成

监控指标体系

1. 系统资源监控

CPU监控指标：

CPU使用率（user、system、idle、iowait）
负载平均值（1分钟、5分钟、15分钟）
CPU核心数和频率
进程和线程数量

内存监控指标：

内存使用率和可用内存
Swap使用情况
缓存和缓冲区使用量
内存分配和释放速率

磁盘监控指标：

磁盘使用率和可用空间
磁盘I/O读写速率
磁盘队列长度和响应时间
inode使用情况

网络监控指标：

网络接口流量（入站/出站）
网络连接数和状态
网络错误和丢包率
TCP连接状态分布

2. 应用层监控

进程监控：

进程状态和资源使用
进程启动时间和运行时长
进程文件描述符使用
进程内存映射

服务监控：

服务可用性和响应时间
服务端口监听状态
服务日志错误统计
服务依赖关系检查

基础监控工具与命令

系统内置监控命令

1. 实时监控命令

#!/bin/bash

# CPU和内存实时监控
function monitor_cpu_memory() {
    echo "=== CPU和内存监控 ==="
    
    # 使用top命令获取系统概览
    top -bn1 | head -20
    
    echo -e "\n=== 详细CPU信息 ==="
    # CPU详细信息
    cat /proc/cpuinfo | grep -E "processor|model name|cpu MHz" | head -10
    
    echo -e "\n=== 内存详细信息 ==="
    # 内存详细信息
    free -h
    cat /proc/meminfo | grep -E "MemTotal|MemFree|MemAvailable|Buffers|Cached|SwapTotal|SwapFree"
    
    echo -e "\n=== 负载平均值 ==="
    # 系统负载
    uptime
    cat /proc/loadavg
}

# 磁盘监控
function monitor_disk() {
    echo "=== 磁盘使用监控 ==="
    
    # 磁盘使用情况
    df -h
    
    echo -e "\n=== inode使用情况 ==="
    df -i
    
    echo -e "\n=== 磁盘I/O统计 ==="
    # 磁盘I/O统计（需要sysstat包）
    if command -v iostat &> /dev/null; then
        iostat -x 1 3
    else
        echo "iostat命令未安装，请安装sysstat包"
    fi
    
    echo -e "\n=== 磁盘读写活动 ==="
    # 实时磁盘活动
    if command -v iotop &> /dev/null; then
        iotop -b -n 1
    else
        echo "iotop命令未安装"
    fi
}

# 网络监控
function monitor_network() {
    echo "=== 网络监控 ==="
    
    # 网络接口统计
    cat /proc/net/dev
    
    echo -e "\n=== 网络连接统计 ==="
    # TCP连接状态统计
    netstat -an | awk '/^tcp/ {++state[$NF]} END {for(key in state) print key"\t"state[key]}'
    
    echo -e "\n=== 监听端口 ==="
    # 监听端口
    netstat -tlnp
    
    echo -e "\n=== 网络接口信息 ==="
    # 网络接口详细信息
    ip addr show
    
    echo -e "\n=== 路由表 ==="
    # 路由信息
    ip route show
}

# 进程监控
function monitor_processes() {
    echo "=== 进程监控 ==="
    
    # 进程数量统计
    echo "总进程数: $(ps aux | wc -l)"
    echo "运行中进程数: $(ps aux | awk '$8 ~ /^R/ {count++} END {print count+0}')"
    echo "睡眠进程数: $(ps aux | awk '$8 ~ /^S/ {count++} END {print count+0}')"
    echo "僵尸进程数: $(ps aux | awk '$8 ~ /^Z/ {count++} END {print count+0}')"
    
    echo -e "\n=== CPU使用率最高的10个进程 ==="
    ps aux --sort=-%cpu | head -11
    
    echo -e "\n=== 内存使用率最高的10个进程 ==="
    ps aux --sort=-%mem | head -11
    
    echo -e "\n=== 文件描述符使用情况 ==="
    # 系统文件描述符限制
    echo "系统文件描述符限制: $(cat /proc/sys/fs/file-max)"
    echo "当前使用的文件描述符: $(cat /proc/sys/fs/file-nr | awk '{print $1}')"
    
    # 进程文件描述符使用Top 10
    echo -e "\n=== 文件描述符使用最多的10个进程 ==="
    for pid in $(ps -eo pid --no-headers | head -20); do
        if [ -d "/proc/$pid/fd" ]; then
            fd_count=$(ls /proc/$pid/fd 2>/dev/null | wc -l)
            if [ $fd_count -gt 0 ]; then
                cmd=$(ps -p $pid -o comm --no-headers 2>/dev/null)
                echo "$pid $cmd $fd_count"
            fi
        fi
    done | sort -k3 -nr | head -10
}

# 系统服务监控
function monitor_services() {
    echo "=== 系统服务监控 ==="
    
    # systemd服务状态
    if command -v systemctl &> /dev/null; then
        echo "=== 失败的服务 ==="
        systemctl --failed
        
        echo -e "\n=== 关键服务状态 ==="
        # 检查关键服务状态
        critical_services=("sshd" "network" "firewalld" "chronyd" "rsyslog")
        for service in "${critical_services[@]}"; do
            if systemctl list-unit-files | grep -q "^$service"; then
                status=$(systemctl is-active $service 2>/dev/null)
                enabled=$(systemctl is-enabled $service 2>/dev/null)
                echo "$service: $status ($enabled)"
            fi
        done
    fi
    
    echo -e "\n=== 系统启动时间 ==="
    uptime
    
    echo -e "\n=== 最近登录用户 ==="
    last | head -10
}

# 主监控函数
function main_monitor() {
    echo "==========================================="
    echo "Linux系统监控报告 - $(date)"
    echo "主机名: $(hostname)"
    echo "内核版本: $(uname -r)"
    echo "系统版本: $(cat /etc/os-release | grep PRETTY_NAME | cut -d'"' -f2)"
    echo "==========================================="
    
    monitor_cpu_memory
    echo -e "\n"
    monitor_disk
    echo -e "\n"
    monitor_network
    echo -e "\n"
    monitor_processes
    echo -e "\n"
    monitor_services
}

# 执行监控
main_monitor

2. 高级监控脚本

#!/bin/bash

# 系统性能监控脚本
# 用于收集详细的系统性能数据

MONITOR_DIR="/var/log/system-monitor"
DATE=$(date +"%Y%m%d")
TIMESTAMP=$(date +"%Y-%m-%d %H:%M:%S")

# 创建监控日志目录
mkdir -p $MONITOR_DIR

# 性能数据收集函数
function collect_performance_data() {
    local output_file="$MONITOR_DIR/performance_$DATE.log"
    
    {
        echo "[$TIMESTAMP] === 性能数据收集开始 ==="
        
        # CPU性能数据
        echo "[CPU] 负载平均值:"
        cat /proc/loadavg
        
        echo "[CPU] CPU使用率详情:"
        grep 'cpu ' /proc/stat
        
        echo "[CPU] 上下文切换和中断:"
        grep -E 'ctxt|intr|processes|procs_running|procs_blocked' /proc/stat
        
        # 内存性能数据
        echo "[MEMORY] 内存使用详情:"
        cat /proc/meminfo
        
        echo "[MEMORY] 虚拟内存统计:"
        cat /proc/vmstat | grep -E 'pgpgin|pgpgout|pswpin|pswpout|pgfault|pgmajfault'
        
        # 磁盘I/O数据
        echo "[DISK] 磁盘统计:"
        cat /proc/diskstats
        
        # 网络统计
        echo "[NETWORK] 网络接口统计:"
        cat /proc/net/dev
        
        echo "[NETWORK] TCP统计:"
        cat /proc/net/snmp | grep Tcp
        
        echo "[$TIMESTAMP] === 性能数据收集结束 ==="
        echo ""
        
    } >> $output_file
}

# 系统健康检查
function system_health_check() {
    local output_file="$MONITOR_DIR/health_check_$DATE.log"
    local alert_file="$MONITOR_DIR/alerts_$DATE.log"
    
    {
        echo "[$TIMESTAMP] === 系统健康检查开始 ==="
        
        # CPU负载检查
        load_1min=$(cat /proc/loadavg | awk '{print $1}')
        cpu_cores=$(nproc)
        load_threshold=$(echo "$cpu_cores * 0.8" | bc -l)
        
        if (( $(echo "$load_1min > $load_threshold" | bc -l) )); then
            echo "[ALERT] CPU负载过高: $load_1min (阈值: $load_threshold)" | tee -a $alert_file
        fi
        
        # 内存使用检查
        mem_total=$(grep MemTotal /proc/meminfo | awk '{print $2}')
        mem_available=$(grep MemAvailable /proc/meminfo | awk '{print $2}')
        mem_usage_percent=$(echo "scale=2; (($mem_total - $mem_available) * 100) / $mem_total" | bc)
        
        if (( $(echo "$mem_usage_percent > 90" | bc -l) )); then
            echo "[ALERT] 内存使用率过高: ${mem_usage_percent}%" | tee -a $alert_file
        fi
        
        # 磁盘使用检查
        df -h | awk 'NR>1 {gsub(/%/, "", $5); if($5 > 90) print "[ALERT] 磁盘使用率过高: " $6 " (" $5 "%)";}' | tee -a $alert_file
        
        # 检查僵尸进程
        zombie_count=$(ps aux | awk '$8 ~ /^Z/ {count++} END {print count+0}')
        if [ $zombie_count -gt 0 ]; then
            echo "[ALERT] 发现僵尸进程: $zombie_count 个" | tee -a $alert_file
        fi
        
        # 检查重要服务
        critical_services=("sshd" "network" "firewalld")
        for service in "${critical_services[@]}"; do
            if systemctl list-unit-files | grep -q "^$service" && ! systemctl is-active --quiet $service; then
                echo "[ALERT] 关键服务未运行: $service" | tee -a $alert_file
            fi
        done
        
        echo "[$TIMESTAMP] === 系统健康检查结束 ==="
        echo ""
        
    } >> $output_file
}

# 网络连接监控
function monitor_network_connections() {
    local output_file="$MONITOR_DIR/network_connections_$DATE.log"
    
    {
        echo "[$TIMESTAMP] === 网络连接监控开始 ==="
        
        # TCP连接状态统计
        echo "[TCP] 连接状态统计:"
        netstat -an | awk '/^tcp/ {++state[$NF]} END {for(key in state) print key"\t"state[key]}'
        
        # 连接数最多的IP地址
        echo "[TCP] 连接数最多的IP地址 (Top 10):"
        netstat -an | grep ESTABLISHED | awk '{print $5}' | cut -d: -f1 | sort | uniq -c | sort -nr | head -10
        
        # 监听端口
        echo "[TCP] 当前监听端口:"
        netstat -tlnp | grep LISTEN
        
        # 网络流量统计
        echo "[NETWORK] 网络接口流量:"
        cat /proc/net/dev | awk 'NR>2 {printf "%-10s RX: %10d bytes TX: %10d bytes\n", $1, $2, $10}'
        
        echo "[$TIMESTAMP] === 网络连接监控结束 ==="
        echo ""
        
    } >> $output_file
}

# 进程资源使用监控
function monitor_process_resources() {
    local output_file="$MONITOR_DIR/process_resources_$DATE.log"
    
    {
        echo "[$TIMESTAMP] === 进程资源使用监控开始 ==="
        
        # CPU使用率最高的进程
        echo "[PROCESS] CPU使用率最高的进程 (Top 10):"
        ps aux --sort=-%cpu | head -11 | awk 'NR==1 {print $0} NR>1 {printf "%-8s %-10s %6s %6s %-20s\n", $2, $1, $3, $4, $11}'
        
        # 内存使用最多的进程
        echo "[PROCESS] 内存使用最多的进程 (Top 10):"
        ps aux --sort=-%mem | head -11 | awk 'NR==1 {print $0} NR>1 {printf "%-8s %-10s %6s %6s %-20s\n", $2, $1, $3, $4, $11}'
        
        # 文件描述符使用统计
        echo "[PROCESS] 文件描述符使用统计:"
        echo "系统限制: $(cat /proc/sys/fs/file-max)"
        echo "当前使用: $(cat /proc/sys/fs/file-nr | awk '{print $1}')"
        echo "使用率: $(cat /proc/sys/fs/file-nr | awk '{printf "%.2f%%", ($1/$(cat /proc/sys/fs/file-max))*100}')"
        
        echo "[$TIMESTAMP] === 进程资源使用监控结束 ==="
        echo ""
        
    } >> $output_file
}

# 日志清理函数
function cleanup_old_logs() {
    # 删除7天前的日志文件
    find $MONITOR_DIR -name "*.log" -mtime +7 -delete
}

# 主执行函数
function main() {
    echo "开始系统监控数据收集..."
    
    collect_performance_data
    system_health_check
    monitor_network_connections
    monitor_process_resources
    cleanup_old_logs
    
    echo "监控数据收集完成，日志保存在: $MONITOR_DIR"
    
    # 如果有告警，显示告警信息
    alert_file="$MONITOR_DIR/alerts_$DATE.log"
    if [ -f $alert_file ] && [ -s $alert_file ]; then
        echo "发现系统告警:"
        cat $alert_file
    fi
}

# 检查是否以root权限运行
if [ "$EUID" -ne 0 ]; then
    echo "建议以root权限运行以获取完整的监控数据"
fi

# 执行主函数
main

企业级监控解决方案

Prometheus + Grafana 监控架构

1. Prometheus 配置

# prometheus.yml
global:
  scrape_interval: 15s
  evaluation_interval: 15s
  external_labels:
    cluster: 'production'
    region: 'us-west-1'

rule_files:
  - "rules/*.yml"

alerting:
  alertmanagers:
    - static_configs:
        - targets:
          - alertmanager:9093

scrape_configs:
  # Prometheus自身监控
  - job_name: 'prometheus'
    static_configs:
      - targets: ['localhost:9090']
    scrape_interval: 5s
    metrics_path: /metrics

  # Node Exporter监控
  - job_name: 'node-exporter'
    static_configs:
      - targets:
        - 'server1:9100'
        - 'server2:9100'
        - 'server3:9100'
    scrape_interval: 10s
    metrics_path: /metrics
    relabel_configs:
      - source_labels: [__address__]
        target_label: instance
        regex: '([^:]+):.+'
        replacement: '${1}'

  # 应用程序监控
  - job_name: 'application'
    static_configs:
      - targets:
        - 'app1:8080'
        - 'app2:8080'
    metrics_path: /actuator/prometheus
    scrape_interval: 15s

  # MySQL监控
  - job_name: 'mysql'
    static_configs:
      - targets:
        - 'mysql-exporter:9104'
    scrape_interval: 30s

  # Redis监控
  - job_name: 'redis'
    static_configs:
      - targets:
        - 'redis-exporter:9121'
    scrape_interval: 30s

  # Nginx监控
  - job_name: 'nginx'
    static_configs:
      - targets:
        - 'nginx-exporter:9113'
    scrape_interval: 30s

  # 服务发现配置（Consul）
  - job_name: 'consul-services'
    consul_sd_configs:
      - server: 'consul:8500'
        services: []
    relabel_configs:
      - source_labels: [__meta_consul_service]
        target_label: job
      - source_labels: [__meta_consul_node]
        target_label: instance

2. 告警规则配置

# rules/system-alerts.yml
groups:
  - name: system.rules
    rules:
      # CPU使用率告警
      - alert: HighCPUUsage
        expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
        for: 5m
        labels:
          severity: warning
          category: system
        annotations:
          summary: "High CPU usage detected"
          description: "CPU usage is above 80% for more than 5 minutes on {{ $labels.instance }}"

      # 内存使用率告警
      - alert: HighMemoryUsage
        expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 90
        for: 5m
        labels:
          severity: critical
          category: system
        annotations:
          summary: "High memory usage detected"
          description: "Memory usage is above 90% for more than 5 minutes on {{ $labels.instance }}"

      # 磁盘使用率告警
      - alert: HighDiskUsage
        expr: (1 - (node_filesystem_avail_bytes{fstype!="tmpfs"} / node_filesystem_size_bytes{fstype!="tmpfs"})) * 100 > 90
        for: 5m
        labels:
          severity: warning
          category: system
        annotations:
          summary: "High disk usage detected"
          description: "Disk usage is above 90% on {{ $labels.instance }} mount point {{ $labels.mountpoint }}"

      # 系统负载告警
      - alert: HighSystemLoad
        expr: node_load1 / on(instance) count by(instance) (node_cpu_seconds_total{mode="idle"}) > 0.8
        for: 10m
        labels:
          severity: warning
          category: system
        annotations:
          summary: "High system load detected"
          description: "System load is above 80% of CPU cores for more than 10 minutes on {{ $labels.instance }}"

      # 网络连接数告警
      - alert: HighNetworkConnections
        expr: node_netstat_Tcp_CurrEstab > 1000
        for: 5m
        labels:
          severity: warning
          category: network
        annotations:
          summary: "High number of TCP connections"
          description: "Number of established TCP connections is above 1000 on {{ $labels.instance }}"

      # 服务不可用告警
      - alert: ServiceDown
        expr: up == 0
        for: 1m
        labels:
          severity: critical
          category: availability
        annotations:
          summary: "Service is down"
          description: "Service {{ $labels.job }} on {{ $labels.instance }} is down"

      # 磁盘I/O使用率告警
      - alert: HighDiskIOUsage
        expr: irate(node_disk_io_time_seconds_total[5m]) * 100 > 80
        for: 5m
        labels:
          severity: warning
          category: system
        annotations:
          summary: "High disk I/O usage detected"
          description: "Disk I/O usage is above 80% for more than 5 minutes on {{ $labels.instance }} device {{ $labels.device }}"

  - name: application.rules
    rules:
      # 应用响应时间告警
      - alert: HighResponseTime
        expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 1
        for: 5m
        labels:
          severity: warning
          category: application
        annotations:
          summary: "High response time detected"
          description: "95th percentile response time is above 1 second for {{ $labels.instance }}"

      # 应用错误率告警
      - alert: HighErrorRate
        expr: rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) * 100 > 5
        for: 5m
        labels:
          severity: critical
          category: application
        annotations:
          summary: "High error rate detected"
          description: "Error rate is above 5% for {{ $labels.instance }}"

3. Node Exporter 部署

#!/bin/bash

# Node Exporter 安装脚本

NODE_EXPORTER_VERSION="1.6.1"
USER="node_exporter"
INSTALL_DIR="/opt/node_exporter"
SERVICE_FILE="/etc/systemd/system/node_exporter.service"

# 创建用户
sudo useradd --no-create-home --shell /bin/false $USER

# 下载和安装 Node Exporter
cd /tmp
wget https://github.com/prometheus/node_exporter/releases/download/v${NODE_EXPORTER_VERSION}/node_exporter-${NODE_EXPORTER_VERSION}.linux-amd64.tar.gz
tar xzf node_exporter-${NODE_EXPORTER_VERSION}.linux-amd64.tar.gz

# 创建安装目录
sudo mkdir -p $INSTALL_DIR
sudo cp node_exporter-${NODE_EXPORTER_VERSION}.linux-amd64/node_exporter $INSTALL_DIR/
sudo chown -R $USER:$USER $INSTALL_DIR

# 创建systemd服务文件
sudo tee $SERVICE_FILE > /dev/null <<EOF
[Unit]
Description=Node Exporter
Wants=network-online.target
After=network-online.target

[Service]
User=$USER
Group=$USER
Type=simple
ExecStart=$INSTALL_DIR/node_exporter \
    --collector.systemd \
    --collector.processes \
    --collector.interrupts \
    --collector.tcpstat \
    --collector.meminfo_numa \
    --web.listen-address=:9100
Restart=always
RestartSec=3

[Install]
WantedBy=multi-user.target
EOF

# 启动服务
sudo systemctl daemon-reload
sudo systemctl enable node_exporter
sudo systemctl start node_exporter

# 检查服务状态
sudo systemctl status node_exporter

# 清理临时文件
rm -rf /tmp/node_exporter-*

echo "Node Exporter 安装完成！"
echo "访问 http://$(hostname -I | awk '{print $1}'):9100/metrics 查看指标"

4. 自定义监控指标

#!/bin/bash

# 自定义监控指标收集脚本
# 生成Prometheus格式的指标数据

METRICS_FILE="/var/lib/node_exporter/textfile_collector/custom_metrics.prom"
TMP_FILE="${METRICS_FILE}.tmp"

# 确保目录存在
mkdir -p $(dirname $METRICS_FILE)

# 开始生成指标
{
    echo "# HELP custom_system_info System information"
    echo "# TYPE custom_system_info gauge"
    
    # 系统信息
    kernel_version=$(uname -r)
    os_version=$(cat /etc/os-release | grep VERSION_ID | cut -d'"' -f2)
    echo "custom_system_info{kernel_version=\"$kernel_version\",os_version=\"$os_version\"} 1"
    
    echo ""
    echo "# HELP custom_disk_inodes_usage Disk inode usage percentage"
    echo "# TYPE custom_disk_inodes_usage gauge"
    
    # 磁盘inode使用率
    df -i | awk 'NR>1 && $1 !~ /^tmpfs/ && $6 != "/dev" && $6 != "/run" {
        gsub(/%/, "", $5)
        if ($5 != "-" && $5 != "")
            printf "custom_disk_inodes_usage{device=\"%s\",mountpoint=\"%s\"} %s\n", $1, $6, $5
    }'
    
    echo ""
    echo "# HELP custom_tcp_connection_states TCP connection states count"
    echo "# TYPE custom_tcp_connection_states gauge"
    
    # TCP连接状态统计
    netstat -an | awk '/^tcp/ {state[$NF]++} END {
        for (s in state) {
            gsub(/[^A-Z_]/, "", s)
            if (s != "") printf "custom_tcp_connection_states{state=\"%s\"} %d\n", s, state[s]
        }
    }'
    
    echo ""
    echo "# HELP custom_process_count Process count by state"
    echo "# TYPE custom_process_count gauge"
    
    # 进程状态统计
    ps aux | awk 'NR>1 {
        if ($8 ~ /^R/) running++
        else if ($8 ~ /^S/) sleeping++
        else if ($8 ~ /^D/) uninterruptible++
        else if ($8 ~ /^Z/) zombie++
        else if ($8 ~ /^T/) stopped++
        total++
    } END {
        printf "custom_process_count{state=\"running\"} %d\n", running+0
        printf "custom_process_count{state=\"sleeping\"} %d\n", sleeping+0
        printf "custom_process_count{state=\"uninterruptible\"} %d\n", uninterruptible+0
        printf "custom_process_count{state=\"zombie\"} %d\n", zombie+0
        printf "custom_process_count{state=\"stopped\"} %d\n", stopped+0
        printf "custom_process_count{state=\"total\"} %d\n", total+0
    }'
    
    echo ""
    echo "# HELP custom_service_status Service status (1=active, 0=inactive)"
    echo "# TYPE custom_service_status gauge"
    
    # 关键服务状态
    critical_services=("sshd" "network" "firewalld" "chronyd" "rsyslog" "docker")
    for service in "${critical_services[@]}"; do
        if systemctl list-unit-files | grep -q "^$service"; then
            if systemctl is-active --quiet $service; then
                echo "custom_service_status{service=\"$service\"} 1"
            else
                echo "custom_service_status{service=\"$service\"} 0"
            fi
        fi
    done
    
    echo ""
    echo "# HELP custom_file_descriptor_usage File descriptor usage"
    echo "# TYPE custom_file_descriptor_usage gauge"
    
    # 文件描述符使用情况
    fd_used=$(cat /proc/sys/fs/file-nr | awk '{print $1}')
    fd_max=$(cat /proc/sys/fs/file-max)
    fd_usage_percent=$(echo "scale=2; ($fd_used * 100) / $fd_max" | bc)
    
    echo "custom_file_descriptor_usage{type=\"used\"} $fd_used"
    echo "custom_file_descriptor_usage{type=\"max\"} $fd_max"
    echo "custom_file_descriptor_usage{type=\"usage_percent\"} $fd_usage_percent"
    
    echo ""
    echo "# HELP custom_last_boot_time Last boot time in seconds since epoch"
    echo "# TYPE custom_last_boot_time gauge"
    
    # 系统启动时间
    boot_time=$(stat -c %Y /proc/1)
    echo "custom_last_boot_time $boot_time"
    
    echo ""
    echo "# HELP custom_security_updates Security updates available"
    echo "# TYPE custom_security_updates gauge"
    
    # 安全更新数量（适用于基于Debian的系统）
    if command -v apt &> /dev/null; then
        security_updates=$(apt list --upgradable 2>/dev/null | grep -c security || echo 0)
        echo "custom_security_updates $security_updates"
    elif command -v yum &> /dev/null; then
        security_updates=$(yum --security check-update 2>/dev/null | grep -c "updates" || echo 0)
        echo "custom_security_updates $security_updates"
    else
        echo "custom_security_updates 0"
    fi
    
} > $TMP_FILE

# 原子性地更新指标文件
mv $TMP_FILE $METRICS_FILE

echo "自定义指标已更新: $METRICS_FILE"

Grafana 仪表板配置

1. 系统概览仪表板

{
  "dashboard": {
    "id": null,
    "title": "Linux系统监控概览",
    "tags": ["linux", "system", "monitoring"],
    "timezone": "browser",
    "panels": [
      {
        "id": 1,
        "title": "CPU使用率",
        "type": "stat",
        "targets": [
          {
            "expr": "100 - (avg by(instance) (irate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)",
            "legendFormat": "{{instance}}"
          }
        ],
        "fieldConfig": {
          "defaults": {
            "unit": "percent",
            "min": 0,
            "max": 100,
            "thresholds": {
              "steps": [
                {"color": "green", "value": 0},
                {"color": "yellow", "value": 70},
                {"color": "red", "value": 90}
              ]
            }
          }
        },
        "gridPos": {"h": 8, "w": 6, "x": 0, "y": 0}
      },
      {
        "id": 2,
        "title": "内存使用率",
        "type": "stat",
        "targets": [
          {
            "expr": "(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100",
            "legendFormat": "{{instance}}"
          }
        ],
        "fieldConfig": {
          "defaults": {
            "unit": "percent",
            "min": 0,
            "max": 100,
            "thresholds": {
              "steps": [
                {"color": "green", "value": 0},
                {"color": "yellow", "value": 80},
                {"color": "red", "value": 95}
              ]
            }
          }
        },
        "gridPos": {"h": 8, "w": 6, "x": 6, "y": 0}
      },
      {
        "id": 3,
        "title": "磁盘使用率",
        "type": "stat",
        "targets": [
          {
            "expr": "max by(instance) ((1 - (node_filesystem_avail_bytes{fstype!=\"tmpfs\"} / node_filesystem_size_bytes{fstype!=\"tmpfs\"})) * 100)",
            "legendFormat": "{{instance}}"
          }
        ],
        "fieldConfig": {
          "defaults": {
            "unit": "percent",
            "min": 0,
            "max": 100,
            "thresholds": {
              "steps": [
                {"color": "green", "value": 0},
                {"color": "yellow", "value": 80},
                {"color": "red", "value": 95}
              ]
            }
          }
        },
        "gridPos": {"h": 8, "w": 6, "x": 12, "y": 0}
      },
      {
        "id": 4,
        "title": "系统负载",
        "type": "stat",
        "targets": [
          {
            "expr": "node_load1",
            "legendFormat": "{{instance}}"
          }
        ],
        "fieldConfig": {
          "defaults": {
            "unit": "short",
            "decimals": 2,
            "thresholds": {
              "steps": [
                {"color": "green", "value": 0},
                {"color": "yellow", "value": 2},
                {"color": "red", "value": 4}
              ]
            }
          }
        },
        "gridPos": {"h": 8, "w": 6, "x": 18, "y": 0}
      },
      {
        "id": 5,
        "title": "CPU使用率趋势",
        "type": "timeseries",
        "targets": [
          {
            "expr": "100 - (avg by(instance) (irate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)",
            "legendFormat": "{{instance}}"
          }
        ],
        "fieldConfig": {
          "defaults": {
            "unit": "percent",
            "min": 0,
            "max": 100
          }
        },
        "gridPos": {"h": 8, "w": 12, "x": 0, "y": 8}
      },
      {
        "id": 6,
        "title": "内存使用趋势",
        "type": "timeseries",
        "targets": [
          {
            "expr": "node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes",
            "legendFormat": "已使用 - {{instance}}"
          },
          {
            "expr": "node_memory_MemAvailable_bytes",
            "legendFormat": "可用 - {{instance}}"
          }
        ],
        "fieldConfig": {
          "defaults": {
            "unit": "bytes"
          }
        },
        "gridPos": {"h": 8, "w": 12, "x": 12, "y": 8}
      }
    ],
    "time": {
      "from": "now-1h",
      "to": "now"
    },
    "refresh": "30s"
  }
}

日志监控与分析

1. 系统日志监控

#!/bin/bash

# 系统日志监控脚本
# 监控关键系统日志并生成告警

LOG_DIR="/var/log"
ALERT_LOG="/var/log/system-alerts.log"
CONFIG_FILE="/etc/log-monitor.conf"
LAST_CHECK_FILE="/var/lib/log-monitor/last-check"

# 创建必要的目录
mkdir -p $(dirname $LAST_CHECK_FILE)

# 默认配置
if [ ! -f $CONFIG_FILE ]; then
    cat > $CONFIG_FILE << 'EOF'
# 日志监控配置文件
# 格式: 日志文件路径:关键词:告警级别
/var/log/messages:error:WARNING
/var/log/messages:critical:CRITICAL
/var/log/messages:panic:CRITICAL
/var/log/secure:Failed password:WARNING
/var/log/secure:authentication failure:WARNING
/var/log/secure:Invalid user:WARNING
/var/log/kern.log:Out of memory:CRITICAL
/var/log/kern.log:segfault:WARNING
/var/log/kern.log:BUG:CRITICAL
/var/log/syslog:error:WARNING
/var/log/syslog:failed:WARNING
/var/log/audit/audit.log:ANOM:WARNING
/var/log/audit/audit.log:denied:WARNING
EOF
fi

# 获取上次检查时间
if [ -f $LAST_CHECK_FILE ]; then
    LAST_CHECK=$(cat $LAST_CHECK_FILE)
else
    LAST_CHECK=$(date -d "1 hour ago" +"%Y-%m-%d %H:%M:%S")
fi

# 更新检查时间
date +"%Y-%m-%d %H:%M:%S" > $LAST_CHECK_FILE

# 日志分析函数
function analyze_logs() {
    local log_file=$1
    local keyword=$2
    local alert_level=$3
    local current_time=$(date +"%Y-%m-%d %H:%M:%S")
    
    if [ ! -f "$log_file" ]; then
        return
    fi
    
    # 查找自上次检查以来的新日志条目
    local new_entries=$(awk -v start="$LAST_CHECK" -v keyword="$keyword" '
        BEGIN {
            # 将时间字符串转换为时间戳进行比较
            cmd = "date -d \"" start "\" +%s"
            cmd | getline start_ts
            close(cmd)
        }
        {
            # 提取日志时间戳（假设格式为 "Mon DD HH:MM:SS"）
            if (match($0, /^[A-Za-z]{3}\s+[0-9]{1,2}\s+[0-9]{2}:[0-9]{2}:[0-9]{2}/)) {
                log_time = substr($0, RSTART, RLENGTH)
                # 添加年份
                log_time = strftime("%Y") " " log_time
                cmd = "date -d \"" log_time "\" +%s 2>/dev/null"
                if ((cmd | getline log_ts) > 0 && log_ts >= start_ts) {
                    if (tolower($0) ~ tolower(keyword)) {
                        print $0
                    }
                }
                close(cmd)
            }
        }' "$log_file")
    
    if [ -n "$new_entries" ]; then
        echo "[$current_time] [$alert_level] 在 $log_file 中发现关键词 '$keyword':" >> $ALERT_LOG
        echo "$new_entries" >> $ALERT_LOG
        echo "" >> $ALERT_LOG
        
        # 发送告警通知
        send_alert "$alert_level" "日志告警" "在 $log_file 中发现关键词 '$keyword'"
    fi
}

# 发送告警函数
function send_alert() {
    local level=$1
    local title=$2
    local message=$3
    local timestamp=$(date +"%Y-%m-%d %H:%M:%S")
    
    # 记录到系统日志
    logger -t "log-monitor" "[$level] $title: $message"
    
    # 这里可以集成邮件、短信、钉钉等告警方式
    case $level in
        "CRITICAL")
            # 发送紧急告警
            echo "[$timestamp] CRITICAL ALERT: $title - $message" | wall
            ;;
        "WARNING")
            # 发送警告
            echo "[$timestamp] WARNING: $title - $message"
            ;;
    esac
}

# 系统资源告警检查
function check_system_resources() {
    local current_time=$(date +"%Y-%m-%d %H:%M:%S")
    
    # 检查磁盘使用率
    df -h | awk 'NR>1 && $5+0 > 90 {
        gsub(/%/, "", $5)
        print "[$current_time] [WARNING] 磁盘使用率过高: " $6 " (" $5 "%)"
    }' >> $ALERT_LOG
    
    # 检查内存使用率
    local mem_usage=$(free | awk 'NR==2{printf "%.0f", $3*100/$2}')
    if [ $mem_usage -gt 90 ]; then
        echo "[$current_time] [WARNING] 内存使用率过高: ${mem_usage}%" >> $ALERT_LOG
        send_alert "WARNING" "内存告警" "内存使用率达到 ${mem_usage}%"
    fi
    
    # 检查CPU负载
    local load_1min=$(cat /proc/loadavg | awk '{print $1}')
    local cpu_cores=$(nproc)
    local load_threshold=$(echo "$cpu_cores * 2" | bc -l)
    
    if (( $(echo "$load_1min > $load_threshold" | bc -l) )); then
        echo "[$current_time] [WARNING] CPU负载过高: $load_1min (阈值: $load_threshold)" >> $ALERT_LOG
        send_alert "WARNING" "CPU负载告警" "1分钟负载平均值: $load_1min"
    fi
}

# 安全事件检查
function check_security_events() {
    local current_time=$(date +"%Y-%m-%d %H:%M:%S")
    
    # 检查SSH登录失败
    local failed_ssh=$(grep "Failed password" /var/log/secure 2>/dev/null | \
        awk -v start="$LAST_CHECK" '
        {
            if ($0 > start) count++
        } END {print count+0}')
    
    if [ $failed_ssh -gt 10 ]; then
        echo "[$current_time] [WARNING] SSH登录失败次数过多: $failed_ssh 次" >> $ALERT_LOG
        send_alert "WARNING" "安全告警" "检测到 $failed_ssh 次SSH登录失败"
    fi
    
    # 检查sudo使用
    local sudo_usage=$(grep "sudo:" /var/log/secure 2>/dev/null | \
        awk -v start="$LAST_CHECK" '
        {
            if ($0 > start) count++
        } END {print count+0}')
    
    if [ $sudo_usage -gt 50 ]; then
        echo "[$current_time] [INFO] sudo使用频繁: $sudo_usage 次" >> $ALERT_LOG
    fi
}

# 主执行函数
function main() {
    echo "开始日志监控检查 - $(date)"
    
    # 读取配置文件并分析日志
    while IFS=':' read -r log_file keyword alert_level; do
        # 跳过注释行和空行
        if [[ $log_file =~ ^#.*$ ]] || [[ -z $log_file ]]; then
            continue
        fi
        
        analyze_logs "$log_file" "$keyword" "$alert_level"
    done < $CONFIG_FILE
    
    # 检查系统资源
    check_system_resources
    
    # 检查安全事件
    check_security_events
    
    echo "日志监控检查完成 - $(date)"
    
    # 清理旧的告警日志（保留7天）
    find $(dirname $ALERT_LOG) -name "*.log" -mtime +7 -delete 2>/dev/null
}

# 执行主函数
main

2. 应用日志监控

#!/bin/bash

# 应用日志监控脚本
# 专门监控应用程序日志

APP_LOG_DIR="/var/log/applications"
ALERT_LOG="/var/log/app-alerts.log"
CONFIG_FILE="/etc/app-log-monitor.conf"
METRICS_FILE="/var/lib/node_exporter/textfile_collector/app_log_metrics.prom"

# 创建必要目录
mkdir -p $(dirname $ALERT_LOG)
mkdir -p $(dirname $METRICS_FILE)

# 应用日志配置
if [ ! -f $CONFIG_FILE ]; then
    cat > $CONFIG_FILE << 'EOF'
# 应用日志监控配置
# 格式: 应用名:日志文件:错误模式:告警阈值(每分钟)
web-app:/var/log/applications/web-app.log:ERROR:5
api-service:/var/log/applications/api.log:Exception:3
database:/var/log/mysql/error.log:ERROR:2
nginx:/var/log/nginx/error.log:error:10
redis:/var/log/redis/redis-server.log:WARNING:5
EOF
fi

# 应用日志分析函数
function analyze_app_logs() {
    local app_name=$1
    local log_file=$2
    local error_pattern=$3
    local threshold=$4
    local current_time=$(date +"%Y-%m-%d %H:%M:%S")
    local one_minute_ago=$(date -d "1 minute ago" +"%Y-%m-%d %H:%M:%S")
    
    if [ ! -f "$log_file" ]; then
        return
    fi
    
    # 统计最近1分钟的错误数量
    local error_count=$(awk -v start="$one_minute_ago" -v pattern="$error_pattern" '
        {
            # 简化的时间比较，假设日志格式包含时间戳
            if ($0 >= start && tolower($0) ~ tolower(pattern)) {
                count++
            }
        } END {print count+0}' "$log_file")
    
    # 生成Prometheus指标
    echo "# HELP app_log_errors_total Total number of application log errors" >> $METRICS_FILE.tmp
    echo "# TYPE app_log_errors_total counter" >> $METRICS_FILE.tmp
    echo "app_log_errors_total{app=\"$app_name\",pattern=\"$error_pattern\"} $error_count" >> $METRICS_FILE.tmp
    
    # 检查是否超过阈值
    if [ $error_count -gt $threshold ]; then
        local alert_msg="应用 $app_name 在最近1分钟内出现 $error_count 个 '$error_pattern' 错误（阈值: $threshold）"
        echo "[$current_time] [WARNING] $alert_msg" >> $ALERT_LOG
        
        # 提取最近的错误日志样本
        echo "错误样本:" >> $ALERT_LOG
        grep -i "$error_pattern" "$log_file" | tail -3 >> $ALERT_LOG
        echo "" >> $ALERT_LOG
        
        # 发送告警
        send_app_alert "WARNING" "磁盘空间告警" "日志分区使用率达到 ${log_disk_usage}%"
    fi
}

# 发送应用告警函数
function send_app_alert() {
    local level=$1
    local title=$2
    local message=$3
    local timestamp=$(date +"%Y-%m-%d %H:%M:%S")
    
    # 记录到系统日志
    logger -t "app-log-monitor" "[$level] $title: $message"
    
    # 这里可以集成具体的告警系统
    case $level in
        "CRITICAL")
            echo "[$timestamp] CRITICAL APP ALERT: $title - $message" | wall
            ;;
        "WARNING")
            echo "[$timestamp] APP WARNING: $title - $message"
            ;;
    esac
}

# 主执行函数
function main() {
    echo "开始应用日志监控 - $(date)"
    
    # 清空临时指标文件
    > $METRICS_FILE.tmp
    
    # 读取配置并分析应用日志
    while IFS=':' read -r app_name log_file error_pattern threshold; do
        if [[ $app_name =~ ^#.*$ ]] || [[ -z $app_name ]]; then
            continue
        fi
        
        analyze_app_logs "$app_name" "$log_file" "$error_pattern" "$threshold"
        analyze_app_performance "$app_name" "$log_file"
    done < $CONFIG_FILE
    
    # 检查日志轮转
    check_log_rotation
    
    # 更新Prometheus指标文件
    mv $METRICS_FILE.tmp $METRICS_FILE
    
    echo "应用日志监控完成 - $(date)"
}

# 执行主函数
main

Zabbix 监控解决方案

1. Zabbix Agent 配置

#!/bin/bash

# Zabbix Agent 安装和配置脚本

ZABBIX_VERSION="6.4"
ZABBIX_SERVER="zabbix-server.example.com"
AGENT_CONFIG="/etc/zabbix/zabbix_agentd.conf"
CUSTOM_SCRIPTS_DIR="/etc/zabbix/scripts"

# 安装Zabbix Agent
function install_zabbix_agent() {
    echo "安装Zabbix Agent..."
    
    # 添加Zabbix仓库
    if [ -f /etc/redhat-release ]; then
        # CentOS/RHEL
        rpm -Uvh https://repo.zabbix.com/zabbix/${ZABBIX_VERSION}/rhel/$(rpm -E %{rhel})/x86_64/zabbix-release-${ZABBIX_VERSION}-1.el$(rpm -E %{rhel}).noarch.rpm
        yum clean all
        yum install -y zabbix-agent2
    elif [ -f /etc/debian_version ]; then
        # Debian/Ubuntu
        wget https://repo.zabbix.com/zabbix/${ZABBIX_VERSION}/ubuntu/pool/main/z/zabbix-release/zabbix-release_${ZABBIX_VERSION}-1+ubuntu$(lsb_release -rs)_all.deb
        dpkg -i zabbix-release_${ZABBIX_VERSION}-1+ubuntu$(lsb_release -rs)_all.deb
        apt update
        apt install -y zabbix-agent2
    fi
}

# 配置Zabbix Agent
function configure_zabbix_agent() {
    echo "配置Zabbix Agent..."
    
    # 备份原配置文件
    cp $AGENT_CONFIG ${AGENT_CONFIG}.backup
    
    # 生成新配置
    cat > $AGENT_CONFIG << EOF
# Zabbix Agent 配置文件

# Zabbix服务器地址
Server=$ZABBIX_SERVER
ServerActive=$ZABBIX_SERVER

# Agent主机名
Hostname=$(hostname)

# 监听端口
ListenPort=10050

# 日志文件
LogFile=/var/log/zabbix/zabbix_agentd.log
LogFileSize=10

# PID文件
PidFile=/var/run/zabbix/zabbix_agentd.pid

# 包含自定义配置目录
Include=/etc/zabbix/zabbix_agentd.d/*.conf

# 用户参数
UserParameter=custom.cpu.util,cat /proc/stat | awk '/^cpu / {usage=($2+$4)*100/($2+$3+$4+$5)} END {print usage}'
UserParameter=custom.memory.util,free | awk '/^Mem:/ {printf "%.2f", ($3/$2)*100}'
UserParameter=custom.disk.util[*],df -h $1 | awk 'NR==2 {gsub(/%/, "", $$5); print $$5}'
UserParameter=custom.network.bytes.in[*],cat /proc/net/dev | awk -v interface=$1 '$$1==interface":" {print $$2}'
UserParameter=custom.network.bytes.out[*],cat /proc/net/dev | awk -v interface=$1 '$$1==interface":" {print $$10}'
UserParameter=custom.tcp.connections,netstat -an | grep ESTABLISHED | wc -l
UserParameter=custom.process.count[*],ps aux | grep -v grep | grep -c $1
UserParameter=custom.service.status[*],systemctl is-active $1 | grep -c active
UserParameter=custom.log.error.count[*],grep -c "ERROR" $1 2>/dev/null || echo 0
UserParameter=custom.disk.inode.util[*],df -i $1 | awk 'NR==2 {gsub(/%/, "", $$5); print $$5}'
UserParameter=custom.load.avg1,cat /proc/loadavg | awk '{print $$1}'
UserParameter=custom.load.avg5,cat /proc/loadavg | awk '{print $$2}'
UserParameter=custom.load.avg15,cat /proc/loadavg | awk '{print $$3}'

# 超时设置
Timeout=30

# 允许远程命令
EnableRemoteCommands=1
LogRemoteCommands=1

# 缓冲区大小
MaxLinesPerSecond=20
EOF

    # 创建自定义脚本目录
    mkdir -p $CUSTOM_SCRIPTS_DIR
    
    # 设置权限
    chown -R zabbix:zabbix /etc/zabbix
    chmod 755 $CUSTOM_SCRIPTS_DIR
}

# 创建自定义监控脚本
function create_custom_scripts() {
    echo "创建自定义监控脚本..."
    
    # 系统性能监控脚本
    cat > $CUSTOM_SCRIPTS_DIR/system_performance.sh << 'EOF'
#!/bin/bash

# 系统性能监控脚本

case $1 in
    "cpu_usage")
        top -bn1 | grep "Cpu(s)" | awk '{print $2}' | sed 's/%us,//'
        ;;
    "memory_usage")
        free | awk '/^Mem:/ {printf "%.2f", ($3/$2)*100}'
        ;;
    "disk_io_read")
        iostat -d 1 2 | awk '/^[a-z]/ && NR>6 {sum+=$3} END {print sum}'
        ;;
    "disk_io_write")
        iostat -d 1 2 | awk '/^[a-z]/ && NR>6 {sum+=$4} END {print sum}'
        ;;
    "network_connections")
        netstat -an | grep ESTABLISHED | wc -l
        ;;
    "zombie_processes")
        ps aux | awk '$8 ~ /^Z/ {count++} END {print count+0}'
        ;;
    "file_descriptors")
        cat /proc/sys/fs/file-nr | awk '{print $1}'
        ;;
    "context_switches")
        grep ctxt /proc/stat | awk '{print $2}'
        ;;
    *)
        echo "Usage: $0 {cpu_usage|memory_usage|disk_io_read|disk_io_write|network_connections|zombie_processes|file_descriptors|context_switches}"
        exit 1
        ;;
esac
EOF

    # 应用监控脚本
    cat > $CUSTOM_SCRIPTS_DIR/app_monitor.sh << 'EOF'
#!/bin/bash

# 应用监控脚本

APP_NAME=$1
METRIC=$2

case $METRIC in
    "status")
        if systemctl is-active --quiet $APP_NAME; then
            echo 1
        else
            echo 0
        fi
        ;;
    "memory")
        ps aux | grep $APP_NAME | grep -v grep | awk '{sum+=$6} END {print sum+0}'
        ;;
    "cpu")
        ps aux | grep $APP_NAME | grep -v grep | awk '{sum+=$3} END {print sum+0}'
        ;;
    "connections")
        netstat -an | grep $(ps aux | grep $APP_NAME | grep -v grep | awk '{print $2}' | head -1) | wc -l
        ;;
    "log_errors")
        LOG_FILE="/var/log/$APP_NAME.log"
        if [ -f $LOG_FILE ]; then
            grep -c "ERROR" $LOG_FILE
        else
            echo 0
        fi
        ;;
    *)
        echo "Usage: $0 <app_name> {status|memory|cpu|connections|log_errors}"
        exit 1
        ;;
esac
EOF

    # 安全监控脚本
    cat > $CUSTOM_SCRIPTS_DIR/security_monitor.sh << 'EOF'
#!/bin/bash

# 安全监控脚本

case $1 in
    "failed_logins")
        grep "Failed password" /var/log/secure 2>/dev/null | wc -l
        ;;
    "successful_logins")
        grep "Accepted password" /var/log/secure 2>/dev/null | wc -l
        ;;
    "sudo_usage")
        grep "sudo:" /var/log/secure 2>/dev/null | wc -l
        ;;
    "firewall_drops")
        dmesg | grep -i "dropped" | wc -l
        ;;
    "open_ports")
        netstat -tlnp | grep LISTEN | wc -l
        ;;
    "last_login")
        last -1 | head -1 | awk '{print $4" "$5" "$6" "$7}'
        ;;
    *)
        echo "Usage: $0 {failed_logins|successful_logins|sudo_usage|firewall_drops|open_ports|last_login}"
        exit 1
        ;;
esac
EOF

    # 设置脚本权限
    chmod +x $CUSTOM_SCRIPTS_DIR/*.sh
    chown zabbix:zabbix $CUSTOM_SCRIPTS_DIR/*.sh
}

# 启动和启用服务
function start_zabbix_agent() {
    echo "启动Zabbix Agent服务..."
    
    systemctl daemon-reload
    systemctl enable zabbix-agent2
    systemctl start zabbix-agent2
    
    # 检查服务状态
    if systemctl is-active --quiet zabbix-agent2; then
        echo "Zabbix Agent 启动成功！"
        systemctl status zabbix-agent2
    else
        echo "Zabbix Agent 启动失败！"
        systemctl status zabbix-agent2
        exit 1
    fi
}

# 主执行函数
function main() {
    echo "开始安装和配置Zabbix Agent..."
    
    install_zabbix_agent
    configure_zabbix_agent
    create_custom_scripts
    start_zabbix_agent
    
    echo "Zabbix Agent 安装配置完成！"
    echo "请在Zabbix服务器上添加此主机: $(hostname) ($(hostname -I | awk '{print $1}'))"
}

# 检查是否以root权限运行
if [ "$EUID" -ne 0 ]; then
    echo "请以root权限运行此脚本"
    exit 1
fi

# 执行主函数
main

2. Zabbix 模板配置

<?xml version="1.0" encoding="UTF-8"?>
<zabbix_export>
    <version>6.4</version>
    <date>2024-03-15T10:00:00Z</date>
    <groups>
        <group>
            <uuid>linux-servers</uuid>
            <name>Linux Servers</name>
        </group>
    </groups>
    <templates>
        <template>
            <uuid>linux-advanced-monitoring</uuid>
            <template>Linux Advanced Monitoring</template>
            <name>Linux Advanced Monitoring</name>
            <groups>
                <group>
                    <name>Linux Servers</name>
                </group>
            </groups>
            <items>
                <!-- CPU监控项 -->
                <item>
                    <uuid>cpu-utilization</uuid>
                    <name>CPU Utilization</name>
                    <key>custom.cpu.util</key>
                    <delay>60s</delay>
                    <value_type>FLOAT</value_type>
                    <units>%</units>
                    <description>CPU utilization percentage</description>
                </item>
                
                <!-- 内存监控项 -->
                <item>
                    <uuid>memory-utilization</uuid>
                    <name>Memory Utilization</name>
                    <key>custom.memory.util</key>
                    <delay>60s</delay>
                    <value_type>FLOAT</value_type>
                    <units>%</units>
                    <description>Memory utilization percentage</description>
                </item>
                
                <!-- 磁盘监控项 -->
                <item>
                    <uuid>disk-utilization-root</uuid>
                    <name>Disk Utilization /</name>
                    <key>custom.disk.util[/]</key>
                    <delay>300s</delay>
                    <value_type>FLOAT</value_type>
                    <units>%</units>
                    <description>Root filesystem utilization</description>
                </item>
                
                <!-- 网络监控项 -->
                <item>
                    <uuid>network-bytes-in</uuid>
                    <name>Network Bytes In eth0</name>
                    <key>custom.network.bytes.in[eth0]</key>
                    <delay>60s</delay>
                    <value_type>UINT64</value_type>
                    <units>B</units>
                    <description>Network bytes received on eth0</description>
                    <preprocessing>
                        <step>
                            <type>CHANGE_PER_SECOND</type>
                        </step>
                    </preprocessing>
                </item>
                
                <!-- 系统负载监控项 -->
                <item>
                    <uuid>load-average-1min</uuid>
                    <name>Load Average 1min</name>
                    <key>custom.load.avg1</key>
                    <delay>60s</delay>
                    <value_type>FLOAT</value_type>
                    <description>1 minute load average</description>
                </item>
                
                <!-- 进程监控项 -->
                <item>
                    <uuid>tcp-connections</uuid>
                    <name>TCP Connections</name>
                    <key>custom.tcp.connections</key>
                    <delay>60s</delay>
                    <value_type>UINT64</value_type>
                    <description>Number of established TCP connections</description>
                </item>
            </items>
            
            <triggers>
                <!-- CPU告警触发器 -->
                <trigger>
                    <uuid>cpu-high-trigger</uuid>
                    <expression>last(/Linux Advanced Monitoring/custom.cpu.util)&gt;80</expression>
                    <name>High CPU utilization on {HOST.NAME}</name>
                    <priority>WARNING</priority>
                    <description>CPU utilization is above 80%</description>
                </trigger>
                
                <!-- 内存告警触发器 -->
                <trigger>
                    <uuid>memory-high-trigger</uuid>
                    <expression>last(/Linux Advanced Monitoring/custom.memory.util)&gt;90</expression>
                    <name>High memory utilization on {HOST.NAME}</name>
                    <priority>HIGH</priority>
                    <description>Memory utilization is above 90%</description>
                </trigger>
                
                <!-- 磁盘告警触发器 -->
                <trigger>
                    <uuid>disk-high-trigger</uuid>
                    <expression>last(/Linux Advanced Monitoring/custom.disk.util[/])&gt;85</expression>
                    <name>High disk utilization on {HOST.NAME}</name>
                    <priority>WARNING</priority>
                    <description>Root filesystem utilization is above 85%</description>
                </trigger>
                
                <!-- 系统负载告警触发器 -->
                <trigger>
                    <uuid>load-high-trigger</uuid>
                    <expression>last(/Linux Advanced Monitoring/custom.load.avg1)&gt;4</expression>
                    <name>High system load on {HOST.NAME}</name>
                    <priority>WARNING</priority>
                    <description>1 minute load average is above 4</description>
                </trigger>
            </triggers>
            
            <graphs>
                <!-- CPU使用率图表 -->
                <graph>
                    <uuid>cpu-graph</uuid>
                    <name>CPU Utilization</name>
                    <graph_items>
                        <graph_item>
                            <color>FF0000</color>
                            <item>
                                <host>Linux Advanced Monitoring</host>
                                <key>custom.cpu.util</key>
                            </item>
                        </graph_item>
                    </graph_items>
                </graph>
                
                <!-- 内存使用率图表 -->
                <graph>
                    <uuid>memory-graph</uuid>
                    <name>Memory Utilization</name>
                    <graph_items>
                        <graph_item>
                            <color>00FF00</color>
                            <item>
                                <host>Linux Advanced Monitoring</host>
                                <key>custom.memory.util</key>
                            </item>
                        </graph_item>
                    </graph_items>
                </graph>
                
                <!-- 网络流量图表 -->
                <graph>
                    <uuid>network-graph</uuid>
                    <name>Network Traffic eth0</name>
                    <graph_items>
                        <graph_item>
                            <color>0000FF</color>
                            <item>
                                <host>Linux Advanced Monitoring</host>
                                <key>custom.network.bytes.in[eth0]</key>
                            </item>
                        </graph_item>
                    </graph_items>
                </graph>
            </graphs>
        </template>
    </templates>
</zabbix_export>

性能调优与优化

1. 监控系统性能优化

#!/bin/bash

# 监控系统性能优化脚本

OPTIMIZATION_LOG="/var/log/monitoring-optimization.log"
CONFIG_BACKUP_DIR="/etc/monitoring/backups"

# 创建必要目录
mkdir -p $CONFIG_BACKUP_DIR

# Prometheus 性能优化
function optimize_prometheus() {
    echo "优化Prometheus配置..." | tee -a $OPTIMIZATION_LOG
    
    local prometheus_config="/etc/prometheus/prometheus.yml"
    
    if [ -f $prometheus_config ]; then
        # 备份原配置
        cp $prometheus_config $CONFIG_BACKUP_DIR/prometheus.yml.$(date +%Y%m%d_%H%M%S)
        
        # 优化配置建议
        cat >> $OPTIMIZATION_LOG << 'EOF'
Prometheus 性能优化建议:
1. 调整scrape_interval，减少不必要的频繁采集
2. 使用recording rules预计算复杂查询
3. 配置适当的retention时间
4. 启用压缩以减少存储空间
5. 使用federation分层部署
EOF
        
        # 检查Prometheus存储使用情况
        local data_dir="/var/lib/prometheus"
        if [ -d $data_dir ]; then
            local storage_size=$(du -sh $data_dir | cut -f1)
            echo "Prometheus存储使用: $storage_size" | tee -a $OPTIMIZATION_LOG
        fi
    fi
}

# Node Exporter 性能优化
function optimize_node_exporter() {
    echo "优化Node Exporter配置..." | tee -a $OPTIMIZATION_LOG
    
    local service_file="/etc/systemd/system/node_exporter.service"
    
    if [ -f $service_file ]; then
        # 检查当前配置
        echo "当前Node Exporter配置:" | tee -a $OPTIMIZATION_LOG
        grep ExecStart $service_file | tee -a $OPTIMIZATION_LOG
        
        # 优化建议
        cat >> $OPTIMIZATION_LOG << 'EOF'
Node Exporter 性能优化建议:
1. 禁用不需要的collector以减少资源使用
2. 调整--collector.filesystem.ignored-mount-points
3. 使用--collector.textfile.directory启用自定义指标
4. 配置适当的--web.max-requests限制
EOF
    fi
}

# 系统资源优化
function optimize_system_resources() {
    echo "优化系统资源配置..." | tee -a $OPTIMIZATION_LOG
    
    # 检查文件描述符限制
    local current_fd_limit=$(ulimit -n)
    echo "当前文件描述符限制: $current_fd_limit" | tee -a $OPTIMIZATION_LOG
    
    if [ $current_fd_limit -lt 65536 ]; then
        echo "建议增加文件描述符限制到65536" | tee -a $OPTIMIZATION_LOG
        
        # 添加到limits.conf
        if ! grep -q "monitoring" /etc/security/limits.conf; then
            cat >> /etc/security/limits.conf << 'EOF'
# 监控系统优化
prometheus soft nofile 65536
prometheus hard nofile 65536
zabbix soft nofile 65536
zabbix hard nofile 65536
EOF
        fi
    fi
    
    # 检查内存使用
    local mem_total=$(free -m | awk 'NR==2{print $2}')
    local mem_used=$(free -m | awk 'NR==2{print $3}')
    local mem_usage_percent=$((mem_used * 100 / mem_total))
    
    echo "内存使用情况: ${mem_used}MB / ${mem_total}MB (${mem_usage_percent}%)" | tee -a $OPTIMIZATION_LOG
    
    if [ $mem_usage_percent -gt 80 ]; then
        echo "警告: 内存使用率过高，建议优化监控配置或增加内存" | tee -a $OPTIMIZATION_LOG
    fi
    
    # 检查磁盘I/O
    if command -v iostat &> /dev/null; then
        echo "磁盘I/O统计:" | tee -a $OPTIMIZATION_LOG
        iostat -x 1 3 | tail -n +4 | tee -a $OPTIMIZATION_LOG
    fi
}

# 网络优化
function optimize_network() {
    echo "优化网络配置..." | tee -a $OPTIMIZATION_LOG
    
    # 检查网络连接数
    local tcp_connections=$(netstat -an | grep ESTABLISHED | wc -l)
    echo "当前TCP连接数: $tcp_connections" | tee -a $OPTIMIZATION_LOG
    
    # 检查网络缓冲区
    local net_core_rmem_max=$(cat /proc/sys/net/core/rmem_max)
    local net_core_wmem_max=$(cat /proc/sys/net/core/wmem_max)
    
    echo "网络接收缓冲区最大值: $net_core_rmem_max" | tee -a $OPTIMIZATION_LOG
    echo "网络发送缓冲区最大值: $net_core_wmem_max" | tee -a $OPTIMIZATION_LOG
    
    # 优化建议
    if [ $net_core_rmem_max -lt 16777216 ]; then
        echo "建议增加网络接收缓冲区: echo 16777216 > /proc/sys/net/core/rmem_max" | tee -a $OPTIMIZATION_LOG
    fi
    
    if [ $net_core_wmem_max -lt 16777216 ]; then
        echo "建议增加网络发送缓冲区: echo 16777216 > /proc/sys/net/core/wmem_max" | tee -a $OPTIMIZATION_LOG
    fi
}

# 日志优化
function optimize_logging() {
    echo "优化日志配置..." | tee -a $OPTIMIZATION_LOG
    
    # 检查日志文件大小
    find /var/log -name "*.log" -size +100M 2>/dev/null | while read large_log; do
        local log_size=$(du -h "$large_log" | cut -f1)
        echo "发现大日志文件: $large_log ($log_size)" | tee -a $OPTIMIZATION_LOG
    done
    
    # 检查logrotate配置
    if [ -f /etc/logrotate.conf ]; then
        echo "检查logrotate配置..." | tee -a $OPTIMIZATION_LOG
        
        # 确保监控相关日志有轮转配置
        local monitoring_logrotate="/etc/logrotate.d/monitoring"
        if [ ! -f $monitoring_logrotate ]; then
            cat > $monitoring_logrotate << 'EOF'
/var/log/prometheus/*.log {
    daily
    missingok
    rotate 7
    compress
    delaycompress
    notifempty
    sharedscripts
    postrotate
        systemctl reload prometheus
    endscript
}

/var/log/zabbix/*.log {
    daily
    missingok
    rotate 7
    compress
    delaycompress
    notifempty
    sharedscripts
    postrotate
        systemctl reload zabbix-agent2
    endscript
}

/var/log/system-monitor/*.log {
    daily
    missingok
    rotate 30
    compress
    delaycompress
    notifempty
}
EOF
            echo "创建监控日志轮转配置: $monitoring_logrotate" | tee -a $OPTIMIZATION_LOG
        fi
    fi
}

# 生成优化报告
function generate_optimization_report() {
    local report_file="/var/log/monitoring-optimization-report.txt"
    local timestamp=$(date +"%Y-%m-%d %H:%M:%S")
    
    {
        echo "==========================================="
        echo "监控系统优化报告 - $timestamp"
        echo "==========================================="
        echo ""
        
        echo "=== 系统资源使用情况 ==="
        echo "CPU核心数: $(nproc)"
        echo "内存总量: $(free -h | awk 'NR==2{print $2}')"
        echo "磁盘使用情况:"
        df -h | grep -E '^/dev/'
        echo ""
        
        echo "=== 监控服务状态 ==="
        for service in prometheus node_exporter grafana-server zabbix-agent2; do
            if systemctl list-unit-files | grep -q "^$service"; then
                status=$(systemctl is-active $service 2>/dev/null)
                echo "$service: $status"
            fi
        done
        echo ""
        
        echo "=== 性能指标 ==="
        echo "当前负载: $(cat /proc/loadavg)"
        echo "内存使用率: $(free | awk 'NR==2{printf "%.2f%%", $3*100/$2}')"
        echo "TCP连接数: $(netstat -an | grep ESTABLISHED | wc -l)"
        echo "文件描述符使用: $(cat /proc/sys/fs/file-nr | awk '{print $1}') / $(cat /proc/sys/fs/file-max)"
        echo ""
        
        echo "=== 优化建议 ==="
        cat $OPTIMIZATION_LOG | grep -E "建议|警告|优化"
        
    } > $report_file
    
    echo "优化报告已生成: $report_file"
}

# 主执行函数
function main() {
    echo "开始监控系统性能优化检查..." | tee $OPTIMIZATION_LOG
    
    optimize_prometheus
    optimize_node_exporter
    optimize_system_resources
    optimize_network
    optimize_logging
    generate_optimization_report
    
    echo "监控系统优化检查完成！" | tee -a $OPTIMIZATION_LOG
}

# 执行主函数
main

最佳实践与总结

监控系统设计原则

分层监控原则
- 基础设施层：CPU、内存、磁盘、网络
- 应用层：服务状态、性能指标、业务指标
- 用户体验层：响应时间、可用性、错误率
告警设计原则
- 避免告警疲劳，设置合理的阈值
- 实施告警分级，区分紧急和一般告警
- 建立告警升级机制
- 定期回顾和调整告警规则
数据保留策略
- 高精度短期数据（1分钟，保留7天）
- 中精度中期数据（5分钟，保留30天）
- 低精度长期数据（1小时，保留1年）
性能优化要点
- 合理配置采集频率
- 使用标签和维度进行数据分组
- 实施数据压缩和存储优化
- 定期清理历史数据

常见问题解决方案

1. 监控数据丢失

检查网络连接和防火墙设置
验证监控agent的运行状态
检查存储空间是否充足
查看监控系统的错误日志

2. 告警风暴

实施告警抑制和分组
调整告警阈值和持续时间
使用依赖关系减少重复告警
建立告警静默机制

3. 性能问题

优化查询语句和聚合规则
增加系统资源（CPU、内存、存储）
实施监控系统的水平扩展
使用缓存减少重复计算

技术选型建议

1. 小型环境（<50台服务器）

推荐方案：Prometheus + Grafana + AlertManager
优势：轻量级、易部署、社区活跃
适用场景：中小型企业、开发测试环境

2. 中型环境（50-500台服务器）

推荐方案：Zabbix 或 Prometheus集群
优势：功能完善、扩展性好、管理界面友好
适用场景：中型企业、生产环境

3. 大型环境（>500台服务器）

推荐方案：Prometheus联邦 + Thanos 或企业级解决方案
优势：高可用、高性能、长期存储
适用场景：大型企业、云环境

结语

Linux系统监控是现代IT运维的基石，一个完善的监控体系能够显著提升系统的可靠性和运维效率。通过本文的深入探讨，我们了解了从基础监控工具到企业级监控解决方案的完整技术栈。

在实施监控系统时，需要根据实际业务需求和技术环境选择合适的方案。同时，监控系统本身也需要持续优化和改进，以适应不断变化的业务需求和技术发展。

记住，监控不是目的，而是手段。最终目标是通过有效的监控来保障系统稳定运行，提升用户体验，支撑业务发展。希望本文能为读者在构建和优化Linux监控系统时提供有价值的参考和指导。应用日志告警” “$alert_msg”
fi
}

应用性能指标分析

function analyze_app_performance() {
local app_name=$1
local log_file=$2
local current_time=$(date +”%Y-%m-%d %H:%M:%S”)

if [ ! -f "$log_file" ]; then
    return
fi

# 分析响应时间（假设日志中包含响应时间信息）
local avg_response_time=$(awk '
    /response_time/ {
        match($0, /response_time[=:]([0-9.]+)/, arr)
        if (arr[1] != "") {
            sum += arr[1]
            count++
        }
    } END {
        if (count > 0) print sum/count
        else print 0
    }' "$log_file")

# 生成性能指标
echo "# HELP app_response_time_seconds Average response time in seconds" >> $METRICS_FILE.tmp
echo "# TYPE app_response_time_seconds gauge" >> $METRICS_FILE.tmp
echo "app_response_time_seconds{app=\"$app_name\"} $avg_response_time" >> $METRICS_FILE.tmp

# 分析请求量
local request_count=$(grep -c "request" "$log_file" 2>/dev/null || echo 0)
echo "# HELP app_requests_total Total number of requests" >> $METRICS_FILE.tmp
echo "# TYPE app_requests_total counter" >> $METRICS_FILE.tmp
echo "app_requests_total{app=\"$app_name\"} $request_count" >> $METRICS_FILE.tmp

}

日志轮转检查

function check_log_rotation() {
local current_time=$(date +”%Y-%m-%d %H:%M:%S”)

# 检查大文件
find /var/log -name "*.log" -size +100M 2>/dev/null | while read large_file; do
    local file_size=$(du -h "$large_file" | cut -f1)
    echo "[$current_time] [WARNING] 发现大日志文件: $large_file ($file_size)" >> $ALERT_LOG
done

# 检查磁盘空间
local log_disk_usage=$(df /var/log | awk 'NR==2 {gsub(/%/, "", $5); print $5}')
if [ $log_disk_usage -gt 80 ]; then
    echo "[$current_time] [WARNING] 日志分区磁盘使用率过高: ${log_disk_usage}%" >> $ALERT_LOG
    send_app_alert "WARNING" "