Linux系统监控:构建全方位运维监控体系

在现代IT基础设施中,Linux系统监控是保障服务稳定运行的关键环节。一个完善的监控体系不仅能够及时发现问题,还能预测潜在风险,为运维决策提供数据支撑。本文将深入探讨Linux系统监控的各个方面,从基础监控指标到企业级监控架构的设计与实现。

系统监控基础理论

监控的核心价值

系统监控在现代运维中发挥着至关重要的作用:

  1. 故障预防:通过监控关键指标,在问题发生前进行预警
  2. 快速定位:当故障发生时,快速定位问题根源
  3. 性能优化:基于监控数据进行系统性能调优
  4. 容量规划:为系统扩容和资源规划提供数据依据
  5. SLA保障:确保服务水平协议的达成

监控指标体系

1. 系统资源监控

CPU监控指标

  • CPU使用率(user、system、idle、iowait)
  • 负载平均值(1分钟、5分钟、15分钟)
  • CPU核心数和频率
  • 进程和线程数量

内存监控指标

  • 内存使用率和可用内存
  • Swap使用情况
  • 缓存和缓冲区使用量
  • 内存分配和释放速率

磁盘监控指标

  • 磁盘使用率和可用空间
  • 磁盘I/O读写速率
  • 磁盘队列长度和响应时间
  • inode使用情况

网络监控指标

  • 网络接口流量(入站/出站)
  • 网络连接数和状态
  • 网络错误和丢包率
  • TCP连接状态分布

2. 应用层监控

进程监控

  • 进程状态和资源使用
  • 进程启动时间和运行时长
  • 进程文件描述符使用
  • 进程内存映射

服务监控

  • 服务可用性和响应时间
  • 服务端口监听状态
  • 服务日志错误统计
  • 服务依赖关系检查

基础监控工具与命令

系统内置监控命令

1. 实时监控命令

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
#!/bin/bash

# CPU和内存实时监控
function monitor_cpu_memory() {
echo "=== CPU和内存监控 ==="

# 使用top命令获取系统概览
top -bn1 | head -20

echo -e "\n=== 详细CPU信息 ==="
# CPU详细信息
cat /proc/cpuinfo | grep -E "processor|model name|cpu MHz" | head -10

echo -e "\n=== 内存详细信息 ==="
# 内存详细信息
free -h
cat /proc/meminfo | grep -E "MemTotal|MemFree|MemAvailable|Buffers|Cached|SwapTotal|SwapFree"

echo -e "\n=== 负载平均值 ==="
# 系统负载
uptime
cat /proc/loadavg
}

# 磁盘监控
function monitor_disk() {
echo "=== 磁盘使用监控 ==="

# 磁盘使用情况
df -h

echo -e "\n=== inode使用情况 ==="
df -i

echo -e "\n=== 磁盘I/O统计 ==="
# 磁盘I/O统计(需要sysstat包)
if command -v iostat &> /dev/null; then
iostat -x 1 3
else
echo "iostat命令未安装,请安装sysstat包"
fi

echo -e "\n=== 磁盘读写活动 ==="
# 实时磁盘活动
if command -v iotop &> /dev/null; then
iotop -b -n 1
else
echo "iotop命令未安装"
fi
}

# 网络监控
function monitor_network() {
echo "=== 网络监控 ==="

# 网络接口统计
cat /proc/net/dev

echo -e "\n=== 网络连接统计 ==="
# TCP连接状态统计
netstat -an | awk '/^tcp/ {++state[$NF]} END {for(key in state) print key"\t"state[key]}'

echo -e "\n=== 监听端口 ==="
# 监听端口
netstat -tlnp

echo -e "\n=== 网络接口信息 ==="
# 网络接口详细信息
ip addr show

echo -e "\n=== 路由表 ==="
# 路由信息
ip route show
}

# 进程监控
function monitor_processes() {
echo "=== 进程监控 ==="

# 进程数量统计
echo "总进程数: $(ps aux | wc -l)"
echo "运行中进程数: $(ps aux | awk '$8 ~ /^R/ {count++} END {print count+0}')"
echo "睡眠进程数: $(ps aux | awk '$8 ~ /^S/ {count++} END {print count+0}')"
echo "僵尸进程数: $(ps aux | awk '$8 ~ /^Z/ {count++} END {print count+0}')"

echo -e "\n=== CPU使用率最高的10个进程 ==="
ps aux --sort=-%cpu | head -11

echo -e "\n=== 内存使用率最高的10个进程 ==="
ps aux --sort=-%mem | head -11

echo -e "\n=== 文件描述符使用情况 ==="
# 系统文件描述符限制
echo "系统文件描述符限制: $(cat /proc/sys/fs/file-max)"
echo "当前使用的文件描述符: $(cat /proc/sys/fs/file-nr | awk '{print $1}')"

# 进程文件描述符使用Top 10
echo -e "\n=== 文件描述符使用最多的10个进程 ==="
for pid in $(ps -eo pid --no-headers | head -20); do
if [ -d "/proc/$pid/fd" ]; then
fd_count=$(ls /proc/$pid/fd 2>/dev/null | wc -l)
if [ $fd_count -gt 0 ]; then
cmd=$(ps -p $pid -o comm --no-headers 2>/dev/null)
echo "$pid $cmd $fd_count"
fi
fi
done | sort -k3 -nr | head -10
}

# 系统服务监控
function monitor_services() {
echo "=== 系统服务监控 ==="

# systemd服务状态
if command -v systemctl &> /dev/null; then
echo "=== 失败的服务 ==="
systemctl --failed

echo -e "\n=== 关键服务状态 ==="
# 检查关键服务状态
critical_services=("sshd" "network" "firewalld" "chronyd" "rsyslog")
for service in "${critical_services[@]}"; do
if systemctl list-unit-files | grep -q "^$service"; then
status=$(systemctl is-active $service 2>/dev/null)
enabled=$(systemctl is-enabled $service 2>/dev/null)
echo "$service: $status ($enabled)"
fi
done
fi

echo -e "\n=== 系统启动时间 ==="
uptime

echo -e "\n=== 最近登录用户 ==="
last | head -10
}

# 主监控函数
function main_monitor() {
echo "==========================================="
echo "Linux系统监控报告 - $(date)"
echo "主机名: $(hostname)"
echo "内核版本: $(uname -r)"
echo "系统版本: $(cat /etc/os-release | grep PRETTY_NAME | cut -d'"' -f2)"
echo "==========================================="

monitor_cpu_memory
echo -e "\n"
monitor_disk
echo -e "\n"
monitor_network
echo -e "\n"
monitor_processes
echo -e "\n"
monitor_services
}

# 执行监控
main_monitor

2. 高级监控脚本

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
#!/bin/bash

# 系统性能监控脚本
# 用于收集详细的系统性能数据

MONITOR_DIR="/var/log/system-monitor"
DATE=$(date +"%Y%m%d")
TIMESTAMP=$(date +"%Y-%m-%d %H:%M:%S")

# 创建监控日志目录
mkdir -p $MONITOR_DIR

# 性能数据收集函数
function collect_performance_data() {
local output_file="$MONITOR_DIR/performance_$DATE.log"

{
echo "[$TIMESTAMP] === 性能数据收集开始 ==="

# CPU性能数据
echo "[CPU] 负载平均值:"
cat /proc/loadavg

echo "[CPU] CPU使用率详情:"
grep 'cpu ' /proc/stat

echo "[CPU] 上下文切换和中断:"
grep -E 'ctxt|intr|processes|procs_running|procs_blocked' /proc/stat

# 内存性能数据
echo "[MEMORY] 内存使用详情:"
cat /proc/meminfo

echo "[MEMORY] 虚拟内存统计:"
cat /proc/vmstat | grep -E 'pgpgin|pgpgout|pswpin|pswpout|pgfault|pgmajfault'

# 磁盘I/O数据
echo "[DISK] 磁盘统计:"
cat /proc/diskstats

# 网络统计
echo "[NETWORK] 网络接口统计:"
cat /proc/net/dev

echo "[NETWORK] TCP统计:"
cat /proc/net/snmp | grep Tcp

echo "[$TIMESTAMP] === 性能数据收集结束 ==="
echo ""

} >> $output_file
}

# 系统健康检查
function system_health_check() {
local output_file="$MONITOR_DIR/health_check_$DATE.log"
local alert_file="$MONITOR_DIR/alerts_$DATE.log"

{
echo "[$TIMESTAMP] === 系统健康检查开始 ==="

# CPU负载检查
load_1min=$(cat /proc/loadavg | awk '{print $1}')
cpu_cores=$(nproc)
load_threshold=$(echo "$cpu_cores * 0.8" | bc -l)

if (( $(echo "$load_1min > $load_threshold" | bc -l) )); then
echo "[ALERT] CPU负载过高: $load_1min (阈值: $load_threshold)" | tee -a $alert_file
fi

# 内存使用检查
mem_total=$(grep MemTotal /proc/meminfo | awk '{print $2}')
mem_available=$(grep MemAvailable /proc/meminfo | awk '{print $2}')
mem_usage_percent=$(echo "scale=2; (($mem_total - $mem_available) * 100) / $mem_total" | bc)

if (( $(echo "$mem_usage_percent > 90" | bc -l) )); then
echo "[ALERT] 内存使用率过高: ${mem_usage_percent}%" | tee -a $alert_file
fi

# 磁盘使用检查
df -h | awk 'NR>1 {gsub(/%/, "", $5); if($5 > 90) print "[ALERT] 磁盘使用率过高: " $6 " (" $5 "%)";}' | tee -a $alert_file

# 检查僵尸进程
zombie_count=$(ps aux | awk '$8 ~ /^Z/ {count++} END {print count+0}')
if [ $zombie_count -gt 0 ]; then
echo "[ALERT] 发现僵尸进程: $zombie_count 个" | tee -a $alert_file
fi

# 检查重要服务
critical_services=("sshd" "network" "firewalld")
for service in "${critical_services[@]}"; do
if systemctl list-unit-files | grep -q "^$service" && ! systemctl is-active --quiet $service; then
echo "[ALERT] 关键服务未运行: $service" | tee -a $alert_file
fi
done

echo "[$TIMESTAMP] === 系统健康检查结束 ==="
echo ""

} >> $output_file
}

# 网络连接监控
function monitor_network_connections() {
local output_file="$MONITOR_DIR/network_connections_$DATE.log"

{
echo "[$TIMESTAMP] === 网络连接监控开始 ==="

# TCP连接状态统计
echo "[TCP] 连接状态统计:"
netstat -an | awk '/^tcp/ {++state[$NF]} END {for(key in state) print key"\t"state[key]}'

# 连接数最多的IP地址
echo "[TCP] 连接数最多的IP地址 (Top 10):"
netstat -an | grep ESTABLISHED | awk '{print $5}' | cut -d: -f1 | sort | uniq -c | sort -nr | head -10

# 监听端口
echo "[TCP] 当前监听端口:"
netstat -tlnp | grep LISTEN

# 网络流量统计
echo "[NETWORK] 网络接口流量:"
cat /proc/net/dev | awk 'NR>2 {printf "%-10s RX: %10d bytes TX: %10d bytes\n", $1, $2, $10}'

echo "[$TIMESTAMP] === 网络连接监控结束 ==="
echo ""

} >> $output_file
}

# 进程资源使用监控
function monitor_process_resources() {
local output_file="$MONITOR_DIR/process_resources_$DATE.log"

{
echo "[$TIMESTAMP] === 进程资源使用监控开始 ==="

# CPU使用率最高的进程
echo "[PROCESS] CPU使用率最高的进程 (Top 10):"
ps aux --sort=-%cpu | head -11 | awk 'NR==1 {print $0} NR>1 {printf "%-8s %-10s %6s %6s %-20s\n", $2, $1, $3, $4, $11}'

# 内存使用最多的进程
echo "[PROCESS] 内存使用最多的进程 (Top 10):"
ps aux --sort=-%mem | head -11 | awk 'NR==1 {print $0} NR>1 {printf "%-8s %-10s %6s %6s %-20s\n", $2, $1, $3, $4, $11}'

# 文件描述符使用统计
echo "[PROCESS] 文件描述符使用统计:"
echo "系统限制: $(cat /proc/sys/fs/file-max)"
echo "当前使用: $(cat /proc/sys/fs/file-nr | awk '{print $1}')"
echo "使用率: $(cat /proc/sys/fs/file-nr | awk '{printf "%.2f%%", ($1/$(cat /proc/sys/fs/file-max))*100}')"

echo "[$TIMESTAMP] === 进程资源使用监控结束 ==="
echo ""

} >> $output_file
}

# 日志清理函数
function cleanup_old_logs() {
# 删除7天前的日志文件
find $MONITOR_DIR -name "*.log" -mtime +7 -delete
}

# 主执行函数
function main() {
echo "开始系统监控数据收集..."

collect_performance_data
system_health_check
monitor_network_connections
monitor_process_resources
cleanup_old_logs

echo "监控数据收集完成,日志保存在: $MONITOR_DIR"

# 如果有告警,显示告警信息
alert_file="$MONITOR_DIR/alerts_$DATE.log"
if [ -f $alert_file ] && [ -s $alert_file ]; then
echo "发现系统告警:"
cat $alert_file
fi
}

# 检查是否以root权限运行
if [ "$EUID" -ne 0 ]; then
echo "建议以root权限运行以获取完整的监控数据"
fi

# 执行主函数
main

企业级监控解决方案

Prometheus + Grafana 监控架构

1. Prometheus 配置

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# prometheus.yml
global:
scrape_interval: 15s
evaluation_interval: 15s
external_labels:
cluster: 'production'
region: 'us-west-1'

rule_files:
- "rules/*.yml"

alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093

scrape_configs:
# Prometheus自身监控
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
scrape_interval: 5s
metrics_path: /metrics

# Node Exporter监控
- job_name: 'node-exporter'
static_configs:
- targets:
- 'server1:9100'
- 'server2:9100'
- 'server3:9100'
scrape_interval: 10s
metrics_path: /metrics
relabel_configs:
- source_labels: [__address__]
target_label: instance
regex: '([^:]+):.+'
replacement: '${1}'

# 应用程序监控
- job_name: 'application'
static_configs:
- targets:
- 'app1:8080'
- 'app2:8080'
metrics_path: /actuator/prometheus
scrape_interval: 15s

# MySQL监控
- job_name: 'mysql'
static_configs:
- targets:
- 'mysql-exporter:9104'
scrape_interval: 30s

# Redis监控
- job_name: 'redis'
static_configs:
- targets:
- 'redis-exporter:9121'
scrape_interval: 30s

# Nginx监控
- job_name: 'nginx'
static_configs:
- targets:
- 'nginx-exporter:9113'
scrape_interval: 30s

# 服务发现配置(Consul)
- job_name: 'consul-services'
consul_sd_configs:
- server: 'consul:8500'
services: []
relabel_configs:
- source_labels: [__meta_consul_service]
target_label: job
- source_labels: [__meta_consul_node]
target_label: instance

2. 告警规则配置

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# rules/system-alerts.yml
groups:
- name: system.rules
rules:
# CPU使用率告警
- alert: HighCPUUsage
expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 5m
labels:
severity: warning
category: system
annotations:
summary: "High CPU usage detected"
description: "CPU usage is above 80% for more than 5 minutes on {{ $labels.instance }}"

# 内存使用率告警
- alert: HighMemoryUsage
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 90
for: 5m
labels:
severity: critical
category: system
annotations:
summary: "High memory usage detected"
description: "Memory usage is above 90% for more than 5 minutes on {{ $labels.instance }}"

# 磁盘使用率告警
- alert: HighDiskUsage
expr: (1 - (node_filesystem_avail_bytes{fstype!="tmpfs"} / node_filesystem_size_bytes{fstype!="tmpfs"})) * 100 > 90
for: 5m
labels:
severity: warning
category: system
annotations:
summary: "High disk usage detected"
description: "Disk usage is above 90% on {{ $labels.instance }} mount point {{ $labels.mountpoint }}"

# 系统负载告警
- alert: HighSystemLoad
expr: node_load1 / on(instance) count by(instance) (node_cpu_seconds_total{mode="idle"}) > 0.8
for: 10m
labels:
severity: warning
category: system
annotations:
summary: "High system load detected"
description: "System load is above 80% of CPU cores for more than 10 minutes on {{ $labels.instance }}"

# 网络连接数告警
- alert: HighNetworkConnections
expr: node_netstat_Tcp_CurrEstab > 1000
for: 5m
labels:
severity: warning
category: network
annotations:
summary: "High number of TCP connections"
description: "Number of established TCP connections is above 1000 on {{ $labels.instance }}"

# 服务不可用告警
- alert: ServiceDown
expr: up == 0
for: 1m
labels:
severity: critical
category: availability
annotations:
summary: "Service is down"
description: "Service {{ $labels.job }} on {{ $labels.instance }} is down"

# 磁盘I/O使用率告警
- alert: HighDiskIOUsage
expr: irate(node_disk_io_time_seconds_total[5m]) * 100 > 80
for: 5m
labels:
severity: warning
category: system
annotations:
summary: "High disk I/O usage detected"
description: "Disk I/O usage is above 80% for more than 5 minutes on {{ $labels.instance }} device {{ $labels.device }}"

- name: application.rules
rules:
# 应用响应时间告警
- alert: HighResponseTime
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 1
for: 5m
labels:
severity: warning
category: application
annotations:
summary: "High response time detected"
description: "95th percentile response time is above 1 second for {{ $labels.instance }}"

# 应用错误率告警
- alert: HighErrorRate
expr: rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) * 100 > 5
for: 5m
labels:
severity: critical
category: application
annotations:
summary: "High error rate detected"
description: "Error rate is above 5% for {{ $labels.instance }}"

3. Node Exporter 部署

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
#!/bin/bash

# Node Exporter 安装脚本

NODE_EXPORTER_VERSION="1.6.1"
USER="node_exporter"
INSTALL_DIR="/opt/node_exporter"
SERVICE_FILE="/etc/systemd/system/node_exporter.service"

# 创建用户
sudo useradd --no-create-home --shell /bin/false $USER

# 下载和安装 Node Exporter
cd /tmp
wget https://github.com/prometheus/node_exporter/releases/download/v${NODE_EXPORTER_VERSION}/node_exporter-${NODE_EXPORTER_VERSION}.linux-amd64.tar.gz
tar xzf node_exporter-${NODE_EXPORTER_VERSION}.linux-amd64.tar.gz

# 创建安装目录
sudo mkdir -p $INSTALL_DIR
sudo cp node_exporter-${NODE_EXPORTER_VERSION}.linux-amd64/node_exporter $INSTALL_DIR/
sudo chown -R $USER:$USER $INSTALL_DIR

# 创建systemd服务文件
sudo tee $SERVICE_FILE > /dev/null <<EOF
[Unit]
Description=Node Exporter
Wants=network-online.target
After=network-online.target

[Service]
User=$USER
Group=$USER
Type=simple
ExecStart=$INSTALL_DIR/node_exporter \
--collector.systemd \
--collector.processes \
--collector.interrupts \
--collector.tcpstat \
--collector.meminfo_numa \
--web.listen-address=:9100
Restart=always
RestartSec=3

[Install]
WantedBy=multi-user.target
EOF

# 启动服务
sudo systemctl daemon-reload
sudo systemctl enable node_exporter
sudo systemctl start node_exporter

# 检查服务状态
sudo systemctl status node_exporter

# 清理临时文件
rm -rf /tmp/node_exporter-*

echo "Node Exporter 安装完成!"
echo "访问 http://$(hostname -I | awk '{print $1}'):9100/metrics 查看指标"

4. 自定义监控指标

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!/bin/bash

# 自定义监控指标收集脚本
# 生成Prometheus格式的指标数据

METRICS_FILE="/var/lib/node_exporter/textfile_collector/custom_metrics.prom"
TMP_FILE="${METRICS_FILE}.tmp"

# 确保目录存在
mkdir -p $(dirname $METRICS_FILE)

# 开始生成指标
{
echo "# HELP custom_system_info System information"
echo "# TYPE custom_system_info gauge"

# 系统信息
kernel_version=$(uname -r)
os_version=$(cat /etc/os-release | grep VERSION_ID | cut -d'"' -f2)
echo "custom_system_info{kernel_version=\"$kernel_version\",os_version=\"$os_version\"} 1"

echo ""
echo "# HELP custom_disk_inodes_usage Disk inode usage percentage"
echo "# TYPE custom_disk_inodes_usage gauge"

# 磁盘inode使用率
df -i | awk 'NR>1 && $1 !~ /^tmpfs/ && $6 != "/dev" && $6 != "/run" {
gsub(/%/, "", $5)
if ($5 != "-" && $5 != "")
printf "custom_disk_inodes_usage{device=\"%s\",mountpoint=\"%s\"} %s\n", $1, $6, $5
}'

echo ""
echo "# HELP custom_tcp_connection_states TCP connection states count"
echo "# TYPE custom_tcp_connection_states gauge"

# TCP连接状态统计
netstat -an | awk '/^tcp/ {state[$NF]++} END {
for (s in state) {
gsub(/[^A-Z_]/, "", s)
if (s != "") printf "custom_tcp_connection_states{state=\"%s\"} %d\n", s, state[s]
}
}'

echo ""
echo "# HELP custom_process_count Process count by state"
echo "# TYPE custom_process_count gauge"

# 进程状态统计
ps aux | awk 'NR>1 {
if ($8 ~ /^R/) running++
else if ($8 ~ /^S/) sleeping++
else if ($8 ~ /^D/) uninterruptible++
else if ($8 ~ /^Z/) zombie++
else if ($8 ~ /^T/) stopped++
total++
} END {
printf "custom_process_count{state=\"running\"} %d\n", running+0
printf "custom_process_count{state=\"sleeping\"} %d\n", sleeping+0
printf "custom_process_count{state=\"uninterruptible\"} %d\n", uninterruptible+0
printf "custom_process_count{state=\"zombie\"} %d\n", zombie+0
printf "custom_process_count{state=\"stopped\"} %d\n", stopped+0
printf "custom_process_count{state=\"total\"} %d\n", total+0
}'

echo ""
echo "# HELP custom_service_status Service status (1=active, 0=inactive)"
echo "# TYPE custom_service_status gauge"

# 关键服务状态
critical_services=("sshd" "network" "firewalld" "chronyd" "rsyslog" "docker")
for service in "${critical_services[@]}"; do
if systemctl list-unit-files | grep -q "^$service"; then
if systemctl is-active --quiet $service; then
echo "custom_service_status{service=\"$service\"} 1"
else
echo "custom_service_status{service=\"$service\"} 0"
fi
fi
done

echo ""
echo "# HELP custom_file_descriptor_usage File descriptor usage"
echo "# TYPE custom_file_descriptor_usage gauge"

# 文件描述符使用情况
fd_used=$(cat /proc/sys/fs/file-nr | awk '{print $1}')
fd_max=$(cat /proc/sys/fs/file-max)
fd_usage_percent=$(echo "scale=2; ($fd_used * 100) / $fd_max" | bc)

echo "custom_file_descriptor_usage{type=\"used\"} $fd_used"
echo "custom_file_descriptor_usage{type=\"max\"} $fd_max"
echo "custom_file_descriptor_usage{type=\"usage_percent\"} $fd_usage_percent"

echo ""
echo "# HELP custom_last_boot_time Last boot time in seconds since epoch"
echo "# TYPE custom_last_boot_time gauge"

# 系统启动时间
boot_time=$(stat -c %Y /proc/1)
echo "custom_last_boot_time $boot_time"

echo ""
echo "# HELP custom_security_updates Security updates available"
echo "# TYPE custom_security_updates gauge"

# 安全更新数量(适用于基于Debian的系统)
if command -v apt &> /dev/null; then
security_updates=$(apt list --upgradable 2>/dev/null | grep -c security || echo 0)
echo "custom_security_updates $security_updates"
elif command -v yum &> /dev/null; then
security_updates=$(yum --security check-update 2>/dev/null | grep -c "updates" || echo 0)
echo "custom_security_updates $security_updates"
else
echo "custom_security_updates 0"
fi

} > $TMP_FILE

# 原子性地更新指标文件
mv $TMP_FILE $METRICS_FILE

echo "自定义指标已更新: $METRICS_FILE"

Grafana 仪表板配置

1. 系统概览仪表板

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
{
"dashboard": {
"id": null,
"title": "Linux系统监控概览",
"tags": ["linux", "system", "monitoring"],
"timezone": "browser",
"panels": [
{
"id": 1,
"title": "CPU使用率",
"type": "stat",
"targets": [
{
"expr": "100 - (avg by(instance) (irate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)",
"legendFormat": "{{instance}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"min": 0,
"max": 100,
"thresholds": {
"steps": [
{"color": "green", "value": 0},
{"color": "yellow", "value": 70},
{"color": "red", "value": 90}
]
}
}
},
"gridPos": {"h": 8, "w": 6, "x": 0, "y": 0}
},
{
"id": 2,
"title": "内存使用率",
"type": "stat",
"targets": [
{
"expr": "(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100",
"legendFormat": "{{instance}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"min": 0,
"max": 100,
"thresholds": {
"steps": [
{"color": "green", "value": 0},
{"color": "yellow", "value": 80},
{"color": "red", "value": 95}
]
}
}
},
"gridPos": {"h": 8, "w": 6, "x": 6, "y": 0}
},
{
"id": 3,
"title": "磁盘使用率",
"type": "stat",
"targets": [
{
"expr": "max by(instance) ((1 - (node_filesystem_avail_bytes{fstype!=\"tmpfs\"} / node_filesystem_size_bytes{fstype!=\"tmpfs\"})) * 100)",
"legendFormat": "{{instance}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"min": 0,
"max": 100,
"thresholds": {
"steps": [
{"color": "green", "value": 0},
{"color": "yellow", "value": 80},
{"color": "red", "value": 95}
]
}
}
},
"gridPos": {"h": 8, "w": 6, "x": 12, "y": 0}
},
{
"id": 4,
"title": "系统负载",
"type": "stat",
"targets": [
{
"expr": "node_load1",
"legendFormat": "{{instance}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "short",
"decimals": 2,
"thresholds": {
"steps": [
{"color": "green", "value": 0},
{"color": "yellow", "value": 2},
{"color": "red", "value": 4}
]
}
}
},
"gridPos": {"h": 8, "w": 6, "x": 18, "y": 0}
},
{
"id": 5,
"title": "CPU使用率趋势",
"type": "timeseries",
"targets": [
{
"expr": "100 - (avg by(instance) (irate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)",
"legendFormat": "{{instance}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"min": 0,
"max": 100
}
},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 8}
},
{
"id": 6,
"title": "内存使用趋势",
"type": "timeseries",
"targets": [
{
"expr": "node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes",
"legendFormat": "已使用 - {{instance}}"
},
{
"expr": "node_memory_MemAvailable_bytes",
"legendFormat": "可用 - {{instance}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "bytes"
}
},
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 8}
}
],
"time": {
"from": "now-1h",
"to": "now"
},
"refresh": "30s"
}
}

日志监控与分析

1. 系统日志监控

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
#!/bin/bash

# 系统日志监控脚本
# 监控关键系统日志并生成告警

LOG_DIR="/var/log"
ALERT_LOG="/var/log/system-alerts.log"
CONFIG_FILE="/etc/log-monitor.conf"
LAST_CHECK_FILE="/var/lib/log-monitor/last-check"

# 创建必要的目录
mkdir -p $(dirname $LAST_CHECK_FILE)

# 默认配置
if [ ! -f $CONFIG_FILE ]; then
cat > $CONFIG_FILE << 'EOF'
# 日志监控配置文件
# 格式: 日志文件路径:关键词:告警级别
/var/log/messages:error:WARNING
/var/log/messages:critical:CRITICAL
/var/log/messages:panic:CRITICAL
/var/log/secure:Failed password:WARNING
/var/log/secure:authentication failure:WARNING
/var/log/secure:Invalid user:WARNING
/var/log/kern.log:Out of memory:CRITICAL
/var/log/kern.log:segfault:WARNING
/var/log/kern.log:BUG:CRITICAL
/var/log/syslog:error:WARNING
/var/log/syslog:failed:WARNING
/var/log/audit/audit.log:ANOM:WARNING
/var/log/audit/audit.log:denied:WARNING
EOF
fi

# 获取上次检查时间
if [ -f $LAST_CHECK_FILE ]; then
LAST_CHECK=$(cat $LAST_CHECK_FILE)
else
LAST_CHECK=$(date -d "1 hour ago" +"%Y-%m-%d %H:%M:%S")
fi

# 更新检查时间
date +"%Y-%m-%d %H:%M:%S" > $LAST_CHECK_FILE

# 日志分析函数
function analyze_logs() {
local log_file=$1
local keyword=$2
local alert_level=$3
local current_time=$(date +"%Y-%m-%d %H:%M:%S")

if [ ! -f "$log_file" ]; then
return
fi

# 查找自上次检查以来的新日志条目
local new_entries=$(awk -v start="$LAST_CHECK" -v keyword="$keyword" '
BEGIN {
# 将时间字符串转换为时间戳进行比较
cmd = "date -d \"" start "\" +%s"
cmd | getline start_ts
close(cmd)
}
{
# 提取日志时间戳(假设格式为 "Mon DD HH:MM:SS")
if (match($0, /^[A-Za-z]{3}\s+[0-9]{1,2}\s+[0-9]{2}:[0-9]{2}:[0-9]{2}/)) {
log_time = substr($0, RSTART, RLENGTH)
# 添加年份
log_time = strftime("%Y") " " log_time
cmd = "date -d \"" log_time "\" +%s 2>/dev/null"
if ((cmd | getline log_ts) > 0 && log_ts >= start_ts) {
if (tolower($0) ~ tolower(keyword)) {
print $0
}
}
close(cmd)
}
}' "$log_file")

if [ -n "$new_entries" ]; then
echo "[$current_time] [$alert_level] 在 $log_file 中发现关键词 '$keyword':" >> $ALERT_LOG
echo "$new_entries" >> $ALERT_LOG
echo "" >> $ALERT_LOG

# 发送告警通知
send_alert "$alert_level" "日志告警" "在 $log_file 中发现关键词 '$keyword'"
fi
}

# 发送告警函数
function send_alert() {
local level=$1
local title=$2
local message=$3
local timestamp=$(date +"%Y-%m-%d %H:%M:%S")

# 记录到系统日志
logger -t "log-monitor" "[$level] $title: $message"

# 这里可以集成邮件、短信、钉钉等告警方式
case $level in
"CRITICAL")
# 发送紧急告警
echo "[$timestamp] CRITICAL ALERT: $title - $message" | wall
;;
"WARNING")
# 发送警告
echo "[$timestamp] WARNING: $title - $message"
;;
esac
}

# 系统资源告警检查
function check_system_resources() {
local current_time=$(date +"%Y-%m-%d %H:%M:%S")

# 检查磁盘使用率
df -h | awk 'NR>1 && $5+0 > 90 {
gsub(/%/, "", $5)
print "[$current_time] [WARNING] 磁盘使用率过高: " $6 " (" $5 "%)"
}' >> $ALERT_LOG

# 检查内存使用率
local mem_usage=$(free | awk 'NR==2{printf "%.0f", $3*100/$2}')
if [ $mem_usage -gt 90 ]; then
echo "[$current_time] [WARNING] 内存使用率过高: ${mem_usage}%" >> $ALERT_LOG
send_alert "WARNING" "内存告警" "内存使用率达到 ${mem_usage}%"
fi

# 检查CPU负载
local load_1min=$(cat /proc/loadavg | awk '{print $1}')
local cpu_cores=$(nproc)
local load_threshold=$(echo "$cpu_cores * 2" | bc -l)

if (( $(echo "$load_1min > $load_threshold" | bc -l) )); then
echo "[$current_time] [WARNING] CPU负载过高: $load_1min (阈值: $load_threshold)" >> $ALERT_LOG
send_alert "WARNING" "CPU负载告警" "1分钟负载平均值: $load_1min"
fi
}

# 安全事件检查
function check_security_events() {
local current_time=$(date +"%Y-%m-%d %H:%M:%S")

# 检查SSH登录失败
local failed_ssh=$(grep "Failed password" /var/log/secure 2>/dev/null | \
awk -v start="$LAST_CHECK" '
{
if ($0 > start) count++
} END {print count+0}')

if [ $failed_ssh -gt 10 ]; then
echo "[$current_time] [WARNING] SSH登录失败次数过多: $failed_ssh 次" >> $ALERT_LOG
send_alert "WARNING" "安全告警" "检测到 $failed_ssh 次SSH登录失败"
fi

# 检查sudo使用
local sudo_usage=$(grep "sudo:" /var/log/secure 2>/dev/null | \
awk -v start="$LAST_CHECK" '
{
if ($0 > start) count++
} END {print count+0}')

if [ $sudo_usage -gt 50 ]; then
echo "[$current_time] [INFO] sudo使用频繁: $sudo_usage 次" >> $ALERT_LOG
fi
}

# 主执行函数
function main() {
echo "开始日志监控检查 - $(date)"

# 读取配置文件并分析日志
while IFS=':' read -r log_file keyword alert_level; do
# 跳过注释行和空行
if [[ $log_file =~ ^#.*$ ]] || [[ -z $log_file ]]; then
continue
fi

analyze_logs "$log_file" "$keyword" "$alert_level"
done < $CONFIG_FILE

# 检查系统资源
check_system_resources

# 检查安全事件
check_security_events

echo "日志监控检查完成 - $(date)"

# 清理旧的告警日志(保留7天)
find $(dirname $ALERT_LOG) -name "*.log" -mtime +7 -delete 2>/dev/null
}

# 执行主函数
main

2. 应用日志监控

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#!/bin/bash

# 应用日志监控脚本
# 专门监控应用程序日志

APP_LOG_DIR="/var/log/applications"
ALERT_LOG="/var/log/app-alerts.log"
CONFIG_FILE="/etc/app-log-monitor.conf"
METRICS_FILE="/var/lib/node_exporter/textfile_collector/app_log_metrics.prom"

# 创建必要目录
mkdir -p $(dirname $ALERT_LOG)
mkdir -p $(dirname $METRICS_FILE)

# 应用日志配置
if [ ! -f $CONFIG_FILE ]; then
cat > $CONFIG_FILE << 'EOF'
# 应用日志监控配置
# 格式: 应用名:日志文件:错误模式:告警阈值(每分钟)
web-app:/var/log/applications/web-app.log:ERROR:5
api-service:/var/log/applications/api.log:Exception:3
database:/var/log/mysql/error.log:ERROR:2
nginx:/var/log/nginx/error.log:error:10
redis:/var/log/redis/redis-server.log:WARNING:5
EOF
fi

# 应用日志分析函数
function analyze_app_logs() {
local app_name=$1
local log_file=$2
local error_pattern=$3
local threshold=$4
local current_time=$(date +"%Y-%m-%d %H:%M:%S")
local one_minute_ago=$(date -d "1 minute ago" +"%Y-%m-%d %H:%M:%S")

if [ ! -f "$log_file" ]; then
return
fi

# 统计最近1分钟的错误数量
local error_count=$(awk -v start="$one_minute_ago" -v pattern="$error_pattern" '
{
# 简化的时间比较,假设日志格式包含时间戳
if ($0 >= start && tolower($0) ~ tolower(pattern)) {
count++
}
} END {print count+0}' "$log_file")

# 生成Prometheus指标
echo "# HELP app_log_errors_total Total number of application log errors" >> $METRICS_FILE.tmp
echo "# TYPE app_log_errors_total counter" >> $METRICS_FILE.tmp
echo "app_log_errors_total{app=\"$app_name\",pattern=\"$error_pattern\"} $error_count" >> $METRICS_FILE.tmp

# 检查是否超过阈值
if [ $error_count -gt $threshold ]; then
local alert_msg="应用 $app_name 在最近1分钟内出现 $error_count 个 '$error_pattern' 错误(阈值: $threshold)"
echo "[$current_time] [WARNING] $alert_msg" >> $ALERT_LOG

# 提取最近的错误日志样本
echo "错误样本:" >> $ALERT_LOG
grep -i "$error_pattern" "$log_file" | tail -3 >> $ALERT_LOG
echo "" >> $ALERT_LOG

# 发送告警
send_app_alert "WARNING" "磁盘空间告警" "日志分区使用率达到 ${log_disk_usage}%"
fi
}

# 发送应用告警函数
function send_app_alert() {
local level=$1
local title=$2
local message=$3
local timestamp=$(date +"%Y-%m-%d %H:%M:%S")

# 记录到系统日志
logger -t "app-log-monitor" "[$level] $title: $message"

# 这里可以集成具体的告警系统
case $level in
"CRITICAL")
echo "[$timestamp] CRITICAL APP ALERT: $title - $message" | wall
;;
"WARNING")
echo "[$timestamp] APP WARNING: $title - $message"
;;
esac
}

# 主执行函数
function main() {
echo "开始应用日志监控 - $(date)"

# 清空临时指标文件
> $METRICS_FILE.tmp

# 读取配置并分析应用日志
while IFS=':' read -r app_name log_file error_pattern threshold; do
if [[ $app_name =~ ^#.*$ ]] || [[ -z $app_name ]]; then
continue
fi

analyze_app_logs "$app_name" "$log_file" "$error_pattern" "$threshold"
analyze_app_performance "$app_name" "$log_file"
done < $CONFIG_FILE

# 检查日志轮转
check_log_rotation

# 更新Prometheus指标文件
mv $METRICS_FILE.tmp $METRICS_FILE

echo "应用日志监控完成 - $(date)"
}

# 执行主函数
main

Zabbix 监控解决方案

1. Zabbix Agent 配置

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
#!/bin/bash

# Zabbix Agent 安装和配置脚本

ZABBIX_VERSION="6.4"
ZABBIX_SERVER="zabbix-server.example.com"
AGENT_CONFIG="/etc/zabbix/zabbix_agentd.conf"
CUSTOM_SCRIPTS_DIR="/etc/zabbix/scripts"

# 安装Zabbix Agent
function install_zabbix_agent() {
echo "安装Zabbix Agent..."

# 添加Zabbix仓库
if [ -f /etc/redhat-release ]; then
# CentOS/RHEL
rpm -Uvh https://repo.zabbix.com/zabbix/${ZABBIX_VERSION}/rhel/$(rpm -E %{rhel})/x86_64/zabbix-release-${ZABBIX_VERSION}-1.el$(rpm -E %{rhel}).noarch.rpm
yum clean all
yum install -y zabbix-agent2
elif [ -f /etc/debian_version ]; then
# Debian/Ubuntu
wget https://repo.zabbix.com/zabbix/${ZABBIX_VERSION}/ubuntu/pool/main/z/zabbix-release/zabbix-release_${ZABBIX_VERSION}-1+ubuntu$(lsb_release -rs)_all.deb
dpkg -i zabbix-release_${ZABBIX_VERSION}-1+ubuntu$(lsb_release -rs)_all.deb
apt update
apt install -y zabbix-agent2
fi
}

# 配置Zabbix Agent
function configure_zabbix_agent() {
echo "配置Zabbix Agent..."

# 备份原配置文件
cp $AGENT_CONFIG ${AGENT_CONFIG}.backup

# 生成新配置
cat > $AGENT_CONFIG << EOF
# Zabbix Agent 配置文件

# Zabbix服务器地址
Server=$ZABBIX_SERVER
ServerActive=$ZABBIX_SERVER

# Agent主机名
Hostname=$(hostname)

# 监听端口
ListenPort=10050

# 日志文件
LogFile=/var/log/zabbix/zabbix_agentd.log
LogFileSize=10

# PID文件
PidFile=/var/run/zabbix/zabbix_agentd.pid

# 包含自定义配置目录
Include=/etc/zabbix/zabbix_agentd.d/*.conf

# 用户参数
UserParameter=custom.cpu.util,cat /proc/stat | awk '/^cpu / {usage=($2+$4)*100/($2+$3+$4+$5)} END {print usage}'
UserParameter=custom.memory.util,free | awk '/^Mem:/ {printf "%.2f", ($3/$2)*100}'
UserParameter=custom.disk.util[*],df -h $1 | awk 'NR==2 {gsub(/%/, "", $$5); print $$5}'
UserParameter=custom.network.bytes.in[*],cat /proc/net/dev | awk -v interface=$1 '$$1==interface":" {print $$2}'
UserParameter=custom.network.bytes.out[*],cat /proc/net/dev | awk -v interface=$1 '$$1==interface":" {print $$10}'
UserParameter=custom.tcp.connections,netstat -an | grep ESTABLISHED | wc -l
UserParameter=custom.process.count[*],ps aux | grep -v grep | grep -c $1
UserParameter=custom.service.status[*],systemctl is-active $1 | grep -c active
UserParameter=custom.log.error.count[*],grep -c "ERROR" $1 2>/dev/null || echo 0
UserParameter=custom.disk.inode.util[*],df -i $1 | awk 'NR==2 {gsub(/%/, "", $$5); print $$5}'
UserParameter=custom.load.avg1,cat /proc/loadavg | awk '{print $$1}'
UserParameter=custom.load.avg5,cat /proc/loadavg | awk '{print $$2}'
UserParameter=custom.load.avg15,cat /proc/loadavg | awk '{print $$3}'

# 超时设置
Timeout=30

# 允许远程命令
EnableRemoteCommands=1
LogRemoteCommands=1

# 缓冲区大小
MaxLinesPerSecond=20
EOF

# 创建自定义脚本目录
mkdir -p $CUSTOM_SCRIPTS_DIR

# 设置权限
chown -R zabbix:zabbix /etc/zabbix
chmod 755 $CUSTOM_SCRIPTS_DIR
}

# 创建自定义监控脚本
function create_custom_scripts() {
echo "创建自定义监控脚本..."

# 系统性能监控脚本
cat > $CUSTOM_SCRIPTS_DIR/system_performance.sh << 'EOF'
#!/bin/bash

# 系统性能监控脚本

case $1 in
"cpu_usage")
top -bn1 | grep "Cpu(s)" | awk '{print $2}' | sed 's/%us,//'
;;
"memory_usage")
free | awk '/^Mem:/ {printf "%.2f", ($3/$2)*100}'
;;
"disk_io_read")
iostat -d 1 2 | awk '/^[a-z]/ && NR>6 {sum+=$3} END {print sum}'
;;
"disk_io_write")
iostat -d 1 2 | awk '/^[a-z]/ && NR>6 {sum+=$4} END {print sum}'
;;
"network_connections")
netstat -an | grep ESTABLISHED | wc -l
;;
"zombie_processes")
ps aux | awk '$8 ~ /^Z/ {count++} END {print count+0}'
;;
"file_descriptors")
cat /proc/sys/fs/file-nr | awk '{print $1}'
;;
"context_switches")
grep ctxt /proc/stat | awk '{print $2}'
;;
*)
echo "Usage: $0 {cpu_usage|memory_usage|disk_io_read|disk_io_write|network_connections|zombie_processes|file_descriptors|context_switches}"
exit 1
;;
esac
EOF

# 应用监控脚本
cat > $CUSTOM_SCRIPTS_DIR/app_monitor.sh << 'EOF'
#!/bin/bash

# 应用监控脚本

APP_NAME=$1
METRIC=$2

case $METRIC in
"status")
if systemctl is-active --quiet $APP_NAME; then
echo 1
else
echo 0
fi
;;
"memory")
ps aux | grep $APP_NAME | grep -v grep | awk '{sum+=$6} END {print sum+0}'
;;
"cpu")
ps aux | grep $APP_NAME | grep -v grep | awk '{sum+=$3} END {print sum+0}'
;;
"connections")
netstat -an | grep $(ps aux | grep $APP_NAME | grep -v grep | awk '{print $2}' | head -1) | wc -l
;;
"log_errors")
LOG_FILE="/var/log/$APP_NAME.log"
if [ -f $LOG_FILE ]; then
grep -c "ERROR" $LOG_FILE
else
echo 0
fi
;;
*)
echo "Usage: $0 <app_name> {status|memory|cpu|connections|log_errors}"
exit 1
;;
esac
EOF

# 安全监控脚本
cat > $CUSTOM_SCRIPTS_DIR/security_monitor.sh << 'EOF'
#!/bin/bash

# 安全监控脚本

case $1 in
"failed_logins")
grep "Failed password" /var/log/secure 2>/dev/null | wc -l
;;
"successful_logins")
grep "Accepted password" /var/log/secure 2>/dev/null | wc -l
;;
"sudo_usage")
grep "sudo:" /var/log/secure 2>/dev/null | wc -l
;;
"firewall_drops")
dmesg | grep -i "dropped" | wc -l
;;
"open_ports")
netstat -tlnp | grep LISTEN | wc -l
;;
"last_login")
last -1 | head -1 | awk '{print $4" "$5" "$6" "$7}'
;;
*)
echo "Usage: $0 {failed_logins|successful_logins|sudo_usage|firewall_drops|open_ports|last_login}"
exit 1
;;
esac
EOF

# 设置脚本权限
chmod +x $CUSTOM_SCRIPTS_DIR/*.sh
chown zabbix:zabbix $CUSTOM_SCRIPTS_DIR/*.sh
}

# 启动和启用服务
function start_zabbix_agent() {
echo "启动Zabbix Agent服务..."

systemctl daemon-reload
systemctl enable zabbix-agent2
systemctl start zabbix-agent2

# 检查服务状态
if systemctl is-active --quiet zabbix-agent2; then
echo "Zabbix Agent 启动成功!"
systemctl status zabbix-agent2
else
echo "Zabbix Agent 启动失败!"
systemctl status zabbix-agent2
exit 1
fi
}

# 主执行函数
function main() {
echo "开始安装和配置Zabbix Agent..."

install_zabbix_agent
configure_zabbix_agent
create_custom_scripts
start_zabbix_agent

echo "Zabbix Agent 安装配置完成!"
echo "请在Zabbix服务器上添加此主机: $(hostname) ($(hostname -I | awk '{print $1}'))"
}

# 检查是否以root权限运行
if [ "$EUID" -ne 0 ]; then
echo "请以root权限运行此脚本"
exit 1
fi

# 执行主函数
main

2. Zabbix 模板配置

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
<?xml version="1.0" encoding="UTF-8"?>
<zabbix_export>
<version>6.4</version>
<date>2024-03-15T10:00:00Z</date>
<groups>
<group>
<uuid>linux-servers</uuid>
<name>Linux Servers</name>
</group>
</groups>
<templates>
<template>
<uuid>linux-advanced-monitoring</uuid>
<template>Linux Advanced Monitoring</template>
<name>Linux Advanced Monitoring</name>
<groups>
<group>
<name>Linux Servers</name>
</group>
</groups>
<items>
<!-- CPU监控项 -->
<item>
<uuid>cpu-utilization</uuid>
<name>CPU Utilization</name>
<key>custom.cpu.util</key>
<delay>60s</delay>
<value_type>FLOAT</value_type>
<units>%</units>
<description>CPU utilization percentage</description>
</item>

<!-- 内存监控项 -->
<item>
<uuid>memory-utilization</uuid>
<name>Memory Utilization</name>
<key>custom.memory.util</key>
<delay>60s</delay>
<value_type>FLOAT</value_type>
<units>%</units>
<description>Memory utilization percentage</description>
</item>

<!-- 磁盘监控项 -->
<item>
<uuid>disk-utilization-root</uuid>
<name>Disk Utilization /</name>
<key>custom.disk.util[/]</key>
<delay>300s</delay>
<value_type>FLOAT</value_type>
<units>%</units>
<description>Root filesystem utilization</description>
</item>

<!-- 网络监控项 -->
<item>
<uuid>network-bytes-in</uuid>
<name>Network Bytes In eth0</name>
<key>custom.network.bytes.in[eth0]</key>
<delay>60s</delay>
<value_type>UINT64</value_type>
<units>B</units>
<description>Network bytes received on eth0</description>
<preprocessing>
<step>
<type>CHANGE_PER_SECOND</type>
</step>
</preprocessing>
</item>

<!-- 系统负载监控项 -->
<item>
<uuid>load-average-1min</uuid>
<name>Load Average 1min</name>
<key>custom.load.avg1</key>
<delay>60s</delay>
<value_type>FLOAT</value_type>
<description>1 minute load average</description>
</item>

<!-- 进程监控项 -->
<item>
<uuid>tcp-connections</uuid>
<name>TCP Connections</name>
<key>custom.tcp.connections</key>
<delay>60s</delay>
<value_type>UINT64</value_type>
<description>Number of established TCP connections</description>
</item>
</items>

<triggers>
<!-- CPU告警触发器 -->
<trigger>
<uuid>cpu-high-trigger</uuid>
<expression>last(/Linux Advanced Monitoring/custom.cpu.util)&gt;80</expression>
<name>High CPU utilization on {HOST.NAME}</name>
<priority>WARNING</priority>
<description>CPU utilization is above 80%</description>
</trigger>

<!-- 内存告警触发器 -->
<trigger>
<uuid>memory-high-trigger</uuid>
<expression>last(/Linux Advanced Monitoring/custom.memory.util)&gt;90</expression>
<name>High memory utilization on {HOST.NAME}</name>
<priority>HIGH</priority>
<description>Memory utilization is above 90%</description>
</trigger>

<!-- 磁盘告警触发器 -->
<trigger>
<uuid>disk-high-trigger</uuid>
<expression>last(/Linux Advanced Monitoring/custom.disk.util[/])&gt;85</expression>
<name>High disk utilization on {HOST.NAME}</name>
<priority>WARNING</priority>
<description>Root filesystem utilization is above 85%</description>
</trigger>

<!-- 系统负载告警触发器 -->
<trigger>
<uuid>load-high-trigger</uuid>
<expression>last(/Linux Advanced Monitoring/custom.load.avg1)&gt;4</expression>
<name>High system load on {HOST.NAME}</name>
<priority>WARNING</priority>
<description>1 minute load average is above 4</description>
</trigger>
</triggers>

<graphs>
<!-- CPU使用率图表 -->
<graph>
<uuid>cpu-graph</uuid>
<name>CPU Utilization</name>
<graph_items>
<graph_item>
<color>FF0000</color>
<item>
<host>Linux Advanced Monitoring</host>
<key>custom.cpu.util</key>
</item>
</graph_item>
</graph_items>
</graph>

<!-- 内存使用率图表 -->
<graph>
<uuid>memory-graph</uuid>
<name>Memory Utilization</name>
<graph_items>
<graph_item>
<color>00FF00</color>
<item>
<host>Linux Advanced Monitoring</host>
<key>custom.memory.util</key>
</item>
</graph_item>
</graph_items>
</graph>

<!-- 网络流量图表 -->
<graph>
<uuid>network-graph</uuid>
<name>Network Traffic eth0</name>
<graph_items>
<graph_item>
<color>0000FF</color>
<item>
<host>Linux Advanced Monitoring</host>
<key>custom.network.bytes.in[eth0]</key>
</item>
</graph_item>
</graph_items>
</graph>
</graphs>
</template>
</templates>
</zabbix_export>

性能调优与优化

1. 监控系统性能优化

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
#!/bin/bash

# 监控系统性能优化脚本

OPTIMIZATION_LOG="/var/log/monitoring-optimization.log"
CONFIG_BACKUP_DIR="/etc/monitoring/backups"

# 创建必要目录
mkdir -p $CONFIG_BACKUP_DIR

# Prometheus 性能优化
function optimize_prometheus() {
echo "优化Prometheus配置..." | tee -a $OPTIMIZATION_LOG

local prometheus_config="/etc/prometheus/prometheus.yml"

if [ -f $prometheus_config ]; then
# 备份原配置
cp $prometheus_config $CONFIG_BACKUP_DIR/prometheus.yml.$(date +%Y%m%d_%H%M%S)

# 优化配置建议
cat >> $OPTIMIZATION_LOG << 'EOF'
Prometheus 性能优化建议:
1. 调整scrape_interval,减少不必要的频繁采集
2. 使用recording rules预计算复杂查询
3. 配置适当的retention时间
4. 启用压缩以减少存储空间
5. 使用federation分层部署
EOF

# 检查Prometheus存储使用情况
local data_dir="/var/lib/prometheus"
if [ -d $data_dir ]; then
local storage_size=$(du -sh $data_dir | cut -f1)
echo "Prometheus存储使用: $storage_size" | tee -a $OPTIMIZATION_LOG
fi
fi
}

# Node Exporter 性能优化
function optimize_node_exporter() {
echo "优化Node Exporter配置..." | tee -a $OPTIMIZATION_LOG

local service_file="/etc/systemd/system/node_exporter.service"

if [ -f $service_file ]; then
# 检查当前配置
echo "当前Node Exporter配置:" | tee -a $OPTIMIZATION_LOG
grep ExecStart $service_file | tee -a $OPTIMIZATION_LOG

# 优化建议
cat >> $OPTIMIZATION_LOG << 'EOF'
Node Exporter 性能优化建议:
1. 禁用不需要的collector以减少资源使用
2. 调整--collector.filesystem.ignored-mount-points
3. 使用--collector.textfile.directory启用自定义指标
4. 配置适当的--web.max-requests限制
EOF
fi
}

# 系统资源优化
function optimize_system_resources() {
echo "优化系统资源配置..." | tee -a $OPTIMIZATION_LOG

# 检查文件描述符限制
local current_fd_limit=$(ulimit -n)
echo "当前文件描述符限制: $current_fd_limit" | tee -a $OPTIMIZATION_LOG

if [ $current_fd_limit -lt 65536 ]; then
echo "建议增加文件描述符限制到65536" | tee -a $OPTIMIZATION_LOG

# 添加到limits.conf
if ! grep -q "monitoring" /etc/security/limits.conf; then
cat >> /etc/security/limits.conf << 'EOF'
# 监控系统优化
prometheus soft nofile 65536
prometheus hard nofile 65536
zabbix soft nofile 65536
zabbix hard nofile 65536
EOF
fi
fi

# 检查内存使用
local mem_total=$(free -m | awk 'NR==2{print $2}')
local mem_used=$(free -m | awk 'NR==2{print $3}')
local mem_usage_percent=$((mem_used * 100 / mem_total))

echo "内存使用情况: ${mem_used}MB / ${mem_total}MB (${mem_usage_percent}%)" | tee -a $OPTIMIZATION_LOG

if [ $mem_usage_percent -gt 80 ]; then
echo "警告: 内存使用率过高,建议优化监控配置或增加内存" | tee -a $OPTIMIZATION_LOG
fi

# 检查磁盘I/O
if command -v iostat &> /dev/null; then
echo "磁盘I/O统计:" | tee -a $OPTIMIZATION_LOG
iostat -x 1 3 | tail -n +4 | tee -a $OPTIMIZATION_LOG
fi
}

# 网络优化
function optimize_network() {
echo "优化网络配置..." | tee -a $OPTIMIZATION_LOG

# 检查网络连接数
local tcp_connections=$(netstat -an | grep ESTABLISHED | wc -l)
echo "当前TCP连接数: $tcp_connections" | tee -a $OPTIMIZATION_LOG

# 检查网络缓冲区
local net_core_rmem_max=$(cat /proc/sys/net/core/rmem_max)
local net_core_wmem_max=$(cat /proc/sys/net/core/wmem_max)

echo "网络接收缓冲区最大值: $net_core_rmem_max" | tee -a $OPTIMIZATION_LOG
echo "网络发送缓冲区最大值: $net_core_wmem_max" | tee -a $OPTIMIZATION_LOG

# 优化建议
if [ $net_core_rmem_max -lt 16777216 ]; then
echo "建议增加网络接收缓冲区: echo 16777216 > /proc/sys/net/core/rmem_max" | tee -a $OPTIMIZATION_LOG
fi

if [ $net_core_wmem_max -lt 16777216 ]; then
echo "建议增加网络发送缓冲区: echo 16777216 > /proc/sys/net/core/wmem_max" | tee -a $OPTIMIZATION_LOG
fi
}

# 日志优化
function optimize_logging() {
echo "优化日志配置..." | tee -a $OPTIMIZATION_LOG

# 检查日志文件大小
find /var/log -name "*.log" -size +100M 2>/dev/null | while read large_log; do
local log_size=$(du -h "$large_log" | cut -f1)
echo "发现大日志文件: $large_log ($log_size)" | tee -a $OPTIMIZATION_LOG
done

# 检查logrotate配置
if [ -f /etc/logrotate.conf ]; then
echo "检查logrotate配置..." | tee -a $OPTIMIZATION_LOG

# 确保监控相关日志有轮转配置
local monitoring_logrotate="/etc/logrotate.d/monitoring"
if [ ! -f $monitoring_logrotate ]; then
cat > $monitoring_logrotate << 'EOF'
/var/log/prometheus/*.log {
daily
missingok
rotate 7
compress
delaycompress
notifempty
sharedscripts
postrotate
systemctl reload prometheus
endscript
}

/var/log/zabbix/*.log {
daily
missingok
rotate 7
compress
delaycompress
notifempty
sharedscripts
postrotate
systemctl reload zabbix-agent2
endscript
}

/var/log/system-monitor/*.log {
daily
missingok
rotate 30
compress
delaycompress
notifempty
}
EOF
echo "创建监控日志轮转配置: $monitoring_logrotate" | tee -a $OPTIMIZATION_LOG
fi
fi
}

# 生成优化报告
function generate_optimization_report() {
local report_file="/var/log/monitoring-optimization-report.txt"
local timestamp=$(date +"%Y-%m-%d %H:%M:%S")

{
echo "==========================================="
echo "监控系统优化报告 - $timestamp"
echo "==========================================="
echo ""

echo "=== 系统资源使用情况 ==="
echo "CPU核心数: $(nproc)"
echo "内存总量: $(free -h | awk 'NR==2{print $2}')"
echo "磁盘使用情况:"
df -h | grep -E '^/dev/'
echo ""

echo "=== 监控服务状态 ==="
for service in prometheus node_exporter grafana-server zabbix-agent2; do
if systemctl list-unit-files | grep -q "^$service"; then
status=$(systemctl is-active $service 2>/dev/null)
echo "$service: $status"
fi
done
echo ""

echo "=== 性能指标 ==="
echo "当前负载: $(cat /proc/loadavg)"
echo "内存使用率: $(free | awk 'NR==2{printf "%.2f%%", $3*100/$2}')"
echo "TCP连接数: $(netstat -an | grep ESTABLISHED | wc -l)"
echo "文件描述符使用: $(cat /proc/sys/fs/file-nr | awk '{print $1}') / $(cat /proc/sys/fs/file-max)"
echo ""

echo "=== 优化建议 ==="
cat $OPTIMIZATION_LOG | grep -E "建议|警告|优化"

} > $report_file

echo "优化报告已生成: $report_file"
}

# 主执行函数
function main() {
echo "开始监控系统性能优化检查..." | tee $OPTIMIZATION_LOG

optimize_prometheus
optimize_node_exporter
optimize_system_resources
optimize_network
optimize_logging
generate_optimization_report

echo "监控系统优化检查完成!" | tee -a $OPTIMIZATION_LOG
}

# 执行主函数
main

最佳实践与总结

监控系统设计原则

  1. 分层监控原则

    • 基础设施层:CPU、内存、磁盘、网络
    • 应用层:服务状态、性能指标、业务指标
    • 用户体验层:响应时间、可用性、错误率
  2. 告警设计原则

    • 避免告警疲劳,设置合理的阈值
    • 实施告警分级,区分紧急和一般告警
    • 建立告警升级机制
    • 定期回顾和调整告警规则
  3. 数据保留策略

    • 高精度短期数据(1分钟,保留7天)
    • 中精度中期数据(5分钟,保留30天)
    • 低精度长期数据(1小时,保留1年)
  4. 性能优化要点

    • 合理配置采集频率
    • 使用标签和维度进行数据分组
    • 实施数据压缩和存储优化
    • 定期清理历史数据

常见问题解决方案

1. 监控数据丢失

  • 检查网络连接和防火墙设置
  • 验证监控agent的运行状态
  • 检查存储空间是否充足
  • 查看监控系统的错误日志

2. 告警风暴

  • 实施告警抑制和分组
  • 调整告警阈值和持续时间
  • 使用依赖关系减少重复告警
  • 建立告警静默机制

3. 性能问题

  • 优化查询语句和聚合规则
  • 增加系统资源(CPU、内存、存储)
  • 实施监控系统的水平扩展
  • 使用缓存减少重复计算

技术选型建议

1. 小型环境(<50台服务器)

  • 推荐方案:Prometheus + Grafana + AlertManager
  • 优势:轻量级、易部署、社区活跃
  • 适用场景:中小型企业、开发测试环境

2. 中型环境(50-500台服务器)

  • 推荐方案:Zabbix 或 Prometheus集群
  • 优势:功能完善、扩展性好、管理界面友好
  • 适用场景:中型企业、生产环境

3. 大型环境(>500台服务器)

  • 推荐方案:Prometheus联邦 + Thanos 或企业级解决方案
  • 优势:高可用、高性能、长期存储
  • 适用场景:大型企业、云环境

结语

Linux系统监控是现代IT运维的基石,一个完善的监控体系能够显著提升系统的可靠性和运维效率。通过本文的深入探讨,我们了解了从基础监控工具到企业级监控解决方案的完整技术栈。

在实施监控系统时,需要根据实际业务需求和技术环境选择合适的方案。同时,监控系统本身也需要持续优化和改进,以适应不断变化的业务需求和技术发展。

记住,监控不是目的,而是手段。最终目标是通过有效的监控来保障系统稳定运行,提升用户体验,支撑业务发展。希望本文能为读者在构建和优化Linux监控系统时提供有价值的参考和指导。应用日志告警” “$alert_msg”
fi
}

应用性能指标分析

function analyze_app_performance() {
local app_name=$1
local log_file=$2
local current_time=$(date +”%Y-%m-%d %H:%M:%S”)

if [ ! -f "$log_file" ]; then
    return
fi

# 分析响应时间(假设日志中包含响应时间信息)
local avg_response_time=$(awk '
    /response_time/ {
        match($0, /response_time[=:]([0-9.]+)/, arr)
        if (arr[1] != "") {
            sum += arr[1]
            count++
        }
    } END {
        if (count > 0) print sum/count
        else print 0
    }' "$log_file")

# 生成性能指标
echo "# HELP app_response_time_seconds Average response time in seconds" >> $METRICS_FILE.tmp
echo "# TYPE app_response_time_seconds gauge" >> $METRICS_FILE.tmp
echo "app_response_time_seconds{app=\"$app_name\"} $avg_response_time" >> $METRICS_FILE.tmp

# 分析请求量
local request_count=$(grep -c "request" "$log_file" 2>/dev/null || echo 0)
echo "# HELP app_requests_total Total number of requests" >> $METRICS_FILE.tmp
echo "# TYPE app_requests_total counter" >> $METRICS_FILE.tmp
echo "app_requests_total{app=\"$app_name\"} $request_count" >> $METRICS_FILE.tmp

}

日志轮转检查

function check_log_rotation() {
local current_time=$(date +”%Y-%m-%d %H:%M:%S”)

# 检查大文件
find /var/log -name "*.log" -size +100M 2>/dev/null | while read large_file; do
    local file_size=$(du -h "$large_file" | cut -f1)
    echo "[$current_time] [WARNING] 发现大日志文件: $large_file ($file_size)" >> $ALERT_LOG
done

# 检查磁盘空间
local log_disk_usage=$(df /var/log | awk 'NR==2 {gsub(/%/, "", $5); print $5}')
if [ $log_disk_usage -gt 80 ]; then
    echo "[$current_time] [WARNING] 日志分区磁盘使用率过高: ${log_disk_usage}%" >> $ALERT_LOG
    send_app_alert "WARNING" "

版权所有,如有侵权请联系我