Linux系统监控:构建全方位运维监控体系

在现代IT基础设施中,Linux系统监控是保障服务稳定运行的关键环节。一个完善的监控体系不仅能够及时发现问题,还能预测潜在风险,为运维决策提供数据支撑。本文将深入探讨Linux系统监控的各个方面,从基础监控指标到企业级监控架构的设计与实现。

系统监控基础理论

监控的核心价值

系统监控在现代运维中发挥着至关重要的作用:

  1. 故障预防:通过监控关键指标,在问题发生前进行预警
  2. 快速定位:当故障发生时,快速定位问题根源
  3. 性能优化:基于监控数据进行系统性能调优
  4. 容量规划:为系统扩容和资源规划提供数据依据
  5. SLA保障:确保服务水平协议的达成

监控指标体系

1. 系统资源监控

CPU监控指标

  • CPU使用率(user、system、idle、iowait)
  • 负载平均值(1分钟、5分钟、15分钟)
  • CPU核心数和频率
  • 进程和线程数量

内存监控指标

  • 内存使用率和可用内存
  • Swap使用情况
  • 缓存和缓冲区使用量
  • 内存分配和释放速率

磁盘监控指标

  • 磁盘使用率和可用空间
  • 磁盘I/O读写速率
  • 磁盘队列长度和响应时间
  • inode使用情况

网络监控指标

  • 网络接口流量(入站/出站)
  • 网络连接数和状态
  • 网络错误和丢包率
  • TCP连接状态分布

2. 应用层监控

进程监控

  • 进程状态和资源使用
  • 进程启动时间和运行时长
  • 进程文件描述符使用
  • 进程内存映射

服务监控

  • 服务可用性和响应时间
  • 服务端口监听状态
  • 服务日志错误统计
  • 服务依赖关系检查

基础监控工具与命令

系统内置监控命令

1. 实时监控命令

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
#!/bin/bash

# CPU和内存实时监控
function monitor_cpu_memory() {
echo "=== CPU和内存监控 ==="

# 使用top命令获取系统概览
top -bn1 | head -20

echo -e "\n=== 详细CPU信息 ==="
# CPU详细信息
cat /proc/cpuinfo | grep -E "processor|model name|cpu MHz" | head -10

echo -e "\n=== 内存详细信息 ==="
# 内存详细信息
free -h
cat /proc/meminfo | grep -E "MemTotal|MemFree|MemAvailable|Buffers|Cached|SwapTotal|SwapFree"

echo -e "\n=== 负载平均值 ==="
# 系统负载
uptime
cat /proc/loadavg
}

# 磁盘监控
function monitor_disk() {
echo "=== 磁盘使用监控 ==="

# 磁盘使用情况
df -h

echo -e "\n=== inode使用情况 ==="
df -i

echo -e "\n=== 磁盘I/O统计 ==="
# 磁盘I/O统计(需要sysstat包)
if command -v iostat &> /dev/null; then
iostat -x 1 3
else
echo "iostat命令未安装,请安装sysstat包"
fi

echo -e "\n=== 磁盘读写活动 ==="
# 实时磁盘活动
if command -v iotop &> /dev/null; then
iotop -b -n 1
else
echo "iotop命令未安装"
fi
}

# 网络监控
function monitor_network() {
echo "=== 网络监控 ==="

# 网络接口统计
cat /proc/net/dev

echo -e "\n=== 网络连接统计 ==="
# TCP连接状态统计
netstat -an | awk '/^tcp/ {++state[$NF]} END {for(key in state) print key"\t"state[key]}'

echo -e "\n=== 监听端口 ==="
# 监听端口
netstat -tlnp

echo -e "\n=== 网络接口信息 ==="
# 网络接口详细信息
ip addr show

echo -e "\n=== 路由表 ==="
# 路由信息
ip route show
}

# 进程监控
function monitor_processes() {
echo "=== 进程监控 ==="

# 进程数量统计
echo "总进程数: $(ps aux | wc -l)"
echo "运行中进程数: $(ps aux | awk '$8 ~ /^R/ {count++} END {print count+0}')"
echo "睡眠进程数: $(ps aux | awk '$8 ~ /^S/ {count++} END {print count+0}')"
echo "僵尸进程数: $(ps aux | awk '$8 ~ /^Z/ {count++} END {print count+0}')"

echo -e "\n=== CPU使用率最高的10个进程 ==="
ps aux --sort=-%cpu | head -11

echo -e "\n=== 内存使用率最高的10个进程 ==="
ps aux --sort=-%mem | head -11

echo -e "\n=== 文件描述符使用情况 ==="
# 系统文件描述符限制
echo "系统文件描述符限制: $(cat /proc/sys/fs/file-max)"
echo "当前使用的文件描述符: $(cat /proc/sys/fs/file-nr | awk '{print $1}')"

# 进程文件描述符使用Top 10
echo -e "\n=== 文件描述符使用最多的10个进程 ==="
for pid in $(ps -eo pid --no-headers | head -20); do
if [ -d "/proc/$pid/fd" ]; then
fd_count=$(ls /proc/$pid/fd 2>/dev/null | wc -l)
if [ $fd_count -gt 0 ]; then
cmd=$(ps -p $pid -o comm --no-headers 2>/dev/null)
echo "$pid $cmd $fd_count"
fi
fi
done | sort -k3 -nr | head -10
}

# 系统服务监控
function monitor_services() {
echo "=== 系统服务监控 ==="

# systemd服务状态
if command -v systemctl &> /dev/null; then
echo "=== 失败的服务 ==="
systemctl --failed

echo -e "\n=== 关键服务状态 ==="
# 检查关键服务状态
critical_services=("sshd" "network" "firewalld" "chronyd" "rsyslog")
for service in "${critical_services[@]}"; do
if systemctl list-unit-files | grep -q "^$service"; then
status=$(systemctl is-active $service 2>/dev/null)
enabled=$(systemctl is-enabled $service 2>/dev/null)
echo "$service: $status ($enabled)"
fi
done
fi

echo -e "\n=== 系统启动时间 ==="
uptime

echo -e "\n=== 最近登录用户 ==="
last | head -10
}

# 主监控函数
function main_monitor() {
echo "==========================================="
echo "Linux系统监控报告 - $(date)"
echo "主机名: $(hostname)"
echo "内核版本: $(uname -r)"
echo "系统版本: $(cat /etc/os-release | grep PRETTY_NAME | cut -d'"' -f2)"
echo "==========================================="

monitor_cpu_memory
echo -e "\n"
monitor_disk
echo -e "\n"
monitor_network
echo -e "\n"
monitor_processes
echo -e "\n"
monitor_services
}

# 执行监控
main_monitor

2. 高级监控脚本

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
#!/bin/bash

# 系统性能监控脚本
# 用于收集详细的系统性能数据

MONITOR_DIR="/var/log/system-monitor"
DATE=$(date +"%Y%m%d")
TIMESTAMP=$(date +"%Y-%m-%d %H:%M:%S")

# 创建监控日志目录
mkdir -p $MONITOR_DIR

# 性能数据收集函数
function collect_performance_data() {
local output_file="$MONITOR_DIR/performance_$DATE.log"

{
echo "[$TIMESTAMP] === 性能数据收集开始 ==="

# CPU性能数据
echo "[CPU] 负载平均值:"
cat /proc/loadavg

echo "[CPU] CPU使用率详情:"
grep 'cpu ' /proc/stat

echo "[CPU] 上下文切换和中断:"
grep -E 'ctxt|intr|processes|procs_running|procs_blocked' /proc/stat

# 内存性能数据
echo "[MEMORY] 内存使用详情:"
cat /proc/meminfo

echo "[MEMORY] 虚拟内存统计:"
cat /proc/vmstat | grep -E 'pgpgin|pgpgout|pswpin|pswpout|pgfault|pgmajfault'

# 磁盘I/O数据
echo "[DISK] 磁盘统计:"
cat /proc/diskstats

# 网络统计
echo "[NETWORK] 网络接口统计:"
cat /proc/net/dev

echo "[NETWORK] TCP统计:"
cat /proc/net/snmp | grep Tcp

echo "[$TIMESTAMP] === 性能数据收集结束 ==="
echo ""

} >> $output_file
}

# 系统健康检查
function system_health_check() {
local output_file="$MONITOR_DIR/health_check_$DATE.log"
local alert_file="$MONITOR_DIR/alerts_$DATE.log"

{
echo "[$TIMESTAMP] === 系统健康检查开始 ==="

# CPU负载检查
load_1min=$(cat /proc/loadavg | awk '{print $1}')
cpu_cores=$(nproc)
load_threshold=$(echo "$cpu_cores * 0.8" | bc -l)

if (( $(echo "$load_1min > $load_threshold" | bc -l) )); then
echo "[ALERT] CPU负载过高: $load_1min (阈值: $load_threshold)" | tee -a $alert_file
fi

# 内存使用检查
mem_total=$(grep MemTotal /proc/meminfo | awk '{print $2}')
mem_available=$(grep MemAvailable /proc/meminfo | awk '{print $2}')
mem_usage_percent=$(echo "scale=2; (($mem_total - $mem_available) * 100) / $mem_total" | bc)

if (( $(echo "$mem_usage_percent > 90" | bc -l) )); then
echo "[ALERT] 内存使用率过高: ${mem_usage_percent}%" | tee -a $alert_file
fi

# 磁盘使用检查
df -h | awk 'NR>1 {gsub(/%/, "", $5); if($5 > 90) print "[ALERT] 磁盘使用率过高: " $6 " (" $5 "%)";}' | tee -a $alert_file

# 检查僵尸进程
zombie_count=$(ps aux | awk '$8 ~ /^Z/ {count++} END {print count+0}')
if [ $zombie_count -gt 0 ]; then
echo "[ALERT] 发现僵尸进程: $zombie_count 个" | tee -a $alert_file
fi

# 检查重要服务
critical_services=("sshd" "network" "firewalld")
for service in "${critical_services[@]}"; do
if systemctl list-unit-files | grep -q "^$service" && ! systemctl is-active --quiet $service; then
echo "[ALERT] 关键服务未运行: $service" | tee -a $alert_file
fi
done

echo "[$TIMESTAMP] === 系统健康检查结束 ==="
echo ""

} >> $output_file
}

# 网络连接监控
function monitor_network_connections() {
local output_file="$MONITOR_DIR/network_connections_$DATE.log"

{
echo "[$TIMESTAMP] === 网络连接监控开始 ==="

# TCP连接状态统计
echo "[TCP] 连接状态统计:"
netstat -an | awk '/^tcp/ {++state[$NF]} END {for(key in state) print key"\t"state[key]}'

# 连接数最多的IP地址
echo "[TCP] 连接数最多的IP地址 (Top 10):"
netstat -an | grep ESTABLISHED | awk '{print $5}' | cut -d: -f1 | sort | uniq -c | sort -nr | head -10

# 监听端口
echo "[TCP] 当前监听端口:"
netstat -tlnp | grep LISTEN

# 网络流量统计
echo "[NETWORK] 网络接口流量:"
cat /proc/net/dev | awk 'NR>2 {printf "%-10s RX: %10d bytes TX: %10d bytes\n", $1, $2, $10}'

echo "[$TIMESTAMP] === 网络连接监控结束 ==="
echo ""

} >> $output_file
}

# 进程资源使用监控
function monitor_process_resources() {
local output_file="$MONITOR_DIR/process_resources_$DATE.log"

{
echo "[$TIMESTAMP] === 进程资源使用监控开始 ==="

# CPU使用率最高的进程
echo "[PROCESS] CPU使用率最高的进程 (Top 10):"
ps aux --sort=-%cpu | head -11 | awk 'NR==1 {print $0} NR>1 {printf "%-8s %-10s %6s %6s %-20s\n", $2, $1, $3, $4, $11}'

# 内存使用最多的进程
echo "[PROCESS] 内存使用最多的进程 (Top 10):"
ps aux --sort=-%mem | head -11 | awk 'NR==1 {print $0} NR>1 {printf "%-8s %-10s %6s %6s %-20s\n", $2, $1, $3, $4, $11}'

# 文件描述符使用统计
echo "[PROCESS] 文件描述符使用统计:"
echo "系统限制: $(cat /proc/sys/fs/file-max)"
echo "当前使用: $(cat /proc/sys/fs/file-nr | awk '{print $1}')"
echo "使用率: $(cat /proc/sys/fs/file-nr | awk '{printf "%.2f%%", ($1/$(cat /proc/sys/fs/file-max))*100}')"

echo "[$TIMESTAMP] === 进程资源使用监控结束 ==="
echo ""

} >> $output_file
}

# 日志清理函数
function cleanup_old_logs() {
# 删除7天前的日志文件
find $MONITOR_DIR -name "*.log" -mtime +7 -delete
}

# 主执行函数
function main() {
echo "开始系统监控数据收集..."

collect_performance_data
system_health_check
monitor_network_connections
monitor_process_resources
cleanup_old_logs

echo "监控数据收集完成,日志保存在: $MONITOR_DIR"

# 如果有告警,显示告警信息
alert_file="$MONITOR_DIR/alerts_$DATE.log"
if [ -f $alert_file ] && [ -s $alert_file ]; then
echo "发现系统告警:"
cat $alert_file
fi
}

# 检查是否以root权限运行
if [ "$EUID" -ne 0 ]; then
echo "建议以root权限运行以获取完整的监控数据"
fi

# 执行主函数
main

企业级监控解决方案

Prometheus + Grafana 监控架构

1. Prometheus 配置

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# prometheus.yml
global:
scrape_interval: 15s
evaluation_interval: 15s
external_labels:
cluster: 'production'
region: 'us-west-1'

rule_files:
- "rules/*.yml"

alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093

scrape_configs:
# Prometheus自身监控
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
scrape_interval: 5s
metrics_path: /metrics

# Node Exporter监控
- job_name: 'node-exporter'
static_configs:
- targets:
- 'server1:9100'
- 'server2:9100'
- 'server3:9100'
scrape_interval: 10s
metrics_path: /metrics
relabel_configs:
- source_labels: [__address__]
target_label: instance
regex: '([^:]+):.+'
replacement: '${1}'

# 应用程序监控
- job_name: 'application'
static_configs:
- targets:
- 'app1:8080'
- 'app2:8080'
metrics_path: /actuator/prometheus
scrape_interval: 15s

# MySQL监控
- job_name: 'mysql'
static_configs:
- targets:
- 'mysql-exporter:9104'
scrape_interval: 30s

# Redis监控
- job_name: 'redis'
static_configs:
- targets:
- 'redis-exporter:9121'
scrape_interval: 30s

# Nginx监控
- job_name: 'nginx'
static_configs:
- targets:
- 'nginx-exporter:9113'
scrape_interval: 30s

# 服务发现配置(Consul)
- job_name: 'consul-services'
consul_sd_configs:
- server: 'consul:8500'
services: []
relabel_configs:
- source_labels: [__meta_consul_service]
target_label: job
- source_labels: [__meta_consul_node]
target_label: instance

2. 告警规则配置

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# rules/system-alerts.yml
groups:
- name: system.rules
rules:
# CPU使用率告警
- alert: HighCPUUsage
expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 5m
labels:
severity: warning
category: system
annotations:
summary: "High CPU usage detected"
description: "CPU usage is above 80% for more than 5 minutes on {{ $labels.instance }}"

# 内存使用率告警
- alert: HighMemoryUsage
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 90
for: 5m
labels:
severity: critical
category: system
annotations:
summary: "High memory usage detected"
description: "Memory usage is above 90% for more than 5 minutes on {{ $labels.instance }}"

# 磁盘使用率告警
- alert: HighDiskUsage
expr: (1 - (node_filesystem_avail_bytes{fstype!="tmpfs"} / node_filesystem_size_bytes{fstype!="tmpfs"})) * 100 > 90
for: 5m
labels:
severity: warning
category: system
annotations:
summary: "High disk usage detected"
description: "Disk usage is above 90% on {{ $labels.instance }} mount point {{ $labels.mountpoint }}"

# 系统负载告警
- alert: HighSystemLoad
expr: node_load1 / on(instance) count by(instance) (node_cpu_seconds_total{mode="idle"}) > 0.8
for: 10m
labels:
severity: warning
category: system
annotations:
summary: "High system load detected"
description: "System load is above 80% of CPU cores for more than 10 minutes on {{ $labels.instance }}"

# 网络连接数告警
- alert: HighNetworkConnections
expr: node_netstat_Tcp_CurrEstab > 1000
for: 5m
labels:
severity: warning
category: network
annotations:
summary: "High number of TCP connections"
description: "Number of established TCP connections is above 1000 on {{ $labels.instance }}"

# 服务不可用告警
- alert: ServiceDown
expr: up == 0
for: 1m
labels:
severity: critical
category: availability
annotations:
summary: "Service is down"
description: "Service {{ $labels.job }} on {{ $labels.instance }} is down"

# 磁盘I/O使用率告警
- alert: HighDiskIOUsage
expr: irate(node_disk_io_time_seconds_total[5m]) * 100 > 80
for: 5m
labels:
severity: warning
category: system
annotations:
summary: "High disk I/O usage detected"
description: "Disk I/O usage is above 80% for more than 5 minutes on {{ $labels.instance }} device {{ $labels.device }}"

- name: application.rules
rules:
# 应用响应时间告警
- alert: HighResponseTime
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 1
for: 5m
labels:
severity: warning
category: application
annotations:
summary: "High response time detected"
description: "95th percentile response time is above 1 second for {{ $labels.instance }}"

# 应用错误率告警
- alert: HighErrorRate
expr: rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) * 100 > 5
for: 5m
labels:
severity: critical
category: application
annotations:
summary: "High error rate detected"
description: "Error rate is above 5% for {{ $labels.instance }}"

3. Node Exporter 部署

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
#!/bin/bash

# Node Exporter 安装脚本

NODE_EXPORTER_VERSION="1.6.1"
USER="node_exporter"
INSTALL_DIR="/opt/node_exporter"
SERVICE_FILE="/etc/systemd/system/node_exporter.service"

# 创建用户
sudo useradd --no-create-home --shell /bin/false $USER

# 下载和安装 Node Exporter
cd /tmp
wget https://github.com/prometheus/node_exporter/releases/download/v${NODE_EXPORTER_VERSION}/node_exporter-${NODE_EXPORTER_VERSION}.linux-amd64.tar.gz
tar xzf node_exporter-${NODE_EXPORTER_VERSION}.linux-amd64.tar.gz

# 创建安装目录
sudo mkdir -p $INSTALL_DIR
sudo cp node_exporter-${NODE_EXPORTER_VERSION}.linux-amd64/node_exporter $INSTALL_DIR/
sudo chown -R $USER:$USER $INSTALL_DIR

# 创建systemd服务文件
sudo tee $SERVICE_FILE > /dev/null <<EOF
[Unit]
Description=Node Exporter
Wants=network-online.target
After=network-online.target

[Service]
User=$USER
Group=$USER
Type=simple
ExecStart=$INSTALL_DIR/node_exporter \
--collector.systemd \
--collector.processes \
--collector.interrupts \
--collector.tcpstat \
--collector.meminfo_numa \
--web.listen-address=:9100
Restart=always
RestartSec=3

[Install]
WantedBy=multi-user.target
EOF

# 启动服务
sudo systemctl daemon-reload
sudo systemctl enable node_exporter
sudo systemctl start node_exporter

# 检查服务状态
sudo systemctl status node_exporter

# 清理临时文件
rm -rf /tmp/node_exporter-*

echo "Node Exporter 安装完成!"
echo "访问 http://$(hostname -I | awk '{print $1}'):9100/metrics 查看指标"

4. 自定义监控指标

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!/bin/bash

# 自定义监控指标收集脚本
# 生成Prometheus格式的指标数据

METRICS_FILE="/var/lib/node_exporter/textfile_collector/custom_metrics.prom"
TMP_FILE="${METRICS_FILE}.tmp"

# 确保目录存在
mkdir -p $(dirname $METRICS_FILE)

# 开始生成指标
{
echo "# HELP custom_system_info System information"
echo "# TYPE custom_system_info gauge"

# 系统信息
kernel_version=$(uname -r)
os_version=$(cat /etc/os-release | grep VERSION_ID | cut -d'"' -f2)
echo "custom_system_info{kernel_version=\"$kernel_version\",os_version=\"$os_version\"} 1"

echo ""
echo "# HELP custom_disk_inodes_usage Disk inode usage percentage"
echo "# TYPE custom_disk_inodes_usage gauge"

# 磁盘inode使用率
df -i | awk 'NR>1 && $1 !~ /^tmpfs/ && $6 != "/dev" && $6 != "/run" {
gsub(/%/, "", $5)
if ($5 != "-" && $5 != "")
printf "custom_disk_inodes_usage{device=\"%s\",mountpoint=\"%s\"} %s\n", $1, $6, $5
}'

echo ""
echo "# HELP custom_tcp_connection_states TCP connection states count"
echo "# TYPE custom_tcp_connection_states gauge"

# TCP连接状态统计
netstat -an | awk '/^tcp/ {state[$NF]++} END {
for (s in state) {
gsub(/[^A-Z_]/, "", s)
if (s != "") printf "custom_tcp_connection_states{state=\"%s\"} %d\n", s, state[s]
}
}'

echo ""
echo "# HELP custom_process_count Process count by state"
echo "# TYPE custom_process_count gauge"

# 进程状态统计
ps aux | awk 'NR>1 {
if ($8 ~ /^R/) running++
else if ($8 ~ /^S/) sleeping++
else if ($8 ~ /^D/) uninterruptible++
else if ($8 ~ /^Z/) zombie++
else if ($8 ~ /^T/) stopped++
total++
} END {
printf "custom_process_count{state=\"running\"} %d\n", running+0
printf "custom_process_count{state=\"sleeping\"} %d\n", sleeping+0
printf "custom_process_count{state=\"uninterruptible\"} %d\n", uninterruptible+0
printf "custom_process_count{state=\"zombie\"} %d\n", zombie+0
printf "custom_process_count{state=\"stopped\"} %d\n", stopped+0
printf "custom_process_count{state=\"total\"} %d\n", total+0
}'

echo ""
echo "# HELP custom_service_status Service status (1=active, 0=inactive)"
echo "# TYPE custom_service_status gauge"

# 关键服务状态
critical_services=("sshd" "network" "firewalld" "chronyd" "rsyslog" "docker")
for service in "${critical_services[@]}"; do
if systemctl list-unit-files | grep -q "^$service"; then
if systemctl is-active --quiet $service; then
echo "custom_service_status{service=\"$service\"} 1"
else
echo "custom_service_status{service=\"$service\"} 0"
fi
fi
done

echo ""
echo "# HELP custom_file_descriptor_usage File descriptor usage"
echo "# TYPE custom_file_descriptor_usage gauge"

# 文件描述符使用情况
fd_used=$(cat /proc/sys/fs/file-nr | awk '{print $1}')
fd_max=$(cat /proc/sys/fs/file-max)
fd_usage_percent=$(echo "scale=2; ($fd_used * 100) / $fd_max" | bc)

echo "custom_file_descriptor_usage{type=\"used\"} $fd_used"
echo "custom_file_descriptor_usage{type=\"max\"} $fd_max"
echo "custom_file_descriptor_usage{type=\"usage_percent\"} $fd_usage_percent"

echo ""
echo "# HELP custom_last_boot_time Last boot time in seconds since epoch"
echo "# TYPE custom_last_boot_time gauge"

# 系统启动时间
boot_time=$(stat -c %Y /proc/1)
echo "custom_last_boot_time $boot_time"

echo ""
echo "# HELP custom_security_updates Security updates available"
echo "# TYPE custom_security_updates gauge"

# 安全更新数量(适用于基于Debian的系统)
if command -v apt &> /dev/null; then
security_updates=$(apt list --upgradable 2>/dev/null | grep -c security || echo 0)
echo "custom_security_updates $security_updates"
elif command -v yum &> /dev/null; then
security_updates=$(yum --security check-update 2>/dev/null | grep -c "updates" || echo 0)
echo "custom_security_updates $security_updates"
else
echo "custom_security_updates 0"
fi

} > $TMP_FILE

# 原子性地更新指标文件
mv $TMP_FILE $METRICS_FILE

echo "自定义指标已更新: $METRICS_FILE"

Grafana 仪表板配置

1. 系统概览仪表板

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
{
"dashboard": {
"id": null,
"title": "Linux系统监控概览",
"tags": ["linux", "system", "monitoring"],
"timezone": "browser",
"panels": [
{
"id": 1,
"title": "CPU使用率",
"type": "stat",
"targets": [
{
"expr": "100 - (avg by(instance) (irate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)",
"legendFormat": "{{instance}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"min": 0,
"max": 100,
"thresholds": {
"steps": [
{"color": "green", "value": 0},
{"color": "yellow", "value": 70},
{"color": "red", "value": 90}
]
}
}
},
"gridPos": {"h": 8, "w": 6, "x": 0, "y": 0}
},
{
"id": 2,
"title": "内存使用率",
"type": "stat",
"targets": [
{
"expr": "(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100",
"legendFormat": "{{instance}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"min": 0,
"max": 100,
"thresholds": {
"steps": [
{"color": "green", "value": 0},
{"color": "yellow", "value": 80},
{"color": "red", "value": 95}
]
}
}
},
"gridPos": {"h": 8, "w": 6, "x": 6, "y": 0}
},
{
"id": 3,
"title": "磁盘使用率",
"type": "stat",
"targets": [
{
"expr": "max by(instance) ((1 - (node_filesystem_avail_bytes{fstype!=\"tmpfs\"} / node_filesystem_size_bytes{fstype!=\"tmpfs\"})) * 100)",
"legendFormat": "{{instance}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"min": 0,
"max": 100,
"thresholds": {
"steps": [
{"color": "green", "value": 0},
{"color": "yellow", "value": 80},
{"color": "red", "value": 95}
]
}
}
},
"gridPos": {"h": 8, "w": 6, "x": 12, "y": 0}
},
{
"id": 4,
"title": "系统负载",
"type": "stat",
"targets": [
{
"expr": "node_load1",
"legendFormat": "{{instance}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "short",
"decimals": 2,
"thresholds": {
"steps": [
{"color": "green", "value": 0},
{"color": "yellow", "value": 2},
{"color": "red", "value": 4}
]
}
}
},
"gridPos": {"h": 8, "w": 6, "x": 18, "y": 0}
},
{
"id": 5,
"title": "CPU使用率趋势",
"type": "timeseries",
"targets": [
{
"expr": "100 - (avg by(instance) (irate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)",
"legendFormat": "{{instance}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"min": 0,
"max": 100
}
},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 8}
},
{
"id": 6,
"title": "内存使用趋势",
"type": "timeseries",
"targets": [
{
"expr": "node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes",
"legendFormat": "已使用 - {{instance}}"
},
{
"expr": "node_memory_MemAvailable_bytes",
"legendFormat": "可用 - {{instance}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "bytes"
}
},
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 8}
}
],
"time": {
"from": "now-1h",
"to": "now"
},
"refresh": "30s"
}
}

日志监控与分析

1. 系统日志监控

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
#!/bin/bash

# 系统日志监控脚本
# 监控关键系统日志并生成告警

LOG_DIR="/var/log"
ALERT_LOG="/var/log/system-alerts.log"
CONFIG_FILE="/etc/log-monitor.conf"
LAST_CHECK_FILE="/var/lib/log-monitor/last-check"

# 创建必要的目录
mkdir -p $(dirname $LAST_CHECK_FILE)

# 默认配置
if [ ! -f $CONFIG_FILE ]; then
cat > $CONFIG_FILE << 'EOF'
# 日志监控配置文件
# 格式: 日志文件路径:关键词:告警级别
/var/log/messages:error:WARNING
/var/log/messages:critical:CRITICAL
/var/log/messages:panic:CRITICAL
/var/log/secure:Failed password:WARNING
/var/log/secure:authentication failure:WARNING
/var/log/secure:Invalid user:WARNING
/var/log/kern.log:Out of memory:CRITICAL
/var/log/kern.log:segfault:WARNING
/var/log/kern.log:BUG:CRITICAL
/var/log/syslog:error:WARNING
/var/log/syslog:failed:WARNING
/var/log/audit/audit.log:ANOM:WARNING
/var/log/audit/audit.log:denied:WARNING
EOF
fi

# 获取上次检查时间
if [ -f $LAST_CHECK_FILE ]; then
LAST_CHECK=$(cat $LAST_CHECK_FILE)
else
LAST_CHECK=$(date -d "1 hour ago" +"%Y-%m-%d %H:%M:%S")
fi

# 更新检查时间
date +"%Y-%m-%d %H:%M:%S" > $LAST_CHECK_FILE

# 日志分析函数
function analyze_logs() {
local log_file=$1
local keyword=$2
local alert_level=$3
local current_time=$(date +"%Y-%m-%d %H:%M:%S")

if [ ! -f "$log_file" ]; then
return
fi

# 查找自上次检查以来的新日志条目
local new_entries=$(awk -v start="$LAST_CHECK" -v keyword="$keyword" '
BEGIN {
# 将时间字符串转换为时间戳进行比较
cmd = "date -d \"" start "\" +%s"
cmd | getline start_ts
close(cmd)
}
{
# 提取日志时间戳(假设格式为 "Mon DD HH:MM:SS")
if (match($0, /^[A-Za-z]{3}\s+[0-9]{1,2}\s+[0-9]{2}:[0-9]{2}:[0-9]{2}/)) {
log_time = substr($0, RSTART, RLENGTH)
# 添加年份
log_time = strftime("%Y") " " log_time
cmd = "date -d \"" log_time "\" +%s 2>/dev/null"
if ((cmd | getline log_ts) > 0 && log_ts >= start_ts) {
if (tolower($0) ~ tolower(keyword)) {
print $0
}
}
close(cmd)
}
}' "$log_file")

if [ -n "$new_entries" ]; then
echo "[$current_time] [$alert_level] 在 $log_file 中发现关键词 '$keyword':" >> $ALERT_LOG
echo "$new_entries" >> $ALERT_LOG
echo "" >> $ALERT_LOG

# 发送告警通知
send_alert "$alert_level" "日志告警" "在 $log_file 中发现关键词 '$keyword'"
fi
}

# 发送告警函数
function send_alert() {
local level=$1
local title=$2
local message=$3
local timestamp=$(date +"%Y-%m-%d %H:%M:%S")

# 记录到系统日志
logger -t "log-monitor" "[$level] $title: $message"

# 这里可以集成邮件、短信、钉钉等告警方式
case $level in
"CRITICAL")
# 发送紧急告警
echo "[$timestamp] CRITICAL ALERT: $title - $message" | wall
;;
"WARNING")
# 发送警告
echo "[$timestamp] WARNING: $title - $message"
;;
esac
}

# 系统资源告警检查
function check_system_resources() {
local current_time=$(date +"%Y-%m-%d %H:%M:%S")

# 检查磁盘使用率
df -h | awk 'NR>1 && $5+0 > 90 {
gsub(/%/, "", $5)
print "[$current_time] [WARNING] 磁盘使用率过高: " $6 " (" $5 "%)"
}' >> $ALERT_LOG

# 检查内存使用率
local mem_usage=$(free | awk 'NR==2{printf "%.0f", $3*100/$2}')
if [ $mem_usage -gt 90 ]; then
echo "[$current_time] [WARNING] 内存使用率过高: ${mem_usage}%" >> $ALERT_LOG
send_alert "WARNING" "内存告警" "内存使用率达到 ${mem_usage}%"
fi

# 检查CPU负载
local load_1min=$(cat /proc/loadavg | awk '{print $1}')
local cpu_cores=$(nproc)
local load_threshold=$(echo "$cpu_cores * 2" | bc -l)

if (( $(echo "$load_1min > $load_threshold" | bc -l) )); then
echo "[$current_time] [WARNING] CPU负载过高: $load_1min (阈值: $load_threshold)" >> $ALERT_LOG
send_alert "WARNING" "CPU负载告警" "1分钟负载平均值: $load_1min"
fi
}

# 安全事件检查
function check_security_events() {
local current_time=$(date +"%Y-%m-%d %H:%M:%S")

# 检查SSH登录失败
local failed_ssh=$(grep "Failed password" /var/log/secure 2>/dev/null | \
awk -v start="$LAST_CHECK" '
{
if ($0 > start) count++
} END {print count+0}')

if [ $failed_ssh -gt 10 ]; then
echo "[$current_time] [WARNING] SSH登录失败次数过多: $failed_ssh 次" >> $ALERT_LOG
send_alert "WARNING" "安全告警" "检测到 $failed_ssh 次SSH登录失败"
fi

# 检查sudo使用
local sudo_usage=$(grep "sudo:" /var/log/secure 2>/dev/null | \
awk -v start="$LAST_CHECK" '
{
if ($0 > start) count++
} END {print count+0}')

if [ $sudo_usage -gt 50 ]; then
echo "[$current_time] [INFO] sudo使用频繁: $sudo_usage 次" >> $ALERT_LOG
fi
}

# 主执行函数
function main() {
echo "开始日志监控检查 - $(date)"

# 读取配置文件并分析日志
while IFS=':' read -r log_file keyword alert_level; do
# 跳过注释行和空行
if [[ $log_file =~ ^#.*$ ]] || [[ -z $log_file ]]; then
continue
fi

analyze_logs "$log_file" "$keyword" "$alert_level"
done < $CONFIG_FILE

# 检查系统资源
check_system_resources

# 检查安全事件
check_security_events

echo "日志监控检查完成 - $(date)"

# 清理旧的告警日志(保留7天)
find $(dirname $ALERT_LOG) -name "*.log" -mtime +7 -delete 2>/dev/null
}

# 执行主函数
main

2. 应用日志监控

#!/bin/bash

# 应用日志监控脚本
# 专门监控应用程序日志

APP_LOG_DIR="/var/log/applications"
ALERT_LOG="/var/log/app-alerts.log"
CONFIG_FILE="/etc/app-log-monitor.conf"
METRICS_FILE="/var/lib/node_exporter/textfile_collector/app_log_metrics.prom"

# 创建必要目录
mkdir -p $(dirname $ALERT_LOG)
mkdir -p $(dirname $METRICS_FILE)

# 应用日志配置
if [ ! -f $CONFIG_FILE ]; then
    cat > $CONFIG_FILE << 'EOF'
# 应用日志监控配置
# 格式: 应用名:日志文件:错误模式:告警阈值(每分钟)
web-app:/var/log/applications/web-app.log:ERROR:5
api-service:/var/log/applications/api.log:Exception:3
database:/var/log/mysql/error.log:ERROR:2
nginx:/var/log/nginx/error.log:error:10
redis:/var/log/redis/redis-server.log:WARNING:5
EOF
fi

# 应用日志分析函数
function analyze_app_logs() {
    local app_name=$1
    local log_file=$2
    local error_pattern=$3
    local threshold=$4
    local current_time=$(date +"%Y-%m-%d %H:%M:%S")
    local one_minute_ago=$(date -d "1 minute ago" +"%Y-%m-%d %H:%M:%S")
    
    if [ ! -f "$log_file" ]; then
        return
    fi
    
    # 统计最近1分钟的错误数量
    local error_count=$(awk -v start="$one_minute_ago" -v pattern="$error_pattern" '
        {
            # 简化的时间比较,假设日志格式包含时间戳
            if ($0 >= start && tolower($0) ~ tolower(pattern)) {
                count++
            }
        } END {print count+0}' "$log_file")
    
    # 生成Prometheus指标
    echo "# HELP app_log_errors_total Total number of application log errors" >> $METRICS_FILE.tmp
    echo "# TYPE app_log_errors_total counter" >> $METRICS_FILE.tmp
    echo "app_log_errors_total{app=\"$app_name\",pattern=\"$error_pattern\"} $error_count" >> $METRICS_FILE.tmp
    
    # 检查是否超过阈值
    if [ $error_count -gt $threshold ]; then
        local alert_msg="应用 $app_name 在最近1分钟内出现 $error_count 个 '$error_pattern' 错误(阈值: $threshold)"
        echo "[$current_time] [WARNING] $alert_msg" >> $ALERT_LOG
        
        # 提取最近的错误日志样本
        echo "错误样本:" >> $ALERT_LOG
        grep -i "$error_pattern" "$log_file" | tail -3 >> $ALERT_LOG
        echo "" >> $ALERT_LOG
        
        # 发送告警
        send_app_alert "WARNING" "应用日志告警" "$alert_msg"
    fi
}

# 应用性能指标分析
function analyze_app_performance() {
    local app_name=$1
    local log_file=$2
    local current_time=$(date +"%Y-%m-%d %H:%M:%S")
    
    if [ ! -f "$log_file" ]; then
        return
    fi
    
    # 分析响应时间(假设日志中包含响应时间信息)
    local avg_response_time=$(awk '
        /response_time/ {
            match($0, /response_time[=:]([0-9.]+)/, arr)
            if (arr[1] != "") {
                sum += arr[1]
                count++
            }
        } END {
            if (count > 0) print sum/count
            else print 0
        }' "$log_file")
    
    # 生成性能指标
    echo "# HELP app_response_time_seconds Average response time in seconds" >> $METRICS_FILE.tmp
    echo "# TYPE app_response_time_seconds gauge" >> $METRICS_FILE.tmp
    echo "app_response_time_seconds{app=\"$app_name\"} $avg_response_time" >> $METRICS_FILE.tmp
    
    # 分析请求量
    local request_count=$(grep -c "request" "$log_file" 2>/dev/null || echo 0)
    echo "# HELP app_requests_total Total number of requests" >> $METRICS_FILE.tmp
    echo "# TYPE app_requests_total counter" >> $METRICS_FILE.tmp
    echo "app_requests_total{app=\"$app_name\"} $request_count" >> $METRICS_FILE.tmp
}

# 日志轮转检查
function check_log_rotation() {
    local current_time=$(date +"%Y-%m-%d %H:%M:%S")
    
    # 检查大文件
    find /var/log -name "*.log" -size +100M 2>/dev/null | while read large_file; do
        local file_size=$(du -h "$large_file" | cut -f1)
        echo "[$current_time] [WARNING] 发现大日志文件: $large_file ($file_size)" >> $ALERT_LOG
    done
    
    # 检查磁盘空间
    local log_disk_usage=$(df /var/log | awk 'NR==2 {gsub(/%/, "", $5); print $5}')
    if [ $log_disk_usage -gt 80 ]; then
        echo "[$current_time] [WARNING] 日志分区磁盘使用率过高: ${log_disk_usage}%" >> $ALERT_LOG
        send_app_alert "WARNING" "

版权所有,如有侵权请联系我