node-exporter, prometheus, grafana
大约 5 分钟
参考网址
docker-compose 安装 node-exporter, prometheus, grafana
基于 Docker 搭建 node-exporter+Prometheus+Grafana 服务器监控平台
ubuntu 22.04 配置 Prometheus 和 Grafana 服务器监控
cd /www/data/compose/
mkdir prometheus
cd prometheus
docker-compose up -d --build
(1)访问URL:http://IP地址:9090/graph
prometheus
http://192.168.16.8:9090/graph
http://192.168.16.8:9090/targets
node_exports
http://192.168.16.8:9100/metrics
NodeExporter
mkdir -p /www/data/promethues/node
cd /www/resource
wget https://github.com/prometheus/node_exporter/releases/download/v1.7.0/node_exporter-1.7.0.linux-amd64.tar.gz
ll
#安装包解压,建立软连接
tar xzvf /www/resource/node_exporter-1.7.0.linux-amd64.tar.gz -C /usr/local/
mv /usr/local/node_exporter-1.7.0.linux-amd64/ /usr/local/node_exporter
#配置系统环境
vim /usr/lib/systemd/system/node_exporter.service
systemctl daemon-reload
systemctl enable node_exporter
systemctl start node_exporter
systemctl status node_exporter
netstat -nptl
ufw allow 9102
http://192.168.6.208:9102
[Unit]
Description=Node Exporter
Wants=network-online.target
After=network-online.target
[Service]
User=root
ExecStart=/usr/local/node_exporter/node_exporter \
--web.listen-address=:9102 \
--collector.filesystem.mount-points-exclude="^/(dev|proc|run|boot|run/credentials/.+|sys|data/kubelet/.+|sys|data/docker.+|sys|var/lib/.+)($|/)"
StandardOutput=syslog
StandardError=syslog
SyslogIdentifier=node_exporter
TimeoutStopSec=10
Restart=on-failure
RestartSec=5
[Install]
WantedBy=multi-user.target
promethues
#二进制安装
#安装包下载
mkdir -p /www/data/promethues/data
cd /www/resource
wget https://github.com/prometheus/prometheus/releases/download/v2.50.1/prometheus-2.50.1.linux-amd64.tar.gz
ll
#安装包解压,建立软连接
tar xzvf /www/resource/prometheus-2.50.1.linux-amd64.tar.gz -C /usr/local/
mv /usr/local/prometheus-2.50.1.linux-amd64/ /usr/local/prometheus
#修改配置文件
vim /usr/local/prometheus/prometheus.yml
/usr/local/prometheus/promtool check config /usr/local/prometheus/prometheus.yml
#配置系统环境
vim /usr/lib/systemd/system/prometheus.service
systemctl daemon-reload
systemctl enable prometheus
systemctl start prometheus
systemctl status prometheus
netstat -nptl
ufw allow 9101
http://192.168.6.208:9101
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
scrape_configs:
- job_name: "prometheus"
static_configs:
- targets: ["localhost:9101"]
- job_name: "node"
static_configs:
- targets: ["localhost:9102"]
labels:
instance: microboot-producer-node
- job_name: "microboot"
scrape_interval: 10s
scrape_timeout: 5s
metrics_path: "/actuator/prometheus"
static_configs:
- targets: ["localhost:9090"]
[Unit]
Description=prometheus server daemon
[Service]
User=root
ExecStart=/usr/local/prometheus/prometheus \
--config.file=/usr/local/prometheus/prometheus.yml \
--storage.tsdb.path=/www/data/prometheus/data \
--web.listen-address=0.0.0.0:9101 --web.enable-lifecycle
TimeoutStopSec=10
Restart=on-failure
RestartSec=5
[Install]
WantedBy=multi-user.target
grafana
#二进制安装
#安装包下载
mkdir -p /www/data/promethues/grafana
cd /www/resource
wget https://dl.grafana.com/enterprise/release/grafana-enterprise-10.3.3.linux-amd64.tar.gz
ll
#安装包解压,建立软连接
tar xzvf /www/resource/grafana-enterprise-10.3.3.linux-amd64.tar.gz -C /usr/local/
mv /usr/local/grafana-enterprise-10.3.3.linux-amd64/ /usr/local/grafana
# 修改端口 http_port 9103
vim /usr/local/grafana/conf/defaults.ini
#配置系统环境
vim /usr/lib/systemd/system/grafana.service
systemctl daemon-reload
systemctl enable grafana
systemctl start grafana
systemctl status grafana
netstat -nptl
ufw allow 9103
http://192.168.6.208:9103
[Unit]
Description=Grafana Service
[Service]
User=root
ExecStart=/usr/local/grafana/bin/grafana-server \
-config /usr/local/grafana/conf/defaults.ini -homepath /usr/local/grafana
TimeoutStopSec=10
Restart=on-failure
RestartSec=5
[Install]
WantedBy=multi-user.target
dashball
8919 7991 11074 12633
alertmanager
#二进制安装
#安装包下载
mkdir -p /www/data/promethues/grafana
cd /www/resource
wget https://github.com/prometheus/alertmanager/releases/download/v0.27.0/alertmanager-0.27.0.linux-amd64.tar.gz
ll
#安装包解压,建立软连接
tar xzvf /www/resource/alertmanager-0.27.0.linux-amd64.tar.gz -C /usr/local/
mv /usr/local/alertmanager-0.27.0.linux-amd64/ /usr/local/alertmanager
# 修改配置
vim /usr/local/alertmanager/alertmanager.yml
/usr/local/alertmanager/amtool check-config /usr/local/alertmanager/alertmanager.yml
mkdir -p /usr/local/prometheus/rules
vim /usr/local/prometheus/rules/microboot-acutator-rule.yml
#配置系统环境
vim /usr/lib/systemd/system/alertmanager.service
systemctl daemon-reload
systemctl enable alertmanager
systemctl start alertmanager
systemctl status alertmanager
netstat -nptl
ufw allow 9103
http://192.168.6.208:9103
global:
resolve_timeout: 5m
smtp_smarthost: 'smtp.qq.com:25'
smtp_from: '784420216@qq.com'
smtp_auth_username: '784420216@qq.com'
smtp_auth_password: 'xtyrperyvtncbbif'
smtp_require_tls: false
route:
group_by: ['alertname']
group_wait: 10s
group_interval: 10s
repeat_interval: 1h
receiver: 'mail'
receivers:
- name: 'mail'
email_configs:
- to: '784420216@qq.com
[Unit]
Description=alertmanager Service
[Service]
User=root
ExecStart=/usr/local/alertmanager/alertmanager \
--config.file=/usr/local/alertmanager/alertmanager.yml
TimeoutStopSec=10
Restart=on-failure
RestartSec=5
[Install]
WantedBy=multi-user.target
alerting:
alertmanagers:
- static_configs:
- targets:
- microboot-prometheus:9093
rule_files:
- "rules/*.yml"
#/usr/local/prometheus/rules/microboot-acutator-rule.yml
groups:
- name: microboot.actuator.rules
rules:
- alert: MicrobootInstanceDown
expr: up{job="microboot"} == 0
for: 1m
labels:
severity: warning
annotations:
description: "微服务 {{ $labels.instance }} 关闭"
summary: "运行在 {{ $labels.instance }} 主机中的 {{ $labels.job }} 微服务已经关闭了!"
#/usr/local/prometheus/rules/microboot-node.yml
groups:
- name: microboot.node.rules
rules:
- alert: NodeCPUUsage
expr: 100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance) * 100) > 80
for: 2m
labels:
severity: warning
annotations:
summary: "微服务运行主机 {{ $labels.instance }} 中的CPU使用率过高"
description: "微服务运行主机 {{ $labels.instance }} 中的CPU使用大于80%,当前值: “{{ $value }}”"
- alert: NodeMemoryUsage
expr: 100 - (node_memory_MemFree_bytes+node_memory_Cached_bytes+node_memory_Buffers_bytes) / node_memory_MemTotal_bytes * 100 > 80
for: 2m
labels:
severity: warning
annotations:
summary: "微服务运行主机 {{ $labels.instance }} 中的内存使用率过高"
description: "微服务运行主机 {{ $labels.instance }} 内存使用大于 80%,当前值: {{ $value }}"
- alert: NodeFilesystemUsage
expr: 100 - (node_filesystem_free_bytes{fstype=~"ext4|xfs"} / node_filesystem_size_bytes{fstype=~"ext4|xfs"} * 100) > 90
for: 2m
labels:
severity: warning
annotations:
summary: "微服务运行主机 {{ $labels.instance }}中的“{{ $labels.mountpoint }}” 分区使用过高"
description: "微服务运行主机 {{ $labels.instance }} 中 {{ $labels.mountpoint }} 分区使用大于80%,当前值: {{ $value }}"
groups:
- name: node_exporter
rules:
- alert: 磁盘空间使用率告警
expr: 100
- (node_filesystem_avail_bytes{fstype=~"xfs|ext4",job="node_exporter"}
/ node_filesystem_size_bytes{fstype=~"xfs|ext4",job="node_exporter"}) * 100
> 90
for: 10m
labels:
severity: critical
opsalertname: 磁盘空间使用率告警
annotations:
summary: "磁盘使用率告警"
description: |
磁盘使用: {{ $labels.mountpoint }}分区磁盘使用率{{ $value | humanize }} %, 大于告警阈值90%
- alert: 服务器宕机告警
expr: up{job="node_exporter"} == 0
for: 30s
labels:
severity: critical
opsalertname: 服务器宕机告警
annotations:
summary: "服务器宕机"
description: |
主机: {{ $labels.instance }}服务器宕机
- alert: 磁盘inode使用率告警
expr: 100 - (node_filesystem_files_free{job="node_exporter",fstype=~"ext4|xfs"} / node_filesystem_files{job="node_exporter",fstype=~"ext4|xfs"}) * 100 > 80
for: 15m
labels:
severity: critical
opsalertname: 磁盘inode使用率告警
annotations:
summary: "磁盘Inode告警"
description: |
Inode使用: {{ $value | humanize }} %, 大于告警阈值80%
- alert: 内存使用率告警
expr: 100 - ( node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes ) * 100 > 90
for: 5m
#for: 5m
labels:
severity: critical
opsalertname: 内存使用率告警
annotations:
summary: "内存使用率告警"
description: |
内存使用率告警 {{ $value | humanize }} %, 大于告警阈值90%
- alert: cpu使用率告警
expr: instance:node_cpu_utilization:ratio * 100 > 85
for: 5m
labels:
severity: critical
opsalertname: cpu使用率告警
annotations:
summary: "cpu使用率大于85%"
description: |
主机 {{ $labels.hostname }} 的cpu使用率为 {{ $value | humanize }}%
- alert: cpu负载告警
expr: node_load5 > instance:node_cpus:count
for: 5m
labels:
severity: critical
level: 2
opsalertname: CPU负载告警
annotations:
summary: "cpu负载大于Cores"
description: |
"主机5min {{ $labels.hostname }} 的cpu负载为 {{ $value }}"
- alert: 主机入方向流量告警
expr: avg(irate(node_network_transmit_bytes_total{device=~"eth0|eth1|ens33|ens37|ens18|eno1|eno2|eno3|eno4"}[1m])) by (environment,instance,device,cluster,app) /1024/1024 * 8 > 500
for: 15m
labels:
severity: critical
level: 2
opsalertname: 主机入方向流量告警
annotations:
summary: "主机入方向15min流量大于 (> 500 Mb/s)"
description: |
"主机网卡:{{ $labels.device }}入方向15min流量大于 (> 500 Mb/s), 当前值: {{ $value | humanize }} Mb/s"
- alert: 主机出方向流量告警
expr: avg(irate(node_network_receive_bytes_total{device=~"eth0|eth1|ens33|ens37|ens8|eno1|eno2|eno3|eno4"}[1m]))by (environment,instance,device,cluster,app) /1024/1024 * 8 > 500
for: 15m
labels:
severity: critical
level: 2
opsalertname: 主机出方向流量告警
annotations:
summary: "主机出方向15min流量大于 (> 500 Mb/s)"
description: |
"主机网卡:{{ $labels.device }}出方向15min流量大于 (> 180 Mb/s), 当前值: {{ $value | humanize }} Mb/s"
- alert: 主机磁盘读告警
expr: sum by (instance,cluster,app) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 100
for: 5m
labels:
severity: critical
level: 2
opsalertname: 主机磁盘读告警
annotations:
summary: 主机磁盘读取速度大于100 MB/s告警
description: |
"主机磁盘读读取速度大于100 MB/s告警, 当前值 {{ $value | humanize }} MB/s"
- alert: 主机磁盘写告警
expr: sum by (instance,cluster,app) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 100
for: 5m
labels:
severity: critical
level: 2
opsalertname: 主机磁盘写告警
annotations:
summary: 主机磁盘写入速度大于 100 MB/s告警
description: |
"主机磁盘写入速度大于 100 MB/s, 当前值{{ $value | humanize }} MB/s"