node-exporter, prometheus, grafana

wangdx大约 5 分钟

cd /www/data/compose/
mkdir prometheus
cd prometheus

docker-compose up -d --build

(1)访问URL：http://IP地址:9090/graph
prometheus
http://192.168.16.8:9090/graph
http://192.168.16.8:9090/targets
node_exports
http://192.168.16.8:9100/metrics

NodeExporter

mkdir -p /www/data/promethues/node
cd /www/resource
wget https://github.com/prometheus/node_exporter/releases/download/v1.7.0/node_exporter-1.7.0.linux-amd64.tar.gz
ll
#安装包解压，建立软连接
tar xzvf /www/resource/node_exporter-1.7.0.linux-amd64.tar.gz -C /usr/local/
mv /usr/local/node_exporter-1.7.0.linux-amd64/ /usr/local/node_exporter
#配置系统环境
vim /usr/lib/systemd/system/node_exporter.service
systemctl daemon-reload
systemctl enable node_exporter
systemctl start node_exporter
systemctl status node_exporter
netstat -nptl
ufw allow 9102

http://192.168.6.208:9102

[Unit]
Description=Node Exporter
Wants=network-online.target
After=network-online.target

[Service]
User=root
ExecStart=/usr/local/node_exporter/node_exporter \
--web.listen-address=:9102 \
--collector.filesystem.mount-points-exclude="^/(dev|proc|run|boot|run/credentials/.+|sys|data/kubelet/.+|sys|data/docker.+|sys|var/lib/.+)($|/)"
StandardOutput=syslog
StandardError=syslog
SyslogIdentifier=node_exporter
TimeoutStopSec=10
Restart=on-failure
RestartSec=5

[Install]
WantedBy=multi-user.target

promethues

#二进制安装
#安装包下载
mkdir -p /www/data/promethues/data
cd /www/resource
wget https://github.com/prometheus/prometheus/releases/download/v2.50.1/prometheus-2.50.1.linux-amd64.tar.gz
ll
#安装包解压，建立软连接
tar xzvf /www/resource/prometheus-2.50.1.linux-amd64.tar.gz -C /usr/local/
mv /usr/local/prometheus-2.50.1.linux-amd64/ /usr/local/prometheus
#修改配置文件
vim /usr/local/prometheus/prometheus.yml
/usr/local/prometheus/promtool check config /usr/local/prometheus/prometheus.yml
#配置系统环境
vim /usr/lib/systemd/system/prometheus.service
systemctl daemon-reload
systemctl enable prometheus
systemctl start prometheus
systemctl status prometheus
netstat -nptl
ufw allow 9101

http://192.168.6.208:9101

global:
  scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
  evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
  # scrape_timeout is set to the global default (10s).

scrape_configs:
  - job_name: "prometheus"
    static_configs:
      - targets: ["localhost:9101"]
  - job_name: "node"
    static_configs:
      - targets: ["localhost:9102"]
        labels:
          instance: microboot-producer-node
  - job_name: "microboot"
    scrape_interval: 10s
    scrape_timeout: 5s
    metrics_path: "/actuator/prometheus"
    static_configs:
      - targets: ["localhost:9090"]

[Unit]
Description=prometheus server daemon

[Service]
User=root
ExecStart=/usr/local/prometheus/prometheus \
	--config.file=/usr/local/prometheus/prometheus.yml \
	--storage.tsdb.path=/www/data/prometheus/data \
	--web.listen-address=0.0.0.0:9101 --web.enable-lifecycle
TimeoutStopSec=10
Restart=on-failure
RestartSec=5

[Install]
WantedBy=multi-user.target

grafana

#二进制安装
#安装包下载
mkdir -p /www/data/promethues/grafana
cd /www/resource
wget https://dl.grafana.com/enterprise/release/grafana-enterprise-10.3.3.linux-amd64.tar.gz
ll
#安装包解压，建立软连接
tar xzvf /www/resource/grafana-enterprise-10.3.3.linux-amd64.tar.gz -C /usr/local/
mv /usr/local/grafana-enterprise-10.3.3.linux-amd64/ /usr/local/grafana
# 修改端口 http_port 9103
vim /usr/local/grafana/conf/defaults.ini
#配置系统环境
vim /usr/lib/systemd/system/grafana.service
systemctl daemon-reload
systemctl enable grafana
systemctl start grafana
systemctl status grafana
netstat -nptl
ufw allow 9103

http://192.168.6.208:9103

[Unit]
Description=Grafana Service

[Service]
User=root
ExecStart=/usr/local/grafana/bin/grafana-server \
	-config /usr/local/grafana/conf/defaults.ini -homepath /usr/local/grafana
TimeoutStopSec=10
Restart=on-failure
RestartSec=5

[Install]
WantedBy=multi-user.target

dashball

8919 7991 11074 12633

alertmanager

#二进制安装
#安装包下载
mkdir -p /www/data/promethues/grafana
cd /www/resource
wget https://github.com/prometheus/alertmanager/releases/download/v0.27.0/alertmanager-0.27.0.linux-amd64.tar.gz
ll
#安装包解压，建立软连接
tar xzvf /www/resource/alertmanager-0.27.0.linux-amd64.tar.gz -C /usr/local/
mv /usr/local/alertmanager-0.27.0.linux-amd64/ /usr/local/alertmanager
# 修改配置
vim /usr/local/alertmanager/alertmanager.yml
/usr/local/alertmanager/amtool check-config /usr/local/alertmanager/alertmanager.yml

mkdir -p /usr/local/prometheus/rules
vim /usr/local/prometheus/rules/microboot-acutator-rule.yml
#配置系统环境
vim /usr/lib/systemd/system/alertmanager.service
systemctl daemon-reload
systemctl enable alertmanager
systemctl start alertmanager
systemctl status alertmanager
netstat -nptl
ufw allow 9103

http://192.168.6.208:9103

global:
  resolve_timeout: 5m
  smtp_smarthost: 'smtp.qq.com:25'
  smtp_from: '784420216@qq.com'
  smtp_auth_username: '784420216@qq.com'
  smtp_auth_password: 'xtyrperyvtncbbif'
  smtp_require_tls: false

route:
  group_by: ['alertname']
  group_wait: 10s
  group_interval: 10s
  repeat_interval: 1h
  receiver: 'mail'
receivers:
- name: 'mail'
  email_configs:
  - to: '784420216@qq.com

[Unit]
Description=alertmanager Service

[Service]
User=root
ExecStart=/usr/local/alertmanager/alertmanager \
	--config.file=/usr/local/alertmanager/alertmanager.yml
TimeoutStopSec=10
Restart=on-failure
RestartSec=5

[Install]
WantedBy=multi-user.target

alerting:
  alertmanagers:
  - static_configs:
    - targets:
       - microboot-prometheus:9093

rule_files:
  - "rules/*.yml"

#/usr/local/prometheus/rules/microboot-acutator-rule.yml
groups:
- name: microboot.actuator.rules
  rules:
  - alert: MicrobootInstanceDown
    expr: up{job="microboot"} == 0
    for: 1m
    labels:
      severity: warning
    annotations:
      description: "微服务 {{ $labels.instance }} 关闭"
      summary: "运行在 {{ $labels.instance }} 主机中的 {{ $labels.job }} 微服务已经关闭了！"
#/usr/local/prometheus/rules/microboot-node.yml
groups:
- name: microboot.node.rules
  rules:
  - alert: NodeCPUUsage
    expr: 100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance) * 100) > 80
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: "微服务运行主机 {{ $labels.instance }} 中的CPU使用率过高"
      description: "微服务运行主机 {{ $labels.instance }} 中的CPU使用大于80%，当前值: “{{ $value }}”"
  - alert: NodeMemoryUsage
    expr: 100 - (node_memory_MemFree_bytes+node_memory_Cached_bytes+node_memory_Buffers_bytes) / node_memory_MemTotal_bytes * 100 > 80
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: "微服务运行主机 {{ $labels.instance }} 中的内存使用率过高"
      description: "微服务运行主机 {{ $labels.instance }} 内存使用大于 80%，当前值: {{ $value }}"
  - alert: NodeFilesystemUsage
    expr: 100 - (node_filesystem_free_bytes{fstype=~"ext4|xfs"} / node_filesystem_size_bytes{fstype=~"ext4|xfs"} * 100) > 90
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: "微服务运行主机 {{ $labels.instance }}中的“{{ $labels.mountpoint }}” 分区使用过高"
      description: "微服务运行主机 {{ $labels.instance }} 中 {{ $labels.mountpoint }} 分区使用大于80%，当前值: {{ $value }}"

groups:
  - name: node_exporter
    rules:
      - alert: 磁盘空间使用率告警
        expr: 100
          - (node_filesystem_avail_bytes{fstype=~"xfs|ext4",job="node_exporter"}
          / node_filesystem_size_bytes{fstype=~"xfs|ext4",job="node_exporter"}) * 100
          > 90
        for: 10m
        labels:
          severity: critical
          opsalertname: 磁盘空间使用率告警
        annotations:
          summary: "磁盘使用率告警"
          description: |
            磁盘使用: {{ $labels.mountpoint }}分区磁盘使用率{{ $value | humanize }} %, 大于告警阈值90%

      - alert: 服务器宕机告警
        expr: up{job="node_exporter"} == 0
        for: 30s
        labels:
          severity: critical
          opsalertname: 服务器宕机告警
        annotations:
          summary: "服务器宕机"
          description: |
            主机: {{ $labels.instance }}服务器宕机

      - alert: 磁盘inode使用率告警
        expr: 100 - (node_filesystem_files_free{job="node_exporter",fstype=~"ext4|xfs"} / node_filesystem_files{job="node_exporter",fstype=~"ext4|xfs"}) * 100  > 80
        for: 15m
        labels:
          severity: critical
          opsalertname: 磁盘inode使用率告警
        annotations:
          summary: "磁盘Inode告警"
          description: |
            Inode使用: {{ $value | humanize }} %, 大于告警阈值80%

      - alert: 内存使用率告警
        expr: 100 - ( node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes ) * 100 > 90
        for: 5m
        #for: 5m
        labels:
          severity: critical
          opsalertname: 内存使用率告警
        annotations:
          summary: "内存使用率告警"
          description: |
            内存使用率告警 {{ $value | humanize }} %, 大于告警阈值90%

      - alert: cpu使用率告警
        expr: instance:node_cpu_utilization:ratio * 100 > 85
        for: 5m
        labels:
          severity: critical
          opsalertname: cpu使用率告警
        annotations:
          summary: "cpu使用率大于85%"
          description: |
            主机 {{ $labels.hostname }} 的cpu使用率为 {{ $value | humanize }}%

      - alert: cpu负载告警
        expr: node_load5 > instance:node_cpus:count
        for: 5m
        labels:
          severity: critical
          level: 2
          opsalertname: CPU负载告警
        annotations:
          summary: "cpu负载大于Cores"
          description: |
            "主机5min {{ $labels.hostname }} 的cpu负载为 {{ $value }}"

      - alert: 主机入方向流量告警
        expr: avg(irate(node_network_transmit_bytes_total{device=~"eth0|eth1|ens33|ens37|ens18|eno1|eno2|eno3|eno4"}[1m])) by (environment,instance,device,cluster,app) /1024/1024 * 8 > 500
        for: 15m
        labels:
          severity: critical
          level: 2
          opsalertname: 主机入方向流量告警
        annotations:
          summary: "主机入方向15min流量大于 (> 500 Mb/s)"
          description: |
            "主机网卡：{{ $labels.device }}入方向15min流量大于 (> 500 Mb/s), 当前值: {{ $value | humanize }} Mb/s"

      - alert: 主机出方向流量告警
        expr: avg(irate(node_network_receive_bytes_total{device=~"eth0|eth1|ens33|ens37|ens8|eno1|eno2|eno3|eno4"}[1m]))by (environment,instance,device,cluster,app) /1024/1024 * 8 > 500
        for: 15m
        labels:
          severity: critical
          level: 2
          opsalertname: 主机出方向流量告警
        annotations:
          summary: "主机出方向15min流量大于 (> 500 Mb/s)"
          description: |
            "主机网卡：{{ $labels.device }}出方向15min流量大于 (> 180 Mb/s), 当前值: {{ $value | humanize }} Mb/s"

      - alert: 主机磁盘读告警
        expr: sum by (instance,cluster,app) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 100
        for: 5m
        labels:
          severity: critical
          level: 2
          opsalertname: 主机磁盘读告警
        annotations:
          summary: 主机磁盘读取速度大于100 MB/s告警
          description: |
            "主机磁盘读读取速度大于100 MB/s告警, 当前值 {{ $value | humanize }}  MB/s"

      - alert: 主机磁盘写告警
        expr: sum by (instance,cluster,app) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 100
        for: 5m
        labels:
          severity: critical
          level: 2
          opsalertname: 主机磁盘写告警
        annotations:
          summary: 主机磁盘写入速度大于 100 MB/s告警
          description: |
            "主机磁盘写入速度大于 100 MB/s, 当前值{{ $value | humanize }} MB/s"

node-exporter, prometheus, grafana

参考网址

NodeExporter

promethues

grafana

alertmanager