一.安装Prometheus(二进制)
1.1 环境准备
主机名 | WAN IP | 角色 | 应用 |
---|---|---|---|
master-1 | 10.0.0.201 | prometheus服务端 | prometheus、node_exporter、grafana、pushgateway、altermanager |
node-1 | 10.0.0.202 | prometheus客户端 | node_exporter |
node-2 | 10.0.0.203 | prometheus客户端 | node_exporter |
#下载安装包
https://github.com/prometheus/
1.2 配置Prometheus(bin)
Prometheus
# 1.创建安装目录
[root@master-1 prometheus]# mkdir /app
# 2.解压prometheus
[root@master-1 prometheus]# tar xf prometheus-2.49.0-rc.1.linux-amd64.tar.gz -C /app/
# 3.改名
[root@master-1 prometheus]# mv /app/prometheus-2.49.0-rc.1.linux-amd64 /app/prometheus-2.49.0
# 4.软链接
[root@master-1 prometheus]# ln -s /app/prometheus-2.49.0 /app/prometheus
# 5.启动prometheus
[root@master-1 prometheus]# ./prometheus --config.file=./prometheus.yml
node_exporter
# 1.下载
# 2.解压
[root@master-1 prometheus]# tar xf node_exporter-1.7.0.linux-amd64.tar.gz -C /app/
# 3.改名
[root@master-1 prometheus]# mv /app/node_exporter-1.7.0.linux-amd64 /app/node_exporter-1.7.0
# 4.软链接
[root@master-1 prometheus]# ln -s /app/node_exporter-1.7.0 /app/node_exporter
# 5.启动
[root@master-1 node_exporter]# ./node_exporter
cadvisor
docker run --volume=/:/rootfs:ro --volume=/var/run:/var/run:rw --volume=/sys:/sys:ro --volume=/var/lib/docker:/var/lib/docker:ro --publish=8080:8080 --detach=true --name=cadvisor google/cadvisor:latest
使用动态发现关联到Prometheus
pushgateway
# 1.下载
# 2.解压
[root@master-1 prometheus]# tar xf pushgateway-1.6.2.linux-amd64.tar.gz -C /app/
# 3.改名
[root@master-1 prometheus]# mv /app/pushgateway-1.6.2.linux-amd64 /app/pushgateway-1.6.2
# 4.软链接
[root@master-1 prometheus]# ln -s /app/pushgateway-1.6.2 /app/pushgateway
#5.查看端口
[root@master-1 prometheus]# netstat -lntup|grep 9091
tcp6 0 0 :::9091 :::* LISTEN 54931/pushgateway
alertmanager告警
[root@master-1 prometheus]# tar xf alertmanager-0.26.0.linux-amd64.tar.gz -C /app/
[root@master-1 prometheus]# mv /app/alertmanager-0.26.0.linux-amd64 /app/alertmanager-0.26.0
[root@master-1 prometheus]# ln -s /app/alertmanager-0.26.0 /app/alertmanager
直接覆盖原有的配置文件
[root@master-1 alertmanager]# vim alertmanager.yml
global:
resolve_timeout: 5m
smtp_from: '2825916659@qq.com'
smtp_smarthost: 'smtp.qq.com:465'
smtp_auth_username: '2825916659@qq.com'
smtp_auth_password: 'bpfsdvmnpxxxxxx'
smtp_require_tls: false
smtp_hello: 'qq.com'
route:
group_by: ['alertname']
group_wait: 5s
group_interval: 5s
repeat_interval: 5m
receiver: 'email'
receivers:
- name: 'email'
email_configs:
- to: '2825916659@qq.com'
send_resolved: true
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']
[root@master-1 alertmanager]# supervisorctl restart alertmanager
alertmanager关联prometheus
[root@master-1 prometheus]# vim prometheus.yml
alerting:
alertmanagers:
- static_configs:
- targets:
- 10.0.0.201:9093
rule_files:
- "first_rules.yml"
[root@master-1 prometheus]# vim first_rules.yml
groups:
- name: cadvisor-run
rules:
- alert: lyx-cadvisor-run-1
expr: docker_runtime{name='cadvisor'} == 0
for: 15s
labels:
severity: 1
team: node
annotations:
summary: "{{ $labels.instance }} has stoped for more than 15s!"
[root@master-1 prometheus]# supervisorctl restart prometheus
1.3 Prometheus动态发现
使用动态发现,可以关联各应用到Prometheus,且改动配置自动应用,无需重启Prometheus
vim prometheus.yml
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
alerting:
alertmanagers:
- static_configs:
- targets:
# - alertmanager:9093
rule_files:
# - "first_rules.yml"
# - "second_rules.yml"
scrape_configs:
- job_name: "prometheus"
static_configs:
- targets: ["10.0.0.201:9090"]
- job_name: "file_sd"
file_sd_configs:
- files:
- /app/prometheus/discovery.yaml
refresh_interval: 5s
vim discovery.yaml
[
{
"targets": [ "10.0.0.201:9100","10.0.0.202:9100","10.0.0.203:9100" ],
"labels": {
"job_name": "node_exporter"
}
},
{
"targets": [ "10.0.0.201:8080","10.0.0.202:8080","10.0.0.203:8080" ],
"labels": {
"job_name": "cAdVisor"
}
},
{
"targets": ["10.0.0.201:9091"],
"labels": {
"job_name": "pushgateway"
}
}
]
二.supervisor管理进程
使用supervisor可以一次性管理多个Prometheus应用
#1.安装supervisor
[root@master-1 ~]# yum install -y supervisor
# 2.启动并加入开机自启
[root@master-1 ~]# systemctl enable supervisord
[root@master-1 ~]# systemctl start supervisord
# 3.编写服务启动配置文件
[root@master-1 ~]# vim /etc/supervisord.d/prome.ini
[program:prometheus]
direct= /app/prometheus/
command=/bin/bash -c "/app/prometheus/prometheus --config.file=/app/prometheus/prometheus.yml"
autostart=true
autorestart=true
stdout_logfile=/var/log/prome_stdout.log
stderr_logfile=/var/log/prome_stderr.log
user=root
stopsignal=TERM
startsecs=5
startretries=3
stopasgroup=true
killasgroup=true
[program:node_exporter]
direct= /app/node_exporter/
command=/bin/bash -c "/app/node_exporter/node_exporter"
autostart=true
autorestart=true
stdout_logfile=/var/log/node_exporter_stdout.log
stderr_logfile=/var/log/node_exporter_stderr.log
user=root
stopsignal=TERM
startsecs=5
startretries=3
stopasgroup=true
killasgroup=true
[program:pushgateway]
direct= /app/pushgateway/
command=/bin/bash -c "/app/pushgateway/pushgateway"
autostart=true
autorestart=true
stdout_logfile=/var/log/pushgateway_stdout.log
stderr_logfile=/var/log/pushgateway_stderr.log
user=root
stopsignal=TERM
startsecs=5
startretries=3
stopasgroup=true
killasgroup=true
[program:alertmanager]
direct= /app/alertmanager
command=/bin/bash -c "/app/alertmanager/alertmanager --config.file=/app/alertmanager/alertmanager.yml"
autostart=true
autorestart=true
stdout_logfile=/var/log/alertmanager_stdout.log
stderr_logfile=/var/log/alertmanager_stderr.log
user=root
stopsignal=TERM
startsecs=5
startretries=3
stopasgroup=true
killasgroup=true
# 4.更新配置文件(第一次写配置文件需要执行才能生效)
[root@master-1 ~]# supervisorctl update
#重启服务
[root@master-1 ~]# systemctl restart supervisord
#起/停止所有服务
[root@master-1 ~]# supervisorctl start all
[root@master-1 ~]# supervisorctl stop all
#起/停止指定服务 [program:prometheus] [program:node_exporter]
[root@master-1 ~]# supervisorctl start prometheus
[root@master-1 ~]# supervisorctl stop prometheus
[root@master-1 ~]# supervisorctl start node_exporter
[root@master-1 ~]# supervisorctl stop node_exporter
三.安装Prometheus(k8s)
# prometheus部署 K8S方式
# 注意修改prom-pv-pvc.yaml的matchExpressions的values值
#创建nm
kubectl create namespace prom
## ConfigMap
cat > prom-cm.yaml <<EOF
apiVersion: v1
kind: ConfigMap
metadata:
name: prom-config
namespace: prom
data:
prometheus.yml: |
global:
scrape_interval: 15s
scrape_timeout: 15s
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
EOF
## PV和PVC
cat > prom-pv-pvc.yaml <<EOF
apiVersion: v1
kind: PersistentVolume
metadata:
name: prom-localhost
namespace: prom
labels:
app: prometheus
spec:
capacity:
storage: 10Gi
accessModes:
- ReadWriteOnce
storageClassName: local-storage
local:
path: /data/k8s/prometheus
nodeAffinity:
required:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- node-2
persistentVolumeReclaimPolicy: Retain
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: prom-data
namespace: prom
spec:
selector:
matchLabels:
app: prometheus
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 10Gi
storageClassName: local-storage
EOF
## RBAC
cat > prom-rbac.yml <<EOF
apiVersion: v1
kind: ServiceAccount
metadata:
name: prometheus
namespace: prom
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: prometheus
rules:
- apiGroups: [""]
resources:
- nodes
- services
- endpoints
- pods
- nodes/proxy
verbs:
- get
- list
- watch
- apiGroups: ["extensions"]
resources:
- ingresses
verbs:
- get
- list
- watch
- apiGroups: [""]
resources:
- configmaps
- nodes/metrics
verbs:
- get
- nonResourceURLs:
- /metrics
verbs:
- get
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: prometheus
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: prometheus
subjects:
- kind: ServiceAccount
name: prometheus
namespace: prom
EOF
## Deployment
cat > prom-dp.yaml <<EOF
apiVersion: apps/v1
kind: Deployment
metadata:
name: prometheus
namespace: prom
labels:
app: prometheus
spec:
selector:
matchLabels:
app: prometheus
template:
metadata:
labels:
app: prometheus
spec:
serviceAccountName: prometheus
volumes:
- name: data
persistentVolumeClaim:
claimName: prom-data
- name: config-volume
configMap:
name: prom-config
initContainers:
- name: fix-permissions
image: busybox
command: [chown, -R, "nobody:nobody", /prometheus]
volumeMounts:
- name: data
mountPath: /prometheus
containers:
- name: prometheus
image: prom/prometheus:v2.24.1
args:
- "--config.file=/etc/prometheus/prometheus.yml"
- "--storage.tsdb.path=/prometheus"
- "--storage.tsdb.retention.time=24h"
- "--web.enable-admin-api"
- "--web.enable-lifecycle"
ports:
- containerPort: 9090
volumeMounts:
- name: config-volume
mountPath: /etc/prometheus
- name: data
mountPath: /prometheus
resources:
requests:
cpu: 100m
memory: 512Mi
limits:
cpu: 100m
memory: 512Mi
EOF
## Service
cat > prom-svc.yaml <<EOF
apiVersion: v1
kind: Service
metadata:
name: prometheus
namespace: prom
labels:
app: prometheus
spec:
selector:
app: prometheus
ports:
- name: web
port: 9090
targetPort: 9090
type: ClusterIP
EOF
## ingress
cat > prom-ingress.yaml <<EOF
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: prometheus
namespace: prom
labels:
app: prometheus
spec:
rules:
- host: prom.drz.com
http:
paths:
- path: /
pathType: ImplementationSpecific
backend:
service:
name: prometheus
port:
number: 9090
EOF
四.Prometheus查询
4.1 PromeQL
## 运算符
+ - * / > < = !=
### 正则匹配
=~ 匹配
!~ 不匹配
## 查看201 和 202 以i开头的
node_cpu_seconds_total{instance!='10.0.0.203:9100',mode=~'i.*'}
4.2 prometheus Metric Types(指标类型 )
metrics是一种对采样数据的总称,其并不代表一种具体的数据格式,而是一种对于度量计算单位的抽象。
类型 | 作用 | 应用 |
---|---|---|
gauge | 只有一个简单的返回值,或者叫瞬时状态 | 衡量待处理任务的个数,监控硬盘容量或者内存的使用量 |
counter | 计数器,从数据量0开始积累计算 | 统计一小时,一周,一个月的用户的访问量 |
histograms | 统计数据的分布情况 | 统计最小值,最大值,中间值,还有中位数 |
4.3 prometheus数据格式(key:value)
prometheus的数据类型就是依赖于上面提到的metrics的类型来计算的。而对于采集回来的数据类型,必须要以一种具体的数据格式供我们查看和使用。如下图所示,请不要忽略以"#"开头的行,每个metric都除了以空格分割的K/V数据外,其上面会最少存在两行注释信息,分别为HELP和TYPE,其中HELP是对该条metric作用的简单描述,而TYPE表示该metric的类型。
4.4 数据类型
类型 | 说明 |
---|---|
即时向量(Instant Vector) | 特定或全部的时间序列集合上,具有相同时间戳的一组样本称为即时向量。 |
范围向量(Range Vector) | 特定或全部的时间序列集合上,在指定的同一范围内的所有样本值。 |
标量(Scalar) | 一个浮点型的数据值 |
字符串(String) | 支持使用单引号,双引号或反引号进行引用,但反引号中不会转移字符进行转义 |
4.5 Prometheus函数
increase(求增量)
在prometheus中是用来针对Counter这种持续增长的数值,截取其中的一段时间的增量
例如:increase(node_cpu_seconds_total[1m])
获取CPU总使用时间在1分钟内的增量,计算的是1分钟内增加的总量
sum(求和)
sum(increase(node_cpu_seconds_total[1m]))
套用一个sum函数就可以把所有CPU核心在1分钟内的增量做一个累加
by(分组)
该函数通常会和sum函数搭配使用,比如(sum(increase(node_cpu_seconds_total[1m])) by (instance))
,表示把sum函数中累加和按照"instance"(机器名称)强行拆分成多组数据
rate(速率计算函数)
它的功能是按照设置的一个时间段,取counter在这个时间段中的平均每秒的增量。因此是专门搭配counter类型数据使用的函数
提示:在实际工作中,我们取监控频率为1分钟还是5分钟这取决于我们对于监控数据的敏感程度来挑选
rate(node_cpu_seconds_total[1m])
获取CPU总使用时间在1分钟内的增加的总量并除以60秒,计算的是每秒的增量。
topk
取前几位的最高值。实际使用的时候一般会用该函数进行瞬时报警,而不是为了观察曲线图。
topk(3,rate(node_cpu_seconds_total[1m]))
获取CPU总使用时间在1分钟内的增加的总量并除以60秒,计算的是每秒的增量。并只查看top3。
count(统计函数)
count(container_cpu_system_seconds_total{image!=''}) by(instance)
把数值符合条件的输出数目进行累计加和。一般用它进行一些模糊的监控判断
五.pushgateway自定义监控
pushgateway是另一种采用被动推送(push)的方式(而不像exporter需要pull)获取监控数据的prometheus插件。
它是可以单独运行在任何节点(不一定是在被监控客户端)上的插件,然后通过用户自定义开发脚本把需要监控的数据发送给pushgateway,再由pushgateway暴露http接口,而后由prometheus server去pull数据。
pushgateway组件本身是没有任何抓取监控数据功能的,它只能被动的等待监控数据被推送过来。
规范:在Prometheus的项目目录创建monitor_scripts 目录存放自定义监控脚本
mkdir /app/prometheus/monitor_scripts
监控TCP状态
vim tcp_monitor.sh
#!/bin/bash
INSTANCE_NAME=`hostname -s`
METRICS_NAME="TCP_STATUS"
FILE_PATH="tcp_temp.log"
PROMETHEUS_HOST="http://10.0.0.201:9091"
# Check if the INSTANCE_NAME is localhost
if [ "$INSTANCE_NAME" == "localhost" ]; then
echo "Must use FQDN hostname"
exit 1
fi
# Header for Prometheus metrics
echo -e "# HELP ${METRICS_NAME} monitor\n# TYPE ${METRICS_NAME} gauge" > "${FILE_PATH}"
# TCP connection states to monitor
tcp_status=(
CLOSED
LISTEN
SYN_SENT
SYN_RECV
ESTABLISHED
FIN_WAIT1
FIN_WAIT2
TIME_WAIT
CLOSE_WAIT
LAST_ACK
CLOSING
)
# Loop through each connection state
for sta in "${tcp_status[@]}"; do
TCP_VALUE=$(netstat -an | grep -w "${sta}" | wc -l)
echo "${METRICS_NAME}{name=\"${sta}\"} $TCP_VALUE" >> "${FILE_PATH}"
done
# Send metrics to Prometheus server
curl --data-binary "@${FILE_PATH}" ${PROMETHEUS_HOST}/metrics/job/${METRICS_NAME}/instance/${INSTANCE_NAME}
监控容器运行时间
[root@master-1 ~]# vim runtime.sh
#!/bin/bash
INSTANCE_NAME=$(hostname -s)
FILE_PATH="docker_runtime_temp.log"
PROMETHEUS_HOST="http://10.0.0.201:9091"
allname=$(docker ps --format "{{.Names}}")
METRICS_NAME="docker_runtime"
# Check if the instance_name is localhost
if [ "${INSTANCE_NAME}" == "localhost" ]; then
echo "Must use FQDN hostname"
exit 1
fi
# Function to calculate Docker container runtime
function dockerruntime() {
# Get the start time of the container
start_time=$(docker inspect -f '{{.State.StartedAt}}' "$1")
# Convert the start time to timestamp
start_timestamp=$(date +%s -d "$start_time")
# Get the current timestamp
current_timestamp=$(date +%s)
# Calculate the runtime in seconds
let runtime=${current_timestamp}-${start_timestamp}
# Output the runtime
echo $runtime
return $runtime
}
# Write the container runtimes to "${FILE_PATH}" file
echo -e "# HELP ${METRICS_NAME} time sec\n# TYPE ${METRICS_NAME} gauge" > ${FILE_PATH}
# Iterate over all Docker container names
for container_name in $allname
do
# Get the runtime of the container
runtime=$(dockerruntime ${container_name})
# Format and write the data to "${FILE_PATH}" file
echo "${METRICS_NAME}{name=\"${container_name}\"} ${runtime}" >> ${FILE_PATH}
done
# Upload the data to Prometheus Pushgateway
curl -s --data-binary "@${FILE_PATH}" ${PROMETHEUS_HOST}/metrics/job/${METRICS_NAME}/instance/"${INSTANCE_NAME}"
监控端口
获取当前所有的端口:
netstat -lntup | awk '/^tcp/ {split($4, a, ":"); print a[length(a)]}'|sort|uniq > port_temp.txt
vim port_monitor
#!/bin/bash
instance_name=`hostname -s`
metrics_name="port_status"
file_path="/app/prometheus/monitor_scripts/port_temp.log"
while true;do
echo -e "# HELP ${metrics_name} monitor\n# TYPE ${metrics_name} gauge" > ${file_path}
for port in `cat /app/prometheus/monitor_scripts/port_temp.txt`;do
port_count=`netstat -lntup|grep -wc "$port"`
echo "${metrics_name}{port=\"$port\"} ${port_count}" >> ${file_path}
done
curl --data-binary @${file_path} http://10.0.0.201:9091/metrics/job/${metrics_name}/instance/$instance_name
sleep 5
done
监控进程
获取当前所有进程:
#记得去除文件内无关信息
netstat -lntup|awk -F/ '{print $NF}'|sort|uniq > process_temp.txt
vim prome_process
#!/bin/bash
instance_name=`hostname -s`
metrics_name="process_status"
file_path="/app/prometheus/monitor_scripts/process/process_temp.log"
while true;do
echo -e "# HELP ${metrics_name} monitor\n# TYPE ${metrics_name} gauge" > ${file_path}
for process in `cat /app/prometheus/monitor_scripts/process/process_temp.txt`;do
process_count=`ps -ef|grep -v grep|grep -c "${process}"`
echo "${metrics_name}{process=\"$process\"} ${process_count}" >>${file_path}
done
curl --data-binary @${file_path} http://10.0.0.201:9091/metrics/job/${metrics_name}/instance/$instance_name
sleep 5
done
配置supervisor
vim /etc/supervisord.d/monitor.ini
[program:prome_port]
direct=/app/prometheus/monitor_scripts/port/
command=/bin/bash -c "/app/prometheus/monitor_scripts/port/port_monitor"
autostart=true
autorestart=true
stdout_logfile=/var/log/prome_port_stdout.log
stderr_logfile=/var/log/prome_port_stderr.log
user=root
stopsignal=TERM
startsecs=5
startretries=3
stopasgroup=true
killasgroup=true
[program:prome_process]
direct=/app/prometheus/monitor_scripts/process/
command=/bin/bash -c "/app/prometheus/monitor_scripts/process/prome_process"
autostart=true
autorestart=true
stdout_logfile=/var/log/prome_process_stdout.log
stderr_logfile=/var/log/prome_process_stderr.log
user=root
stopsignal=TERM
startsecs=5
startretries=3
stopasgroup=true
killasgroup=true