由于公司需要监控目标类型较多,不能手动去改动prometheus规则然后reload,所以就通过python写了个程序自动更新prometheus配置
基本环境准备
- python 3.10.10
- flask 2.3.2
- prometheus 2.52.0
基本流程
- 将接口传来的prometheus规则信息保存到数据表中
- 取数据表中所有prometheus规则生成规则文件保存到本地临时文件夹内
- 获取需要修改prometheus机器ip
- 根据第三步获取的ip读取之前的prometheus规则备用
- 根据第三部获取的ip删除之前prometheus规则文件
- 把第二步生成的规则文件上传到第三步获取ip机器上
- 通过调用http://{ip}:9090/-/reload接口让配置文件重新生效
- 如果新的规则文件未生效,把第四部备用规则文件上传
- 清除第二步中的临时文件夹
以上为开发流程,在基本环境准备好的前提下开始开发,本文涉及的kevin模块导入均为本人开发功。
1.数据表创建及模型开发
DROP TABLE IF EXISTS `prom_ruler`;
CREATE TABLE `prom_ruler` (`id` int(11) NOT NULL AUTO_INCREMENT,`expr` varchar(1023) DEFAULT NULL,`duration` varchar(16) DEFAULT NULL,`severity` varchar(15) DEFAULT NULL,`summary` text,`description` text,`datasource` int(11) DEFAULT NULL,`group` int(11) DEFAULT NULL,`updated_on` datetime DEFAULT NULL,`created_on` datetime DEFAULT NULL,`heal` int(11) DEFAULT NULL,PRIMARY KEY (`id`) USING BTREE
) ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC;
#!/usr/bin/env python
# -*- coding: UTF-8 -*-from sqlalchemy import Column, Integer, String, TEXTfrom kevin.sqlalchemy_utils.model import Modelclass PromRulerModel(Model):"""prom ruler"""__tablename__ = "prom_ruler"expr = Column(String(100))duration = Column(String(10))severity = Column(String)summary = Column(TEXT)description = Column(TEXT)datasource = Column(Integer)group = Column(Integer)heal = Column(Integer)
2.业务逻辑处理
#!/usr/bin/env python
# -*- coding: UTF-8 -*-import traceback
from typing import Listimport yamlimport requests
from flask import requestfrom kevin.common.http_code import HttpCode
from kevin.log import logger
from kevin.sqlalchemy_utils.model_utils import list_to_dictlist
from kevin.utils import pretty_resultfrom config import Config
from extensions.ssh_ext import read_remote_file, exec_ssh_one_cmd, upload_file_to_linux
from models.alarm_heal import AlarmHeal
from models.common import GroupModel, DataSourceModel
from models.prometheus import PromRulerModel, PromRuleClassifyModel
from resource.v1.dc2_monitor_mixin import MonitorMixinclass PrometheusDbView(MonitorMixin):@staticmethoddef generate_prometheus_rule_model(prometheus_rule: dict) -> PromRulerModel:prom_rule_model = PromRulerModel(expr=prometheus_rule["monitorMetrics"] + prometheus_rule["symbols"] + prometheus_rule["alarmThreshold"],duration=prometheus_rule["duration"],severity=prometheus_rule["severity"],summary=prometheus_rule["summary"],description=prometheus_rule["description"],datasource=prometheus_rule["datasource"],group=prometheus_rule["group"])if "heal" in prometheus_rule and prometheus_rule["heal"] and prometheus_rule["heal"] != "None":prom_rule_model.heal = prometheus_rule["heal"]else:prom_rule_model.heal = 0return prom_rule_model@staticmethoddef save_prometheus_rule(prom_rule_model) -> None:add_prom_rule_model_res = prom_rule_model.objects.add()if not add_prom_rule_model_res:raise Exception("prom rule was failed deposited into the database.")@staticmethoddef delete_prometheus_rule(prometheus_rule_id: int) -> dict:prom_rule = PromRulerModel.objects_().get_by_id(prometheus_rule_id).__dict__PromRulerModel.objects_().delete(PromRulerModel.id == prometheus_rule_id)return prom_ruledef save_prometheus_rule_to_db(self, prometheus_rule: dict) -> None:logger.info("Start save prometheus rule to db.")prom_rule_model = self.generate_prometheus_rule_model(prometheus_rule=prometheus_rule)self.save_prometheus_rule(prom_rule_model=prom_rule_model)logger.info("Success save prometheus rule to db.")@staticmethoddef get_prometheus_rule_by_db() -> List[dict]:prom_rule = PromRulerModel.objects_().list() or []prom_rule = list_to_dictlist(prom_rule)for rule in prom_rule:if "datasource" in rule and rule["datasource"] != 'None':rule["datasource_id"] = int(rule["datasource"])rule["datasource"] = DataSourceModel.objects_().get_by_id(int(rule["datasource"])).nameif "group" in rule:rule["group_id"] = int(rule["group"])try:rule["group"] = GroupModel.objects_().get_by_id(int(rule["group"])).nameexcept Exception as e:logger.error("group deleted!")return prom_ruledef update_prometheus_rule_by_db(self, prometheus_rule: dict) -> None:logger.info("Start update prometheus rule to db.")prom_rule_model = self.generate_prometheus_rule_model(prometheus_rule=prometheus_rule)self.delete_prometheus_rule(prometheus_rule_id=prometheus_rule["id"])self.save_prometheus_rule(prom_rule_model=prom_rule_model)logger.info("Finish update prometheus rule to db.")class PrometheusRuleView(MonitorMixin):@staticmethoddef generate_prometheus_ips(prometheus_rule: dict) -> list:datasource_id = int(prometheus_rule["datasource"])url = DataSourceModel.objects_().get_by_id(datasource_id).urlreturn url.split(",")def read_prom_config(self, ip: str) -> list:host_info = self._get_host_info(ip)prom_rule_old_yml = read_remote_file(host_info=host_info, remote_path=Config.PROMETHEUS_CONFIG_PATH)return prom_rule_old_ymldef generate_prometheus_yml(self) -> None:prometheus_yml = {"groups": [{"name": "default_group","rules": []}]}prom_rule_model = PromRulerModel.objects_().list() or []prom_rule_list = list_to_dictlist(prom_rule_model)for prom_rule in prom_rule_list:try:prom_rule_dict = {"alert": GroupModel.objects_().get_by_id(int(prom_rule["group"])).name,"annotations": {"summary": prom_rule["summary"],"description": prom_rule["description"],},"expr": prom_rule["expr"],"for": prom_rule["duration"],"labels": {"prom_id": prom_rule["id"],"severity": prom_rule["severity"]}}prometheus_yml["groups"][0]["rules"].append(prom_rule_dict)except Exception as e:logger.error(f"generate prom rule, error: {e}, {traceback.format_exc()}")logger.error(f"prom rule {prom_rule}")self.generate_conf_tmp_dir()with open(Config.PROMETHEUS_CONFIG_LOCAL_PATH, "w", encoding='utf-8') as f:yaml.dump(prometheus_yml, f, allow_unicode=True)@staticmethoddef generate_old_prometheus_yml(prom_rule_old_yml) -> None:with open(Config.PROMETHEUS_CONFIG_OLD_LOCAL_PATH, "w", encoding='utf-8') as f:yaml.dump(prom_rule_old_yml, f, allow_unicode=True)def remove_remote_prometheus_rules(self, ip: str) -> None:remove_remote_prometheus_rule_cmd = f"rm -f {Config.PROMETHEUS_CONFIG_PATH}"host_info = self._get_host_info(ip)exec_ssh_one_cmd(host_info=host_info, command=remove_remote_prometheus_rule_cmd)def upload_prometheus_rules_file(self, ip: str, local_host: str) -> None:host_info = self._get_host_info(ip)upload_file_to_linux(host_info=host_info,local_path=local_host,remote_path=Config.PROMETHEUS_CONFIG_PATH)@staticmethoddef reload_prometheus_rules(ip: str) -> bool:url = f"http://{ip}:9090/-/reload"resp = requests.post(url=url)if resp.status_code == 200:logger.info(f"Success reload prom rule {ip}")return Trueelse:logger.error(f"Failed to reload prom rule {ip}")return Falsedef reload_prometheus_conf(self, prometheus_rule: dict) -> None:# 2.生成新的规则文件到本地self.generate_prometheus_yml()prometheus_ips = self.generate_prometheus_ips(prometheus_rule=prometheus_rule)# 3.获取ipfor prometheus_ip in prometheus_ips:# 4.读取远程规则作为旧规则作保障prom_old_rule = self.read_prom_config(ip=prometheus_ip)# 5.删除远程规则文件self.remove_remote_prometheus_rules(ip=prometheus_ip)# 6.上传新的规则文件self.upload_prometheus_rules_file(ip=prometheus_ip, local_host=Config.PROMETHEUS_CONFIG_LOCAL_PATH)# 7.进行远程reloadreload_result = self.reload_prometheus_rules(ip=prometheus_ip)# 8.若失败把旧的规则文件生成并上传远程服务器if not reload_result:# 9.生成旧的prom配置文件self.generate_old_prometheus_yml(prom_old_rule)# 10.上传旧的文件self.upload_prometheus_rules_file(ip=prometheus_ip, local_host=Config.PROMETHEUS_CONFIG_OLD_LOCAL_PATH)self.reload_prometheus_rules(ip=prometheus_ip)# 清楚临时生成的文件夹self.clear_conf_tmp_dir()class PrometheusView(PrometheusDbView, PrometheusRuleView):def get(self):rule_classify_id = request.values.get("ruleClassifyId")page = request.values.get("page")pagesize = request.values.get("pageSize")# 规则分类页面获取规则,没有分页if rule_classify_id is None and page is None and pagesize is None:prom_rules = self.get_prometheus_rule_by_db()return pretty_result(code=HttpCode.OK, data=prom_rules)if rule_classify_id == "0" or rule_classify_id is None:prom_rules = self.get_prometheus_rule_by_db()else:prom_rule_classify = PromRuleClassifyModel.objects_().get_by_id(int(rule_classify_id))if prom_rule_classify is None:prom_rules = []else:rule_ids = prom_rule_classify.rule_ids.split(",")prom_rule_obj = PromRulerModel.objects_().list_by_ids(rule_ids)prom_rules = []for prom_rule in prom_rule_obj:rule = {"datasource_id": prom_rule.datasource,"datasource": DataSourceModel.objects_().get_by_id(prom_rule.datasource).name,"description": prom_rule.description,"duration": prom_rule.duration,"expr": prom_rule.expr,"group_id": prom_rule.group,"group": GroupModel.objects_().get_by_id(prom_rule.group).name,"id": prom_rule.id,"severity": prom_rule.severity,"summary": prom_rule.summary,"heal": prom_rule.heal}if prom_rule.heal and int(prom_rule.heal) != 0:rule["heal_title"] = AlarmHeal.objects_().get_by_id(int(prom_rule.heal)).titleelse:rule["heal_title"] = ""prom_rules.append(rule)total = len(prom_rules)prom_rules = prom_rules[(int(page) - 1) * int(pagesize): int(page) * int(pagesize)]return pretty_result(code=HttpCode.OK, data={"prom_rules": prom_rules, "total": total})def post(self):prometheus_rule = request.get_json(force=True)# 1.规则存入数据库self.save_prometheus_rule_to_db(prometheus_rule=prometheus_rule)self.reload_prometheus_conf(prometheus_rule=prometheus_rule)return pretty_result(code=HttpCode.OK)def put(self):prometheus_rule = request.get_json(force=True)self.update_prometheus_rule_by_db(prometheus_rule=prometheus_rule)self.reload_prometheus_conf(prometheus_rule=prometheus_rule)return pretty_result(code=HttpCode.OK)def delete(self):prometheus_rule_id = request.get_json(force=True)prometheus_rule = self.delete_prometheus_rule(prometheus_rule_id=int(prometheus_rule_id))self.reload_prometheus_conf(prometheus_rule=prometheus_rule)return pretty_result(code=HttpCode.OK)
#!/usr/bin/env python
# -*- coding: UTF-8 -*-import os
import shutil
import tracebackfrom flask_restful import Resourcefrom kevin.log import loggerfrom config import Config
from data_entity import HostInfoclass MonitorMixin(Resource):@staticmethoddef _get_host_info(ip):host_info = HostInfo(hostname=ip,username=Config.MONITOR_USERNAME, # prometheus机器用户名password=Config.MONITOR_PASSWORD, # prometheus机器密码port=22)return host_info@staticmethoddef clear_conf_tmp_dir() -> None:try:tmp_path = os.path.join(Config.PROJECT_DIR, "tmp")shutil.rmtree(tmp_path)logger.info(f"Success clear conf tmp dir: {tmp_path}")except Exception as e:logger.error(f"Failed to clear conf tmp dir, error: {e}, {traceback.format_exc()}")@staticmethoddef generate_conf_tmp_dir() -> None:yaml_dir = os.path.join(Config.PROJECT_DIR, "tmp")if not os.path.isdir(yaml_dir):os.mkdir(yaml_dir)
3.接口实现
#!/usr/bin/env python
# -*- coding: UTF-8 -*-from .v1 import *from flask_restful import Apiapi = Api(prefix="/api/v1")
api.add_resource(PrometheusView, "/prometheusconfigs")