帆软+python+mysql实现服务器监控大屏并自动告警
帆软+python+mysql实现服务器监控大屏并自动告警
背景
某业务涉及大量的图像实时处理,请求由一台nginx负载均衡到三台Linux图像处理服务器,由于业务的社会关注度较高,需要特别保证系统的可用性,业务峰值时实时监控服务器及应用状态。
设计
1.新建监控数据表
2.利用python脚本获取服务器性能状态,存入监控表,并监控重要指标,达预警线时调用第三方语音API,拨打电话至手机
3.帆软从mysql取数展示监控数据和应用日志数据
实施步骤
1.在数据库中新建监控表
CREATE TABLE `sys_info` (`ip` varchar(255) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL,`mem_percent` varchar(255) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '内存占用百分比',`mem_used` varchar(255) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '内存已用',`mem_total` varchar(255) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '总内存',`mem_buffers` varchar(255) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL,`cpu_lavg_1` varchar(255) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '1分钟利用率',`cpu_lavg_5` varchar(255) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '5分钟利用率',`cpu_lavg_15` varchar(255) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '15分钟利用率',`cpu_nr` varchar(255) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT 'cpu数',`cpu_running_process` varchar(255) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '正在运行的进程数',`cpu_total_process` varchar(255) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '总进程数',`cpu_last_pid` varchar(255) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '活跃进程id',`disk_used` varchar(255) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '磁盘已用',`disk_capacity` varchar(255) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL,`disk_available` varchar(255) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '磁盘可用',`disk_percent` varchar(255) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '磁盘已用占比',`gpu_0` varchar(255) DEFAULT NULL,`gpu_1` varchar(255) DEFAULT NULL,`gpu_2` varchar(255) DEFAULT NULL,`gpu_3` varchar(255) DEFAULT NULL,`server_process` varchar(255) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '8888图像处理主服务',`image_server_process` varchar(255) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '8898图像处理换底服务',`create_time` varchar(255) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb3;
2.编写python脚本获取服务器数据,并监控告警,拨打电话
# fileName: get_sys_info.py
# 为了方便,脚本直接引用了业务框架(flask)中的一些参数和实例,若不依赖flask就麻烦一点,老老实实装pymysql操作数据库
# 核心函数 os.popen("shell命令"),执行该命令并返回该shell命令的打印信息import time, os
from flask import Flask
from utils.mysqlclient import SqlUtil
import configapp = Flask(__name__)sqlUtil = SqlUtil()
sqlUtil.init_app(app, config.config.get(config.config_name).DB_CONFIG)def job():# 获取系统资源状态mem = memory_stat()cpu = cpu_stat()disk = disk_stat()gpu = gpu_stat()process = process_stat()# print(mem)# print(cpu)# print(disk)# 判断指标,超过警戒线则调用第三方API拨打电话语音通知phone = '181xxxx6589' # 接收电话手机号 dangerous = { # 设置警戒值'mem_percent': 90,'cpu_lavg_1': 98,'disk_percent': 95}if(mem['percent'] > dangerous['mem_percent'] or cpu['lavg_1' > dangerous['cpu_lavg_1'] or disk['percent'] > dangereous['percent']):call_phone(phone)sql = "insert into sys_info (ip, mem_percent, mem_used, mem_total, mem_buffers, cpu_lavg_1, cpu_lavg_5, " \"cpu_lavg_15, cpu_nr, cpu_running_process, cpu_total_process, cpu_last_pid, disk_used, disk_capacity, " \"disk_available, disk_percent, gpu_0, gpu_1, gpu_2, gpu_3, server_process, image_server_process, " \"create_time) values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, " \"%s, %s, %s, now())"params = [config.config.get(config.config_name).LOCAL_IP, mem['percent'], mem['used'], mem['MemTotal'],mem['Buffers'],cpu['lavg_1'], cpu['lavg_5'], cpu['lavg_15'], cpu['nr'], cpu['running_process'],cpu['total_process'], cpu['last_pid'], disk['used'], disk['capacity'],disk['available'], disk['percent'], gpu['gpu_0'], gpu['gpu_1'], gpu['gpu_2'], gpu['gpu_3'],process['server_process'], process['image_server_process']]# params = ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0']sqlUtil.exec(sql, params)# 内存监控
def memory_stat():mem = {}f = open('/proc/meminfo', 'r')lines = f.readlines()f.close()for line in lines:if len(line) < 2:continuename = line.split(':')[0]var = line.split(':')[1].split()[0]mem[name] = float(var)mem['MemUsed'] = mem['MemTotal'] - mem['MemFree'] - mem['Buffers'] - mem['Cached']# 记录内存使用率 已使用 总内存和缓存大小res = {'percent': int(round(mem['MemUsed'] / mem['MemTotal'] * 100)),'used': round(mem['MemUsed'] / (1024 * 1024), 2), 'MemTotal': round(mem['MemTotal'] / (1024 * 1024), 2),'Buffers': round(mem['Buffers'] / (1024 * 1024), 2)}return res# CPU负载监控
def cpu_stat():loadavg = {}f = open("/proc/loadavg")con = f.read().split()f.close()loadavg['lavg_1'] = con[0]loadavg['lavg_5'] = con[1]loadavg['lavg_15'] = con[2]loadavg['nr'] = con[3]prosess_list = loadavg['nr'].split('/')loadavg['running_process'] = prosess_list[0]loadavg['total_process'] = prosess_list[1]loadavg['last_pid'] = con[4]return loadavg# 磁盘空间监控
def disk_stat():hd = {}disk = os.statvfs('/')hd['available'] = float(disk.f_bsize * disk.f_bavail)hd['capacity'] = float(disk.f_bsize * disk.f_blocks)hd['used'] = float((disk.f_blocks - disk.f_bfree) * disk.f_frsize)res = {'used': round(hd['used'] / (1024 * 1024 * 1024), 2),'capacity': round(hd['capacity'] / (1024 * 1024 * 1024), 2)}res['available'] = res['capacity'] - res['used']res['percent'] = int(round(float(res['used']) / res['capacity'] * 100))return res# gpu监控
def gpu_stat():gpu = { # 最多监控4个gpu,超过了再加'gpu_0': None,'gpu_1': None,'gpu_2': None,'gpu_3': None,}info = os.popen('nvidia-smi').readlines()gpu_info_list = [i for i in info if 'MiB' in i and '%' in i]for i in gpu_info_list:gpu['gpu_' + str(gpu_info_list.index(i))] = ireturn gpu# 会话数监控
def process_stat():process = {}info_server = os.popen('netstat -anp |grep 8888 |wc -l').readlines()info_image_server = os.popen('netstat -anp |grep 8898 |wc -l').readlines()process['server_process'] = info_server[-1]process['image_server_process'] = info_image_server[-1]return process# 拨打电话
def call_phone(phone):"""发送语音验证码 ( 鼎信 )免费获取调用次数地址 https://market.aliyun.com/products/56928004/cmapi026600.html?spm=5176.2020520132.101.2.51547218rkAXxy"""appcode = '' #从上述网址获取appcode填入即可API_BY_VOICE_CODE_DINGXIN = 'http://yuyin2.market.alicloudapi.com/dx/voice_notice'data = {'tpl_id': 'TP1801174','phone': phone,'param': 'name:{name},msg:{msg}'.format(name='管理员', msg='服务器状态达警戒线,请注意')}response = request(url=API_BY_VOICE_CODE_DINGXIN, method='POST', data=data,headers={'Authorization': 'APPCODE {}'.format(appcode)})result = response.json()response_message = result.get('return_code')if response.status_code in [400, 401, 403]:print(result)return Falseif response.status_code == 200 and result.get('return_code') == '00000':print('发送语音成功')return Trueelse:return Falseif __name__ == '__main__':while True:time.sleep(5) # 5s获取一次job()
运行脚本
由于依赖于flask,将get_sys_info.py放入flask根目录,执行
nohup python get_sys_info.py & # 后台运行
ps -ef |grep python # 查看脚本进程状态
3.帆软大屏展示数据

本文来自互联网用户投稿,文章观点仅代表作者本人,不代表本站立场,不承担相关法律责任。如若转载,请注明出处。 如若内容造成侵权/违法违规/事实不符,请点击【内容举报】进行投诉反馈!
