El script de shell realiza consultas con un solo clic y monitorea la alarma para los recursos del sistema Linux

Durante la operación del servidor, a menudo es necesario monitorear varios recursos del servidor, tales como: monitoreo de carga de CPU, monitoreo de utilización de memoria, monitoreo de utilización de disco, etc., para alertar al sistema a tiempo cuando el sistema es anormal. El artículo es realizar el monitoreo y alarma de los recursos del sistema y la consulta de una tecla con un script de shell de acuerdo con las necesidades personales.

Requisitos:
1. Supervisar la carga de la CPU y la alarma de correo electrónico
2. Supervisar el uso de la memoria y la alarma de correo electrónico
3. Supervisar el uso de espacio en disco y la alarma de correo electrónico
4. Supervisar la ocupación de datos del servicio de redis de la memoria y la alarma de correo electrónico

1. Alarma de correo electrónico, primero debe configurar un servidor de correo, cómo construir un servidor de correo simple, que se ha introducido en la publicación anterior del blog, aquí está el enlace
https://blog.csdn.net/weixin_44901564/article / detalles / 111914395

2. Escriba el script de shell.
Nota: El estándar de alarma establecido en el script es solo para probar la alarma de correo

#!/bin/bash
#===========================================================================================
#
#	       Use: Monitor system resources
#       Author: xxx
#  Create Time: 2020/12/28
#===========================================================================================

function fun_lock 
{
    
    
    local lock=1
    set -o noclobber
    if (echo "$$" > "$1" 2>/dev/null); then
        lock=0
        set +u
        local foo=''
        [ ""x == "$2"x ] || foo="$2 $1;"
        trap "$foo ret=$?; rm -f $1; exit ${ret};" INT TERM EXIT
        set -u
    fi
    set +o noclobber
    return ${lock}
}

function fun_unlock
{
    
    
    trap '' INT TERM EXIT
    rm -f "$1"
}

function fun_print
{
    
    
    #printf "%+35s %s\n" `echo -e "$1 \033[1;32m$2\033[0m"`
    echo -e "$1 \033[1;32m$2\033[0m"
}

get_cpu_info(){
    
    
    local cpu_num=$(sudo cat /proc/cpuinfo | grep "physical id" | uniq | wc -l)           # 获取物理CPU个数
    local cpu_core=$(sudo cat /proc/cpuinfo | grep "cpu cores" | uniq | awk '{print $4}') # 查看cpu核数
    local cpu_logic_num=$(sudo cat /proc/cpuinfo | grep "processor" | wc -l)              # 查看逻辑cpu个数
    

    #获取当前CPU利用率
    local cpu_user=$(top -b -n 1 | grep Cpu | awk '{print $2}' | cut -f 1 -d "%")    # 用户空间占用百分比
    local cpu_system=$(top -b -n 1 | grep Cpu | awk '{print $4}' | cut -f 1 -d "%")  # 内核空间占用百分比
    local cpu_idle=$(top -b -n 1 | grep Cpu | awk '{print $8}' | cut -f 1 -d "%")    # 空闲CPU百分比
    local cpu_iowait=$(top -b -n 1 | grep Cpu | awk '{print $10}' | cut -f 1 -d "%") # 获取等待输入输出占百分比

    # 获取CPU负载信息
    local load_1min=$(uptime | awk '{print $8}' | cut -f 1 -d ',')	  # 获取CPU 1分钟前到现在的负载平均值
    local load_5min=$(uptime | awk '{print $9}' | cut -f 1 -d ',')      # 获取CPU 5分钟前到现在的负载平均值
    local load_15min=$(uptime | awk '{print $10}' | cut -f 1 -d ',')    # 获取CPU 15分钟前到现在的负载平均值    

    echo -e "\033[1;35m#####################监控Cpu资源############### $time\033[0m"   
    fun_print "物理Cpu个数:" "$cpu_num" 
    fun_print "系统Cpu核数:" "$cpu_core"
    fun_print "逻辑Cpu个数:" "$cpu_logic_num"

    fun_print "用户空间占用Cpu百分比:" "$cpu_user"
    fun_print "内核空间占用Cpu百分比:" "$cpu_system"
    fun_print "空闲CPU百分比:" "$cpu_idle"
    fun_print "获取等待输入输出占用Cpu百分比:" "$cpu_iowait"
    
    echo ""

    if [ $(echo "${cpu_idle} < 100" | bc) -eq 1 ];then
        if [ ${USER} != "root" ];then
            echo -e "\033[1;31mcpu利用率过高,当前Cpu利用率$(printf "%.2f" `echo "scale=2; 100-$cpu_idle" | bc`)%\033[0m"
            echo "cpu利用率过高,当前cpu利用率$(printf "%.2f" `echo "scale=2; 100-$cpu_idle" | bc`)%" | sudo mail -s "cpu告警触发${time}" [email protected]
        else
            echo -e "\033[1;31mcpu利用率过高,当前cpu利用率$(printf "%.2f" `echo "scale=2; 100-$cpu_idle" | bc`)%\033[0m"
            echo "cpu利用率过高,当前cpu利用率$(printf "%.2f" `echo "scale=2; 100-$cpu_idle" | bc`)%" | mail -s "cpu告警触发${time}" [email protected]
        fi
    fi
}

get_mem_info(){
    
    
    local mem_total=$(free -h | grep "Mem" | awk '{print $2}')        # 获取内存总量
    local mem_used=$(free -h | grep "Mem" | awk '{print $3}')         # 获取系统已经使用的内存总量
    local mem_free=$(free -h | grep "Mem" | awk '{print $4}')         # 获取系统空闲中的内存总量
    
    local mem_swap=$(free -h | grep "Swap" | awk '{print $2}')        # 获取交换分区大小
    local mem_swap_used=$(free -h | grep "Swap" | awk '{print $3}')   # 获取已使用的交换分区大小
    local mem_swap_free=$(free -h | grep "Swap" | awk '{print $4}')   # 获取空闲中的交换分区大小

    echo -e "\033[1;35m#####################监控内存资源############## $time\033[0m"
    fun_print "内存总量:" "$mem_total"
    fun_print "已经使用的内存总量:" "$mem_used"
    fun_print "空闲的内存总量:" "$mem_free"

    fun_print "交换分区大小:" "$mem_swap"
    fun_print "已经使用的交换分区大小:" "$mem_swap_used"
    fun_print "空闲的交换分区大小:" "$mem_swap_free"
    
    local mem_total_kb=$(free -m | grep "Mem" | awk '{print $2}')
    local mem_used_kb=$(free -m | grep "Mem" | awk '{print $3}')    
    local mem_percentage=`echo "scale=2; $mem_used_kb/$mem_total_kb*100/1" | bc`
  
    echo ""
    
    if [ $(echo "${mem_percentage} > 10" | bc) -eq 1 ];then
        if [ ${USER} != "root" ];then
            echo -e "\033[1;31m内存利用率过高,当前内存利用率${mem_percentage}%\033[0m"
            echo "内存利用率过高,当前内存利用率${mem_percentage}%" | sudo mail -s "内存告警触发${time}" [email protected]  
        else
            echo -e "\033[1;31m内存利用率过高,当前内存利用率${mem_percentage}%\033[0m"
            echo "内存利用率过高,当前内存利用率${mem_percentage}%" | mail -s "内存告警触发${time}" [email protected]
        fi
    fi  
}

get_disk_info(){
    
    
    local disk_total=$(df -h | sed -n 2p | awk '{print $2}')          # 获取磁盘空间总大小
    local disk_used=$(df -h | sed -n 2p | awk '{print $3}')           # 获取已使用的磁盘空间大小
    local disk_avail=$(df -h | sed -n 2p | awk '{print $4}')          # 获取磁盘可用空间大小
    local disk_percentage=$(df -h | sed -n 2p | awk '{print $5}')     # 获取磁盘利用率

    echo -e "\033[1;35m#####################监控磁盘空间############## $time\033[0m"
    fun_print "磁盘空间大小:" "$disk_total"
    fun_print "已使用磁盘空间大小:" "$disk_used"
    fun_print "可用磁盘空间大小:" "$disk_avail"
    fun_print "磁盘利用率:" "$disk_percentage"

    echo ""

    if [[ $(echo "${disk_percentage%?} > 10" | bc) -eq 1 ]];then
        if [ ${USER} != "root" ];then
            echo -e "\033[1;31m磁盘利用率过高,当前磁盘利用率${disk_percentage}\033[0m"
            echo "磁盘利用率过高,当前磁盘利用率${disk_percentage}" | sudo mail -s "磁盘告警触发${time}" [email protected]
        else
            echo -e "\033[1;31m磁盘利用率过高,当前磁盘利用率${disk_percentage}\033[0m"
            echo "磁盘利用率过高,当前磁盘利用率${disk_percentage}" | mail -s "磁盘告警触发${time}" [email protected]
        fi
    fi
}

get_redis_mem(){
    
    
    local port=6378
    local redis_used_memory=`redis-cli -p $port info | grep -w "used_memory"`                         # 数据占用的内存(bytes)
    local redis_used_memory_human=`redis-cli -p $port info | grep -w "used_memory_human"`             # 数据占用的内存(带单位,可读性好)
    local redis_used_memory_rss=`redis-cli -p $port info | grep -w "used_memory_rss"`                 # redis占用的内存
    local redis_used_memory_peak=`redis-cli -p $port info | grep -w "used_memory_peak"`               # 占用内存峰值(bytes)
    local redis_memory_peak_human=`redis-cli -p $port info | grep -w "used_memory_peak_human"`        # 占用内存峰值(带单位,可读性好)
    local redis_used_memory_lua=`redis-cli -p $port info | grep -w "used_memory_lua"`                 # 引擎占用的内存大小(bytes)
    local redis_mem_fragmentation_ratio=`redis-cli -p $port info | grep -w "mem_fragmentation_ratio"` # 内存碎片率
    local redis_mem_allocator=`redis-cli -p $port info | grep -w "mem_allocator"`                     # redis内存分配器版本,在编译时指定的。有libc、jemalloc、tcmalloc

    echo -e "\033[1;35m#####################监控redis内存############# $time\033[0m"

    echo $redis_used_memory
    echo $redis_used_memory_human
    echo $redis_used_memory_rss
    echo $redis_used_memory_peak
    echo $redis_memory_peak_human
    echo $redis_used_memory_lua
    echo $redis_mem_fragmentation_ratio
    echo $redis_mem_allocator
    
    echo ""
    local used_internal_memory=$(echo ${
     
     redis_used_memory_human} | awk -F: '{print $2}')
    local used_internal_memory=${used_internal_memory%?}

    if [ "${redis_used_memory}" > 1000 ];then
       if [ ${USER} != "root" ];then
           echo -e "\033[1;31mredis数据占用内存过高,当前redis数据占用内存${used_internal_memory}\033[0m"
           echo "数据占用内存过高,当前数据占用内存${used_internal_memory}" | sudo mail -s "redis告警触发${time}" [email protected]
       else
           echo -e "\033[1;31mredis数据占用内存过高,当前redis数据占用内存${used_internal_memory}\033[0m"
           echo "数据占用内存过高,当前数据占用内存${used_internal_memory}" | mail -s "redis告警触发${time}" [email protected]
       fi
    fi
} 

###############################################################################################################################
#cd $(dirname `/usr/sbin/lsof -p $$ | gawk '$4 =="255r"{print $NF}'`) #进入当前脚本所在目录,需要系统装有lsof命令
lock_file=/tmp/monitor_server.lock     # 定义操作锁文件路径
time=`date "+%Y-%m-%d %H:%M:%S"`

#获取操作锁
fun_lock $lock_file 2>/dev/null		   
[ $? -ne 0 ] && echo -e "\033[1;31m注意: 操作锁未解除,不能执行脚本!!!\033[0m" && exit 1

# 开始检查、监控
get_cpu_info
get_mem_info
get_disk_info
get_redis_mem

# 释放操作锁
fun_unlock $lock_file			

Sugerencias: finalmente, debe colocar el script en la tarea de sincronización para lograr el efecto de monitoreo

Supongo que te gusta

Origin blog.csdn.net/weixin_44901564/article/details/111994274
Recomendado
Clasificación