Shell script realizes one-click query and monitoring alarm for Linux system resources

During the operation of the server, it is often necessary to monitor various resources of the server, such as: CPU load monitoring, memory utilization monitoring, disk utilization monitoring, etc., to alert the system in time when the system is abnormal and notify the system administrator , This article is to realize the monitoring and alarm of system resources and one-key query with shell script according to personal needs

Requirements:
1. Monitor CPU load and email alarm
2. Monitor memory usage and email alarm
3. Monitor disk space usage and email alarm
4. Monitor redis service data occupancy of memory and email alarm

1. Email alarm, you have to set up a mail server first, how to build a simple mail server, which has been introduced in the previous blog post, here is the link
https://blog.csdn.net/weixin_44901564/article/details/ 111914395

2. Write shell script.
Note: The alarm standard set in the script is only for testing mail alarm

#!/bin/bash
#===========================================================================================
#
#	       Use: Monitor system resources
#       Author: xxx
#  Create Time: 2020/12/28
#===========================================================================================

function fun_lock 
{
    
    
    local lock=1
    set -o noclobber
    if (echo "$$" > "$1" 2>/dev/null); then
        lock=0
        set +u
        local foo=''
        [ ""x == "$2"x ] || foo="$2 $1;"
        trap "$foo ret=$?; rm -f $1; exit ${ret};" INT TERM EXIT
        set -u
    fi
    set +o noclobber
    return ${lock}
}

function fun_unlock
{
    
    
    trap '' INT TERM EXIT
    rm -f "$1"
}

function fun_print
{
    
    
    #printf "%+35s %s\n" `echo -e "$1 \033[1;32m$2\033[0m"`
    echo -e "$1 \033[1;32m$2\033[0m"
}

get_cpu_info(){
    
    
    local cpu_num=$(sudo cat /proc/cpuinfo | grep "physical id" | uniq | wc -l)           # 获取物理CPU个数
    local cpu_core=$(sudo cat /proc/cpuinfo | grep "cpu cores" | uniq | awk '{print $4}') # 查看cpu核数
    local cpu_logic_num=$(sudo cat /proc/cpuinfo | grep "processor" | wc -l)              # 查看逻辑cpu个数
    

    #获取当前CPU利用率
    local cpu_user=$(top -b -n 1 | grep Cpu | awk '{print $2}' | cut -f 1 -d "%")    # 用户空间占用百分比
    local cpu_system=$(top -b -n 1 | grep Cpu | awk '{print $4}' | cut -f 1 -d "%")  # 内核空间占用百分比
    local cpu_idle=$(top -b -n 1 | grep Cpu | awk '{print $8}' | cut -f 1 -d "%")    # 空闲CPU百分比
    local cpu_iowait=$(top -b -n 1 | grep Cpu | awk '{print $10}' | cut -f 1 -d "%") # 获取等待输入输出占百分比

    # 获取CPU负载信息
    local load_1min=$(uptime | awk '{print $8}' | cut -f 1 -d ',')	  # 获取CPU 1分钟前到现在的负载平均值
    local load_5min=$(uptime | awk '{print $9}' | cut -f 1 -d ',')      # 获取CPU 5分钟前到现在的负载平均值
    local load_15min=$(uptime | awk '{print $10}' | cut -f 1 -d ',')    # 获取CPU 15分钟前到现在的负载平均值    

    echo -e "\033[1;35m#####################监控Cpu资源############### $time\033[0m"   
    fun_print "物理Cpu个数:" "$cpu_num" 
    fun_print "系统Cpu核数:" "$cpu_core"
    fun_print "逻辑Cpu个数:" "$cpu_logic_num"

    fun_print "用户空间占用Cpu百分比:" "$cpu_user"
    fun_print "内核空间占用Cpu百分比:" "$cpu_system"
    fun_print "空闲CPU百分比:" "$cpu_idle"
    fun_print "获取等待输入输出占用Cpu百分比:" "$cpu_iowait"
    
    echo ""

    if [ $(echo "${cpu_idle} < 100" | bc) -eq 1 ];then
        if [ ${USER} != "root" ];then
            echo -e "\033[1;31mcpu利用率过高,当前Cpu利用率$(printf "%.2f" `echo "scale=2; 100-$cpu_idle" | bc`)%\033[0m"
            echo "cpu利用率过高,当前cpu利用率$(printf "%.2f" `echo "scale=2; 100-$cpu_idle" | bc`)%" | sudo mail -s "cpu告警触发${time}" [email protected]
        else
            echo -e "\033[1;31mcpu利用率过高,当前cpu利用率$(printf "%.2f" `echo "scale=2; 100-$cpu_idle" | bc`)%\033[0m"
            echo "cpu利用率过高,当前cpu利用率$(printf "%.2f" `echo "scale=2; 100-$cpu_idle" | bc`)%" | mail -s "cpu告警触发${time}" [email protected]
        fi
    fi
}

get_mem_info(){
    
    
    local mem_total=$(free -h | grep "Mem" | awk '{print $2}')        # 获取内存总量
    local mem_used=$(free -h | grep "Mem" | awk '{print $3}')         # 获取系统已经使用的内存总量
    local mem_free=$(free -h | grep "Mem" | awk '{print $4}')         # 获取系统空闲中的内存总量
    
    local mem_swap=$(free -h | grep "Swap" | awk '{print $2}')        # 获取交换分区大小
    local mem_swap_used=$(free -h | grep "Swap" | awk '{print $3}')   # 获取已使用的交换分区大小
    local mem_swap_free=$(free -h | grep "Swap" | awk '{print $4}')   # 获取空闲中的交换分区大小

    echo -e "\033[1;35m#####################监控内存资源############## $time\033[0m"
    fun_print "内存总量:" "$mem_total"
    fun_print "已经使用的内存总量:" "$mem_used"
    fun_print "空闲的内存总量:" "$mem_free"

    fun_print "交换分区大小:" "$mem_swap"
    fun_print "已经使用的交换分区大小:" "$mem_swap_used"
    fun_print "空闲的交换分区大小:" "$mem_swap_free"
    
    local mem_total_kb=$(free -m | grep "Mem" | awk '{print $2}')
    local mem_used_kb=$(free -m | grep "Mem" | awk '{print $3}')    
    local mem_percentage=`echo "scale=2; $mem_used_kb/$mem_total_kb*100/1" | bc`
  
    echo ""
    
    if [ $(echo "${mem_percentage} > 10" | bc) -eq 1 ];then
        if [ ${USER} != "root" ];then
            echo -e "\033[1;31m内存利用率过高,当前内存利用率${mem_percentage}%\033[0m"
            echo "内存利用率过高,当前内存利用率${mem_percentage}%" | sudo mail -s "内存告警触发${time}" [email protected]  
        else
            echo -e "\033[1;31m内存利用率过高,当前内存利用率${mem_percentage}%\033[0m"
            echo "内存利用率过高,当前内存利用率${mem_percentage}%" | mail -s "内存告警触发${time}" [email protected]
        fi
    fi  
}

get_disk_info(){
    
    
    local disk_total=$(df -h | sed -n 2p | awk '{print $2}')          # 获取磁盘空间总大小
    local disk_used=$(df -h | sed -n 2p | awk '{print $3}')           # 获取已使用的磁盘空间大小
    local disk_avail=$(df -h | sed -n 2p | awk '{print $4}')          # 获取磁盘可用空间大小
    local disk_percentage=$(df -h | sed -n 2p | awk '{print $5}')     # 获取磁盘利用率

    echo -e "\033[1;35m#####################监控磁盘空间############## $time\033[0m"
    fun_print "磁盘空间大小:" "$disk_total"
    fun_print "已使用磁盘空间大小:" "$disk_used"
    fun_print "可用磁盘空间大小:" "$disk_avail"
    fun_print "磁盘利用率:" "$disk_percentage"

    echo ""

    if [[ $(echo "${disk_percentage%?} > 10" | bc) -eq 1 ]];then
        if [ ${USER} != "root" ];then
            echo -e "\033[1;31m磁盘利用率过高,当前磁盘利用率${disk_percentage}\033[0m"
            echo "磁盘利用率过高,当前磁盘利用率${disk_percentage}" | sudo mail -s "磁盘告警触发${time}" [email protected]
        else
            echo -e "\033[1;31m磁盘利用率过高,当前磁盘利用率${disk_percentage}\033[0m"
            echo "磁盘利用率过高,当前磁盘利用率${disk_percentage}" | mail -s "磁盘告警触发${time}" [email protected]
        fi
    fi
}

get_redis_mem(){
    
    
    local port=6378
    local redis_used_memory=`redis-cli -p $port info | grep -w "used_memory"`                         # 数据占用的内存(bytes)
    local redis_used_memory_human=`redis-cli -p $port info | grep -w "used_memory_human"`             # 数据占用的内存(带单位,可读性好)
    local redis_used_memory_rss=`redis-cli -p $port info | grep -w "used_memory_rss"`                 # redis占用的内存
    local redis_used_memory_peak=`redis-cli -p $port info | grep -w "used_memory_peak"`               # 占用内存峰值(bytes)
    local redis_memory_peak_human=`redis-cli -p $port info | grep -w "used_memory_peak_human"`        # 占用内存峰值(带单位,可读性好)
    local redis_used_memory_lua=`redis-cli -p $port info | grep -w "used_memory_lua"`                 # 引擎占用的内存大小(bytes)
    local redis_mem_fragmentation_ratio=`redis-cli -p $port info | grep -w "mem_fragmentation_ratio"` # 内存碎片率
    local redis_mem_allocator=`redis-cli -p $port info | grep -w "mem_allocator"`                     # redis内存分配器版本,在编译时指定的。有libc、jemalloc、tcmalloc

    echo -e "\033[1;35m#####################监控redis内存############# $time\033[0m"

    echo $redis_used_memory
    echo $redis_used_memory_human
    echo $redis_used_memory_rss
    echo $redis_used_memory_peak
    echo $redis_memory_peak_human
    echo $redis_used_memory_lua
    echo $redis_mem_fragmentation_ratio
    echo $redis_mem_allocator
    
    echo ""
    local used_internal_memory=$(echo ${
     
     redis_used_memory_human} | awk -F: '{print $2}')
    local used_internal_memory=${used_internal_memory%?}

    if [ "${redis_used_memory}" > 1000 ];then
       if [ ${USER} != "root" ];then
           echo -e "\033[1;31mredis数据占用内存过高,当前redis数据占用内存${used_internal_memory}\033[0m"
           echo "数据占用内存过高,当前数据占用内存${used_internal_memory}" | sudo mail -s "redis告警触发${time}" [email protected]
       else
           echo -e "\033[1;31mredis数据占用内存过高,当前redis数据占用内存${used_internal_memory}\033[0m"
           echo "数据占用内存过高,当前数据占用内存${used_internal_memory}" | mail -s "redis告警触发${time}" [email protected]
       fi
    fi
} 

###############################################################################################################################
#cd $(dirname `/usr/sbin/lsof -p $$ | gawk '$4 =="255r"{print $NF}'`) #进入当前脚本所在目录,需要系统装有lsof命令
lock_file=/tmp/monitor_server.lock     # 定义操作锁文件路径
time=`date "+%Y-%m-%d %H:%M:%S"`

#获取操作锁
fun_lock $lock_file 2>/dev/null		   
[ $? -ne 0 ] && echo -e "\033[1;31m注意: 操作锁未解除,不能执行脚本!!!\033[0m" && exit 1

# 开始检查、监控
get_cpu_info
get_mem_info
get_disk_info
get_redis_mem

# 释放操作锁
fun_unlock $lock_file			

Tips: Finally, you need to put the script in the timing task to achieve the monitoring effect

Guess you like

Origin blog.csdn.net/weixin_44901564/article/details/111994274