保障streaming job 持续运行

之前写过一个保障单个job持续运行的文章,现在升级一下,对多个job进行保障,crontab 设置每个10分钟执行一次即可,具体脚本如下:

#!/bin/bash -
#===============================================================================
#         USAGE: ./monitor_analytics.sh
#   DESCRIPTION: monitor online on test
#        AUTHOR: Amos Yan
#       CREATED: 01/24/2018 11:14
#===============================================================================
send_mail="/home/hadoop/da/utils/aws-mail-analytics.sh"
cnt=0
counter=5

declare -A job_map=(
["online-app-analytics-streaming-test"]="/home/hadoop/analytics/code/analytics-spark/shell/run_online_app_analytics_test-yarn-cluster.sh"
["online-web-analytics-streaming-test"]="/home/hadoop/analytics/code/analytics-spark/shell/run_online_web_analytics_test-yarn-cluster.sh"
["online-app_hm-analytics-streaming-test"]="/home/hadoop/analytics/code/analytics-spark/shell/run_online_app_hm_analytics_$env-yarn-cluster.sh"
)

while [ $counter -ne 0 ]
do
normal=0
echo -e "\nThe $((6-$counter)) time scanning..........."
for s in "${!job_map[@]}"
do
    is_running=`ps -ef|grep $s | grep -v grep`
    if [[ $? != 0 ]]; then
        normal=1
        if [ -e ${job_map[$s]} ];then
            echo `sh ${job_map[$s]}` && echo -e "\033[0;41mrestart $s job.\033[0m" &&  sleep 10s
        else
            echo -e "\033[0;41m${job_map[$s]} is not running.\033[0m"
        fi
    else
        echo -e "\033[1;32mjob $s is running. \033[0m"
    fi
done
counter=$(( $counter-1 ))
if [[ $normal -eq 0 ]];then
    counter=0
fi
done

run_date=`date '+%F %T %:::z'`
if [[ $normal -ne 0 ]];then
    echo "any streaming job restart failed!!!"
    message="Error!!! streaming job restart failed!!! date:$run_date"
    sh ${send_mail} "${message}"
    exit 1
else
    echo "all streaming job is running!"
    exit 0
fi

猜你喜欢

转载自blog.csdn.net/zhouyan8603/article/details/81513789
job