之前写过一个保障单个job持续运行的文章,现在升级一下,对多个job进行保障,crontab 设置每个10分钟执行一次即可,具体脚本如下:
#!/bin/bash -
#===============================================================================
# USAGE: ./monitor_analytics.sh
# DESCRIPTION: monitor online on test
# AUTHOR: Amos Yan
# CREATED: 01/24/2018 11:14
#===============================================================================
send_mail="/home/hadoop/da/utils/aws-mail-analytics.sh"
cnt=0
counter=5
declare -A job_map=(
["online-app-analytics-streaming-test"]="/home/hadoop/analytics/code/analytics-spark/shell/run_online_app_analytics_test-yarn-cluster.sh"
["online-web-analytics-streaming-test"]="/home/hadoop/analytics/code/analytics-spark/shell/run_online_web_analytics_test-yarn-cluster.sh"
["online-app_hm-analytics-streaming-test"]="/home/hadoop/analytics/code/analytics-spark/shell/run_online_app_hm_analytics_$env-yarn-cluster.sh"
)
while [ $counter -ne 0 ]
do
normal=0
echo -e "\nThe $((6-$counter)) time scanning..........."
for s in "${!job_map[@]}"
do
is_running=`ps -ef|grep $s | grep -v grep`
if [[ $? != 0 ]]; then
normal=1
if [ -e ${job_map[$s]} ];then
echo `sh ${job_map[$s]}` && echo -e "\033[0;41mrestart $s job.\033[0m" && sleep 10s
else
echo -e "\033[0;41m${job_map[$s]} is not running.\033[0m"
fi
else
echo -e "\033[1;32mjob $s is running. \033[0m"
fi
done
counter=$(( $counter-1 ))
if [[ $normal -eq 0 ]];then
counter=0
fi
done
run_date=`date '+%F %T %:::z'`
if [[ $normal -ne 0 ]];then
echo "any streaming job restart failed!!!"
message="Error!!! streaming job restart failed!!! date:$run_date"
sh ${send_mail} "${message}"
exit 1
else
echo "all streaming job is running!"
exit 0
fi