#!/bin/bash - #=============================================================================== # USAGE: ./monitor_analytics.sh # DESCRIPTION: monitor online on test # AUTHOR: Amos Yan # CREATED: 01/24/2018 11:14 #=============================================================================== send_mail="/utils/aws-mail-userprofile.sh" counter=5 while [[ ${counter} -ne 0 ]] do job=`ps -ef | grep analytics-spark-streaming-prod | grep online | awk '{print $2}'` echo $job if [ "$job" = "" ]; then echo "spark job analytics-spark-streaming-prod will restart!!!" echo `sh /shell/run_app_on_yarn-cluster.sh` sleep 2m else echo "spark job analytics-spark-streaming-prod is running!" exit 0 fi sleep 1m counter=$(( $counter - 1 )) job=`ps -ef | grep analytics-spark-streaming-prod | grep online | awk '{print $2}'` done if [ ${counter} -eq 0 ];then echo "online analytics-spark-streaming-prod restart failed!!!" message="Error!online analytics-spark-streaming-prod restart failed!!! date:$run_date" sh ${send_mail} "${message}" exit 1 fi
linux contab 设置为每10分钟检测一次(根据自己需要可设为1分钟):
*/10 * * * * cd /home/hadoop/code/online; sh shell/monitor_analytics-spark-streaming-prod.sh > /home/hadoop/code/online/log/monitor_analytics_on_prod-yarn-cluster.log 2>&1
以上仅供demo参考