背景
在運作spark任務時,可能因為某種異常問題導緻任務沒有正常退出。一直處于假死異常狀态。這就需要我們通過一直機制發現這樣的情況。
#!/bin/bash
todaydate=`expr $(date +%s) \* 1000`
yarn application -list -appStates running |grep "application_"|grep -v "stream" |awk '{print $1}' | while read line
do
yarn application -status $line > /tmp/yarn.txt
id=`sed -n '/Application-Id/p' /tmp/yarn.txt |awk -F ":" '{print $2}'|sed 's/^[ \t]*//g'`
name=`sed -n '/Application-Name/p' /tmp/yarn.txt |awk -F ":" '{print $2}'|sed 's/^[ \t]*//g'`
Time=`sed -n '/Start-Time/p' /tmp/yarn.txt |awk -F ":" '{print int($2)}'`
numdate=`expr $1 \* 60 \* 60 \* 1000`
echo ${id}--${name}--${Time}--${date}--${numdate}
echo ${Time}--${numdate}
Time_numdate=`expr $Time + $numdate`
echo Time_numdate:$Time_numdate
echo today:$todaydate
if [ ${Time_numdate} -lt ${todaydate} ];then
if [ $name = "test_parquet" ];then
yarn application -kill $id
curl "http://xxxx:8080/alarm/sendSms.do?mobile=1515813****&type=0&producer=CDH&body=任務運作逾時異常,任務id:$id,任務名稱:$name,任務運作已超過${1}小時,該任務為Zeppelin任務,已被kill"
else
curl "http://xxxx:8080/alarm/sendSms.do?mobile=1515813****&type=0&producer=CDH&body=任務運作逾時異常,任務id:$id,任務名稱:$name,任務運作已超過${1}小時"
fi
else
echo ${Time_numdate} 大于 ${todaydate} 未逾時
fi
done