天天看點

yarn 離線任務運作逾時監控告警

背景

在運作spark任務時,可能因為某種異常問題導緻任務沒有正常退出。一直處于假死異常狀态。這就需要我們通過一直機制發現這樣的情況。

#!/bin/bash
todaydate=`expr $(date +%s) \* 1000`
yarn application -list -appStates running  |grep "application_"|grep -v "stream" |awk '{print $1}' | while read line 
do
 yarn application -status $line > /tmp/yarn.txt
 
 id=`sed -n '/Application-Id/p' /tmp/yarn.txt |awk -F ":" '{print $2}'|sed 's/^[ \t]*//g'`
 name=`sed -n '/Application-Name/p' /tmp/yarn.txt |awk -F ":" '{print $2}'|sed 's/^[ \t]*//g'`
 Time=`sed -n '/Start-Time/p' /tmp/yarn.txt |awk -F ":" '{print int($2)}'`
 numdate=`expr $1 \* 60 \* 60 \* 1000`
  
 echo ${id}--${name}--${Time}--${date}--${numdate}
 echo ${Time}--${numdate}
 Time_numdate=`expr $Time + $numdate`
 echo Time_numdate:$Time_numdate
 echo today:$todaydate 
 if [ ${Time_numdate} -lt ${todaydate} ];then
   if [ $name = "test_parquet" ];then
   yarn application -kill $id
   curl "http://xxxx:8080/alarm/sendSms.do?mobile=1515813****&type=0&producer=CDH&body=任務運作逾時異常,任務id:$id,任務名稱:$name,任務運作已超過${1}小時,該任務為Zeppelin任務,已被kill"  
   else
   curl "http://xxxx:8080/alarm/sendSms.do?mobile=1515813****&type=0&producer=CDH&body=任務運作逾時異常,任務id:$id,任務名稱:$name,任務運作已超過${1}小時"  
   fi
 else  
   echo ${Time_numdate} 大于 ${todaydate} 未逾時 
 fi 
done
           

繼續閱讀