天天看点

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> <html><head><meta http-equiv="Cont

我们以前使用过的对hbase和hdfs进行健康检查,及剩余hdfs容量告警,简单易用

1.针对hadoop2的脚本:

#/bin/bash

bin=`dirname $0`

bin=`cd $bin;pwd`

STATE_OK=0

STATE_WARNING=1

STATE_CRITICAL=2

STATE_UNKNOWN=3

STATE_DEPENDENT=4

source /etc/profile

DFS_REMAINING_WARNING=15

DFS_REMAINING_CRITICAL=5

ABNORMAL_QUERY="INCONSISTENT|CORRUPT|FAILED|Exception"

HADOOP_WEB_INTERFACE=h001.hadoop

HBASE_WEB_INTERFACE=h008.hadoop

# hbck and fsck report

output=/var/log/cluster-status

hbase hbck >> $output

hadoop fsck /apps/hbase >> $output

# check report

count=`egrep -c "$ABNORMAL_QUERY" $output`

if [ $count -eq 0 ]; then

echo "[OK] Cluster is healthy." >> $output

else

echo "[ABNORMAL] Cluster is abnormal!" >> $output

# Get the last matching entry in the report file

last_entry=`egrep "$ABNORMAL_QUERY" $output | tail -1`

echo "($count) $last_entry"

exit $STATE_CRITICAL

fi

# HDFS usage

dfs_remaining=`curl -s http://${HADOOP_WEB_INTERFACE}:50070/jmx?qry=Hadoop:service=NameNode,name=NameNodeInfo |egrep -o "PercentRemaining.*" | egrep -o "[0-9]*\.[0-9]*"`

dfs_remaining_word="DFS Remaining%: ${dfs_remaining}%"

echo "$dfs_remaining_word" >> $output

# check HDFS usage

dfs_remaining=`echo $dfs_remaining | awk -F '.' '{print $1}'`

if [ $dfs_remaining -lt $DFS_REMAINING_CRITICAL ]; then

echo "Low DFS space. $dfs_remaining_word"

exit_status=$STATE_CRITICAL

elif [ $dfs_remaining -lt $DFS_REMAINING_WARNING ]; then

exit_status=$STATE_WARNING

echo "HBase check OK - DFS and HBase healthy. 

$dfs_remaining_word"

exit_status=$STATE_OK

exit $exit_status

2.针对hadoop1的脚本:

HADOOP_WEB_INTERFACE=hadoop的Namenode对外接口ip

output=/data/logs/cluster-status

$HBASE_HOME/bin/hbase hbck >> $output

$HADOOP_HOME/bin/hadoop fsck /hbase >> $output

# Check RegionServer Status

dead_region_servers=`curl -s http://${HADOOP_WEB_INTERFACE}:60010/master-status | grep "Dead Region Servers" -A 500 | grep "Regions in Transition" -B 500 | egrep -o 'target="_blank">.*</a>' | awk -F">" '{print $2}' | awk -F"<" '{print $1}'`

if [ -z $dead_region_servers ];then

echo "[OK] All RegionServers is healthy." 

echo "[OK] All RegionServers is healthy." >> $output

echo "[ABNORMAL] the dead regionserver list:" >> $output

echo $dead_region_servers >> $output

dfs_remaining=`curl -s http://${HADOOP_WEB_INTERFACE}:50070/dfshealth.jsp |egrep -o "DFS Remaining%.*%" | egrep -o "[0-9]*\.[0-9]*"`

继续阅读