#!/usr/local/python#-*- coding: utf-8 -*-
importosimporttimeimportreimportsysimportip_location"""定義一個間時類,可以選取要分析的間時段,如果沒有指定間時段,則分析部全log"""
classTimeParser(object):def __init__(self, re_time, str_time, period):
self.__re_time =re.compile(re_time)
self.__str_time =str_time
self.__period =perioddef __get(self, line):
t= re.search(self.__re_time, line).group(0)return time.mktime(time.strptime(t, self.__str_time))definPeriod(self, line):
t= self.__get(line)return (t > time.mktime(time.strptime(self.__period[0], self.__str_time))and t < time.mktime(time.strptime(self.__period[1], self.__str_time)))classParseLog(object):def __init__(self, file, re_time, str_time, period):
self.ip_dict={}
self.url_dict={}try:
self.domain, self.parsetime, self.suffix= file.split("_")except:
self.domain= file.split(".")[0]
self.parsetime= "unknown time"
#定義一個數函,用來統計量數和總量流,并存入到應相字典中
defCount(self):#用TimeParser執行個體化CountTime
CountTime =TimeParser(re_time, str_time, period)
self.total_traffic=[]"""以下for循環分析每一行,如果這一行不含包廂時,就跳過,如果含包廂時資訊,且在所分析間時段内,
則統計ip和traffic,沒有http_refer資訊的行隻錄記ip,然後跳過!"""with open(file) as f:for i, line inenumerate(f):try:ifCountTime.inPeriod(line):
ip=line.split()[0]try:
traffic= re.findall(r'\d{3}\ [^0]\d+', line)[0].split()[1]exceptIndexError:
traffic=0try:
url= re.findall(r'GET\ .*\.*\', line)[0].split()[1]exceptIndexError:
url= "unknown"
else:continue
exceptAttributeError:continueself.ip_dict.setdefault(ip, {'number':0, 'traffic':0})['number'] += 1self.ip_dict.setdefault(ip, {'number':0, 'traffic':0})['traffic'] +=int(traffic)
self.url_dict.setdefault(url, 0)
self.url_dict[url]+=int(traffic)if not i % 1000000:print "have processed" + str(i) + "lines !"
#統計總量流
self.total_traffic.append(int(traffic))
total=sum(self.total_traffic)#列印總量流巨細
print "******************************************************************"
print self.domain + "all the traffic in" + self.parsetime + "is below:"
print "total_traffic: %s" % str(total/1024/1024)+"MB"
"""定義兩個字典,分離存儲ip的量數和量流資訊"""
defTopIp(self, number):
self.Count()
TopNumberIp={}
TopTrafficIp={}#對字典值賦
for ip inself.ip_dict.keys():
TopNumberIp[ip]= self.ip_dict[ip]['number']
TopTrafficIp[ip]= self.ip_dict[ip]['traffic']#按值從大到小的次序排序鍵
SortIpNo = sorted(TopNumberIp.items(), key=lambda e: e[1], reverse=True)
SortIpTraffic= sorted(TopTrafficIp.items(), key=lambda e: e[1], reverse=True)#出輸連接配接數top 100 ip的相幹資訊到件文TopIpNo.txt中
ipno = open('TopIpNo.txt', 'w+')
ipno.write(u"ip位址\t\t\t拜訪數次\t\t家國/區域/都會\t\t\t營運商\n")
ipno.write("-------------------------------------------------------------------------------------------------\n")for i inrange(number):try:
ipno.write(SortIpNo[i][0]+"\t\t"+str(SortIpNo[i][1])+"\t\t\t"+ip_location.ip_location(SortIpNo[i][0])+"\n")except:continueipno.write("-------------------------------------------------------------------------------------------------\n")
ipno.close()#出輸量流top 100 ip的相幹資訊到件文iptraffic.txt中
iptr = open('iptraffic.txt', 'w+')
iptr.write(u"ip位址\t\t\t總量流(MB)\t\t家國/區域/都會\t\t\t營運商\n")
iptr.write("-------------------------------------------------------------------------------------------------\n")for i inrange(number):try:
iptr.write(SortIpTraffic[i][0]+"\t\t"+str(SortIpTraffic[i][1]/1024/1024))#記入地理資訊
iptr.write("\t\t\t"+ip_location.ip_location(SortIpTraffic[i][0])+"\n")except:continueiptr.write("-------------------------------------------------------------------------------------------------\n")
iptr.close()defTopUrl(self, number):
SortUrlTraffic= sorted(self.url_dict.items(), key=lambda e: e[1], reverse=True)#出輸量流top 100 url相幹資訊到urltraffic.txt件文中
urtr = open('urltraffic.txt', 'w+')
urtr.write("Filename".ljust(75)+u"TotalTraffic(MB)"+"\n")
urtr.write("-----------------------------------------------------------------------------------------\n")for i inrange(number):try:
urtr.write(SortUrlTraffic[i][0].ljust(80)+str(SortUrlTraffic[i][1]/1024/1024)+"\n")except:continueurtr.write("-----------------------------------------------------------------------------------------\n")
urtr.close()#間時的正則和格式,一般不要需改更
re_time='\d{2}\/\w{3}\/\d{4}:\d{2}:\d{2}:\d{2}'str_time='%d/%b/%Y:%H:%M:%S'
#定義分析的間時段
period=("16/Nov/2000:16:00:00", "16/Nov/2015:17:00:00")#定義出輸top number
number = 100
if __name__ == '__main__':if len(sys.argv) < 2:print 'no logfile specified!'
print "Usage: python logParser.py filename"time.sleep(2)
sys.exit()else:
file= sys.argv[1]
lp=ParseLog(file, re_time, str_time, period)print
print "Start to parse the" + file + "struggling! please wait patiently!"
print "******************************************************************"time.sleep(2)
lp.TopIp(number)
lp.TopUrl(number)