天天看點

python分析nginx日志_資訊、分析-統計nginx日志的python實作 -by小雨

#!/usr/local/python#-*- coding: utf-8 -*-

importosimporttimeimportreimportsysimportip_location"""定義一個間時類,可以選取要分析的間時段,如果沒有指定間時段,則分析部全log"""

classTimeParser(object):def __init__(self, re_time, str_time, period):

self.__re_time =re.compile(re_time)

self.__str_time =str_time

self.__period =perioddef __get(self, line):

t= re.search(self.__re_time, line).group(0)return time.mktime(time.strptime(t, self.__str_time))definPeriod(self, line):

t= self.__get(line)return (t > time.mktime(time.strptime(self.__period[0], self.__str_time))and t < time.mktime(time.strptime(self.__period[1], self.__str_time)))classParseLog(object):def __init__(self, file, re_time, str_time, period):

self.ip_dict={}

self.url_dict={}try:

self.domain, self.parsetime, self.suffix= file.split("_")except:

self.domain= file.split(".")[0]

self.parsetime= "unknown time"

#定義一個數函,用來統計量數和總量流,并存入到應相字典中

defCount(self):#用TimeParser執行個體化CountTime

CountTime =TimeParser(re_time, str_time, period)

self.total_traffic=[]"""以下for循環分析每一行,如果這一行不含包廂時,就跳過,如果含包廂時資訊,且在所分析間時段内,

則統計ip和traffic,沒有http_refer資訊的行隻錄記ip,然後跳過!"""with open(file) as f:for i, line inenumerate(f):try:ifCountTime.inPeriod(line):

ip=line.split()[0]try:

traffic= re.findall(r'\d{3}\ [^0]\d+', line)[0].split()[1]exceptIndexError:

traffic=0try:

url= re.findall(r'GET\ .*\.*\', line)[0].split()[1]exceptIndexError:

url= "unknown"

else:continue

exceptAttributeError:continueself.ip_dict.setdefault(ip, {'number':0, 'traffic':0})['number'] += 1self.ip_dict.setdefault(ip, {'number':0, 'traffic':0})['traffic'] +=int(traffic)

self.url_dict.setdefault(url, 0)

self.url_dict[url]+=int(traffic)if not i % 1000000:print "have processed" + str(i) + "lines !"

#統計總量流

self.total_traffic.append(int(traffic))

total=sum(self.total_traffic)#列印總量流巨細

print "******************************************************************"

print self.domain + "all the traffic in" + self.parsetime + "is below:"

print "total_traffic: %s" % str(total/1024/1024)+"MB"

"""定義兩個字典,分離存儲ip的量數和量流資訊"""

defTopIp(self, number):

self.Count()

TopNumberIp={}

TopTrafficIp={}#對字典值賦

for ip inself.ip_dict.keys():

TopNumberIp[ip]= self.ip_dict[ip]['number']

TopTrafficIp[ip]= self.ip_dict[ip]['traffic']#按值從大到小的次序排序鍵

SortIpNo = sorted(TopNumberIp.items(), key=lambda e: e[1], reverse=True)

SortIpTraffic= sorted(TopTrafficIp.items(), key=lambda e: e[1], reverse=True)#出輸連接配接數top 100 ip的相幹資訊到件文TopIpNo.txt中

ipno = open('TopIpNo.txt', 'w+')

ipno.write(u"ip位址\t\t\t拜訪數次\t\t家國/區域/都會\t\t\t營運商\n")

ipno.write("-------------------------------------------------------------------------------------------------\n")for i inrange(number):try:

ipno.write(SortIpNo[i][0]+"\t\t"+str(SortIpNo[i][1])+"\t\t\t"+ip_location.ip_location(SortIpNo[i][0])+"\n")except:continueipno.write("-------------------------------------------------------------------------------------------------\n")

ipno.close()#出輸量流top 100 ip的相幹資訊到件文iptraffic.txt中

iptr = open('iptraffic.txt', 'w+')

iptr.write(u"ip位址\t\t\t總量流(MB)\t\t家國/區域/都會\t\t\t營運商\n")

iptr.write("-------------------------------------------------------------------------------------------------\n")for i inrange(number):try:

iptr.write(SortIpTraffic[i][0]+"\t\t"+str(SortIpTraffic[i][1]/1024/1024))#記入地理資訊

iptr.write("\t\t\t"+ip_location.ip_location(SortIpTraffic[i][0])+"\n")except:continueiptr.write("-------------------------------------------------------------------------------------------------\n")

iptr.close()defTopUrl(self, number):

SortUrlTraffic= sorted(self.url_dict.items(), key=lambda e: e[1], reverse=True)#出輸量流top 100 url相幹資訊到urltraffic.txt件文中

urtr = open('urltraffic.txt', 'w+')

urtr.write("Filename".ljust(75)+u"TotalTraffic(MB)"+"\n")

urtr.write("-----------------------------------------------------------------------------------------\n")for i inrange(number):try:

urtr.write(SortUrlTraffic[i][0].ljust(80)+str(SortUrlTraffic[i][1]/1024/1024)+"\n")except:continueurtr.write("-----------------------------------------------------------------------------------------\n")

urtr.close()#間時的正則和格式,一般不要需改更

re_time='\d{2}\/\w{3}\/\d{4}:\d{2}:\d{2}:\d{2}'str_time='%d/%b/%Y:%H:%M:%S'

#定義分析的間時段

period=("16/Nov/2000:16:00:00", "16/Nov/2015:17:00:00")#定義出輸top number

number = 100

if __name__ == '__main__':if len(sys.argv) < 2:print 'no logfile specified!'

print "Usage: python logParser.py filename"time.sleep(2)

sys.exit()else:

file= sys.argv[1]

lp=ParseLog(file, re_time, str_time, period)print

print "Start to parse the" + file + "struggling! please wait patiently!"

print

print "******************************************************************"time.sleep(2)

lp.TopIp(number)

lp.TopUrl(number)