#coding:utf-8
importsys,urllib2,re,Queue
sys.path.append("..")from lib.Http_Class importHttp_Classfrom BeautifulSoup importBeautifulSoup#####################################
#Spider 爬蟲子產品#
####################################
classSpider_module:defsetW3AScan(self,w3ascan):
self.w3ascan=w3ascan
self.result_list={}
self.q_list=Queue.Queue()
self.tmp_list=Queue.Queue()defstart(self,aa):
url="http://lucifr.com/"
print "[*] 爬蟲目标:"+url
self.result_list.update({url:0})try:whileTrue:#判斷爬蟲是否有爬過
for url inself.result_list:if self.result_list[url]==0:
self.q_list.put(url)
self.result_list[url]=1
#判斷任務隊列是否為空,如果是則直接退出
#否則處理任務
ifself.q_list.empty():print "[*] 結束爬蟲任務."
break
else:for tmp inrange(self.q_list.qsize()):
spider_url=self.q_list.get()
obj=Http_Class()try:
html=obj._do("get",spider_url)except:
self.w3ascan.log_create("url: %s Field!" % spider_url,"Spider_module")print "url: %s Field!" %spider_urlcontinuesoup=BeautifulSoup(html)
links=soup.findAll('a')for link inlinks:
_url=link.get('href').encode('utf-8')if re.match('^(javascript|:;|#|mailto)',_url) or _url is None or re.match('.(jpg|png|bmp|mp3|wma|wmv|gz|zip|rar|iso|pdf|txt|db)$',_url):continue
if re.match('^(http|https)',_url):if not re.match('^'+url,_url):continue
else:ifself.result_list.has_key(url):continue
else:
rst=_url.encode('utf-8')print "[*][!] 發現新連接配接:"+rst
self.result_list.update({rst:0})else:if self.result_list.has_key(url+_url):continue
else:
rst=url+_urlprint "[*][!] 發現新連接配接:"+rst.encode('utf-8')
self.result_list.update({rst.encode('utf-8'):0})exceptException,error:print "[*] 發生異常情況,捕獲并寫入日志。"self.w3ascan.log_create("Url: %s get Url Error! Source: %s" % (url,error),"Spider_module")defsave(self):print "[*]儲存爬蟲結果"
defgetPluginClass():returnSpider_moduleif __name__=="__main__":
t=Spider_module()
t.start("aaa")