天天看點

python爬蟲插件_Python 爬蟲插件

#coding:utf-8

importsys,urllib2,re,Queue

sys.path.append("..")from lib.Http_Class importHttp_Classfrom BeautifulSoup importBeautifulSoup#####################################

#Spider 爬蟲子產品#

####################################

classSpider_module:defsetW3AScan(self,w3ascan):

self.w3ascan=w3ascan

self.result_list={}

self.q_list=Queue.Queue()

self.tmp_list=Queue.Queue()defstart(self,aa):

url="http://lucifr.com/"

print "[*] 爬蟲目标:"+url

self.result_list.update({url:0})try:whileTrue:#判斷爬蟲是否有爬過

for url inself.result_list:if self.result_list[url]==0:

self.q_list.put(url)

self.result_list[url]=1

#判斷任務隊列是否為空,如果是則直接退出

#否則處理任務

ifself.q_list.empty():print "[*] 結束爬蟲任務."

break

else:for tmp inrange(self.q_list.qsize()):

spider_url=self.q_list.get()

obj=Http_Class()try:

html=obj._do("get",spider_url)except:

self.w3ascan.log_create("url: %s Field!" % spider_url,"Spider_module")print "url: %s Field!" %spider_urlcontinuesoup=BeautifulSoup(html)

links=soup.findAll('a')for link inlinks:

_url=link.get('href').encode('utf-8')if re.match('^(javascript|:;|#|mailto)',_url) or _url is None or re.match('.(jpg|png|bmp|mp3|wma|wmv|gz|zip|rar|iso|pdf|txt|db)$',_url):continue

if re.match('^(http|https)',_url):if not re.match('^'+url,_url):continue

else:ifself.result_list.has_key(url):continue

else:

rst=_url.encode('utf-8')print "[*][!] 發現新連接配接:"+rst

self.result_list.update({rst:0})else:if self.result_list.has_key(url+_url):continue

else:

rst=url+_urlprint "[*][!] 發現新連接配接:"+rst.encode('utf-8')

self.result_list.update({rst.encode('utf-8'):0})exceptException,error:print "[*] 發生異常情況,捕獲并寫入日志。"self.w3ascan.log_create("Url: %s get Url Error! Source: %s" % (url,error),"Spider_module")defsave(self):print "[*]儲存爬蟲結果"

defgetPluginClass():returnSpider_moduleif __name__=="__main__":

t=Spider_module()

t.start("aaa")