天天看點

簡單的網絡爬蟲的python實作

# coding=utf-8

import HTMLParser
import urllib
import sys
import re
import os


# 定義HTML解析器
class parseLinks(HTMLParser.HTMLParser):
    # 該方法用來處理開始标簽的,eg:<div id="main">
    def handle_starttag(self, tag, attrs):
        def _attr(attrlist, attrname):
            for each in attrlist:
                if attrname == each[0]:
                    return each[1]
            return None

        if tag == 'a' or tag == "li" or tag == "link":  # 如果為<a>标簽
            # name為标簽的屬性名,如href、name、id、onClick等等
            for name, value in attrs:
                if name == 'href':  # 這時選擇href屬性
                    #print "name_value: ", value  # href屬性的值
                    link_file.write(value)
                    link_file.write("\n")
                    #print "title: ", _attr(attrs, 'title')
                    #print "first tag:", self.get_starttag_text()  # <a>标簽的開始tag
                    #print "\n"

def search_info(link, key):
    name = key
    text = urllib.urlopen(link).read()
    file_object = open("text.txt", "w")
    file_object.write(text)
    file_object.close()

    file_read = open("text.txt", "r")
    for line in file_read:
        if re.search(name, line):
            print line
            file_result.write(line)
            file_result.write("\n")
    file_read.close()


def deep_search(link, depth):
    lParser.feed(urllib.urlopen(link).read())

if __name__ == "__main__":
    #處理輸入
    website = raw_input("請輸入需要搜尋的網站(exp:http://www.baidu.com): ")
    key = raw_input("請輸入需要搜尋的關鍵字: ")
    print "需要查找的網站是:", website
    print "我知道了主人,您需要找關鍵字:", key
    # 建立HTML解析器的執行個體
    lParser = parseLinks()
    # 深度搜尋子連結
    link_file = open("sub_link.txt", "w")
    deep_search("http://www.baidu.com", 10)
    link_file.close()

    # 查找子連結中的資訊
    sub_link = open("sub_link.txt", "r")
    file_result = open("result.txt", "w")
    for sublink in sub_link:
        #print sublink
        if re.search("http", sublink):
            search_info(sublink, key)
    file_result.close()
    sub_link.close()

    lParser.close()