天天看點

python 爬蟲爬取騰訊新聞科技類的企鵝智酷系列(1)

廢話不多說,直接貼代碼,主要采用BeautifulSoup寫的

#coding:utf8

from bs4 import BeautifulSoup

import urllib2

import urllib

import os

i = 0

j = 0

list_a = []

def gettext(href):

    global j,list_a

    page = urllib.urlopen(href).read()

    soup = BeautifulSoup(page,from_encoding="gb18030")

    div = soup.find_all("div",class_="content")

    p_text = div[0].find_all("p")

    for p in p_text:

        fp = file("%s.txt" % list_a[j],"a")

        fp.write(' ')

        fp.write(p.get_text())

        fp.write(" \n")

        j+=1

def gethref(url): #獲得所有連結

        global i,list_a

        fp = file("AllTitle.txt","w+")

        page = urllib.urlopen(url).read()

        soup = BeautifulSoup(page,from_encoding="gb18030")

        ul = soup.find_all("ul",class_="row1")

        li = ul[0].find_all("li")

        for lia in li:

            list_a.append(("%s、" % (i+1))+lia.h3.get_text())

            href = lia.a.get('href')

            # 将标題簡介和連結有規則的寫入檔案中

            fp.write("%s、" % (i+1))

            i+=1

            fp.write("标題:")

            fp.write(lia.h3.get_text())

            fp.write("\n 簡介:")

            fp.write(lia.p.get_text())

            fp.write("\n 連結:")

            fp.write(lia.a.get("href"))

            fp.write("\n")

            gettext(href)

if "__main__"==__name__:

    url ="http://re.qq.com/biznext/zkht.htm"

    gethref(url)

    print "All Is OK!"