廢話不多說,直接貼代碼,主要采用BeautifulSoup寫的
#coding:utf8
from bs4 import BeautifulSoup
import urllib2
import urllib
import os
i = 0
j = 0
list_a = []
def gettext(href):
global j,list_a
page = urllib.urlopen(href).read()
soup = BeautifulSoup(page,from_encoding="gb18030")
div = soup.find_all("div",class_="content")
p_text = div[0].find_all("p")
for p in p_text:
fp = file("%s.txt" % list_a[j],"a")
fp.write(' ')
fp.write(p.get_text())
fp.write(" \n")
j+=1
def gethref(url): #獲得所有連結
global i,list_a
fp = file("AllTitle.txt","w+")
page = urllib.urlopen(url).read()
soup = BeautifulSoup(page,from_encoding="gb18030")
ul = soup.find_all("ul",class_="row1")
li = ul[0].find_all("li")
for lia in li:
list_a.append(("%s、" % (i+1))+lia.h3.get_text())
href = lia.a.get('href')
# 将标題簡介和連結有規則的寫入檔案中
fp.write("%s、" % (i+1))
i+=1
fp.write("标題:")
fp.write(lia.h3.get_text())
fp.write("\n 簡介:")
fp.write(lia.p.get_text())
fp.write("\n 連結:")
fp.write(lia.a.get("href"))
fp.write("\n")
gettext(href)
if "__main__"==__name__:
url ="http://re.qq.com/biznext/zkht.htm"
gethref(url)
print "All Is OK!"