這次帶來的是爬取一個網站的多個頁面的小說并每本小說寫入一個txt文檔
- 擷取網站網址
- 爬取小說的連結
- 爬取目錄的連結
- 爬取各章小說的目錄和内容
1.網站網址
http://www.biquge.com.tw/
2.爬取小說的連結
爬取小說的連結可以擷取到每本小說,連結作為擷取目錄連結的入口
url1 = 'http://www.biquge.com.tw/'
html = requests.get(url1).content
soup = BeautifulSoup(html,'html.parser')
article = soup.find(id="main")
texts = []
for novel in article.find_all(href=re.compile('http://www.biquge.com.tw/')):
#小說連結
nt = novel.get('href')
texts.append(nt)
#print nt #可供檢驗
new_text = []
for text in texts:
if text not in new_text:
new_text.append(text)
3.爬取目錄連結
目錄連結作為擷取每章内容的入口
url2 = 小說連結
html = requests.get(url2).content
soup = BeautifulSoup(html, 'html.parser')
a = []
#爬取相關資訊及目錄
for catalogue in soup.find_all(id="list"):
timu = soup.find(id="maininfo")
name1 = timu.find('h1').get_text()
tm = timu.get_text()
e_cat = catalogue.get_text('\n')
for link in catalogue.find_all(href=re.compile(".html")):
lianjie = 'http://www.biquge.com.tw/' + link.get('href')
a.append(lianjie)
4.爬取各章小說的目錄和内容
目錄連結作為爬取各章目錄和内容的入口
finallyurl = 目錄連結
html = requests.get(finallyurl).content
soup = BeautifulSoup(html, 'html.parser')
tit = soup.find('div', attrs={'class': 'bookname'})
title = tit.h1
content = soup.find(id='content').get_text()
print title.get_text()
print content
5.完整代碼
# -*- coding:utf-8 -*-
from bs4 import BeautifulSoup
import requests
import re
#解決出現的寫入錯誤
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
#可以擷取多本文章
MAX_RETRIES = 20
url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/einfo.fcgi'
session = requests.Session()
adapter = requests.adapters.HTTPAdapter(max_retries=MAX_RETRIES)
session.mount('https://', adapter)
session.mount('http://', adapter)
r = session.get(url)
#爬取首頁各小說連結,并寫入清單
url1 = 'http://www.biquge.com.tw/'
html = requests.get(url1).content
soup = BeautifulSoup(html,'html.parser')
article = soup.find(id="main")
texts = []
for novel in article.find_all(href=re.compile('http://www.biquge.com.tw/')):
#小說連結
nt = novel.get('href')
texts.append(nt)
#print nt #可供檢驗
new_text = []
for text in texts:
if text not in new_text:
new_text.append(text)
#将剛剛的清單寫入一個新清單,以供周遊,擷取各個連結
h = []
h.append(new_text)
l = 0
for n in h:
while l<=len(n)-1:
#爬取小說的相關資訊及目錄和目錄連結
url2 = n[l]
html = requests.get(url2).content
soup = BeautifulSoup(html, 'html.parser')
a = []
#爬取相關資訊及目錄
for catalogue in soup.find_all(id="list"):
timu = soup.find(id="maininfo")
name1 = timu.find('h1').get_text()
tm = timu.get_text()
e_cat = catalogue.get_text('\n')
print name1
print tm
print e_cat
end1 = u'%s%s%s%s' % (tm, '\n', e_cat, '\n')
# 寫入文檔
one1 = end1.encode('utf-8')
fo = open(name1+'.txt', 'a')
fo.write(one1 + '\n')
fo.close()
#爬取各章連結
for link in catalogue.find_all(href=re.compile(".html")):
lianjie = 'http://www.biquge.com.tw/' + link.get('href')
a.append(lianjie)
#将各章的連結清單寫入一個新清單,以供周遊,擷取各章的清單
k = []
k.append(a)
j = 0
for i in k:
while j <= len(i) - 1:
#爬取各章小說内容
url = 'http://www.biquge.com.tw/14_14055/9194140.html'
finallyurl = i[j]
html = requests.get(finallyurl).content
soup = BeautifulSoup(html, 'html.parser')
tit = soup.find('div', attrs={'class': 'bookname'})
title = tit.h1
content = soup.find(id='content').get_text()
print title.get_text()
print content
j += 1
end2 = u'%s%s%s%s' % (title , '\n' , content , '\n')
#寫入文檔
one2 = end2.encode('utf-8')
fo = open(name1 + ".txt", 'a')
fo.write(one2 + '\n')
fo.close()
l+=1
結果展示(有點多,就截了一點兒)
![](https://img.laitimes.com/img/__Qf2AjLwojIjJCLyojI0JCLiAzNfRHLGZkRGZkRfJ3bs92YsYTMfVmepNHL0klaOFTUE5keRpHW3BjMMBjVtJWd0ckW65UbM5WOHJWa5kHT20ESjBjUIF2X0hXZ0xCMx81dvRWYoNHLrdEZwZ1Rh5WNXp1bwNjW1ZUba9VZwlHdssmch1mclRXY39CXldWYtlWPzNXZj9mcw1ycz9WL49zRQpkL4AjM3QTMxMTMxETMxgTMwIzLc52YucWbp5GZzNmLn9Gbi1yZtl2Lc9CX6MHc0RHaiojIsJye.jpg)