相信很多人都有書荒的時候,想要找到一本合适的書籍确實不容易,是以這次利用剛學習到的知識爬取豆瓣網的各類書籍,傳送門https://book.douban.com/tag/?view=cloud。
首先是這個程式的結構,html_downloader是html下載下傳器,html_outputer是導出到Excel表,html_parser是解析頁面,make_wordcloud是制作詞雲,spided_main是程式入口,url_manager是URL管理器,有興趣的童鞋可以去慕課網看paython基礎爬蟲課程。
![](https://img.laitimes.com/img/_0nNw4CM6IyYiwiM6ICdiwiIn5GcuMzNxIzN4cjNy0CN4kDO1ADMwITOxQDM4EDMy0CM1AzMyATMvwFNwgTMwIzLcBTNwMjMwEzLcd2bsJ2Lc12bj5ycn9Gbi52YugTMwIzcldWYtl2Lc9CX6MHc0RHaiojIsJye.png)
主要實作思路是先請求下載下傳需要的html,解析得到目标URL并存儲到URL管理器中,再從URL管理器中擷取得到URL,發送請求,解析得到需要的資訊内容,導出到Excel表格,再重Excel表中擷取資料進行分析得到詞雲。
html_downloader:
在這裡我使用的是urllib.request進行請求,之前有試過用request進行請求,但是爬取了幾百頁就被封了ip,是以棄用request。
# -*- coding:utf8 -*-
import urllib.request
from urllib.parse import quote
import string
class HtmlDownloader(object):
def download(self,url):
if url is None:
return None
s = quote(url, safe=string.printable) #url裡有中文需要添加這一句,不然亂碼
response = urllib.request.urlopen(s)
if response.getcode()!= 200:
return None
return response.read() #傳回内容
通過分析豆瓣網的結構,可以看到,我們首先傳進去的是總的圖書分類,但是我們需要的是每一個分類裡面的圖書資訊。是以我們需要得到每一個分類的url,即base_url,再通過這個base_url去擷取圖書url,即detail_url。
url_manager:
# -*- coding:utf8 -*-
class UrlManage(object):
def __init__(self):
self.base_urls = set() #基本分類的URL
self.detail_urls = set() #詳細内容頁的URL
self.old_base_urls = set() #已經爬取過的url
self.old_detail_urls = set()#已經爬取過的url
#添加單個url
def add_base_url(self,url):
if url is None:
return
if url not in self.base_urls and url not in self.old_base_urls:
self.base_urls.add(url)
def add_detail_url(self,url):
if url is None:
return
if url not in self.detail_urls and url not in self.old_detail_urls:
self.detail_urls.add(url)
# print(self.detail_urls)
# 添加多個url
def add_new_detail_urls(self, urls):
if urls is None or len(urls) == 0:
return
for url in urls:
self.add_detail_url(url)
def add_new_base_urls(self, urls):
if urls is None or len(urls) == 0:
return
for url in urls:
self.add_base_url(url)
#判斷是否還有url
def has_new_detail_url(self):
return len(self.detail_urls)!=0
def has_new_base_url(self):
return len(self.base_urls)!=0
#得到一個新的url
def get_base_url(self):
new_base_url = self.base_urls.pop()
self.old_base_urls.add(new_base_url)
return new_base_url
def get_detail_url(self):
new_detail_url = self.detail_urls.pop()
self.old_detail_urls.add(new_detail_url)
return new_detail_url
解析器 html_parser:
# -*- coding:utf8 -*-
import re
from urllib.parse import urlparse
from bs4 import BeautifulSoup
class HtmlParser(object):
def soup(cont):
soups = BeautifulSoup(cont, 'html.parser', from_encoding='utf-8')
return soups
#得到具體的data資料
def get_new_data(soup):
dict = {}
if (soup.select('.subject-list')[0].contents):
li = soup.select('.subject-list')[0].select('.subject-item')
di = {}
for i in li:
bookname = i.select('.info')[0].select('a')[0].attrs['title'] # 書名
comment = i.select('.clearfix')[0].select('.pl')[0].text
comment = re.findall('\d+', comment)[0]
di[bookname] = comment
if di: # 傳回的字典不為空的時候
dict.update(di)
return dict
# 得到詳細内容的url
def get_detail_url(base_url):
detail_urls = set()
for k in range(0, 501, 20):
if (k == 0):
urls = base_url
# print(urls)
else:
urls = base_url + '?start={}&type=T'.format(k)
# print(urls)
detail_urls.add(urls)
return detail_urls
# 得到所有的baseurl
def get_all_base_urls(soup):
links = soup.select('.tagCol')[0].select('a')
base_urls = set()
for link in links:
new_full_url = 'https://book.douban.com{}'.format(link.attrs['href'])
# HtmlParser.get_detail_url(new_full_url)
base_urls.add(new_full_url)
return base_urls
def parser(cont):
soup = BeautifulSoup(cont, 'html.parser', from_encoding='utf-8')
base_urls = HtmlParser.get_all_base_urls(soup)
return base_urls
spided_main:
# -*- coding:utf8 -*-
from douban_spider2 import url_manager, html_downloader, html_parser, html_outputer
class SpiderMain(object):
def __init__(self):
self.urls = url_manager.UrlManage()
self.downloader = html_downloader.HtmlDownloader()
self.htmlparser = html_parser.HtmlParser
self.outputer = html_outputer.HtmlOutputer()
def craw(self,root_url):
count = 1
dictdata = {}
cont = self.downloader.download(root_url)
base_urls = self.htmlparser.parser(cont)
self.urls.add_new_base_urls(base_urls)
while self.urls.has_new_base_url():
try:
base_url = self.urls.get_base_url()
detail_urls = self.htmlparser.get_detail_url(base_url)
self.urls.add_new_detail_urls(detail_urls)
except:
print('craw failed')
while self.urls.has_new_detail_url():
try:
detail_url = self.urls.get_detail_url()
print ('crow %d : %s'%(count,detail_url))
html_cont = self.downloader.download(detail_url)
soup = self.htmlparser.soup(html_cont)
dict = self.htmlparser.get_new_data(soup)
dictdata.update(dict)
if count == 1000: #因為之前有被封過ip,是以這裡先爬取前1000條detail_url的内容
break
count = count + 1
except:
print ('craw failed')
self.outputer.output_excel(dictdata)
#程式入口
if __name__=="__main__":
url = 'https://book.douban.com/tag/?view=cloud'
obj_spider = SpiderMain()
obj_spider.craw(url)
html_outputer:
# -*- coding:utf8 -*-
import xlwt #寫入Excel表的庫
class HtmlOutputer(object):
def __init__(self):
self.datas =[]
def output_excel(self, dict):
di = dict
wbk = xlwt.Workbook(encoding='utf-8')
sheet = wbk.add_sheet("wordCount") # Excel單元格名字
k = 0
for i in di.items():
sheet.write(k, 0, label=i[0])
sheet.write(k, 1, label=i[1])
k = k + 1
wbk.save('wordCount.xls') # 儲存為 wordCount.xls檔案
導出的Excel表格格式為,一共導出15261條記錄
make_wordcloud:
# -*- coding:utf8 -*-
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import xlrd
from PIL import Image,ImageSequence
import numpy as np
file = xlrd.open_workbook('wordCount.xls')
sheet = file.sheet_by_name('wordCount')
list = {}
for i in range(sheet.nrows):
rows = sheet.row_values(i)
tu = {}
tu[rows[0]]= int(rows[1])
list.update(tu)
print(list)
image= Image.open('./08.png')
graph = np.array(image)
wc = WordCloud(font_path='./fonts/simhei.ttf',background_color='white',max_words=20000, max_font_size=50, min_font_size=1,mask=graph, random_state=100)
wc.generate_from_frequencies(list)
plt.figure()
# 以下代碼顯示圖檔
plt.imshow(wc)
plt.axis("off")
plt.show()
爬過的坑:
當定義的類有構造函數時候,調用時一定要加上括号,如 f = html_downloader.HtmlDownloader().download(),而不是 f= html_downloader.HtmlDownloader.download(),不然就會一直報錯,類似于TypeError: get_all_base_urls() takes 1 positional argument but 2 were given。
生成詞雲的背景圖檔我選用的是
最後的做出由15261本書形成的詞雲
本次爬蟲隻是針對圖書類熱門評論而做出的詞雲,可以看到涵蓋所有分類的書籍裡最熱門評論的有解憂雜貨店,白夜行等,據此我們可以選取比較熱門的圖書進行閱讀,也可以根據此結果再做進一步的分析,擷取熱門書籍中的評論進行分析人們對于某本書的評價關鍵詞,進而進一步的了解這本圖書所描述的内容。
轉載于:https://www.cnblogs.com/veol/p/8886240.html