模仿http://blog.csdn.net/u011489043/article/details/63255902豆瓣讀書來爬電影,主要是根據電影排名頁面與詳情頁面進行更改
源碼如下(python3):
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# https://movie.douban.com/top250
from urllib.request import urlopen
import os
import csv
import re
import io
import sys
import time
import random
topnum = 1
# 将url轉化成html
def getHtml(url):
try:
page = urlopen(url)
html = page.read()
except Exception as e:
print("failed to geturl:", e)
return ""
else:
return html
# 爬電影清單
def getTitle(html):
# re.S:'.'并且包括換行符在内的任意字元(注意:'.'不包括換行符)
nameList = re.findall(r'<span.*?class="title">(.*?)</span>', html, re.S)
newNameList = []
global topnum
for index, item in enumerate(nameList):
if item.find(" ") == -1: # 過濾掉第二行别名
newNameList.append(item)
topnum += 1
return newNameList
# 通過點選圖檔連結進入每部電影的詳情頁
def getDetail(html):
detailList = re.findall(r'<a href="(https.*?)" target="_blank" rel="external nofollow" .*?class="">.*?</a>', html, re.S)
newDetailList = []
for index, item in enumerate(detailList):
if item.find("subject") != -1 and index % 2 == 0:
newDetailList.append(item)
return newDetailList
"""詳情頁"""
# 擷取電影制片國家
def getRegion(html):
regionList = re.findall(r'<span class="pl">制片國家/地區.*?</span>(.*?)<br/>', html, re.S)
return regionList
# 擷取電影語言
def getLanguage(html):
languageList = re.findall(r'<span class="pl">語言.*?</span>(.*?)<br/>', html, re.S)
return languageList
# 擷取電影上映日期
def getPublishDate(html):
publishDate = re.findall(r'<span property="v:initialReleaseDate" content=.*?>(.*?)</span>(.*?)', html, re.S)
return publishDate
# 擷取電影片長
def getMovieLength(html):
movieLengthList = re.findall(r'<span.*?property="v:runtime".*?content=".*?">(.*?)</span>', html, re.S)
return movieLengthList
"""詳情頁結束"""
# 爬圖檔連結
def getImg(html):
imgList = re.findall(r'img.*?alt=.*?src="(https.*?)"', html, re.S)
newImgList = []
for index, item in enumerate(imgList):
if item.find("js") == -1 and item.find("css") == -1 and item.find("dale") == -1 and item.find(
"icon") == -1 and item.find("png") == -1:
newImgList.append(item)
return newImgList
# 爬評分
def getScore(html):
scoreList = re.findall(r'<span.*?class="rating_num".*?property="v:average">(.*?)</span>', html, re.S)
return scoreList
# 爬評價總數
def getComment(html):
commentList = re.findall(r'<span>(.*?)</span>', html, re.S)
newCommentList = []
for index, item in enumerate(commentList):
if item.find("評價") >= 1:
newCommentList.append(item)
return newCommentList
# 将擷取的資訊進行儲存
def saveInfo(infoList):
# 路徑改成自己的
with open('/home/han/PycharmProjects/WebScrapingWithPython/python_web/movie_scraper.csv', 'w+', newline='',
encoding='gb18030') as fp:
a = csv.writer(fp, delimiter=',') # delimiter的意思是插入到csv檔案中的一行記錄以它分隔開
a.writerow(['影 名', '評 分', '評價人數', '圖檔連結', '制片國家/地區', '語言', ' 上映日期 ', '片長'])
a.writerows(infoList)
print('儲存完畢')
# 程式開始
# 初始化
namesUrl = []
imagesUrl = []
scoresUrl = []
commentsUrl = []
detailsUrl = []
introductionsUrl = []
publishDatesUrl = []
regions = []
languages = []
movieLengths = []
allInfo = []
# 翻頁,每頁25個
for page in range(0, 50, 25):
url = "https://movie.douban.com/top250?start={}&filter=&type=".format(page)
html = getHtml(url).decode("UTF-8")
if html == '':
namesUrl.extend('none')
imagesUrl.extend('none')
scoresUrl.extend('none')
commentsUrl.extend('none')
introductionsUrl.extend('none')
else:
namesUrl.extend(getTitle(html))
imagesUrl.extend(getImg(html))
scoresUrl.extend(getScore(html))
commentsUrl.extend(getComment(html))
introductionsUrl.extend(getDetail(html))
print("len namesUrl:", len(namesUrl))
print("len imagesUrl:", len(imagesUrl))
print("len scoresUrl:", len(scoresUrl))
print("len commentsUrl:", len(commentsUrl))
print("len intro:", len(introductionsUrl))
for index, item in enumerate(introductionsUrl):
print(item)
if getHtml(item) == '': # 排除連結不存在的情況
regions.append("該連結不存在")
languages.append("該連結不存在")
publishDatesUrl.append("該連結不存在")
movieLengths.append("該連結不存在")
else:
html_detail = getHtml(item).decode("UTF-8")
regions.append(getRegion(html_detail))
languages.append(getLanguage(html_detail))
publishDatesUrl.append(getPublishDate(html_detail))
movieLengths.append(getMovieLength(html_detail))
time.sleep(random.randint(1, 2))
for i in range(0, len(namesUrl)):
tmp = []
tmp.append(namesUrl[i])
tmp.append(scoresUrl[i])
tmp.append(commentsUrl[i])
tmp.append(imagesUrl[i])
tmp.append(regions[i])
tmp.append(languages[i])
tmp.append(publishDatesUrl[i])
tmp.append(movieLengths[i])
allInfo.append(tmp)
print(len(namesUrl))
print(len(scoresUrl))
print(len(commentsUrl))
print(len(imagesUrl))
print(len(regions))
print(len(languages))
print(len(publishDatesUrl))
print(len(movieLengths))
saveInfo(allInfo)
print("Exiting Main \n 普通爬取結束時時間")
print(time.ctime(time.time()))
具體流程是從排名頁面抓取電影名稱,圖檔連結,評分,評論數和詳情頁連結(點選圖檔或影片名均可至詳情頁).然後便利詳情頁連結,進入每一個連結詳情頁再抓取影片制作國家,語言,上映日期,影片長度
注意:saveInfo函數儲存路徑自行修改.編寫代碼運作時,要保證影片名,圖檔,評分,評論數及詳情頁連結數組個數相等,如果個數不相等,将每個影片資訊列印下來,尋找沒有顯示的影片,看看是什麼情況導緻的.是正規表達式寫的不對沒有完全比對或是其它情況,再進行修改即可.