天天看點

爬蟲豆瓣電影

模仿http://blog.csdn.net/u011489043/article/details/63255902豆瓣讀書來爬電影,主要是根據電影排名頁面與詳情頁面進行更改

源碼如下(python3):

#!/usr/bin/env python
# -*- coding: utf-8 -*-

# https://movie.douban.com/top250
from urllib.request import urlopen
import os
import csv
import re
import io
import sys
import time
import random

topnum = 1


# 将url轉化成html
def getHtml(url):
    try:
        page = urlopen(url)
        html = page.read()
    except Exception as e:
        print("failed to geturl:", e)
        return ""
    else:
        return html


# 爬電影清單
def getTitle(html):
    # re.S:'.'并且包括換行符在内的任意字元(注意:'.'不包括換行符)
    nameList = re.findall(r'<span.*?class="title">(.*?)</span>', html, re.S)
    newNameList = []
    global topnum
    for index, item in enumerate(nameList):
        if item.find(" ") == -1:  # 過濾掉第二行别名
            newNameList.append(item)
            topnum += 1
    return newNameList


# 通過點選圖檔連結進入每部電影的詳情頁
def getDetail(html):
    detailList = re.findall(r'<a href="(https.*?)" target="_blank" rel="external nofollow" .*?class="">.*?</a>', html, re.S)
    newDetailList = []
    for index, item in enumerate(detailList):
        if item.find("subject") != -1 and index % 2 == 0:
            newDetailList.append(item)
    return newDetailList


"""詳情頁"""


# 擷取電影制片國家
def getRegion(html):
    regionList = re.findall(r'<span class="pl">制片國家/地區.*?</span>(.*?)<br/>', html, re.S)
    return regionList


# 擷取電影語言
def getLanguage(html):
    languageList = re.findall(r'<span class="pl">語言.*?</span>(.*?)<br/>', html, re.S)
    return languageList


# 擷取電影上映日期
def getPublishDate(html):
    publishDate = re.findall(r'<span property="v:initialReleaseDate" content=.*?>(.*?)</span>(.*?)', html, re.S)
    return publishDate


# 擷取電影片長
def getMovieLength(html):
    movieLengthList = re.findall(r'<span.*?property="v:runtime".*?content=".*?">(.*?)</span>', html, re.S)
    return movieLengthList


"""詳情頁結束"""


# 爬圖檔連結
def getImg(html):
    imgList = re.findall(r'img.*?alt=.*?src="(https.*?)"', html, re.S)
    newImgList = []
    for index, item in enumerate(imgList):
        if item.find("js") == -1 and item.find("css") == -1 and item.find("dale") == -1 and item.find(
                "icon") == -1 and item.find("png") == -1:
            newImgList.append(item)
    return newImgList


# 爬評分
def getScore(html):
    scoreList = re.findall(r'<span.*?class="rating_num".*?property="v:average">(.*?)</span>', html, re.S)
    return scoreList


# 爬評價總數
def getComment(html):
    commentList = re.findall(r'<span>(.*?)</span>', html, re.S)
    newCommentList = []
    for index, item in enumerate(commentList):
        if item.find("評價") >= 1:
            newCommentList.append(item)
    return newCommentList


# 将擷取的資訊進行儲存
def saveInfo(infoList):
    # 路徑改成自己的
    with open('/home/han/PycharmProjects/WebScrapingWithPython/python_web/movie_scraper.csv', 'w+', newline='',
              encoding='gb18030') as fp:
        a = csv.writer(fp, delimiter=',')  # delimiter的意思是插入到csv檔案中的一行記錄以它分隔開
        a.writerow(['影  名', '評  分', '評價人數', '圖檔連結', '制片國家/地區', '語言', ' 上映日期 ', '片長'])
        a.writerows(infoList)
        print('儲存完畢')


# 程式開始
# 初始化
namesUrl = []
imagesUrl = []
scoresUrl = []
commentsUrl = []
detailsUrl = []
introductionsUrl = []
publishDatesUrl = []
regions = []
languages = []
movieLengths = []
allInfo = []
# 翻頁,每頁25個
for page in range(0, 50, 25):
    url = "https://movie.douban.com/top250?start={}&filter=&type=".format(page)
    html = getHtml(url).decode("UTF-8")
    if html == '':
        namesUrl.extend('none')
        imagesUrl.extend('none')
        scoresUrl.extend('none')
        commentsUrl.extend('none')
        introductionsUrl.extend('none')
    else:
        namesUrl.extend(getTitle(html))
        imagesUrl.extend(getImg(html))
        scoresUrl.extend(getScore(html))
        commentsUrl.extend(getComment(html))
        introductionsUrl.extend(getDetail(html))

print("len namesUrl:", len(namesUrl))
print("len imagesUrl:", len(imagesUrl))
print("len scoresUrl:", len(scoresUrl))
print("len commentsUrl:", len(commentsUrl))
print("len intro:", len(introductionsUrl))
for index, item in enumerate(introductionsUrl):
    print(item)
    if getHtml(item) == '':  # 排除連結不存在的情況
        regions.append("該連結不存在")
        languages.append("該連結不存在")
        publishDatesUrl.append("該連結不存在")
        movieLengths.append("該連結不存在")
    else:
        html_detail = getHtml(item).decode("UTF-8")
        regions.append(getRegion(html_detail))
        languages.append(getLanguage(html_detail))
        publishDatesUrl.append(getPublishDate(html_detail))
        movieLengths.append(getMovieLength(html_detail))
        time.sleep(random.randint(1, 2))

for i in range(0, len(namesUrl)):
    tmp = []
    tmp.append(namesUrl[i])
    tmp.append(scoresUrl[i])
    tmp.append(commentsUrl[i])
    tmp.append(imagesUrl[i])
    tmp.append(regions[i])
    tmp.append(languages[i])
    tmp.append(publishDatesUrl[i])
    tmp.append(movieLengths[i])
    allInfo.append(tmp)

print(len(namesUrl))
print(len(scoresUrl))
print(len(commentsUrl))
print(len(imagesUrl))
print(len(regions))
print(len(languages))
print(len(publishDatesUrl))
print(len(movieLengths))

saveInfo(allInfo)
print("Exiting Main \n 普通爬取結束時時間")
print(time.ctime(time.time()))
           

具體流程是從排名頁面抓取電影名稱,圖檔連結,評分,評論數和詳情頁連結(點選圖檔或影片名均可至詳情頁).然後便利詳情頁連結,進入每一個連結詳情頁再抓取影片制作國家,語言,上映日期,影片長度

注意:saveInfo函數儲存路徑自行修改.編寫代碼運作時,要保證影片名,圖檔,評分,評論數及詳情頁連結數組個數相等,如果個數不相等,将每個影片資訊列印下來,尋找沒有顯示的影片,看看是什麼情況導緻的.是正規表達式寫的不對沒有完全比對或是其它情況,再進行修改即可.