天天看點

爬取豆瓣電影資料并進行分析可視化

學習爬蟲爬取豆瓣電影資料并進行分析,整體流程如下:

1、爬取豆瓣電影資料

2、讀取豆瓣電影資料

3、統計各個電影的評論數

4、讀取某個電影的全部評論内容

5、擷取某個電影的關鍵詞并生成詞雲圖

6、對電影資料的關鍵詞和評分進行辯證分析并生成熱力圖

讓我們開始吧!廢話不多說,直接上代碼 #------#

爬取豆瓣電影資料

import requests
from bs4 import BeautifulSoup
from collections import OrderedDict
import pandas as pd
# 設定headers
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'}
movie_info = OrderedDict()

def detail_handle(url):
    html = requests.get(url,headers = headers)
    soup = BeautifulSoup(html.text,'lxml')
    movie_info['movie_rank'] = soup.find_all('span',class_="top250-no")[0].string
    movie_info['movie_name'] = soup.find_all('span',property="v:itemreviewed")[0].string
    soup_div = soup.find(id="info")
    movie_info['movie_director'] = handle_mul_tags(soup_div.find_all('span')[0].find_all('a'))
    movie_info['movie_writer'] = handle_mul_tags(soup_div.find_all('span')[3].find_all('a'))
    movie_info['movie_starring'] = handle_mul_tags(soup_div.find_all('span')[6].find_all('a'))
    movie_info['movie_type'] = handle_mul_tags(soup_div.find_all('span',property="v:genre"))
    movie_info['movie_country'] = soup_div.find(text = '制片國家/地區:').next_element.lstrip().rstrip()
    movie_info['movie_language'] = soup_div.find(text = '語言:').next_element.lstrip().rstrip()
    movie_info['movie_release_date'] = handle_mul_tags(soup_div.find_all('span',property="v:initialReleaseDate"))
    movie_info['movie_run_time'] = handle_mul_tags(soup_div.find_all('span',property="v:runtime"))
    movie_second_name = ''
    try:
        movie_info['movie_second_name'] = soup_div.find(text = '又名:').next_element.lstrip().rstrip()
    except AttributeError:
        print('{}沒有别名'.format(movie_info['movie_name']))
        movie_info['movie_second_name'] = movie_second_name
        
    movie_info['movie_rating'] = soup.find_all('strong',property="v:average")[0].string
    movie_info['movie_comment_users'] = soup.find_all('span',property="v:votes")[0].string
    soup_div_for_ratings = soup.find('div',class_="ratings-on-weight")
    movie_info['movie_five_star_ratio'] = soup_div_for_ratings.find_all('div')[0].find(class_="rating_per").string
    movie_info['movie_four_star_ratio'] = soup_div_for_ratings.find_all('div')[2].find(class_="rating_per").string
    movie_info['movie_three_star_ratio'] = soup_div_for_ratings.find_all('div')[4].find(class_="rating_per").string
    movie_info['movie_two_star_ratio'] = soup_div_for_ratings.find_all('div')[6].find(class_="rating_per").string
    movie_info['movie_one_star_ratio'] = soup_div_for_ratings.find_all('div')[8].find(class_="rating_per").string
    return movie_info
    
def handle_mul_tags(soup_span): # 擷取多個标簽結果将合并在一起,以/分隔
    info = ''
    for second_span in soup_span:
        info = ('' if (info == '') else '/').join((info,second_span.string))
    return info

def crawl():
    htmls = ['https://movie.douban.com/top250?start={}&filter='.format(str(page)) for page in range(0,250,25)]
    for html in htmls:
        html_url = requests.get(html,headers = headers)
        soup = BeautifulSoup(html_url.text,'lxml')
        movie_htmls = soup.select('.pic')
        for movie_html in movie_htmls:
            url = movie_html.select('a')[0]['href']
            return detail_handle(url)
           

對電影資料進行分析和可視化

import sqlite3
import pandas as pd
import jieba
import math
import pyecharts.options as opts
from pyecharts.charts import WordCloud
import os
os.chdir('C:\\Users\\Theo.chen\\Desktop\\資料分析項目\\')

import matplotlib as mpl
mpl.rcParams['font.sans-serif'] = ['SimHei']
mpl.rcParams['axes.unicode_minus'] = False

conn = sqlite3.connect('douban_comment_data.db')
comment_data = pd.read_sql_query('select * from comment;', conn)
movie_data = pd.read_excel('douban_movie_data.xlsx')
FILTER_WORDS = ['知道','影評','影片','小編','沒有','一個','\n','good','is','thing','這個','就是','什麼','真的','of',
                '我們','最後','一部','the','片子','這麼','那麼','不是','還是','時候','覺得','電影','但是','hope','Hope','best','因為',
                '隻是','故事','看過','豆瓣','maybe','這部']

def get_movie_idList(min_comment_count):
    movie_list = comment_data['MOVIEID'].value_counts()
    movie_list = movie_list[movie_list.values > min_comment_count]  
# 篩選出評論數>100的電影
    return movie_list.index

def get_comment_keywords(movie_id,count):
    comment_list = comment_data[comment_data['MOVIEID'] == movie_id]['CONTENT']
    comment_str_all = ''
    for comment in comment_list:
        comment_str_all += comment + '\n'
    seg_list = list(jieba.cut(comment_str_all))
    keywords_counts = pd.Series(seg_list)
    keywords_counts = keywords_counts[keywords_counts.str.len() > 1]
    keywords_counts = keywords_counts[~keywords_counts.str.contains('|'.join(FILTER_WORDS))]
    keywords_counts = keywords_counts.value_counts()[:count]
    return keywords_counts

def get_movie_name_and_score(movie_id):
    movie_link = 'https://movie.douban.com/subject/{}/'.format(movie_id)
    search_result = movie_data[movie_data['連結'] == movie_link].iloc[0]
    movie_name = search_result['電影名']
    movie_score = search_result['評分']
    return (movie_name,movie_score)

def generate_wordcloud(word_list,path_name):
    wordcloud = WordCloud()
    wordcloud.add(
        "",
        tuple(zip(keywords_counts.index,keywords_counts)),word_size_range = [20,100])
    wordcloud.render(path_name)
    print(f"Generate word cloud file done: {path_name}")

# 建立清單, 每個清單都含有10個清單
kw_list_by_score=[[] for i in range(10)]
kw_counts_by_score = [[] for i in range(10)]

movie_id_list = get_movie_idList(300)
for movie_id in movie_id_list:
    word_list = get_comment_keywords(movie_id,30)
    movie_name, movie_score = get_movie_name_and_score(movie_id)
    try:
        kw_list_by_score[math.floor(movie_score)].extend(word_list.index)
        kw_counts_by_score[math.floor(movie_score)].extend(word_list.values)
    except:
        print('Something Error!!!')

for i in range(10):
    if kw_list_by_score[i]:
        kw30_with_counts = pd.DataFrame({
            'kw':kw_list_by_score[i],
            'count':kw_counts_by_score[i]
            })
        kw30_with_counts = kw30_with_counts.groupby('kw').sum()
        kw30_with_counts = kw30_with_counts.sort_values(by = 'count', ascending = False)[:30]
        counts_sum = kw30_with_counts['count'].sum()
        kw30_with_counts['percentage'] = kw30_with_counts['count'] / counts_sum
        kw30_with_counts.to_csv('{}_movie_keywords.csv'.format(i))


from pyecharts.charts import HeatMap

kw_counts_by_score=[[] for _ in range(10)]
for i in range(4,10):
	kw_counts_by_score[i] = pd.read_csv('{}_movie_keywords.csv'.format(i))
	kw_percentage_df = pd.DataFrame([],
		columns = list(range(4,10)),
		index=kw_counts_by_score[9]['kw'][:10])

for i in range(4,10):
	kw=kw_counts_by_score[i]
	kw=kw[kw['kw'].isin(kw_percentage_df.index)]
	kw_percentage_df[i] = pd.Series(list(kw['percentage']),index=kw['kw'])

kw_percentage_df.fillna(0,inplace=True)

data=[]
i = 0
for index in kw_percentage_df.index:
	j=0
	for column in kw_percentage_df.columns:
		data.append([j,i,kw_percentage_df[column][index]*100])
		j+=1
	i+=i

heatmap = HeatMap()
heatmap.add_xaxis(list(kw_percentage_df.columns))
heatmap.add_yaxis("電影評論關鍵詞熱力圖",list(kw_percentage_df.index),data)
heatmap.set_global_opts(
	visualmap_opts=opts.VisualMapOpts(
		min_= 0,
		max_=10,
		orient='horizontal'
		),
	)
heatmap.render(path="heatmap.html")