天天看點

python爬取豆瓣電影

相信有許多人時時刻刻的都在等着 自己心儀的那部電影什麼時候出,其實,我們可以利用python簡單的幫你實作
# @author:、Edgar
# @date: unknown 
# version:1.0.1
import urllib.request
import urllib.error
from bs4 import BeautifulSoup
import datetime
import time
"""
TODO: 這個程式通路網站過快,有時候網頁就通路不了
但是如果sleep太久了時間就長了
"""


class DouBan:
    def __init__(self):
        self.baseUrl = 'https://movie.douban.com/cinema/nowplaying/shanghai/'
        self.header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, "
                                     "like Gecko) Chrome/76.0.3809.100 Safari/537.36"}
    
    def get_html(self, url):
        """
        獲得頁面的源代碼
        """
        request = urllib.request.Request(url, headers=self.header)
        try:
            response = urllib.request.urlopen(request, timeout=5)
        except urllib.error.HTTPError as e:
            print(e)
        except urllib.error.URLError as e:
            print(e)
        except Exception as e:
            print(e)
        else:
            return response.read().decode("utf-8")
    
    def get_cities(self, url):
        """
        獲得所有的城市的名稱,以此來拼接網頁
        但是部分城市獲得的不是英文名稱,而是數字,但是可以打開網頁
        """
        html = self.get_html(url)
        soup = BeautifulSoup(html, 'lxml')
        try:
            city_list = soup.find("div", {"id": "cities-list"}).find("div", {"class": "cities-list-bd"})
        except AttributeError as e:
            print(e)
        except Exception as e:
            print(e)
        else:
            hot_list = city_list.find("div").findAll("span")
            hot_cities_list_en = []  # 用以拼接字元串
            # 獲得熱門城市
            for hot in hot_list:
                # 以下的這種方式儲存的是中文格式,無法用于網頁的拼接,但是可用于提示:
                hot_cities_list_en.append(hot.a["uid"])
            all_city_list = city_list.findAll("span")[:-1]
            all_cities_en = []  # 用以拼接字元串
            # 獲得所有的城市
            for city in all_city_list:
                all_cities_en.append(city.a["uid"])
            return hot_cities_list_en, all_cities_en
        
    def spider_nowplaying(self, url):
        """
        用來爬取正在上映的電影
        """
        # 首先爬取正在上映的影片
        try:
            html = self.get_html(self.baseUrl)
            nowplaying_soup = BeautifulSoup(html, 'lxml').find(id="nowplaying")
            html_for_location = self.get_html(url)
        except TypeError:
            self.get_cities(self.baseUrl)
            nowplaying_soup = BeautifulSoup(html, 'lxml').find(id="nowplaying")
        except Exception as e:
            print(e)
        finally:
            location = "上映地區:" + BeautifulSoup(html_for_location, "lxml").find(id="hd").h1.get_text()[5:] + "\n"
            li_list = nowplaying_soup.find("ul", {"class": "lists"}).children
            li_num = 0
            data = '--'*20 + "\n"
            for li in li_list:
                """
                這些隻是從拼接後的網頁上觀看到的
                還可以獲得其中的連接配接,在其網頁中進行爬取
                """
                try:
                    # print("影片名: ", li["data-title"])
                    title = "| 影片名: {}\n".format(li["data-title"])
                    # print("導演: ", li["data-director"])
                    director = "| 導演: {}\n".format(li["data-director"])
                    # print("主角: ", li["data-actors"])
                    actors = "| 主角: {}\n".format(li["data-actors"])
                    # print("豆瓣評分: ", li["data-score"])
                    score = "| 豆瓣評分: {}\n".format(li["data-score"])
                    # print("評分人數: ", li["data-votecount"])
                    vote = "| 評分人數: {}\n".format(li["data-votecount"])
                    # print("--"*20)
                    data = data + title + director + actors + score + vote + "--" * 20 + "\n"
                    li_num += 1
                    # print(data)
                except KeyError:
                    pass
                except Exception:
                    pass
            data = location + data + "一共查找到 {} 部正在上映的電影\n\n".format(li_num)
            print(data)
            return data

    def spider_upcoming(self):
        """
        爬取即将上映的電影的資訊
        """
        url = 'https://movie.douban.com/coming'
        request = urllib.request.Request(url, headers=self.header)
        try:
            response = urllib.request.urlopen(request, timeout=2)
        except urllib.error.HTTPError as e:
            print(e)
        except urllib.error.URLError as e:
            print(e)
        except Exception as e:
            print(e)
        else:
            soup = BeautifulSoup(response.read().decode("utf-8"), 'lxml')
            coming = soup.find(id='content').find("table", {"class": "coming_list"})
            # 在這裡采用逐一獲得每一個電影的方式
            tbody_list = coming.find("tbody").findAll("tr")
            data = ''
            year = datetime.date.today().year

            for tbody in tbody_list:
                data_list = tbody.findAll('td')
                date = data_list[0].get_text().strip()
                name = data_list[1].get_text().strip()
                film_type = data_list[2].get_text().strip()
                maker_location = data_list[3].get_text().strip()
                wants = data_list[4].get_text().strip()
                if '20' not in date:
                    date = str(year) + "年" + date
                    date = "| 電影上映時間: {}\n".format(date)
                else:
                    date = "| 電影上映時間: {}\n".format(date)
                name = "| 影片名:{}\n".format(name)
                film_type = "| 電影類型: {}\n".format(film_type)
                maker_location = "| 制片國家/地區: {}\n".format(maker_location)
                wants = "| 想看: {}\n".format(wants)
                data = data + date + name + film_type + maker_location + wants + "--"*20 + "\n"
            data = data + "一共搜尋到 {} 部電影即将上映".format(len(tbody_list))
            print(data)
            return data
            
    def get_all_web(self):
        """
        拼接以獲得所有的網頁位址
        """
        hot_cities_list_en, all_cities_en = self.get_cities(self.baseUrl)
        hot_city_list = []
        for hot_city in hot_cities_list_en:
            hot_city = "https://movie.douban.com/cinema/nowplaying/{}/".format(hot_city)
            hot_city_list.append(hot_city)
        all_city_list = []
        for city in all_cities_en:
            city_web = "https://movie.douban.com/cinema/nowplaying/{}/".format(city)
            all_city_list.append(city_web)
        return hot_city_list, all_city_list

    @staticmethod
    def write_file(data1='', data2=''):
        with open("douban-data.txt", "w", encoding='utf-8') as file:
            file.write(data1)
            file.write("\n")
            file.write(data2)


if __name__ == '__main__':
    test = DouBan()
    hot_city_list, all_city_list = test.get_all_web()
    # print(all_city_list)
    data1 = ''
    success = 0
    for city in all_city_list:
        try:
            data1 = data1 + test.spider_nowplaying(city)
        except Exception:
            continue
        else:
            success += 1
            continue
    data2 = test.spider_upcoming()
    data2 = '--'*20 + '\n' + data2
    # print(data2)
    # test.spider_nowplaying(test.baseUrl)
    # with open("douban.txt", "w", encoding='utf-8') as file:
    #     file.write(data2)
    test.write_file(data1, data2)
    print("一共有 {} 個地區, 成功通路了 {} 個".format(len(all_city_list), success))