天天看點

豆瓣top250電影抓取——存放到資料庫(2)

下面我們把資料存放到資料庫中。

如果我們想要在pysql的傳回結果裡面使用dict類型。

config裡面可以寫上如下配置

cursorclass = pymysql.cursors.DictCursor

import json
import urllib.request
import urllib.parse
import pymysql


class DoubanMovies(object):
    movies = {

    }
    config = {
        "host": "127.0.0.1",
        "port": 3306,
        "user": "root",
        "password": "*******", 
        "db": "douban_movie",
           
"charset": "utf8",  # not utf-8 take care 使用中文的話,這個要添加進來
    }

    def __init__(self, base_url):
        self.base_url = base_url

    # json資料下載下傳
    def download(self, start=0):
            post_data = {
                "start": start,
            }
            post_data = urllib.parse.urlencode(post_data)
            post_data = post_data.encode("utf-8")
            print(post_data)
            request = urllib.request.Request(self.base_url, post_data)

            # download movies
            response = urllib.request.urlopen(request)
            return response.read().decode("utf-8")

    # json資料解析
    def parse(self, data):
        data = json.loads(data)
        self.movies = data["subjects"]

    # json資料存放
    def store(self):
        connector = pymysql.connect(**self.config)
        cursor = connector.cursor()

        for movie in self.movies:
            title = bytes(movie["title"], encoding="utf-8").decode("utf-8")
            original_title = movie["original_title"]
            genres = " ".join(movie["genres"])
            douban_link = movie["alt"]
            image_large = movie["images"]["large"]
            image_medium = movie["images"]["medium"]
            image_small = movie["images"]["small"]

            sql = "insert into movies(title, original_title, genres, douban_link, " \
                    "image_large, image_medium, image_small) " \
                  'values("%s", "%s" ,"%s","%s", "%s", "%s", "%s")' % (title, original_title, genres, douban_link,
                                                     image_large, image_medium, image_small)

            cursor.execute(sql)

        connector.commit()
        cursor.close()
        connector.close()

if __name__ == "__main__":
    movies_obj = DoubanMovies("https://api.douban.com/v2/movie/top250")
    for i in [i * 10 for i in range(25) if i % 2 == 0]:
        json_data = movies_obj.download(start=i)
        movies_obj.parse(json_data)
        movies_obj.store()

    print("finished")



           

其中資料庫部分寫的比較粗糙,有些字段操作起來很費力。以後慢慢完善。

暫時也不分表了。希望更多的時間花在去電影天堂抓取ftp連結上。

create table movies(id int auto_increment primary key,
					title varchar(50) ,
                    original_title varchar(100),
                    directors varchar(30),
                    casts varchar(60),
                    rating int,
                    genres varchar(100),
                    douban_id varchar(20),
                    image_large varchar(150),
                    image_medium varchar(150),
                    image_small varchar(150),
                    movie_year varchar(10),
                    stars int,
                    subtype varchar(30),
                    douban_link varchar(50)
                    );