天天看點

requests爬取豆瓣前250部高分電影

這兩天又寫了一個爬取豆瓣前250部高分電影的爬蟲,并把電影名字和圖檔儲存到本地。

用的是requests和BeautifulSoup。

@requires_authorization
import requests
from bs4 import BeautifulSoup

def get_(url):
    '''
        獲得電影的名字,并且儲存電影的圖檔
    '''
    name_list = []
    turn = 
    page = url
    while turn < :
        r = requests.get(page, timeout=)
        soup = BeautifulSoup(r.text,'lxml')
        div_list = soup.find_all('div',{'class':'item'})

        for img in div_list:
            #尋找電影的名字
            movie_text = img.find('span', {'class': 'title'}).text
            name_list.append(movie_text)
            #尋找圖檔的url
            movie_img = img.find('img')['src']
            ir = requests.get(movie_img)
            #如果成功,以電影名儲存圖檔到img檔案夾
            if ir.status_code == :
                #需要在程式目錄下建立一個img檔案夾
                with open('img/'+movie_text+'.jpg', 'wb') as f:
                #with open(movie_text + '.jpg', 'wb') as f:             不需要建立檔案夾
                    f.write(ir.content)
        # 獲得下一頁的url
        turn += 
        yema =  * turn
        page = url + '?start=' + str(yema) + '&filter='
        print('完成第{}頁的儲存,共10頁'.format(turn))

    return name_list


def main():
    url = 'http://movie.douban.com/top250'
    name_list = get_(url)
    #将電影名寫入到moviename.txt
    with open('moviename.txt','w',encoding='utf-8') as f:
        for i,name in enumerate(name_list):
            f.write(name)
            f.write('\n')


if __name__=='__main__':
    main()