爬取網易雲超過十萬的歌曲

2019-11-15 09:43:00

import json
from urllib.parse import urlencode

import requests
from lxml import etree
from requests import RequestException

from selenium import webdriver
import time
import csv





# 擷取歌手id和歌手姓名
def read_csv():
    with open("files/music_163_artists.csv", "r", encoding="utf-8") as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            artist_name, artist_id = row
            if str(artist_id) is "artist_id":
                continue
            else:
                yield artist_name, artist_id
    # 當程式的控制流程離開with語句塊後, 檔案将自動關閉

def get_toal(music_id,song_name,driver):
    url = "https://music.163.com/#/song?id="+music_id
    driver.get(url)
    # 切換成frame
    driver.switch_to_frame("g_iframe")
    # 休眠3秒,等待加載完成!
    time.sleep(3)
    response = driver.page_source
    html = etree.HTML(response)
    comments = html.xpath("//span[@class='j-flag']/text()")

    if len(comments) > 0  and int(comments[0]) > 100000:
        print("擷取 %s 的評論 %s 存儲" % (song_name, comments[0]))
        return comments[0]
    else:
        print("擷取 %s 的評論 %s 廢棄" % (song_name, comments[0]))
        return None

# 将獲得的歌手的熱門歌曲id和名字寫入csv檔案
def write_to_csv(song_name,song_url,artist_name,driver):
    csvfile = open('./songs/hotsongs.csv', 'a', encoding='utf-8', newline='')  # 檔案存儲的位置
    writer = csv.writer(csvfile)
    #writer.writerow(('歌曲名稱', '歌曲url','評論總數','歌手'))

    for name, url in zip(song_name, song_url):
        music_id = url.split('=')[-1]
        url = "https://music.163.com/#" + url
        try:
            if name is not None and url is not None:
                song_comments = get_toal(music_id,name,driver)
                if song_comments is not None:
                    writer.writerow([name, url,song_comments,artist_name])
        except Exception as msg:
            print(msg)
            # 當程式的控制流程離開with語句塊後, 檔案将自動關閉


def main(driver):

    for item in read_csv():
        artist_name, artist_id = item
        # 可以任意選擇浏覽器,前提是要配置好相關環境,更多請參考selenium官方文檔
        # 避免多次打開浏覽器
        if artist_id != 'artist_id':
            url = "https://music.163.com/#/artist?id=" + str(artist_id)
            print("正在擷取{}的熱門歌曲...".format(artist_name))
            driver.get(url)
            # 切換成frame
            driver.switch_to.frame("g_iframe")
            # 休眠3秒,等待加載完成!
            time.sleep(2)
            response = driver.page_source

            html = etree.HTML(response)
            song_name = html.xpath("//span[@class='txt']/a/b/@title")
            song_url = html.xpath("//span[@class='txt']/a/@href")

            # 寫入到csv檔案裡面
            write_to_csv(song_name, song_url,artist_name,driver)
            print("{}的熱門歌曲寫入到本地成功!".format(artist_name))


if __name__ == "__main__":
    driver = webdriver.Chrome(executable_path="/www/spider-music163/songs/chromedriver.exe")
    main(driver)

爬取網易雲超過十萬的歌曲

繼續閱讀

ZOJ 1104 Leaps Tall Buildings

Compile workrave under windows &ndash; My exprience 在Windows上編譯Workrave

HDU 2821 Pusher

UVA 1401 Remember the Word

ZOJ 2748 Free Kick

CSU 1567 Reverse Rot

JAVA 系列——>開發工具IntelliJ IDEA的安裝以及配置、快捷鍵IDEA 簡介

門戶通專訪草根站長九天狼：做站貴在堅持

UVA 519 Puzzle (II)

磁盤結構及在Linux中的命名

tabpanel 使用問題

為什麼把CSS放頭部，script放下面

linux下的完美網銀們（google chrome, ubuntu10.04）

CSS之折疊菜單

web開發之前後端渲染

403 Forbidden，You don't have permission to access / on this server.Forbidden