要注意的一點是Jsonpath是從0開始數的,Xpath是從1開始數的
一般寫法(函數式)
import requests
from requests.exceptions import RequestException
import re
import json
import jsonpath
import csv
import time
headers = {
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
}
fp = open('D:/網易雲音樂Top200.csv','wt',newline='',encoding='utf8')
writer = csv.writer(fp)
writer.writerow(('歌名','歌手','圖檔連結','上次排名'))
def get_html(url):
try:
content = requests.get(url,headers=headers)
if content.status_code == requests.codes.OK:
return content.text
else:
print('debug1')
return None
except RequestException:
print('debug2')
return None
def get_json_data(html):
json_content = re.findall('<textarea id="song-list-pre-data" style="display:none;">(.*?)</textarea>',html,re.S)
# print(json_content) 此時的json_content是真正的json格式
result = json.loads(json_content[0])
#注意此時的result已經不能拿去www.json.cn解析了,因為已經轉化為python對象是字典
#json隻有Array和object組成,如果是Array要記得加一個[x]
'''測試提取結果
print(result[0])#result[0]是指JavaScript裡的第一個對象
print(result[0]["name"])#title
print(result[0]["artists"][0]["name"])#artist
print(result[0]["album"]["picUrl"])
print(result[0]["lastRank"])#上一次的排名
'''#測試成功,對應換成jsonpath
for section in result:
title = jsonpath.jsonpath(section,expr='$.name')[0]
artist = jsonpath.jsonpath(section,expr='$.artists..name')[0]
picture_link = jsonpath.jsonpath(section,expr='$.album.picUrl')[0]
lastRank = jsonpath.jsonpath(section,expr='$.lastRank')
if not lastRank:
lastRank = '等于目前排名'
else:
lastRank = lastRank[0]
writer.writerow((title,artist,picture_link,lastRank))
if __name__ == '__main__':
'''值得注意的是這裡url要把源網頁的
'https://music.163.com/#/discover/toplist?id=3778678'中的/#删除才可以得到
因為可以觀察到網頁的url和解析網頁時的url是這個地方有差别,故嘗試
'''
url = 'https://music.163.com/discover/toplist?id=3778678'
html = get_html(url)
get_json_data(html)
嘗試了一下面向對象寫法,封裝一隻小蜘蛛
import requests
from requests.exceptions import RequestException
import re
import json
import jsonpath
import csv
import time
class CloudMusicSpider:
def __init__(self):
self.headers = {
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
}
self.url = 'https://music.163.com/discover/toplist?id=3778678'
def parse_url(self,url):
try:
response = requests.get(url,headers=self.headers)
if response.status_code == requests.codes.OK:
return response.text
else:
return None
except RequestException:
return None
def get_json_data(self,html):
fp = open('D:/網易雲音樂Top200.csv', 'wt', newline='', encoding='utf8')
writer = csv.writer(fp)
writer.writerow(('歌名', '歌手', '圖檔連結', '上次排名'))
json_content = re.findall('<textarea id="song-list-pre-data" style="display:none;">(.*?)</textarea>', html,re.S)
result = json.loads(json_content[0])
for section in result:
title = jsonpath.jsonpath(section, expr='$.name')[0]
artist = jsonpath.jsonpath(section, expr='$.artists..name')[0]
picture_link = jsonpath.jsonpath(section, expr='$.album.picUrl')[0]
lastRank = jsonpath.jsonpath(section, expr='$.lastRank')
if not lastRank:
lastRank = '等于目前排名'
else:
lastRank = lastRank[0]
writer.writerow((title, artist, picture_link, lastRank))
def runspider(self):
html = self.parse_url(self.url)
self.get_json_data(html)
if __name__ == '__main__':
cloud_music_spider = CloudMusicSpider()
cloud_music_spider.runspider()
剛剛爬取好的資料在檔案裡面會遇到亂碼的現象
![](https://img.laitimes.com/img/__Qf2AjLwojIjJCLyojI0JCLiAzNfRHLGZkRGZkRfJ3bs92YsYTMfVmepNHL6dmeNh3a61keRpHW4Z0MMBjVtJWd0ckW65UbM5WOHJWa5kHT20ESjBjUIF2X0hXZ0xCMx81dvRWYoNHLrdEZwZ1Rh5WNXp1bwNjW1ZUba9VZwlHdssmch1mclRXY39CXldWYtlWPzNXZj9mcw1ycz9WL49zROBlLzITOzMDN0QTMwMTMwkTMwIzLc52YucWbp5GZzNmLn9Gbi1yZtl2Lc9CX6MHc0RHaiojIsJye.png)
解決方法是:把這個剛剛爬好的檔案以記事本的形式打開,并另存為另外的一個csv檔案。
然後再打開就可以看到結果了