前言
最近在學習go,學習一門語言最好的方式就是實踐,之前學習python也是從爬蟲入手,現在使用go語言寫一個網易雲音樂的爬蟲,下面會簡單介紹開發的過程,代碼是初學者的水準,歡迎吐槽。
本項目github位址
https://github.com/zhujiajunup/yunyinyue開發工具
- go1.11.2 windows/amd64
- Google Chrome 71.0.3578.98
- Fiddler v5.0.20182.28034
擷取資料
不管用什麼語言寫爬蟲,但步驟總是一緻的,隻是實作使用不應的語言而已,第一步當然是确認你想要什麼,本次的目标是網易雲音樂,我是想擷取使用者首頁的聽歌排行榜。

最先需要弄明白的是這些資料是怎麼擷取的,即雲音樂是如何向伺服器請求資料的,打開chrome的調試工具(F12),點到Network,搜尋“鐘無豔”
可以看到資料是
https://music.163.com/weapi/v1/play/record?csrf_token=
的POST請求來擷取的,再看看該請求發送了什麼資料
可以看到送出了一個表單,參數為
params
和
encSecKey
。發送的資料應該是加了密的,下一步就需要知道雲音樂是如何進行加密傳輸的。
從調試視窗可以看到,該請求是由
https://s3.music.126.net/web/s/core_86994123ce247287ad52aafce6acdf9b.js?86994123ce247287ad52aafce6acdf9b
發出的,加密邏輯應該就是在該js中處理的,将該js儲存到本地并格式化後,并搜尋
encSecKey
在哪裡指派的
v9m.bl9c = function(Y9P, e8e) {
var i8a = {},
e8e = NEJ.X({},
e8e),
mp3x = Y9P.indexOf("?");
if (window.GEnc && /(^|\.com)\/api/.test(Y9P) && !(e8e.headers && e8e.headers[eq1x.Bx8p] == eq1x.Iy0x) && !e8e.noEnc) {
if (mp3x != -1) {
i8a = k8c.hc2x(Y9P.substring(mp3x + 1));
Y9P = Y9P.substring(0, mp3x)
}
if (e8e.query) {
i8a = NEJ.X(i8a, k8c.fQ1x(e8e.query) ? k8c.hc2x(e8e.query) : e8e.query)
}
if (e8e.data) {
i8a = NEJ.X(i8a, k8c.fQ1x(e8e.data) ? k8c.hc2x(e8e.data) : e8e.data)
}
i8a["csrf_token"] = v9m.gO2x("__csrf");
Y9P = Y9P.replace("api", "weapi");
e8e.method = "post";
delete e8e.query;
var bUK2x = window.asrsea(JSON.stringify(i8a), brA4E(["流淚", "強"]), brA4E(WU5Z.md), brA4E(["愛心", "女孩", "驚恐", "大笑"]));
e8e.data = k8c.cz9q({
params: bUK2x.encText,
encSecKey: bUK2x.encSecKey
})
}
cwC9t(Y9P, e8e)
};
可以看到是通過
window.asrsea
函數來擷取的,接下來看
window.asrsea
是如何定義的
function() {
function a(a) {
var d, e, b = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789",
c = "";
for (d = 0; a > d; d += 1) e = Math.random() * b.length,
e = Math.floor(e),
c += b.charAt(e);
return c
}
function b(a, b) {
var c = CryptoJS.enc.Utf8.parse(b),
d = CryptoJS.enc.Utf8.parse("0102030405060708"),
e = CryptoJS.enc.Utf8.parse(a),
f = CryptoJS.AES.encrypt(e, c, {
iv: d,
mode: CryptoJS.mode.CBC
});
return f.toString()
}
function c(a, b, c) {
var d, e;
return setMaxDigits(131),
d = new RSAKeyPair(b, "", c),
e = encryptedString(d, a)
}
function d(d, e, f, g) {
var h = {},
i = a(16);
return h.encText = b(d, g),
h.encText = b(h.encText, i),
h.encSecKey = c(i, e, f),
h
}
function e(a, b, d, e) {
var f = {};
return f.encText = c(a + e, b, d),
f
}
window.asrsea = d,
window.ecnonasr = e
} ();
加密算法就是這段代碼,函數接收四個參數,進行了aes和ras加密,具體邏輯這裡不進行詳解,知道了處理邏輯,現在得擷取這四個參數分别是什麼,接下來使用Fiddler來将js替換為本地js,将參數列印出來即可。
Fiddler調試擷取參數
- 配置代理
go語言實作網易雲音樂爬蟲前言擷取資料Reference - 配置Https
-
修改core.js
将
儲存到本地,比如命名為https://s3.music.126.net/web/s/core_86994123ce247287ad52aafce6acdf9b.js?86994123ce247287ad52aafce6acdf9b
,編輯器打開編輯,在指定位置添加如下列印資訊core.js
v9m.bl9c = function(Y9P, e8e) {
var i8a = {},
e8e = NEJ.X({},
e8e),
mp3x = Y9P.indexOf("?");
if (window.GEnc && /(^|\.com)\/api/.test(Y9P) && !(e8e.headers && e8e.headers[eq1x.Bx8p] == eq1x.Iy0x) && !e8e.noEnc) {
if (mp3x != -1) {
i8a = k8c.hc2x(Y9P.substring(mp3x + 1));
Y9P = Y9P.substring(0, mp3x)
}
if (e8e.query) {
i8a = NEJ.X(i8a, k8c.fQ1x(e8e.query) ? k8c.hc2x(e8e.query) : e8e.query)
}
if (e8e.data) {
i8a = NEJ.X(i8a, k8c.fQ1x(e8e.data) ? k8c.hc2x(e8e.data) : e8e.data)
}
i8a["csrf_token"] = v9m.gO2x("__csrf");
Y9P = Y9P.replace("api", "weapi");
e8e.method = "post";
delete e8e.query;
var bUK2x = window.asrsea(JSON.stringify(i8a), brA4E(["流淚", "強"]), brA4E(WU5Z.md), brA4E(["愛心", "女孩", "驚恐", "大笑"]));
window.console.info(Y9P);
window.console.info(JSON.stringify(i8a));
window.console.info(JSON.stringify( brA4E(["流淚", "強"])));
window.console.info(JSON.stringify(brA4E(WU5Z.md)));
window.console.info(JSON.stringify(brA4E(["愛心", "女孩", "驚恐", "大笑"])));
e8e.data = k8c.cz9q({
params: bUK2x.encText,
encSecKey: bUK2x.encSecKey
})
}
cwC9t(Y9P, e8e)
};
- 配置Fiddler Rule
go語言實作網易雲音樂爬蟲前言擷取資料Reference
一切準備就緒後,再打開浏覽器,輸入
https://music.163.com/#/user/songs/rank?id=62947535
,打開調試視窗得Console,就可以看到本地js中添加的輸出日志了
不難發現除了第一個參數外,其他三個參數都是固定的,是以在後面的進行中,隻需要處理第一個參數即可。
第一個參數就是加密前的請求參數
{
"uid": "62947535",
"type": "-1",
"limit": "1000",
"offset": "0",
"total": "true",
"csrf_token": ""
}
那麼已經弄清楚了資料擷取的邏輯,接下來就是按照這個邏輯用go語言實作一遍了,最關鍵的就是加密算法了
go實作
項目結構
加密算法
來自
https://studygolang.com/topics/5815
/*
Package encrypt provides encrypt algorithm such as rsa & aes
*/
package encrypt
import (
"bytes"
"crypto/aes"
"crypto/cipher"
"encoding/base64"
"fmt"
"math/big"
"math/rand"
"time"
)
// generate string for given size
func RandomStr(size int) (result []byte) {
s := "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
strBytes := []byte(s)
r := rand.New(rand.NewSource(time.Now().UnixNano()))
for i := 0; i < size; i++ {
result = append(result, strBytes[r.Intn(len(strBytes))])
}
return
}
func AesEncrypt(sSrc string, sKey string, aseKey string) (string, error) {
iv := []byte(aseKey)
block, err := aes.NewCipher([]byte(sKey))
if err != nil {
return "", err
}
padding := block.BlockSize() - len([]byte(sSrc))%block.BlockSize()
src := append([]byte(sSrc), bytes.Repeat([]byte{byte(padding)}, padding)...)
model := cipher.NewCBCEncrypter(block, iv)
cipherText := make([]byte, len(src))
model.CryptBlocks(cipherText, src)
return base64.StdEncoding.EncodeToString(cipherText), nil
}
func RsaEncrypt(key string, pubKey string, modulus string) string {
rKey := ""
for i := len(key) - 1; i >= 0; i-- { // reserve key
rKey += key[i : i+1]
}
hexRKey := ""
for _, char := range []rune(rKey) {
hexRKey += fmt.Sprintf("%x", int(char))
}
bigRKey, _ := big.NewInt(0).SetString(hexRKey, 16)
bigPubKey, _ := big.NewInt(0).SetString(pubKey, 16)
bigModulus, _ := big.NewInt(0).SetString(modulus, 16)
bigRs := bigRKey.Exp(bigRKey, bigPubKey, bigModulus)
hexRs := fmt.Sprintf("%x", bigRs)
return addPadding(hexRs, modulus)
}
func addPadding(encText string, modulus string) string {
ml := len(modulus)
for i := 0; ml > 0 && modulus[i:i+1] == "0"; i++ {
ml--
}
num := ml - len(encText)
prefix := ""
for i := 0; i < num; i++ {
prefix += "0"
}
return prefix + encText
}
- Music163Spider
type Music163Spider struct {
// send request
client *http.Client
// request's header
headers map[string]string
}
func NewMusic164Spider() (spider Music163Spider) {
headers := make(map[string]string)
headers["Accept"] = "ext/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"
// empty here
headers["Accept-Encoding"] = ""
headers["Content-Type"] = "application/x-www-form-urlencoded"
headers["User-Agent"] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"
headers["Host"] = constants.Music163Host
headers["Cache-Control"] = "no-cache"
headers["Connection"] = "keep-alive"
headers["Pragma"] = "no-cache"
headers["Origin"] = fmt.Sprintf("%s%s", constants.HttpsPrefix, constants.Music163Host)
headers["Accept"] = "ext/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"
return Music163Spider{
client: &http.Client{},
headers: headers,
}
}
加密函數,實作window.asrsea的加密功能
func (spider Music163Spider) dataEncrypt(dataBytes []byte) (content map[string]string) {
content = make(map[string]string)
randomBytes := encrypt.RandomStr(16)
params, err := encrypt.AesEncrypt(string(dataBytes), constants.SrcretKey, constants.AseKey)
if err != nil {
fmt.Println(err)
}
params, err = encrypt.AesEncrypt(params, string(randomBytes), constants.AseKey)
if err != nil {
fmt.Println(err)
}
encSecKey := encrypt.RsaEncrypt(string(randomBytes), constants.PubKey, constants.Modulus)
if err != nil {
fmt.Println(err)
}
content["params"] = string(params)
content["encSecKey"] = string(encSecKey)
return content
}
定義了發送post請求的方法
func (spider Music163Spider) httpPost(url string, headers map[string]string, params interface{}) (result []byte, err error) {
body := make(url2.Values)
jsonParams, err := json.Marshal(params)
if err != nil {
return nil, err
}
encryptResultMap := spider.dataEncrypt(jsonParams)
body["params"] = []string{encryptResultMap["params"]}
body["encSecKey"] = []string{encryptResultMap["encSecKey"]}
req, err := http.NewRequest("POST", url, strings.NewReader(body.Encode()))
for key, value := range headers {
req.Header.Add(key, value)
}
if err != nil {
return nil, err
}
resp, err := spider.client.Do(req)
defer resp.Body.Close()
data, err := ioutil.ReadAll(resp.Body)
if err != nil {
return nil, err
}
return data, nil
}
發送參數
type BaseRequestBody struct {
Offset string `json:"offset"`
Total string `json:"totail"`
Limit string `json:"limit"`
CsrfToken string `json:"csrf_token"`
}
type PlayRecordRequestBody struct {
BaseRequestBody
Type string `json:"type"`
Uid string `json:"uid"`
}
根據擷取排行榜請求傳回資料來建立相應的對象
type SongDetail struct {
Song common.Song `json:"song"`
// ignore other
}
type PlayRecord struct {
PlayCount int `json:"playCount"`
Score int `json:"score"`
SongDetail SongDetail `json:"song"`
}
type PlayRecordResp struct {
Code int `json:"code"`
AllData []PlayRecord `json:"allData"`
WeekData []PlayRecord `json:"weekData"`
}
定義好對象後,就可以使用模拟發送請求了, spider的GetPlayRecord方法
func (spider Music163Spider) GetPlayRecord(userId string) (record response.PlayRecordResp, err error) {
playRecordReqBody := request.PlayRecordRequestBody{
Uid: userId,
Type: "-1",
BaseRequestBody: request.BaseRequestBody{
Offset: "0",
Total: "true",
Limit: "1000",
CsrfToken: "",
},
}
playRecordUrl := fmt.Sprintf("%s%s%s?csrf_token=", constants.HttpsPrefix, constants.Music163Host, constants.PlayRecord)
result, err := spider.httpPost(playRecordUrl, spider.headers, playRecordReqBody)
if err != nil {
return
}
playRecordResp := response.PlayRecordResp{}
json.Unmarshal([]byte(result), &playRecordResp)
return playRecordResp, nil
}
測試一下
func main() {
musicSpider := spider.NewMusic164Spider()
record, _ := musicSpider.GetPlayRecord("62947535")
jsonData, _ := json.MarshalIndent(record, "", "\t")
fmt.Println(string(jsonData))
}
結果輸出