使用java的HttpClient實作酷狗音樂Top500歌曲的下載下傳
-
歌單URL
https://www.kugou.com/yy/rank/home/1-8888.html?from=rank
通路這個URL可以看到 22 條歌曲清單,把 1-8888 改成 2-8888 就可以看到下 22 首
- 點一首歌進入播放頁,然後打開谷歌浏覽器控制台,在頁面源碼搜尋mp3,就可以找到歌曲播放URL
HttpClient 實作酷狗 Top500 音樂下載下傳
但是用代碼抓取的時候,傳回源碼中是沒有這個MP3位址的,那肯定是因為網站使用了JS來加載MP3連結。
于是我們重新整理頁面,看看是哪個請求的響應中包含了這個MP3連結。
最終在
https://wwwapi.kugou.com/yy/index.php?r=play/getdata
&callback=jQuery19103526571885218994_1559220496485
&hash=448A90C4561C32FEC965970C9F401411
&album_id=10852208
&dfid=2lP8VL1Bb0DP09ymCH4Wx9F0
&mid=9517924789f90af8e2d59c827583cdd2
&platid=4
&_=1559220496486
這個請求的響應中找到了MP3連結
我們把它複制到浏覽器的位址欄中進行請求
可以正常進行播放。
抓取步驟:
- 通路歌單頁
- 找到歌單清單,找到每首歌的播放頁面URL進行請求
- 在響應中找到hash值(通過hash值确定是哪首歌)
var dataFromSmarty = [{“hash”:“448A90C4561C32FEC965970C9F401411”,
“timelength”:“204002”,
“audio_name”:"\u674e\u4e3d - \u6070\u6070\u76f8\u53cd",
“author_name”:"\u674e\u4e3d",
“song_name”:"\u6070\u6070\u76f8\u53cd",
“album_id”:0}],//目前頁面歌曲資訊
playType = “search_single”;//目前播放
4.将hash值填充到擷取MP3連結的URL中,進行請求
5.擷取到這首歌的MP3連結,進行下載下傳,存儲,抓取完成
實作代碼:
package kugouspider;
import net.sf.json.JSONObject;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import static kugouspider.Parse.parseTitleAndPlayurl;
/**
* 抓取類
*/
public class Crawler {
private static Log log = LogFactory.getLog(Crawler.class);
// 儲存歌單的路徑
public static String DOWNLOAD_FILEPATH = "I:\\Java\\6_project\\KuGouSpider\\music\\";
// 歌單連結. 通過改變PAGE來擷取下一頁的内容
public static String MUSICLIST_LINK = "https://www.kugou.com/yy/rank/home/PAGE-8888.html?from=rank";
// 擷取MP3連結的請求URL
public static String MP3_LINK = "https://wwwapi.kugou.com/yy/index.php?r=play/getdata" +
"&callback=jQuery19103526571885218994_TIME01" +
"&hash=HASH" +
"&album_id=10852208" +
"&dfid=2lP8VL1Bb0DP09ymCH4Wx9F0" +
"&mid=9517924789f90af8e2d59c827583cdd2" +
"&platid=4" +
"&_=TIME02";
public static void test() {
String title = "陳雪凝 - 你的酒館對我打了烊";
String playHref = "https://www.kugou.com/song/tlk6517.html";
downLoad(title, playHref);
}
public static void main(String[] args) {
for (int i = 4; i <= 4; i++) {
// 把PAGE用i替換,以擷取每一頁的内容
// https://www.kugou.com/yy/rank/home/1-8888.html?from=rank 就是第一頁歌單
String link = MUSICLIST_LINK.replace("PAGE", i + "");
String content = getMusicList(link);
// String content = Utils.getFile("I:\\Java\\6_project\\KuGouSpider\\out\\top01.html");
// System.out.println(content);
Map songList = parseTitleAndPlayurl(content);
// 周遊songList,擷取歌名和歌曲播放連結
Set<Map.Entry<String, String>> entrys = songList.entrySet();
for (java.util.Map.Entry<String, String> entry : entrys) {
try {
downLoad(entry.getKey(), entry.getValue());
} catch (Exception e) {
log.error("--下載下傳出錯: " + e);
}
}
}
}
/**
* 進行get請求
*
* @param musicListLink
* @return
*/
public static String getRequest(String musicListLink) {
// 先建立一個 httpclient
HttpClient httpClient = HttpClients.custom().build();
try {
// 建立get方法執行個體,設定URL
HttpGet getMusicList = new HttpGet(musicListLink);
RequestConfig requestConfig = RequestConfig.custom()
.setSocketTimeout(5000)
.setConnectTimeout(50000)
.setConnectionRequestTimeout(50000)
.build();
getMusicList.setConfig(requestConfig);
// 設定請求頭
getMusicList.addHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
getMusicList.addHeader("Accept-Encoding", "gzip,deflate,sdch");
getMusicList.addHeader("Accept-Language", "zh-CN,zh;q=0.8");
getMusicList.addHeader("Connection", "keep-alive");
getMusicList.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36");
// 進行請求
HttpResponse httpResponse = httpClient.execute(getMusicList);
if (httpResponse != null) {
String content = EntityUtils.toString(httpResponse.getEntity());
// System.out.println(content);
return content;
}
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
/**
* 擷取歌單清單
*
* @param musicListUrl
* @return
*/
public static String getMusicList(String musicListUrl) {
return getRequest(musicListUrl);
}
public static String getPlayPage(String playHref) {
return getRequest(playHref);
}
public static String getDownloadUrl(String mp3Link) {
return getRequest(mp3Link);
}
/**
* 獲得歌曲下載下傳連結進行下載下傳存儲
*
* @param title
* @param playHref
*/
public static void downLoad(String title, String playHref) {
// 獲得歌曲播放頁面的源碼
String content = getPlayPage(playHref);
String hash = "";
// 利用正規表達式擷取歌曲的Hash值
String regEx = "\"hash\":\"[0-9A-Z]+\"";
Pattern pattern = Pattern.compile(regEx);
Matcher matcher = pattern.matcher(content);
if (matcher.find()) {
hash = matcher.group();
hash = hash.replace("\"hash\":\"", "");
hash = hash.replace("\"", "");
}
// 拼接獲得MP3下載下傳連結的請求URL
String mp3Link = MP3_LINK.replace("HASH", hash);
mp3Link = mp3Link.replace("TIME01", System.currentTimeMillis() + "");
mp3Link = mp3Link.replace("TIME02", System.currentTimeMillis() + "");
// 獲得這個URL之後,進行請求,解析響應源碼,就能得到歌曲的下載下傳URL
content = getDownloadUrl(mp3Link);
content = content.substring(content.indexOf("(") + 1, content.length() - 2);
// System.out.println(content);
JSONObject jsonObject = JSONObject.fromObject(content);
String playUrl = jsonObject.getJSONObject("data").getString("play_url");
if("".equals(playUrl)){
System.out.println(title + " 是收費歌曲,不能下載下傳");
return;
}
// System.out.println("MP3: " + playUrl);
// 接下來就是下載下傳歌曲
System.out.println("--開始下載下傳: " + title);
MusicDownload musicDownload = new MusicDownload();
Boolean success = musicDownload.download(playUrl, DOWNLOAD_FILEPATH + title + ".mp3");
if (success) {
System.out.println(title + "---下載下傳完成---");
} else {
System.out.println(title + "---下載下傳失敗---");
}
}
}
package kugouspider;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.util.LinkedHashMap;
import java.util.Map;
/**
* 解析類
*/
public class Parse {
// 歌曲清單. 使用 LinkedHashMap 是為了確定歌曲順序不被打亂
private static Map<String, String> song = new LinkedHashMap<>();
/**
* 解析響應源碼獲得歌曲标題和播放頁面的URL
* @param content
*/
public static Map parseTitleAndPlayurl(String content){
// 歌曲名稱
String title = "";
// 播放頁面的URL
String playHref = "";
// 将html源碼執行個體化為一個document對象
Document document = Jsoup.parse(content);
Element ele = document.getElementsByClass("pc_temp_songlist").get(0);
Elements eles = ele.getElementsByTag("li");
for(int i = 0 ; i < eles.size() ; i++){
Element item = eles.get(i);
title = item.attr("title").trim();
playHref = item.getElementsByTag("a").first().attr("href");
song.put(title, playHref);
// System.out.println(title + ": " + playHref);
}
return song;
}
}
package kugouspider;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
public class MusicDownload {
/**
* 下載下傳檔案
* @param url
* @param path
* @return
*/
public boolean download(String url, String path) {
boolean flag = false;
CloseableHttpClient httpclient = HttpClients.createDefault();
RequestConfig requestConfig = RequestConfig.custom().setSocketTimeout(5000)
.setConnectTimeout(5000).build();
HttpGet get = new HttpGet(url);
get.setConfig(requestConfig);
BufferedInputStream in = null;
BufferedOutputStream out = null;
try {
for (int i = 0; i < 3; i++) {
CloseableHttpResponse result = httpclient.execute(get);
System.out.println(result.getStatusLine());
if (result.getStatusLine().getStatusCode() == 200) {
in = new BufferedInputStream(result.getEntity().getContent());
File file = new File(path);
out = new BufferedOutputStream(new FileOutputStream(file));
byte[] buffer = new byte[1024];
int len = -1;
while ((len = in.read(buffer, 0, 1024)) > -1) {
out.write(buffer, 0, len);
}
flag = true;
break;
} else if (result.getStatusLine().getStatusCode() == 500) {
continue;
}
}
} catch (Exception e) {
e.printStackTrace();
flag = false;
} finally {
get.releaseConnection();
try {
if (in != null) {
in.close();
}
if (out != null) {
out.close();
}
} catch (Exception e) {
e.printStackTrace();
flag = false;
}
}
return flag;
}
}