使用java的HttpClient实现酷狗音乐Top500歌曲的下载
-
歌单URL
https://www.kugou.com/yy/rank/home/1-8888.html?from=rank
访问这个URL可以看到 22 条歌曲列表,把 1-8888 改成 2-8888 就可以看到下 22 首
- 点一首歌进入播放页,然后打开谷歌浏览器控制台,在页面源码搜索mp3,就可以找到歌曲播放URL
HttpClient 实现酷狗 Top500 音乐下载
但是用代码抓取的时候,返回源码中是没有这个MP3地址的,那肯定是因为网站使用了JS来加载MP3链接。
于是我们刷新页面,看看是哪个请求的响应中包含了这个MP3链接。
最终在
https://wwwapi.kugou.com/yy/index.php?r=play/getdata
&callback=jQuery19103526571885218994_1559220496485
&hash=448A90C4561C32FEC965970C9F401411
&album_id=10852208
&dfid=2lP8VL1Bb0DP09ymCH4Wx9F0
&mid=9517924789f90af8e2d59c827583cdd2
&platid=4
&_=1559220496486
这个请求的响应中找到了MP3链接
我们把它复制到浏览器的地址栏中进行请求
可以正常进行播放。
抓取步骤:
- 访问歌单页
- 找到歌单列表,找到每首歌的播放页面URL进行请求
- 在响应中找到hash值(通过hash值确定是哪首歌)
var dataFromSmarty = [{“hash”:“448A90C4561C32FEC965970C9F401411”,
“timelength”:“204002”,
“audio_name”:"\u674e\u4e3d - \u6070\u6070\u76f8\u53cd",
“author_name”:"\u674e\u4e3d",
“song_name”:"\u6070\u6070\u76f8\u53cd",
“album_id”:0}],//当前页面歌曲信息
playType = “search_single”;//当前播放
4.将hash值填充到获取MP3链接的URL中,进行请求
5.获取到这首歌的MP3链接,进行下载,存储,抓取完成
实现代码:
package kugouspider;
import net.sf.json.JSONObject;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import static kugouspider.Parse.parseTitleAndPlayurl;
/**
* 抓取类
*/
public class Crawler {
private static Log log = LogFactory.getLog(Crawler.class);
// 保存歌单的路径
public static String DOWNLOAD_FILEPATH = "I:\\Java\\6_project\\KuGouSpider\\music\\";
// 歌单链接. 通过改变PAGE来获取下一页的内容
public static String MUSICLIST_LINK = "https://www.kugou.com/yy/rank/home/PAGE-8888.html?from=rank";
// 获取MP3链接的请求URL
public static String MP3_LINK = "https://wwwapi.kugou.com/yy/index.php?r=play/getdata" +
"&callback=jQuery19103526571885218994_TIME01" +
"&hash=HASH" +
"&album_id=10852208" +
"&dfid=2lP8VL1Bb0DP09ymCH4Wx9F0" +
"&mid=9517924789f90af8e2d59c827583cdd2" +
"&platid=4" +
"&_=TIME02";
public static void test() {
String title = "陈雪凝 - 你的酒馆对我打了烊";
String playHref = "https://www.kugou.com/song/tlk6517.html";
downLoad(title, playHref);
}
public static void main(String[] args) {
for (int i = 4; i <= 4; i++) {
// 把PAGE用i替换,以获取每一页的内容
// https://www.kugou.com/yy/rank/home/1-8888.html?from=rank 就是第一页歌单
String link = MUSICLIST_LINK.replace("PAGE", i + "");
String content = getMusicList(link);
// String content = Utils.getFile("I:\\Java\\6_project\\KuGouSpider\\out\\top01.html");
// System.out.println(content);
Map songList = parseTitleAndPlayurl(content);
// 遍历songList,获取歌名和歌曲播放链接
Set<Map.Entry<String, String>> entrys = songList.entrySet();
for (java.util.Map.Entry<String, String> entry : entrys) {
try {
downLoad(entry.getKey(), entry.getValue());
} catch (Exception e) {
log.error("--下载出错: " + e);
}
}
}
}
/**
* 进行get请求
*
* @param musicListLink
* @return
*/
public static String getRequest(String musicListLink) {
// 先创建一个 httpclient
HttpClient httpClient = HttpClients.custom().build();
try {
// 创建get方法实例,设置URL
HttpGet getMusicList = new HttpGet(musicListLink);
RequestConfig requestConfig = RequestConfig.custom()
.setSocketTimeout(5000)
.setConnectTimeout(50000)
.setConnectionRequestTimeout(50000)
.build();
getMusicList.setConfig(requestConfig);
// 设置请求头
getMusicList.addHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
getMusicList.addHeader("Accept-Encoding", "gzip,deflate,sdch");
getMusicList.addHeader("Accept-Language", "zh-CN,zh;q=0.8");
getMusicList.addHeader("Connection", "keep-alive");
getMusicList.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36");
// 进行请求
HttpResponse httpResponse = httpClient.execute(getMusicList);
if (httpResponse != null) {
String content = EntityUtils.toString(httpResponse.getEntity());
// System.out.println(content);
return content;
}
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
/**
* 获取歌单列表
*
* @param musicListUrl
* @return
*/
public static String getMusicList(String musicListUrl) {
return getRequest(musicListUrl);
}
public static String getPlayPage(String playHref) {
return getRequest(playHref);
}
public static String getDownloadUrl(String mp3Link) {
return getRequest(mp3Link);
}
/**
* 获得歌曲下载链接进行下载存储
*
* @param title
* @param playHref
*/
public static void downLoad(String title, String playHref) {
// 获得歌曲播放页面的源码
String content = getPlayPage(playHref);
String hash = "";
// 利用正则表达式获取歌曲的Hash值
String regEx = "\"hash\":\"[0-9A-Z]+\"";
Pattern pattern = Pattern.compile(regEx);
Matcher matcher = pattern.matcher(content);
if (matcher.find()) {
hash = matcher.group();
hash = hash.replace("\"hash\":\"", "");
hash = hash.replace("\"", "");
}
// 拼接获得MP3下载链接的请求URL
String mp3Link = MP3_LINK.replace("HASH", hash);
mp3Link = mp3Link.replace("TIME01", System.currentTimeMillis() + "");
mp3Link = mp3Link.replace("TIME02", System.currentTimeMillis() + "");
// 获得这个URL之后,进行请求,解析响应源码,就能得到歌曲的下载URL
content = getDownloadUrl(mp3Link);
content = content.substring(content.indexOf("(") + 1, content.length() - 2);
// System.out.println(content);
JSONObject jsonObject = JSONObject.fromObject(content);
String playUrl = jsonObject.getJSONObject("data").getString("play_url");
if("".equals(playUrl)){
System.out.println(title + " 是收费歌曲,不能下载");
return;
}
// System.out.println("MP3: " + playUrl);
// 接下来就是下载歌曲
System.out.println("--开始下载: " + title);
MusicDownload musicDownload = new MusicDownload();
Boolean success = musicDownload.download(playUrl, DOWNLOAD_FILEPATH + title + ".mp3");
if (success) {
System.out.println(title + "---下载完成---");
} else {
System.out.println(title + "---下载失败---");
}
}
}
package kugouspider;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.util.LinkedHashMap;
import java.util.Map;
/**
* 解析类
*/
public class Parse {
// 歌曲列表. 使用 LinkedHashMap 是为了确保歌曲顺序不被打乱
private static Map<String, String> song = new LinkedHashMap<>();
/**
* 解析响应源码获得歌曲标题和播放页面的URL
* @param content
*/
public static Map parseTitleAndPlayurl(String content){
// 歌曲名称
String title = "";
// 播放页面的URL
String playHref = "";
// 将html源码实例化为一个document对象
Document document = Jsoup.parse(content);
Element ele = document.getElementsByClass("pc_temp_songlist").get(0);
Elements eles = ele.getElementsByTag("li");
for(int i = 0 ; i < eles.size() ; i++){
Element item = eles.get(i);
title = item.attr("title").trim();
playHref = item.getElementsByTag("a").first().attr("href");
song.put(title, playHref);
// System.out.println(title + ": " + playHref);
}
return song;
}
}
package kugouspider;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
public class MusicDownload {
/**
* 下载文件
* @param url
* @param path
* @return
*/
public boolean download(String url, String path) {
boolean flag = false;
CloseableHttpClient httpclient = HttpClients.createDefault();
RequestConfig requestConfig = RequestConfig.custom().setSocketTimeout(5000)
.setConnectTimeout(5000).build();
HttpGet get = new HttpGet(url);
get.setConfig(requestConfig);
BufferedInputStream in = null;
BufferedOutputStream out = null;
try {
for (int i = 0; i < 3; i++) {
CloseableHttpResponse result = httpclient.execute(get);
System.out.println(result.getStatusLine());
if (result.getStatusLine().getStatusCode() == 200) {
in = new BufferedInputStream(result.getEntity().getContent());
File file = new File(path);
out = new BufferedOutputStream(new FileOutputStream(file));
byte[] buffer = new byte[1024];
int len = -1;
while ((len = in.read(buffer, 0, 1024)) > -1) {
out.write(buffer, 0, len);
}
flag = true;
break;
} else if (result.getStatusLine().getStatusCode() == 500) {
continue;
}
}
} catch (Exception e) {
e.printStackTrace();
flag = false;
} finally {
get.releaseConnection();
try {
if (in != null) {
in.close();
}
if (out != null) {
out.close();
}
} catch (Exception e) {
e.printStackTrace();
flag = false;
}
}
return flag;
}
}