天天看點

HttpClient 實作酷狗 Top500 音樂下載下傳

使用java的HttpClient實作酷狗音樂Top500歌曲的下載下傳

  1. 歌單URL

    https://www.kugou.com/yy/rank/home/1-8888.html?from=rank

通路這個URL可以看到 22 條歌曲清單,把 1-8888 改成 2-8888 就可以看到下 22 首

  1. 點一首歌進入播放頁,然後打開谷歌浏覽器控制台,在頁面源碼搜尋mp3,就可以找到歌曲播放URL
    HttpClient 實作酷狗 Top500 音樂下載下傳

但是用代碼抓取的時候,傳回源碼中是沒有這個MP3位址的,那肯定是因為網站使用了JS來加載MP3連結。

于是我們重新整理頁面,看看是哪個請求的響應中包含了這個MP3連結。

最終在

https://wwwapi.kugou.com/yy/index.php?r=play/getdata
&callback=jQuery19103526571885218994_1559220496485
&hash=448A90C4561C32FEC965970C9F401411
&album_id=10852208
&dfid=2lP8VL1Bb0DP09ymCH4Wx9F0
&mid=9517924789f90af8e2d59c827583cdd2
&platid=4
&_=1559220496486
           

這個請求的響應中找到了MP3連結

HttpClient 實作酷狗 Top500 音樂下載下傳

我們把它複制到浏覽器的位址欄中進行請求

HttpClient 實作酷狗 Top500 音樂下載下傳

可以正常進行播放。

抓取步驟:

  1. 通路歌單頁
  2. 找到歌單清單,找到每首歌的播放頁面URL進行請求
  3. 在響應中找到hash值(通過hash值确定是哪首歌)

var dataFromSmarty = [{“hash”:“448A90C4561C32FEC965970C9F401411”,

“timelength”:“204002”,

“audio_name”:"\u674e\u4e3d - \u6070\u6070\u76f8\u53cd",

“author_name”:"\u674e\u4e3d",

“song_name”:"\u6070\u6070\u76f8\u53cd",

“album_id”:0}],//目前頁面歌曲資訊

playType = “search_single”;//目前播放

4.将hash值填充到擷取MP3連結的URL中,進行請求

5.擷取到這首歌的MP3連結,進行下載下傳,存儲,抓取完成

HttpClient 實作酷狗 Top500 音樂下載下傳

實作代碼:

package kugouspider;

import net.sf.json.JSONObject;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;

import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import static kugouspider.Parse.parseTitleAndPlayurl;

/**
 * 抓取類
 */
public class Crawler {

    private static Log log = LogFactory.getLog(Crawler.class);

    // 儲存歌單的路徑
    public static String DOWNLOAD_FILEPATH = "I:\\Java\\6_project\\KuGouSpider\\music\\";
    // 歌單連結. 通過改變PAGE來擷取下一頁的内容
    public static String MUSICLIST_LINK = "https://www.kugou.com/yy/rank/home/PAGE-8888.html?from=rank";
    // 擷取MP3連結的請求URL
    public static String MP3_LINK = "https://wwwapi.kugou.com/yy/index.php?r=play/getdata" +
            "&callback=jQuery19103526571885218994_TIME01" +
            "&hash=HASH" +
            "&album_id=10852208" +
            "&dfid=2lP8VL1Bb0DP09ymCH4Wx9F0" +
            "&mid=9517924789f90af8e2d59c827583cdd2" +
            "&platid=4" +
            "&_=TIME02";

    public static void test() {
        String title = "陳雪凝 - 你的酒館對我打了烊";
        String playHref = "https://www.kugou.com/song/tlk6517.html";
        downLoad(title, playHref);
    }

    public static void main(String[] args) {
        for (int i = 4; i <= 4; i++) {
            // 把PAGE用i替換,以擷取每一頁的内容
            // https://www.kugou.com/yy/rank/home/1-8888.html?from=rank 就是第一頁歌單
            String link = MUSICLIST_LINK.replace("PAGE", i + "");
            String content = getMusicList(link);
            // String content = Utils.getFile("I:\\Java\\6_project\\KuGouSpider\\out\\top01.html");
            // System.out.println(content);
            Map songList = parseTitleAndPlayurl(content);
            // 周遊songList,擷取歌名和歌曲播放連結
            Set<Map.Entry<String, String>> entrys = songList.entrySet();
            for (java.util.Map.Entry<String, String> entry : entrys) {
                try {
                    downLoad(entry.getKey(), entry.getValue());
                } catch (Exception e) {
                    log.error("--下載下傳出錯: " + e);
                }
            }
        }
    }

    /**
     * 進行get請求
     *
     * @param musicListLink
     * @return
     */
    public static String getRequest(String musicListLink) {
        // 先建立一個 httpclient
        HttpClient httpClient = HttpClients.custom().build();

        try {
            // 建立get方法執行個體,設定URL
            HttpGet getMusicList = new HttpGet(musicListLink);
            RequestConfig requestConfig = RequestConfig.custom()
                    .setSocketTimeout(5000)
                    .setConnectTimeout(50000)
                    .setConnectionRequestTimeout(50000)
                    .build();
            getMusicList.setConfig(requestConfig);
            // 設定請求頭
            getMusicList.addHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
            getMusicList.addHeader("Accept-Encoding", "gzip,deflate,sdch");
            getMusicList.addHeader("Accept-Language", "zh-CN,zh;q=0.8");
            getMusicList.addHeader("Connection", "keep-alive");
            getMusicList.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36");
            // 進行請求
            HttpResponse httpResponse = httpClient.execute(getMusicList);
            if (httpResponse != null) {
                String content = EntityUtils.toString(httpResponse.getEntity());
                // System.out.println(content);
                return content;
            }

        } catch (Exception e) {
            e.printStackTrace();
        }
        return null;
    }

    /**
     * 擷取歌單清單
     *
     * @param musicListUrl
     * @return
     */
    public static String getMusicList(String musicListUrl) {
        return getRequest(musicListUrl);
    }

    public static String getPlayPage(String playHref) {
        return getRequest(playHref);
    }

    public static String getDownloadUrl(String mp3Link) {
        return getRequest(mp3Link);
    }

    /**
     * 獲得歌曲下載下傳連結進行下載下傳存儲
     *
     * @param title
     * @param playHref
     */
    public static void downLoad(String title, String playHref) {
        // 獲得歌曲播放頁面的源碼
        String content = getPlayPage(playHref);
        String hash = "";

        // 利用正規表達式擷取歌曲的Hash值
        String regEx = "\"hash\":\"[0-9A-Z]+\"";
        Pattern pattern = Pattern.compile(regEx);
        Matcher matcher = pattern.matcher(content);
        if (matcher.find()) {
            hash = matcher.group();
            hash = hash.replace("\"hash\":\"", "");
            hash = hash.replace("\"", "");
        }
        // 拼接獲得MP3下載下傳連結的請求URL
        String mp3Link = MP3_LINK.replace("HASH", hash);
        mp3Link = mp3Link.replace("TIME01", System.currentTimeMillis() + "");
        mp3Link = mp3Link.replace("TIME02", System.currentTimeMillis() + "");
        // 獲得這個URL之後,進行請求,解析響應源碼,就能得到歌曲的下載下傳URL
        content = getDownloadUrl(mp3Link);
        content = content.substring(content.indexOf("(") + 1, content.length() - 2);
        // System.out.println(content);
        JSONObject jsonObject = JSONObject.fromObject(content);
        String playUrl = jsonObject.getJSONObject("data").getString("play_url");
        if("".equals(playUrl)){
            System.out.println(title +  " 是收費歌曲,不能下載下傳");
            return;
        }

        // System.out.println("MP3: " + playUrl);
        // 接下來就是下載下傳歌曲
        System.out.println("--開始下載下傳: " + title);
        MusicDownload musicDownload = new MusicDownload();
        Boolean success = musicDownload.download(playUrl, DOWNLOAD_FILEPATH + title + ".mp3");
        if (success) {
            System.out.println(title + "---下載下傳完成---");
        } else {
            System.out.println(title + "---下載下傳失敗---");
        }
    }

}

           
package kugouspider;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.util.LinkedHashMap;
import java.util.Map;

/**
 * 解析類
 */
public class Parse {

    // 歌曲清單. 使用 LinkedHashMap 是為了確定歌曲順序不被打亂
    private static Map<String, String> song = new LinkedHashMap<>();
    /**
     * 解析響應源碼獲得歌曲标題和播放頁面的URL
     * @param content
     */
    public static Map parseTitleAndPlayurl(String content){
        // 歌曲名稱
        String title = "";
        // 播放頁面的URL
        String playHref = "";
        // 将html源碼執行個體化為一個document對象
        Document document = Jsoup.parse(content);
        Element ele = document.getElementsByClass("pc_temp_songlist").get(0);
        Elements eles = ele.getElementsByTag("li");
        for(int i = 0 ; i < eles.size() ; i++){
            Element item = eles.get(i);
            title = item.attr("title").trim();
            playHref = item.getElementsByTag("a").first().attr("href");
            song.put(title, playHref);
            // System.out.println(title + ": " + playHref);
        }
        return song;
    }
}

           
package kugouspider;

import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;

public class MusicDownload {
    /**
     * 下載下傳檔案
     * @param url
     * @param path
     * @return
     */
    public boolean download(String url, String path) {

        boolean flag = false;

        CloseableHttpClient httpclient = HttpClients.createDefault();
        RequestConfig requestConfig = RequestConfig.custom().setSocketTimeout(5000)
                .setConnectTimeout(5000).build();

        HttpGet get = new HttpGet(url);
        get.setConfig(requestConfig);

        BufferedInputStream in = null;
        BufferedOutputStream out = null;
        try {
            for (int i = 0; i < 3; i++) {
                CloseableHttpResponse result = httpclient.execute(get);
                System.out.println(result.getStatusLine());
                if (result.getStatusLine().getStatusCode() == 200) {
                    in = new BufferedInputStream(result.getEntity().getContent());
                    File file = new File(path);
                    out = new BufferedOutputStream(new FileOutputStream(file));
                    byte[] buffer = new byte[1024];
                    int len = -1;
                    while ((len = in.read(buffer, 0, 1024)) > -1) {
                        out.write(buffer, 0, len);
                    }
                    flag = true;
                    break;
                } else if (result.getStatusLine().getStatusCode() == 500) {
                    continue;
                }
            }

        } catch (Exception e) {
            e.printStackTrace();
            flag = false;
        } finally {
            get.releaseConnection();
            try {
                if (in != null) {
                    in.close();
                }
                if (out != null) {
                    out.close();
                }
            } catch (Exception e) {
                e.printStackTrace();
                flag = false;
            }
        }
        return flag;
    }


}