天天看點

java爬取jd的所有圖書類資訊

maven 依賴

<dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.11.3</version>
        </dependency>
        <dependency>
            <groupId>com.alibaba</groupId>
            <artifactId>fastjson</artifactId>
            <version>1.2.47</version>
        </dependency>
         <dependency>
            <groupId>org.apache.commons</groupId>
            <artifactId>commons-lang3</artifactId>
            <version>3.8.1</version>
        </dependency>
           

java代碼 (先根據 BASE_FILE_PATH 建立 jd_book 檔案夾)

然後運作下面代碼就行了, sql 檔案會在 jd_book 檔案夾下生成,導入mysql 就行了, 價格查詢的接口大概通路 幾千次左右就 會被jd 禁止一段時間,

ip代理要花錢,就沒弄, 這個價格的解決方案,可以用java讀取資料的時候, 發起請求查詢,這時候的查詢量少,是沒問題的,

當然也可以在查詢價格的時候, 把線程休眠一段時間,減少請求的次數

import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.net.URL;
import java.util.*;
import java.util.stream.Collectors;

/**
 * @Author: panlf
 * @Date: 2019/9/27 11:43
 */
public class JDBook {
    private static String BASE_LIST_URL = "https://list.jd.com/";
    private static String BASE_FILE_PATH = "D:\\ideaWorkplace\\demo1\\src\\main\\resources\\jd_book\\sqlresult_%s.txt";
    private static String BASE_ERR_PATH = "D:\\ideaWorkplace\\demo1\\src\\main\\resources\\jd_book\\errorUrl.txt";
    private static String URL_TYPE =
            "list.html?cat=1713,3261,3359`/list.html?cat=1713,3258`/list.html?cat=1713,3259`/list.html?cat=1713,3260`/list.html?cat=1713,3261`" +
            "/list.html?cat=1713,12775`/list.html?cat=1713,12776`/list.html?cat=1713,13627`/list.html?cat=1713,13634`/list.html?cat=1713,3262`" +
                    "/list.html?cat=1713,3263`/list.html?cat=1713,3267`/list.html?cat=1713,3266`/list.html?cat=1713,3264`/list.html?cat=1713,3265`" +
                    "/list.html?cat=1713,13613`/list.html?cat=1713,3270`/list.html?cat=1713,3271`/list.html?cat=1713,9278`/list.html?cat=1713,9291`" +
                    "/list.html?cat=1713,9301`/list.html?cat=1713,9309`/list.html?cat=1713,9314`/list.html?cat=1713,3269`/list.html?cat=1713,3272`" +
                    "/list.html?cat=1713,3273`/list.html?cat=1713,3279`/list.html?cat=1713,3276`/list.html?cat=1713,3275`/list.html?cat=1713,3274`" +
                    "/list.html?cat=1713,3277`/list.html?cat=1713,3280`/list.html?cat=1713,3281`/list.html?cat=1713,3284`/list.html?cat=1713,3287`" +
                    "/list.html?cat=1713,3285`/list.html?cat=1713,9340`/list.html?cat=1713,9368`/list.html?cat=1713,3286`/list.html?cat=1713,9351`" +
                    "/list.html?cat=1713,3288`/list.html?cat=1713,3289`/list.html?cat=1713,3282`/list.html?cat=1713,11047`/list.html?cat=1713,3290`" +
                    "/list.html?cat=1713,3291`/list.html?cat=1713,3294`/list.html?cat=1713,4758`/list.html?cat=1713,4855`/list.html?cat=1713,6929`" +
                    "/list.html?cat=1713,14669`/list.html?cat=1713,3296`/list.html?cat=1713,11745";
   // private static String URL_TYPE="list.html?cat=1713,3261,3359";
    private static String PRICE_JD = "https://p.3.cn/prices/mgets?skuIds=J_";
    private static final Map<String,String> HEADER =new HashMap<>();
    static {
        HEADER.put("Host", "http://p.3.cn");
        HEADER.put("User-Agent", "  Mozilla/5.0 (Windows NT 6.1; WOW64; rv:5.0) Gecko/20100101 Firefox/5.0");
        HEADER.put("Accept", "  text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
        HEADER.put("Accept-Language", "zh-cn,zh;q=0.5");
        HEADER.put("Accept-Charset", "  GB2312,utf-8;q=0.7,*;q=0.7");
        HEADER.put("Connection", "keep-alive");
    }

    public static void main(String[] args) throws Exception {
        List<String> list = Arrays.asList(URL_TYPE.split("`"));
        BufferedWriter errOut = getBufferedOut(BASE_ERR_PATH);
        //開10個線程
        int n = list.size()/10;
        if(n==0)n=1;
        for (int i = 0; i < list.size(); i += n) {
            List<String> temp = list.stream().skip(i).limit(n).collect(Collectors.toList());
            startThread(getBufferedOut(String.format(BASE_FILE_PATH, i / n)), errOut, temp, String.format("第%d條線程", i / n));
        }

    }

    private static void searchJD(BufferedWriter out, BufferedWriter outErr, List<String> list) throws Exception {
        int index = 0;
        for (String one : list) {
            String bookUrl = BASE_LIST_URL + one;
            Document indexPage = Jsoup.parse(new URL(bookUrl), 5000);
            String formatUrl = BASE_LIST_URL + indexPage.select(".p-num").get(0).child(1).attr("href").replaceAll("\\&page=[0123456789]*\\&", "&page=%d&");
            int maxPage = Integer.parseInt(indexPage.select(".p-skip").get(0).child(0).select("b").html());
            //周遊每一頁
            for (int i = 1; i <= maxPage; i++) {
                String pageUrl = String.format(formatUrl, i);
                Document pageDetail = Jsoup.parse(new URL(pageUrl), 5000);
                List<String> bookDetailList = pageDetail.select("#plist").get(0).select(".gl-item").stream().map(x -> "https:" + x.select(".p-name").get(0).child(0).attr("href")).collect(Collectors.toList());
                //周遊每一條
                for (String url : bookDetailList) {
                    try {
                        Map<String, String> fieldMap = getFieldMap();
                        List<String> paramList = new ArrayList<>();
                        List<String> valueList = new ArrayList<>();

                        paramList.add("id");
                        String uuid = UUID.randomUUID().toString().replaceAll("-","");
                             //   UuidUtils.generate();
                        valueList.add(uuid);

                        Document detail = Jsoup.parse(new URL(url), 50000);

                        Element e1 = detail.select(".crumb.fl.clearfix").get(0);
                        //書名
                        String bookName = e1.select(".item.ellipsis").get(0).html();
                        paramList.add("book_name");
                        valueList.add(bookName);

                        //分類
                        String bookType = e1.select("a").stream().map(x -> x.html()).reduce((x, y) -> x + ">" + y).get();
                        paramList.add("book_type");
                        valueList.add(bookType);

                        //作者
                        String bookAuthor = null;
                        try {
                            bookAuthor = detail.select("#p-author").get(0).child(0).html();
                        } catch (Exception e) {
                            bookAuthor = "無作者資訊";
                        }
                        paramList.add("author");
                        valueList.add(bookAuthor);

                        Element e2 = detail.select("#parameter2").get(0);
                        Map<String, String> detailMap = e2.children().stream().collect(Collectors.toMap(x -> x.html(), x -> x.attr("title")));
                        String bookOrder = "";
                        for (String key : detailMap.keySet()) {
                            if (key.contains("出版社:")) {
                                paramList.add(fieldMap.get("出版社"));
                                valueList.add(detailMap.get(key));
                            } else if (key.contains("ISBN")) {
                                paramList.add(fieldMap.get("ISBN"));
                                valueList.add(detailMap.get(key));
                            } else if (key.contains("出版時間")) {
                                paramList.add(fieldMap.get("出版時間"));
                                valueList.add(detailMap.get(key));
                            } else if (key.contains("頁數")) {
                                paramList.add(fieldMap.get("頁數"));
                                valueList.add(detailMap.get(key));
                            } else if (key.contains("商品編碼")) {
                                bookOrder = detailMap.get(key);
                                paramList.add("jd_code");
                                valueList.add(bookOrder);
                            }
                        }
                        //定價,先找到商品編号   id  :  url
                        String price=getPrice(bookOrder);
                        if(StringUtils.isNotBlank(price)){
                            paramList.add("price");
                            valueList.add(price);
                        }
                        paramList.add("detail_url");
                        valueList.add(url);
                        String sql = String.format("insert into jd_book_info (%s) values ('%s');", StringUtils.join(paramList, ","), StringUtils.join(valueList, "','"));
                        out.write(sql + "\r\n");
                        System.out.println(Thread.currentThread().getName() + " 第" + (++index) + "條");
                        out.flush();
                    } catch (Exception e) {
                        outErr.write("錯誤url: " + url+"\r\n");
                        e.printStackTrace();
                    }
                }
                outErr.flush();
            }
        }
        out.close();
    }

    private static String getPrice(String bookOrder) {
        String body="";
        try {
            Connection connect = Jsoup.connect(PRICE_JD+bookOrder).ignoreContentType(true).headers(HEADER).timeout(50000);
            Connection.Response execute = connect.execute();
            body = execute.body();
            List<JSONObject> list = JSONArray.parseArray(body, JSONObject.class);
            return list.get(0).get("m").toString();
        } catch (Exception e) {
            System.out.println(e.getMessage()+"url: "+ PRICE_JD+bookOrder);
            return null;
        }
    }

    private static Map<String, String> getFieldMap() {
        Map<String, String> fieldMap = new HashMap<>();
        fieldMap.put("出版社", "publisher");
        fieldMap.put("ISBN", "isbn");
        fieldMap.put("出版時間", "book_time");
        fieldMap.put("頁數", "page_num");
        return fieldMap;
    }


    private static void startThread(BufferedWriter out, BufferedWriter outErr, List<String> typeUrl, String threadName) {
        new Thread(() -> {
            try {
                searchJD(out, outErr, typeUrl);
            } catch (Exception e) {
                e.printStackTrace();
            }
        }, threadName).start();
    }

    private static BufferedWriter getBufferedOut(String path) throws IOException {
        File writeName = new File(path); // 相對路徑,如果沒有則要建立一個新的output.txt檔案
        writeName.createNewFile();
        FileWriter writer = new FileWriter(writeName, true);
        return new BufferedWriter(writer);
    }
}