maven 依賴
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.3</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.47</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.8.1</version>
</dependency>
java代碼 (先根據 BASE_FILE_PATH 建立 jd_book 檔案夾)
然後運作下面代碼就行了, sql 檔案會在 jd_book 檔案夾下生成,導入mysql 就行了, 價格查詢的接口大概通路 幾千次左右就 會被jd 禁止一段時間,
ip代理要花錢,就沒弄, 這個價格的解決方案,可以用java讀取資料的時候, 發起請求查詢,這時候的查詢量少,是沒問題的,
當然也可以在查詢價格的時候, 把線程休眠一段時間,減少請求的次數
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.net.URL;
import java.util.*;
import java.util.stream.Collectors;
/**
* @Author: panlf
* @Date: 2019/9/27 11:43
*/
public class JDBook {
private static String BASE_LIST_URL = "https://list.jd.com/";
private static String BASE_FILE_PATH = "D:\\ideaWorkplace\\demo1\\src\\main\\resources\\jd_book\\sqlresult_%s.txt";
private static String BASE_ERR_PATH = "D:\\ideaWorkplace\\demo1\\src\\main\\resources\\jd_book\\errorUrl.txt";
private static String URL_TYPE =
"list.html?cat=1713,3261,3359`/list.html?cat=1713,3258`/list.html?cat=1713,3259`/list.html?cat=1713,3260`/list.html?cat=1713,3261`" +
"/list.html?cat=1713,12775`/list.html?cat=1713,12776`/list.html?cat=1713,13627`/list.html?cat=1713,13634`/list.html?cat=1713,3262`" +
"/list.html?cat=1713,3263`/list.html?cat=1713,3267`/list.html?cat=1713,3266`/list.html?cat=1713,3264`/list.html?cat=1713,3265`" +
"/list.html?cat=1713,13613`/list.html?cat=1713,3270`/list.html?cat=1713,3271`/list.html?cat=1713,9278`/list.html?cat=1713,9291`" +
"/list.html?cat=1713,9301`/list.html?cat=1713,9309`/list.html?cat=1713,9314`/list.html?cat=1713,3269`/list.html?cat=1713,3272`" +
"/list.html?cat=1713,3273`/list.html?cat=1713,3279`/list.html?cat=1713,3276`/list.html?cat=1713,3275`/list.html?cat=1713,3274`" +
"/list.html?cat=1713,3277`/list.html?cat=1713,3280`/list.html?cat=1713,3281`/list.html?cat=1713,3284`/list.html?cat=1713,3287`" +
"/list.html?cat=1713,3285`/list.html?cat=1713,9340`/list.html?cat=1713,9368`/list.html?cat=1713,3286`/list.html?cat=1713,9351`" +
"/list.html?cat=1713,3288`/list.html?cat=1713,3289`/list.html?cat=1713,3282`/list.html?cat=1713,11047`/list.html?cat=1713,3290`" +
"/list.html?cat=1713,3291`/list.html?cat=1713,3294`/list.html?cat=1713,4758`/list.html?cat=1713,4855`/list.html?cat=1713,6929`" +
"/list.html?cat=1713,14669`/list.html?cat=1713,3296`/list.html?cat=1713,11745";
// private static String URL_TYPE="list.html?cat=1713,3261,3359";
private static String PRICE_JD = "https://p.3.cn/prices/mgets?skuIds=J_";
private static final Map<String,String> HEADER =new HashMap<>();
static {
HEADER.put("Host", "http://p.3.cn");
HEADER.put("User-Agent", " Mozilla/5.0 (Windows NT 6.1; WOW64; rv:5.0) Gecko/20100101 Firefox/5.0");
HEADER.put("Accept", " text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
HEADER.put("Accept-Language", "zh-cn,zh;q=0.5");
HEADER.put("Accept-Charset", " GB2312,utf-8;q=0.7,*;q=0.7");
HEADER.put("Connection", "keep-alive");
}
public static void main(String[] args) throws Exception {
List<String> list = Arrays.asList(URL_TYPE.split("`"));
BufferedWriter errOut = getBufferedOut(BASE_ERR_PATH);
//開10個線程
int n = list.size()/10;
if(n==0)n=1;
for (int i = 0; i < list.size(); i += n) {
List<String> temp = list.stream().skip(i).limit(n).collect(Collectors.toList());
startThread(getBufferedOut(String.format(BASE_FILE_PATH, i / n)), errOut, temp, String.format("第%d條線程", i / n));
}
}
private static void searchJD(BufferedWriter out, BufferedWriter outErr, List<String> list) throws Exception {
int index = 0;
for (String one : list) {
String bookUrl = BASE_LIST_URL + one;
Document indexPage = Jsoup.parse(new URL(bookUrl), 5000);
String formatUrl = BASE_LIST_URL + indexPage.select(".p-num").get(0).child(1).attr("href").replaceAll("\\&page=[0123456789]*\\&", "&page=%d&");
int maxPage = Integer.parseInt(indexPage.select(".p-skip").get(0).child(0).select("b").html());
//周遊每一頁
for (int i = 1; i <= maxPage; i++) {
String pageUrl = String.format(formatUrl, i);
Document pageDetail = Jsoup.parse(new URL(pageUrl), 5000);
List<String> bookDetailList = pageDetail.select("#plist").get(0).select(".gl-item").stream().map(x -> "https:" + x.select(".p-name").get(0).child(0).attr("href")).collect(Collectors.toList());
//周遊每一條
for (String url : bookDetailList) {
try {
Map<String, String> fieldMap = getFieldMap();
List<String> paramList = new ArrayList<>();
List<String> valueList = new ArrayList<>();
paramList.add("id");
String uuid = UUID.randomUUID().toString().replaceAll("-","");
// UuidUtils.generate();
valueList.add(uuid);
Document detail = Jsoup.parse(new URL(url), 50000);
Element e1 = detail.select(".crumb.fl.clearfix").get(0);
//書名
String bookName = e1.select(".item.ellipsis").get(0).html();
paramList.add("book_name");
valueList.add(bookName);
//分類
String bookType = e1.select("a").stream().map(x -> x.html()).reduce((x, y) -> x + ">" + y).get();
paramList.add("book_type");
valueList.add(bookType);
//作者
String bookAuthor = null;
try {
bookAuthor = detail.select("#p-author").get(0).child(0).html();
} catch (Exception e) {
bookAuthor = "無作者資訊";
}
paramList.add("author");
valueList.add(bookAuthor);
Element e2 = detail.select("#parameter2").get(0);
Map<String, String> detailMap = e2.children().stream().collect(Collectors.toMap(x -> x.html(), x -> x.attr("title")));
String bookOrder = "";
for (String key : detailMap.keySet()) {
if (key.contains("出版社:")) {
paramList.add(fieldMap.get("出版社"));
valueList.add(detailMap.get(key));
} else if (key.contains("ISBN")) {
paramList.add(fieldMap.get("ISBN"));
valueList.add(detailMap.get(key));
} else if (key.contains("出版時間")) {
paramList.add(fieldMap.get("出版時間"));
valueList.add(detailMap.get(key));
} else if (key.contains("頁數")) {
paramList.add(fieldMap.get("頁數"));
valueList.add(detailMap.get(key));
} else if (key.contains("商品編碼")) {
bookOrder = detailMap.get(key);
paramList.add("jd_code");
valueList.add(bookOrder);
}
}
//定價,先找到商品編号 id : url
String price=getPrice(bookOrder);
if(StringUtils.isNotBlank(price)){
paramList.add("price");
valueList.add(price);
}
paramList.add("detail_url");
valueList.add(url);
String sql = String.format("insert into jd_book_info (%s) values ('%s');", StringUtils.join(paramList, ","), StringUtils.join(valueList, "','"));
out.write(sql + "\r\n");
System.out.println(Thread.currentThread().getName() + " 第" + (++index) + "條");
out.flush();
} catch (Exception e) {
outErr.write("錯誤url: " + url+"\r\n");
e.printStackTrace();
}
}
outErr.flush();
}
}
out.close();
}
private static String getPrice(String bookOrder) {
String body="";
try {
Connection connect = Jsoup.connect(PRICE_JD+bookOrder).ignoreContentType(true).headers(HEADER).timeout(50000);
Connection.Response execute = connect.execute();
body = execute.body();
List<JSONObject> list = JSONArray.parseArray(body, JSONObject.class);
return list.get(0).get("m").toString();
} catch (Exception e) {
System.out.println(e.getMessage()+"url: "+ PRICE_JD+bookOrder);
return null;
}
}
private static Map<String, String> getFieldMap() {
Map<String, String> fieldMap = new HashMap<>();
fieldMap.put("出版社", "publisher");
fieldMap.put("ISBN", "isbn");
fieldMap.put("出版時間", "book_time");
fieldMap.put("頁數", "page_num");
return fieldMap;
}
private static void startThread(BufferedWriter out, BufferedWriter outErr, List<String> typeUrl, String threadName) {
new Thread(() -> {
try {
searchJD(out, outErr, typeUrl);
} catch (Exception e) {
e.printStackTrace();
}
}, threadName).start();
}
private static BufferedWriter getBufferedOut(String path) throws IOException {
File writeName = new File(path); // 相對路徑,如果沒有則要建立一個新的output.txt檔案
writeName.createNewFile();
FileWriter writer = new FileWriter(writeName, true);
return new BufferedWriter(writer);
}
}