爬蟲順序
1.分析網站網絡請求
通過浏覽器F12開發者工具檢視網站的内容擷取方式。
2.模拟HTTP請求,擷取網頁内容。
可以采用HttpClient,利用JAVA HttpClient工具可以模拟HTTP GET、POST請求,可以用來擷取爬蟲需要的資料。JAVA的一些爬蟲架構底層用到的擷取網頁方式也都是HttpClient。
3.解析網頁HTML内容,擷取可用資料和下一條請求連結。
可以采用jsoup、正規表達式、xpath等。
實踐一:知乎
![](https://img.laitimes.com/img/_0nNw4CM6IyYiwiM6ICdiwiIx0DciV2dmADM30zd-cmbw5CRzUyYtJ2d1M1Tx0kaNVzYE5kerRVT0FFVPhXTU5keVRUT4FlaNBTQ65EeBpWT0dGRPhXSU9UN4MkT3NGVNdXS6xENnRVT5tGVPZ3YyI2cKJDT0ljMZVXTzold41WW15UbMFTRE1UeNhlWuZ0ViBXO5xkNNh0YwIFSh9CXt92YuM3YltWas5iclN3Ztl2Lc9CX6MHc0RHaiojIsJye.png)
檢視開發者工具可以看到知乎首頁的内容擷取有兩種:
一種是GET請求,請求位址為https://www.zhihu.com/
一種是POST請求,請求位址為https://www.zhihu.com/node/TopStory2FeedList
第一種GET請求即現實中使用者直接從浏覽器位址欄輸入知乎的網址或點選連結進行請求,這時知乎會響應傳回一個隻有數條内容的首頁給使用者。
第二種POST請求即現實中使用者向下滾動頁面,浏覽器持續加載新内容。
第一種GET請求沒有參數,響應也是HTML,較為簡單。
第二種POST請求可以在開發者工具中檢視它的參數和響應。
可以看到有兩個請求參數
params:"{"offset":21,"start":"19"}"
method:"next"
響應為一段JSON,我們要的是下面的msg數組,是以代碼中會用到json-lib這個jar包友善我們解析json。
分析完網站的網絡請求後就可以進行下一步,模拟HTTP請求
首先模拟GET請求
public String doGet() throws ClientProtocolException, IOException {
String str = "";
// 建立HttpClient執行個體
HttpClient httpClient = new DefaultHttpClient();
// 建立Get方法執行個體
HttpUriRequest httpUriRequest = new HttpGet("http://www.zhihu.com");
// 添加必要的頭資訊
httpUriRequest.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0");
httpUriRequest.setHeader("Cookie", "這裡的Cookie拷貝複制登入後請求頭裡的Cookie值");
httpUriRequest.setHeader("DNT", "1");
httpUriRequest.setHeader("Connection", "keep-alive");
httpUriRequest.setHeader("Upgrade-Insecure-Requests", "1");
httpUriRequest.setHeader("Cache-Control", "max-age=0");
HttpResponse response = httpClient.execute(httpUriRequest);
HttpEntity entity = response.getEntity();
if (entity != null) {
InputStream inputStream = entity.getContent();
str = convertStreamToString(inputStream);
}
return str;
}
convertStreamToString為一個将流轉換為字元串的方法
public static String convertStreamToString(InputStream is)
throws IOException {
InputStreamReader ir = new InputStreamReader(is, "UTF8");
BufferedReader reader = new BufferedReader(ir);
StringBuilder sb = new StringBuilder();
String line = null;
try {
while ((line = reader.readLine()) != null) {
sb.append(line + "\n");
}
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
is.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return sb.toString();
}
View Code
模拟POST請求(兩個參數即為請求參數裡的兩個變量)
public String doPost(int offset, int start) throws Exception {
HttpClient httpClient = new DefaultHttpClient();
HttpUriRequest httpUriRequest = RequestBuilder
.post()
.setUri("https://www.zhihu.com/node/TopStory2FeedList")
.addParameter("params", "{\"offset\":" + offset + ",\"start\":\"" + start + "\"}").addParameter("method", "next").build();
// 添加必要的頭資訊
httpUriRequest.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0");
httpUriRequest.setHeader("X-Xsrftoken", "這裡的X-Xsrftoken拷貝複制登入後請求頭裡的X-Xsrftoken值");
httpUriRequest.setHeader("X-Requested-With", "XMLHttpRequest");
httpUriRequest.setHeader("Referer", "https://www.zhihu.com/");
httpUriRequest.setHeader("Cookie", "這裡的Cookie拷貝複制登入後請求頭裡的Cookie值");
httpUriRequest.setHeader("DNT", "1");
httpUriRequest.setHeader("Connection", "keep-alive");
httpUriRequest.setHeader("Cache-Control", "max-age=0");
HttpResponse response = httpClient.execute(httpUriRequest);
String str = "";
HttpEntity entity = response.getEntity();
if (entity != null) {
InputStream instreams = entity.getContent();
str = convertStreamToString(instreams);
}
return str;
}
最後走一波main方法将資料儲存至TXT檔案中,在這之前要提取一下HTML中的資料
根據HTML解析資料
這裡用到的Document Elements Element 都是jsoup裡的元素
這段代碼首先拿到到類名為feed-item-inner的HTML元素
變量所有feed-item-inner拿到類名為feed-title的标題和标簽類型為textarea的内容
public String unparsedData(String html) {
Document doc = Jsoup.parse(html);
Elements feeds = doc.getElementsByAttributeValue("class", "feed-item-inner");
String writeStr = "";
for (Element feed : feeds) {
Elements title = new Elements();
Elements feedTitles = feed.getElementsByAttributeValue("class", "feed-title");
for (Element feedTitle : feedTitles) {
title = feedTitle.getElementsByTag("a");
}
Elements content = feed.getElementsByTag("textarea");
String titleHref = title.attr("href");
String titleText = title.text().trim();
String contentText = content.text().trim();
// if(!titleText.contains("人民的名義")){
// continue;
// }
System.out.println("--------------------");
System.out.println("-----标題-----");
System.out.println("連結:" + titleHref);
System.out.println("内容:" + titleText);
System.out.println("-----内容-----");
System.out.println("内容:" + contentText);
System.out.println("--------------------");
writeStr += "--------------------\n-----标題-----\n" + titleHref
+ "\n" + titleText + "\n-----内容-----\n" + contentText
+ "\n--------------------\n\n\n";
}
return writeStr;
}
最後Main方法
public void downloadFile() throws Exception {
// 模拟HTTP GET請求
String responseBody = doGet();
// 解析資料
String writeStr = unparsedData(responseBody);
// 建立新檔案
String path = "D:\\testFile\\zhihu.txt";
PrintWriter printWriter = null;
printWriter = new PrintWriter(new FileWriter(new File(path)));
// 寫内容
printWriter.write(writeStr);
printWriter.close();
int offset = 10;
int start = 9;
for (int time = 0; time <= 100; time++) {
// 模拟POST請求
JSONObject jsonObject = JSONObject.fromObject(doPost(offset, start));
// 解析資料(隻拿JSON資料裡的msg數組)
String addWriteStr = "";
JSONArray jsonArray = jsonObject.getJSONArray("msg");
Object[] arrays = jsonArray.toArray();
for (Object array : arrays) {
addWriteStr += unparsedData(array.toString());
}
// 追加文本
printWriter = new PrintWriter(new FileWriter(path, true));
printWriter.write(addWriteStr);
printWriter.close();
// 延時,調整參數
Thread.currentThread().sleep(1000);// 毫秒
offset = offset + 10;
start = start + 10;
}
}
View Code
完整代碼
package spider;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import net.sf.json.JSONArray;
import net.sf.json.JSONObject;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.client.methods.RequestBuilder;
import org.apache.http.impl.client.DefaultHttpClient;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.Test;
@SuppressWarnings("deprecation")
public class ZhihuSpider {
/**
* 模拟HTTP GET請求
*/
public String doGet() throws ClientProtocolException, IOException {
String str = "";
// 建立HttpClient執行個體
HttpClient httpClient = new DefaultHttpClient();
// 建立Get方法執行個體
HttpUriRequest httpUriRequest = new HttpGet("http://www.zhihu.com");
// 添加必要的頭資訊
httpUriRequest
.setHeader("User-Agent",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0");
httpUriRequest
.setHeader(
"Cookie",
"這裡的Cookie拷貝複制登入後請求頭裡的Cookie值");
httpUriRequest.setHeader("DNT", "1");
httpUriRequest.setHeader("Connection", "keep-alive");
httpUriRequest.setHeader("Upgrade-Insecure-Requests", "1");
httpUriRequest.setHeader("Cache-Control", "max-age=0");
HttpResponse response = httpClient.execute(httpUriRequest);
HttpEntity entity = response.getEntity();
if (entity != null) {
InputStream inputStream = entity.getContent();
str = convertStreamToString(inputStream);
}
return str;
}
public static String convertStreamToString(InputStream is)
throws IOException {
InputStreamReader ir = new InputStreamReader(is, "UTF8");
BufferedReader reader = new BufferedReader(ir);
StringBuilder sb = new StringBuilder();
String line = null;
try {
while ((line = reader.readLine()) != null) {
sb.append(line + "\n");
}
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
is.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return sb.toString();
}
// 下載下傳 URL 指向的網頁
@SuppressWarnings("static-access")
@Test
public void downloadFile() throws Exception {
// 模拟HTTP GET請求
String responseBody = doGet();
// 解析資料
String writeStr = unparsedData(responseBody);
// 建立新檔案
String path = "D:\\testFile\\zhihu.txt";
PrintWriter printWriter = null;
printWriter = new PrintWriter(new FileWriter(new File(path)));
// 寫内容
printWriter.write(writeStr);
printWriter.close();
int offset = 10;
int start = 9;
for (int time = 0; time <= 100; time++) {
// 模拟POST請求
JSONObject jsonObject = JSONObject
.fromObject(doPost(offset, start));
// 解析資料(隻拿JSON資料裡的msg數組)
String addWriteStr = "";
JSONArray jsonArray = jsonObject.getJSONArray("msg");
Object[] arrays = jsonArray.toArray();
for (Object array : arrays) {
addWriteStr += unparsedData(array.toString());
}
// 追加文本
printWriter = new PrintWriter(new FileWriter(path, true));
printWriter.write(addWriteStr);
printWriter.close();
// 延時,調整參數
Thread.currentThread().sleep(1000);// 毫秒
offset = offset + 10;
start = start + 10;
}
}
/**
* 根據HTML解析資料
*
* @param html
* 源HTML
* @return 解析後的資料
*/
public String unparsedData(String html) {
Document doc = Jsoup.parse(html);
Elements feeds = doc.getElementsByAttributeValue("class",
"feed-item-inner");
String writeStr = "";
for (Element feed : feeds) {
Elements title = new Elements();
Elements feedTitles = feed.getElementsByAttributeValue("class",
"feed-title");
for (Element feedTitle : feedTitles) {
title = feedTitle.getElementsByTag("a");
}
Elements content = feed.getElementsByTag("textarea");
String titleHref = title.attr("href");
String titleText = title.text().trim();
String contentText = content.text().trim();
// if(!titleText.contains("人民的名義")){
// continue;
// }
System.out.println("--------------------");
System.out.println("-----标題-----");
System.out.println("連結:" + titleHref);
System.out.println("内容:" + titleText);
System.out.println("-----内容-----");
System.out.println("内容:" + contentText);
System.out.println("--------------------");
writeStr += "--------------------\n-----标題-----\n" + titleHref
+ "\n" + titleText + "\n-----内容-----\n" + contentText
+ "\n--------------------\n\n\n";
}
return writeStr;
}
/**
* 模拟HTTP POST請求
*
* @param offset
* 參數offset
* @param start
* 參數start
* @return 請求傳回的JSON資料
*/
public String doPost(int offset, int start) throws Exception {
HttpClient httpClient = new DefaultHttpClient();
HttpUriRequest httpUriRequest = RequestBuilder
.post()
.setUri("https://www.zhihu.com/node/TopStory2FeedList")
.addParameter(
"params",
"{\"offset\":" + offset + ",\"start\":\"" + start
+ "\"}").addParameter("method", "next").build();
// 添加必要的頭資訊
httpUriRequest
.setHeader("User-Agent",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0");
httpUriRequest.setHeader("X-Xsrftoken",
"這裡的X-Xsrftoken拷貝複制登入後請求頭裡的X-Xsrftoken值");
httpUriRequest.setHeader("X-Requested-With", "XMLHttpRequest");
httpUriRequest.setHeader("Referer", "https://www.zhihu.com/");
httpUriRequest
.setHeader(
"Cookie",
"這裡的Cookie拷貝複制登入後請求頭裡的Cookie值");
httpUriRequest.setHeader("DNT", "1");
httpUriRequest.setHeader("Connection", "keep-alive");
httpUriRequest.setHeader("Cache-Control", "max-age=0");
HttpResponse response = httpClient.execute(httpUriRequest);
String str = "";
HttpEntity entity = response.getEntity();
if (entity != null) {
InputStream instreams = entity.getContent();
str = convertStreamToString(instreams);
}
return str;
}
}
View Code