具體實作

第一步：pom.mxl配置

<!-- 爬蟲 -->
    <dependency>
        <groupId>org.jsoup</groupId>
        <artifactId>jsoup</artifactId>
        <version>1.10.2</version>
    </dependency>
    <!-- 爬蟲 适用于包含動态js的html請求 -->
    <dependency>
        <groupId>net.sourceforge.htmlunit</groupId>
        <artifactId>htmlunit</artifactId>
        <version>2.27</version>
    </dependency>
    <!-- 爬蟲 适用于純靜态html請求  -->
    <dependency>
        <groupId>org.apache.httpcomponents</groupId>
        <artifactId>httpclient</artifactId>
        <version>4.5.6</version>
    </dependency>

基于HttpClient的爬蟲例子

import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.*;
import java.net.URL;
import java.net.URLConnection;

/**
 * 基于org.apache.http.impl.client.HttpClients;方式去擷取html資料（靜态html，不執行js的純靜态html）
 * 1-為啥不用Jsoup請求html:因為功能，性能，效率都不如專業的org.apache.http.impl.client.HttpClients
 * 2-Jsoup是可以自己通過url去請求html（但是功能效率效果都不如上面的）
 * 3-Jsoup是解析操作html的工具
 * @author hxz
 */
public class HttpClient_Jsoup_Dome {
    /**
     *  通過HttpClient+Jsoup方式爬取資料
     * @param a
     */
    public static void main(String[] a)throws Exception{
        //通過HttpClient去請求html
        String Context=HttpClientHtml("http://www.baidu.com");
        //Jsoup解析html并擷取圖檔路徑
        doJsoup(Context,"http://www.baidu.com");
    }

    /**
     * HttpClient 請求靜态html資料
     * @param URL
     * @return
     */
    public static String HttpClientHtml(String URL){
        //String URL="http://www.baidu.com";
        String Context="";

        //建立一個新的請求用戶端
        CloseableHttpClient httpClient= HttpClients.createDefault();

        //使用HttpGet的方式請求網址
        HttpGet httpGet = new HttpGet(URL);

        //擷取網址的傳回結果
        CloseableHttpResponse response=null;
        try {
            response=httpClient.execute(httpGet);
        } catch (IOException e) {
            e.printStackTrace();
        }

        //擷取傳回結果中的實體
        HttpEntity entity = response.getEntity();

        //将傳回的實體輸出
        try {
            Context=EntityUtils.toString(entity, "utf-8");
            System.out.println(Context);
        } catch (IOException e) {
            e.printStackTrace();
        }
        return Context;
    }

    /**
     * 通過Jsoup解析html
     * @param html
     */
    public static void doJsoup(String html,String url) throws Exception{
        Document document = Jsoup.parse(html);
        //若HTML文檔包含相對URLs路徑，需要将這些相對路徑轉換成絕對路徑的URLs
        document.setBaseUri(url);//指定base URI

        //擷取所有的img元素
        Elements elements = document.select("img");
        int i=1;
        for (Element e : elements) {
            //擷取每個src的絕對路徑
            String src = e.absUrl("src");
            System.out.println(src);

            //擷取圖檔
            //URL urlSource = new URL(src);
            //URLConnection urlConnection = urlSource.openConnection();

            //設定圖檔名字
            String imageName = src.substring(src.lastIndexOf("/") + 1,src.length());

            //控制台輸出圖檔的src
            System.out.println(imageName);

            //通過URLConnection得到一個流，将圖檔寫到流中，并且建立檔案儲存
            /*InputStream in = urlConnection.getInputStream();
            OutputStream out = new FileOutputStream(new File("E:\\IDEA\\imgs\\", imageName));
            byte[] buf = new byte[1024];
            int l = 0;
            while ((l = in.read(buf)) != -1) {
                out.write(buf, 0, l);
            }*/
        }
    }
}

基于HtmlUnit的爬蟲例子

import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;

import java.util.List;

/**
 * 基于com.gargoylesoftware.htmlunit.WebClient;方式去擷取html資料（包含js建構的動态html）
 * 1-為啥不用Jsoup請求html:因為功能，性能，效率都不如專業的com.gargoylesoftware.htmlunit.WebClient；【最重要的是不支援執行js】
 * 2-Jsoup是可以自己通過url去請求html（但是功能效率效果都不如上面的）
 * 3-Jsoup是解析操作html的工具
 * @author hxz
 */
public class HtmlUnit_Jsoup_Dome {
    /**
     * 通過Htmlunit+Jsoup方式爬取資料
     * @param
     */
   public static void main(String[] a){
       WebClient webClient = new WebClient(BrowserVersion.CHROME);//建立一個模拟谷歌Chrome浏覽器的浏覽器用戶端對象

       webClient.getOptions().setThrowExceptionOnScriptError(false);//當JS執行出錯的時候是否抛出異常, 這裡選擇不需要
       webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);//當HTTP的狀态非200時是否抛出異常, 這裡選擇不需要
       webClient.getOptions().setActiveXNative(false);
       webClient.getOptions().setCssEnabled(false);//是否啟用CSS, 因為不需要展現頁面, 是以不需要啟用
       webClient.getOptions().setJavaScriptEnabled(true); //很重要，啟用JS
       webClient.setAjaxController(new NicelyResynchronizingAjaxController());//很重要，設定支援AJAX

       HtmlPage page = null;
       try {
           page = webClient.getPage("http://ent.sina.com.cn/film/");//嘗試加載上面圖檔例子給出的網頁
       } catch (Exception e) {
           e.printStackTrace();
       }finally {
           webClient.close();
       }

       webClient.waitForBackgroundJavaScript(30000);//異步JS執行需要耗時,是以這裡線程要阻塞30秒,等待異步JS執行結束

       String pageXml = page.asXml();//直接将加載完成的頁面轉換成xml格式的字元串

       //TODO 下面的代碼就是對字元串的操作了,正常的爬蟲操作,用到了比較好用的Jsoup庫

       Document document = Jsoup.parse(pageXml);//擷取html文檔
       List<Element> infoListEle = document.getElementById("feedCardContent").getElementsByAttributeValue("class", "feed-card-item");//擷取元素節點等
       infoListEle.forEach(element -> {
           System.out.println(element.getElementsByTag("h2").first().getElementsByTag("a").text());
           System.out.println(element.getElementsByTag("h2").first().getElementsByTag("a").attr("href"));
       });
   }
}

爬山的蝸牛旅程：爬蟲 Jsoup+(HtmlUnit或HttpClient)實作具體實作

具體實作

繼續閱讀

v2ex的簡單爬蟲

Python漫畫爬蟲開源 66漫畫 AJAX，包含資料庫連接配接，圖檔下載下傳處理

requests子產品進行人人網模拟登陸

Python image.show() 出錯FSPathMakeRef(/Applications/Preview.app) failed with error -43

2023爬蟲學習筆記 -- 多線程操作

M團店鋪評價采集不到問題問題展示：解決方案：

Python爬蟲學習（1）

Python爬蟲學習進階

Python爬蟲（入門+進階）學習筆記 1-2 初識Python爬蟲

Python進階爬蟲——Class1：認識爬蟲

python爬蟲學習筆記-1

python學習之urllib使用小結

NOIp模拟題之肮髒的牧師（桶排序）

一篇文章教你如何在一個月内學會爬取大規模資料

Pyhton爬蟲實戰 - 抓取BOSS直聘職位描述和資料清洗Pyhton爬蟲實戰 - 抓取BOSS直聘職位描述和資料清洗

sort()函數到底是怎樣進行數字排序的