天天看點

java爬蟲(Jsoup)爬取某站點評論

本文是基于這一篇的:http://blog.csdn.net/disiwei1012/article/details/51614492

在上一篇中,我們抓取到了新聞的标題,超連結和摘要,這次我們通過新聞的超連結,進入新聞的評論頁,然後爬取評論!

注:http://www.wumaow.com,這個網站的标簽寫的太混亂了,而且還有js報錯,到處都是廣告。要是不是外國評論翻譯的及時,我就去看龍騰網了http://www.ltaaa.com.

先看下評論頁的标簽:

主要是尋找id為“art_content”的标簽下的 id為“text”下的“div”标簽。

java爬蟲(Jsoup)爬取某站點評論

代碼:

public class News {
    private String title;
    private String href;
    private String content;

    public News() {}

    public News(String title,String href,String content){
        this.title = title;
        this.content = content;
        this.href = href;
    }

    public String getTitle() {
        return title;
    }

    public void setTitle(String title) {
        this.title = title;
    }

    public String getHref() {
        return href;
    }

    public void setHref(String href) {
        this.href = href;
    }

    public String getContent() {
        return content;
    }

    public void setContent(String content) {
        this.content = content;
    }

}
           
public class JsoupTest {

    static String url="http://www.cnblogs.com/zyw-205520/archive/2012/12/20/2826402.html";
    /**
     * @param args
     * @throws Exception 
     */
    public static void main(String[] args) throws Exception {
//      BolgBody();
//      article();
//      Blog();
        ArrayList<News> newsList = getWuMaoW();
        getComments(newsList);
    }

    //=======================begin=======================================

    //擷取5毛網上的文章标題和超連結
    public static ArrayList getWuMaoW(){
        String url = "http://www.wumaow.com";
        Document doc = null;
        ArrayList<News> newsList = new ArrayList<News>();
        try {
            doc = Jsoup.connect(url).get();
            Elements listDiv = doc.getElementsByAttributeValue("class", "post");
            for(Element element : listDiv){
                News news = new News();
                Comments comment = new Comments();
                Elements texts = element.getElementsByTag("h4");
                Elements summerys = element.getElementsByTag("p");
                for(Element text:texts){
                    String ptext = text.text();
                    news.setTitle(ptext);
                    Elements hrefs = text.getElementsByTag("a");
                    for(Element href:hrefs){
                        String phref = href.attr("href");
                        news.setHref("http://www.wumaow.com"+phref);
                    }
                }
                for(Element summery:summerys){
                    String psummery = summery.text();
                    news.setContent(psummery);
                }
                newsList.add(news);
            }
            /*for(News news:newsList){
                System.out.println(news.getTitle());
                System.out.println(news.getHref());
                System.out.println(news.getContent());
                System.out.println("=============================================");
            }*/
        } catch (IOException e) {
            e.printStackTrace();
        }
        return newsList;
    }


    //擷取五毛網的評論
    public static ArrayList getComments(ArrayList<News> newsList) throws IOException{
        for(News news:newsList){
            Document doc = Jsoup.connect(news.getHref()).get();
            Element art_content = doc.getElementById("art_content");
            Element text = art_content.getElementById("text");
            Elements  commentsList = text.getElementsByTag("div");
            for(Element element:commentsList){
                String nr = element.text();
//              String _shared = element.attr("class");
//              System.out.println(_shared);
                if(nr!=""){
                    System.out.println(nr);
                }
            }
        }
        return null;
    } 
}   
//============================end=========================================
           

結果:

java爬蟲(Jsoup)爬取某站點評論