天天看點

使用Jsoup解析XML抓取新浪新聞文章

<?xml version="1.0" encoding="UTF-8"?>
<result>
    <status>
        <code>0</code>
    </status>
    <encoding>utf-8</encoding>
    <serverSeconds>1420343599</serverSeconds>
    <total>298076</total>
    <count>22</count>
    <last_time>1420334026</last_time>
    <data>
        <item>
            <id>1-1-31356907</id>
            <column>tpxw</column>
            <title>組圖:武漢舉辦“女神相親會”3000多名媛報名</title>
            <url>http://slide.news.sina.com.cn/s/slide_1_2841_79556.html</url>
            <keywords>武漢,女神相親會</keywords>
            <comment_channel />
            <img>http://www.sinaimg.cn/dy/slidenews/1_t500/2015_01/2841_532839_164977.jpg</img>
            <level>0</level>
            <createtime>1420334026</createtime>
            <old_level>2</old_level>
            <media_type>tw</media_type>
            <media_name>新浪圖檔</media_name>
        </item>
        <item>
            <id>1-1-31356801</id>
            <column>tpxw</column>
            <title>組圖:鄭州一火鍋店牆上挂百萬現金作舉報獎</title>
            <url>http://slide.news.sina.com.cn/s/slide_1_2841_79546.html</url>
            <keywords>現金,火鍋店,百萬,地溝油,食品安全</keywords>
            <comment_channel />
            <img>http://www.sinaimg.cn/dy/slidenews/1_t500/2015_01/2841_532748_181307.jpg</img>
            <level>0</level>
            <createtime>1420331615</createtime>
            <old_level>2</old_level>
            <media_type>tw</media_type>
            <media_name>新浪圖檔</media_name>
        </item>
        <item>
            <id>1-1-31356788</id>
            <column>spxw</column>
            <title>視訊:實拍兩男販9公斤冰毒被抓現場互推脫指控</title>
            <url>http://video.sina.com.cn/p/news/s/v/2015-01-04/082764468119.html</url>
            <keywords>毒販,冰毒,反目</keywords>
            <comment_channel>vblog</comment_channel>
            <img>http://www.sinaimg.cn/dy/http/video.sina.com.cn/p/news/s/v/2015-01-04/136905268_2_s160x120.jpg</img>
            <level>0</level>
            <createtime>1420331230</createtime>
            <old_level>2</old_level>
            <media_type>sp</media_type>
            <media_name>齊魯網</media_name>
        </item>
        <item>
            <id>1-1-31356783</id>
            <column>shwx</column>
            <title>男子在3600米海拔雪地裡半裸求婚(圖)</title>
            <url>http://news.sina.com.cn/s/p/2015-01-04/082231356783.shtml</url>
            <keywords>半裸,求婚</keywords>
            <comment_channel>sh</comment_channel>
            <img>http://www.sinaimg.cn/dy/s/p/2015-01-04/U10856P1T1D31356783F21DT20150104082241.jpg</img>
            <level>1</level>
            <createtime>1420330961</createtime>
            <old_level>1</old_level>
            <media_type>tw</media_type>
            <media_name>中國網</media_name>
        </item>
        <item>
            <id>1-1-31356712</id>
            <column>spxw</column>
            <title>視訊:監拍救護車來遲醫生遭家屬暴打 護士大哭</title>
            <url>http://video.sina.com.cn/p/news/s/v/2015-01-04/080764468075.html</url>
            <keywords>救護車,來遲,家屬,暴打</keywords>
            <comment_channel>vblog</comment_channel>
            <img>http://www.sinaimg.cn/dy/http/video.sina.com.cn/p/news/s/v/2015-01-04/136904998_2_s160x120.jpg</img>
            <level>0</level>
            <createtime>1420330051</createtime>
            <old_level>2</old_level>
            <media_type>sp</media_type>
            <media_name>齊魯網</media_name>
        </item>
        <item>
            <id>1-1-31356710</id>
            <column>spxw</column>
            <title>視訊:湖北交警曝光男女車内熱吻親熱照引争議</title>
            <url>http://video.sina.com.cn/p/news/s/v/2015-01-04/080564468065.html</url>
            <keywords>交警,熱吻,親熱</keywords>
            <comment_channel>vblog</comment_channel>
            <img>http://www.sinaimg.cn/dy/http/video.sina.com.cn/p/news/s/v/2015-01-04/136885413_2_s160x120.jpg</img>
            <level>0</level>
            <createtime>1420329921</createtime>
            <old_level>2</old_level>
            <media_type>sp</media_type>
            <media_name>廣西台</media_name>
        </item>
        <item>
            <id>1-1-31356697</id>
            <column>shwx</column>
            <title>男子元旦過後上班突然暈倒不幸離世</title>
            <url>http://news.sina.com.cn/s/2015-01-04/075031356697.shtml</url>
            <keywords>突發疾病,工傷</keywords>
            <comment_channel>sh</comment_channel>
            <img />
            <level>0</level>
            <createtime>1420329018</createtime>
            <old_level>2</old_level>
            <media_type />
            <media_name>四川線上-華西都市報</media_name>
        </item>
        <item>
            <id>1-1-31356690</id>
            <column>spxw</column>
            <title>視訊:哈爾濱大火緻樓體3次坍塌 前後畫面對比</title>
            <url>http://video.sina.com.cn/p/news/s/v/2015-01-04/074264468023.html</url>
            <keywords>哈爾濱,畫面,大火</keywords>
            <comment_channel>vblog</comment_channel>
            <img>http://www.sinaimg.cn/dy/http/video.sina.com.cn/p/news/s/v/2015-01-04/136885015_2_s160x120.jpg</img>
            <level>0</level>
            <createtime>1420328533</createtime>
            <old_level>2</old_level>
            <media_type>sp</media_type>
            <media_name>東方衛視《看東方》</media_name>
        </item>
        <item>
            <id>1-1-31356715</id>
            <column>spxw</column>
            <title>視訊:5名犧牲消防員名單公布 年齡最小僅18歲</title>
            <url>http://video.sina.com.cn/p/news/s/v/2015-01-04/071564468095.html</url>
            <keywords>消防員,火災,年齡</keywords>
            <comment_channel>vblog</comment_channel>
            <img>http://www.sinaimg.cn/dy/http/video.sina.com.cn/p/news/s/v/2015-01-04/136904368_2_s160x120.jpg</img>
            <level>0</level>
            <createtime>1420326920</createtime>
            <old_level>2</old_level>
            <media_type>sp</media_type>
            <media_name>東方衛視《看東方》</media_name>
        </item>
        <item>
            <id>1-1-31356567</id>
            <column>shwx</column>
            <title>80後女子放棄高薪回鄉創業賣魚面 年賺20多萬</title>
            <url>http://news.sina.com.cn/s/2015-01-04/070231356567.shtml</url>
            <keywords>創業</keywords>
            <comment_channel>sh</comment_channel>
            <img />
            <level>0</level>
            <createtime>1420326165</createtime>
            <old_level>2</old_level>
            <media_type />
            <media_name>重慶晨報</media_name>
        </item>
        <item>
            <id>1-1-31356566</id>
            <column>shwx</column>
            <title>民工偷床單禦寒 警察接警後送其兩床被子</title>
            <url>http://news.sina.com.cn/s/2015-01-04/070131356566.shtml</url>
            <keywords>偷床單</keywords>
            <comment_channel>sh</comment_channel>
            <img />
            <level>0</level>
            <createtime>1420326090</createtime>
            <old_level>2</old_level>
            <media_type />
            <media_name>揚子晚報</media_name>
        </item>
        <item>
            <id>1-1-31356563</id>
            <column>shwx</column>
            <title>1歲半小孩過馬路遭汽車齊腰碾壓無大礙(圖)</title>
            <url>http://news.sina.com.cn/s/2015-01-04/065831356563.shtml</url>
            <keywords>碾壓,汽車碾壓</keywords>
            <comment_channel>sh</comment_channel>
            <img>http://www.sinaimg.cn/dy/s/2015-01-04/U11556P1T1D31356563F21DT20150104065850.jpg</img>
            <level>1</level>
            <createtime>1420325930</createtime>
            <old_level>1</old_level>
            <media_type>tw</media_type>
            <media_name>揚子晚報</media_name>
        </item>
        <item>
            <id>1-1-31356554</id>
            <column>qwys</column>
            <title>男子被甩後盜女友家得600元贓款</title>
            <url>http://news.sina.com.cn/s/2015-01-04/064631356554.shtml</url>
            <keywords>偷竊</keywords>
            <comment_channel>sh</comment_channel>
            <img />
            <level>0</level>
            <createtime>1420325219</createtime>
            <old_level>2</old_level>
            <media_type />
            <media_name>南方都市報</media_name>
        </item>
        <item>
            <id>1-1-31356551</id>
            <column>shwx</column>
            <title>外籍男子打車忘拿包報警後20分鐘找回</title>
            <url>http://news.sina.com.cn/s/2015-01-04/064331356551.shtml</url>
            <keywords>外籍男子</keywords>
            <comment_channel>sh</comment_channel>
            <img />
            <level>0</level>
            <createtime>1420325030</createtime>
            <old_level>2</old_level>
            <media_type />
            <media_name>南方都市報</media_name>
        </item>
        <item>
            <id>1-1-31356520</id>
            <column>shwx</column>
            <title>4人野外挖洞燒烤時塌方緻3人身亡</title>
            <url>http://news.sina.com.cn/s/2015-01-04/062931356520.shtml</url>
            <keywords>塌方</keywords>
            <comment_channel>sh</comment_channel>
            <img />
            <level>1</level>
            <createtime>1420324163</createtime>
            <old_level>1</old_level>
            <media_type />
            <media_name>南方都市報</media_name>
        </item>
        <item>
            <id>1-1-31356399</id>
            <column>shwx</column>
            <title>主人花300英鎊為便秘小金魚做手術(圖)</title>
            <url>http://news.sina.com.cn/s/p/2015-01-04/061031356399.shtml</url>
            <keywords>小金魚</keywords>
            <comment_channel>sh</comment_channel>
            <img>http://www.sinaimg.cn/dy/s/p/2015-01-04/U11556P1T1D31356399F21DT20150104061042.jpg</img>
            <level>0</level>
            <createtime>1420323042</createtime>
            <old_level>2</old_level>
            <media_type>tw</media_type>
            <media_name>現代快報</media_name>
        </item>
        <item>
            <id>1-1-31356398</id>
            <column>shwx</column>
            <title>男實習醫生以看病為由施暴女網友被刑拘</title>
            <url>http://news.sina.com.cn/s/2015-01-04/060931356398.shtml</url>
            <keywords>施暴</keywords>
            <comment_channel>sh</comment_channel>
            <img />
            <level>0</level>
            <createtime>1420322948</createtime>
            <old_level>2</old_level>
            <media_type />
            <media_name>現代快報</media_name>
        </item>
        <item>
            <id>1-1-31356397</id>
            <column>shwx</column>
            <title>男子上樓取物車被人開跑 次日接電話被罵亂停車</title>
            <url>http://news.sina.com.cn/s/2015-01-04/060731356397.shtml</url>
            <keywords>亂停車</keywords>
            <comment_channel>sh</comment_channel>
            <img>http://www.sinaimg.cn/dy/s/2015-01-04/U10608P1T1D31356397F21DT20150104060808.jpg</img>
            <level>0</level>
            <createtime>1420322833</createtime>
            <old_level>2</old_level>
            <media_type>tw</media_type>
            <media_name>新文化報</media_name>
        </item>
        <item>
            <id>1-1-31356396</id>
            <column>shwx</column>
            <title>女子為使皮膚好連啃3天豬腳下巴脫臼</title>
            <url>http://news.sina.com.cn/s/2015-01-04/060431356396.shtml</url>
            <keywords>脫臼</keywords>
            <comment_channel>sh</comment_channel>
            <img />
            <level>1</level>
            <createtime>1420322648</createtime>
            <old_level>1</old_level>
            <media_type />
            <media_name>中國網</media_name>
        </item>
        <item>
            <id>1-1-31356391</id>
            <column>shwx</column>
            <title>女孩與父親争吵後失聯半個月 被找到時已身亡</title>
            <url>http://news.sina.com.cn/s/2015-01-04/055931356391.shtml</url>
            <keywords>遇難,失聯</keywords>
            <comment_channel>sh</comment_channel>
            <img />
            <level>0</level>
            <createtime>1420322345</createtime>
            <old_level>2</old_level>
            <media_type />
            <media_name>錢江晚報</media_name>
        </item>
        <item>
            <id>1-1-31356389</id>
            <column>shwx</column>
            <title>男子因與女兒争吵在高速上賭氣下車後迷路</title>
            <url>http://news.sina.com.cn/s/2015-01-04/055931356389.shtml</url>
            <keywords>高速公路,争吵</keywords>
            <comment_channel>sh</comment_channel>
            <img />
            <level>0</level>
            <createtime>1420322345</createtime>
            <old_level>2</old_level>
            <media_type />
            <media_name>錢江晚報</media_name>
        </item>
        <item>
            <id>1-1-31356338</id>
            <column>shwx</column>
            <title>女子爬欄杆要跳河被6旬老人拉住</title>
            <url>http://news.sina.com.cn/s/2015-01-04/055831356338.shtml</url>
            <keywords>跳河</keywords>
            <comment_channel>sh</comment_channel>
            <img />
            <level>0</level>
            <createtime>1420322308</createtime>
            <old_level>2</old_level>
            <media_type />
            <media_name>現代快報</media_name>
        </item>
    </data>
</result>
           
package ivyy.taobao.com.domain.xml;

import ivyy.taobao.com.utils.GlobalConstants;

import java.net.URL;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/**
 *@Author:liangjilong
 *@Date:2015-1-4
 *@Email:[email protected]
 *@Version:1.0
 *@Description這個是通過jsoup處理的
 */
public class SinaNew {
	
	public static void main(String[] args)throws Exception {
		String requestURL = GlobalConstants.getUrl(2, "xml");
	    org.jsoup.nodes.Document doc=Jsoup.parse(new URL(requestURL), 3000);
		// String html=doc.html();
	    Elements items=doc.select("item");//擷取item(item具有多個節點)
	    
	    String title = "", url = "", keywords = "", img = "", media_name = "";
	    int i=1;
	    for (Element its : items) {
			
	    	title=its.select("title").html();
	    	url=its.select("url").html();
	    	keywords=its.select("keywords").html();
	    	img=its.select("img").html();
	    	media_name=its.select("media_name").html();
	    	
	    	String newsText=GlobalConstants.getNewsContent(url);//處理新聞内容
	    	
	    	//System.out.println(title + "\n" + url + "\n" + keywords + "\n"+ url + "\n" + media_name);
	    	
	    	System.out.println("==================第"+i+"篇=================="+newsText);
	    	i++;
		}
	}
}
           
package ivyy.taobao.com.utils;

import java.net.URL;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

/**
 *@Author:liangjilong
 *@Date:2015-1-4
 *@Email:[email protected]
 *@Version:1.0
 *@Description
 */
public class GlobalConstants {
	
	/***
	 * 擷取url連接配接
	 * @param page第幾頁
	 * @param format格式(XML、JSON)
	 * @return
	 */
	public static String getUrl(Integer page,String format){
		StringBuffer buffer=new StringBuffer("http://api.roll.news.sina.com.cn/zt_list?channel=news");
		String url="";
		buffer.append("&cat_1=shxw");//顯示新聞
		buffer.append("&cat_2==zqsk||=qwys||=shwx||=fz-shyf");
		buffer.append("&level==1||=2");//級别
		buffer.append("&show_ext=1");
		buffer.append("&show_all=1");//顯示所有
		buffer.append("&show_num=22");//顯示多少條
		buffer.append("&tag=1");
		buffer.append("&format="+format);
		buffer.append("&page="+page);
		buffer.append("&callback=newsloader");
		url=buffer.toString();
		return url;
	}
	
	
	/***
	 * 擷取文章的内容
	 * 從新浪的網頁分析,通過文章body的id就可以拿到相應的文章内容..
	 * @param url
	 * @return
	 */
	public static String getNewsContent(String url) throws Exception{
		Document doc=Jsoup.parse(new URL(url), 3000);
		if(doc!=null){
			String artibody=doc.getElementById("artibody").html();//通過網頁的html的id去拿到新聞内容artibody
			return artibody;
		}else{
			return "網絡異常";
		}
	}
}
           
package ivyy.taobao.com.utils;

import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;

/**
 *@Author:liangjilong
 *@Date:2015-1-4
 *@Email:[email protected]
 *@Version:1.0
 *@Description
 */

public class HttpRequestUtils {
	/**
	 * 發送http請求
	 * POST和GET請求都可以
	 * @param requestUrl 請求位址
	 * @param method傳入的執行的方式 是GET還是POST方式
	 * @return String
	 */
	public static String HttpURLConnRequest(String requestUrl,String method) {
		StringBuffer buffer = new StringBuffer();
		try {
			URL url = new URL(requestUrl);
			HttpURLConnection httpUrlConn = (HttpURLConnection) url.openConnection();
			httpUrlConn.setDoInput(true);
			httpUrlConn.setRequestMethod(method);
			httpUrlConn.setUseCaches(false);  
			httpUrlConn.setInstanceFollowRedirects(true); //重定向
			httpUrlConn.connect();
			// 将傳回的輸入流轉換成字元串
			InputStream inputStream = httpUrlConn.getInputStream();
			InputStreamReader inputStreamReader = new InputStreamReader(inputStream, "utf-8");
			BufferedReader bufferedReader = new BufferedReader(inputStreamReader);

			String str = null;
			while ((str = bufferedReader.readLine()) != null) {
				buffer.append(str);
			}
			bufferedReader.close();
			inputStreamReader.close();
			// 釋放資源
			inputStream.close();
			inputStream = null;
			httpUrlConn.disconnect();

		} catch (Exception e) {
			e.printStackTrace();
		}
		return buffer.toString();
	}
}