Jsoup參見百度http://baike.baidu.com/view/4066913.htm
官方網站:http://jsoup.org/
Jsoup解析HTML檔案基本步驟:
- 擷取Connection對象
- 通過Connection擷取文檔Document對象
- 根據需要解析文檔
例子1:
import java.io.IOException;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
public class Demo1 {
private static final String URL = "http://aiilive.blog.51cto.com/";
public static void main(String[] args) throws IOException {
/**
* 通過URL擷取連接配接
*/
Connection conn = Jsoup.connect(URL);
/**
* 通過連接配接擷取文檔對象
*/
Document doc = conn.get();
/**
* 擷取所有的超連結
*/
Elements elements=doc.getElementsByTag("ul");
for(int i=0, j=elements.size();i<j;i++){
System.out.println(elements.get(i).html());
}
}
}
例子2:來自org.jsoup.examples.ListLink.java
package org.jsoup.examples;
import org.jsoup.Jsoup;
import org.jsoup.helper.Validate;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
/**
* Example program to list links from a URL.
*/
public class ListLinks {
public static void main(String[] args) throws IOException {
Validate.isTrue(args.length == 1, "usage: supply url to fetch");
String url = args[0];
print("Fetching %s...", url);
Document doc = Jsoup.connect(url).get();
Elements links = doc.select("a[href]");
Elements media = doc.select("[src]");
Elements imports = doc.select("link[href]");
print("\nMedia: (%d)", media.size());
for (Element src : media) {
if (src.tagName().equals("img"))
print(" * %s: <%s> %sx%s (%s)",
src.tagName(), src.attr("abs:src"), src.attr("width"), src.attr("height"),
trim(src.attr("alt"), 20));
else
print(" * %s: <%s>", src.tagName(), src.attr("abs:src"));
}
print("\nImports: (%d)", imports.size());
for (Element link : imports) {
print(" * %s <%s> (%s)", link.tagName(),link.attr("abs:href"), link.attr("rel"));
}
print("\nLinks: (%d)", links.size());
for (Element link : links) {
print(" * a: <%s> (%s)", link.attr("abs:href"), trim(link.text(), 35));
}
}
private static void print(String msg, Object... args) {
System.out.println(String.format(msg, args));
}
private static String trim(String s, int width) {
if (s.length() > width)
return s.substring(0, width-1) + ".";
else
return s;
}
}
在解析51CTO部落格中的友情連結的想法:
- 選擇一個部落格首頁位址,如:http://aiilive.blog.51cto.com
- 通過<1>中的位址,擷取到相對應的友情連結;
- 将友情連結作為<1>中的位址繼續擷取;
解析的前提是對部落格頁面進行分析,下面是通過FireFox的程式員開發工具分析圖:
友情連結在<div class="friendLink box">...</div>裡面。
Jsoup的優秀之處之一提供了類似CSS的選擇器的方法,可以去特定标簽進行選擇過濾。
友情連結的部分如下圖:
做法一:
- 篩選出div class;
- 算選出屬性class的值為fiendLink box
- 擷取<a>...</a>的HTML代碼
- 處理<3>字元串 代碼1:擷取<li>中的HTML代碼
/** * 擷取BlogMain中的超連結 */ Element e = null; Elements elements = doc.select("div[class]"); for (int i = 0; i < elements.size(); i++) { e = elements.get(i); if (e.attr("class").equals("friendLink box")) { break; } } printElements(e.children().tagName("li"));
代碼2:處理<3>中的字元串/** * 擷取Elements中每一個Elment的HTML代碼 * @param es */ private static void printElements(Elements es) { for (int i = 0, j = es.size(); i < j; i++) { Element e = es.get(i); String str = e.html(); getUrl(str); } }
/** * <a href="http://21cnbao.blog.51cto.com" title="宋寶華的部落格" target="_blank">宋寶華的部落格</a> * @param str */ private static void getUrl(String str) { String[] strs = str.split("\""); for (String string : strs) { if (string.indexOf("http") != -1) { System.out.println(string); } } }
做法二:
做一個微小的改動:上述代碼1中的printElements(Elements es){...}進行改動。
如下:
private static final String ATTRIBUTE_VALUE_ABS = "abs:href";
//方法改寫
private static void getUrlExt(Elements es) {
for (Element element : es) {
String href = element.attr(ATTRIBUTE_VALUE_ABS);
//篩選51CTO部落格
if (filter51ctoBlog(href)) {
linkSetTemp.add(href);
}
}
}
filter51ctoBlog(href)
做51CTO部落格篩選,是保證URL個格式符和51CTO部落格首頁格式。 import java.io.IOException;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class Demo4 {
//周遊最小深度
private static final int LEVEL_MIN = 1;
//周遊最大深度
private static final int LEVEL_MAX = 4;
//程式入口URL
private static final String IN_URL = "http://aiilive.blog.51cto.com/";
//友情連結标記
// private static final String TAGNAME = "li";
//DIV CLASS篩選條件
private static final String SELECT_DIV = "div[class]";
private static final String ATTRIBUTE = "class";
//友情連結CLASS的值
private static final String ATTRIBUTE_VALUE = "friendLink box";
//超連結篩選Query
private static final String ATTRIBUTE_VALUE_ABS = "abs:href";
private static final String SELECT_TAG = "a[href]";
//存放周遊的友情連結
private static Set<String> linkSet = new HashSet<String>();
private static Set<String> linkSetTemp = null;
static {
linkSet.add(IN_URL);
}
/**
* 第一種:擷取友情連結的URL
*
* @param es
*/
@SuppressWarnings("unused")
private static void getUrl(Elements es) {
for (int i = 0, j = es.size(); i < j; i++) {
Element e = es.get(i);
String str = e.html();
String[] strs = str.split("\"");
for (String strs ) {
if (href.indexOf("http") != -1) {
if (filter51ctoBlog(href)) {
linkSetTemp.add(href);
}
}
}
}
}
/**
* 過濾非51CTO部落格首頁的URL
* @param str
* @return
*/
private static boolean filter51ctoBlog(String str) {
return str.endsWith("blog.51cto.com");
}
/**
* 第二種:擷取友情連結的URL
*
* @param es
*/
private static void getUrlExt(Elements es) {
for (Element element : es) {
String href = element.attr(ATTRIBUTE_VALUE_ABS);
if (filter51ctoBlog(href)) {
linkSetTemp.add(href);
}
}
}
/**
* 添加友情連結
*
* @param url
* @throws IOException
*/
private static void addFriendLink(String url) throws IOException {
Document doc = Jsoup.connect(url).get();
Element e = null;
Elements elements = doc.select(SELECT_DIV);
for (int i = 0; i < elements.size(); i++) {
e = elements.get(i);
if (e.attr(ATTRIBUTE).equals(ATTRIBUTE_VALUE)) {
break;
}
}
// 擷取FRIEND_BOX的孩子結點,篩選出li标簽
// getUrl(e.children().tagName(TAGNAME));
Elements es = e.children().select(SELECT_TAG);
getUrlExt(es);
}
/**
* 添加友情連結
*
* @param set
* @throws IOException
*/
private static void addFriendLink(Set<String> set) throws IOException {
for (Iterator<String> iter = set.iterator(); iter.hasNext();) {
addFriendLink(iter.next());
}
}
/**
* 列印集合資訊
*
* @param set
*/
private static void printSet(Set<String> set) {
for (Iterator<String> iter = set.iterator(); iter.hasNext();) {
System.out.println(iter.next());
}
}
public static void main(String[] args) throws IOException {
linkSet.add(IN_URL);
for (int i = LEVEL_MIN; i <= LEVEL_MAX; i++) {
linkSetTemp = new HashSet<String>();
addFriendLink(linkSet);
linkSet.addAll(linkSetTemp);
linkSetTemp = null;
}
System.out.println(linkSet.size());
printSet(linkSet);
}
}