抓取位址:http://www.120ask.com
package com.ninemax.ak.html;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
/**
* 直接調用底層URL
*
* @author darker
*
*/
public class HtmlHttpClient {
public static void main(String[] args) {
HtmlMethod method = new HtmlMethod();
List<String> urlList = new ArrayList<String>(
Arrays.asList(new String[] {
"http://www.120ask.com/list/jzxjb/",
"http://www.120ask.com/list/tnbz/"
}));
int page = 200;
String classify = "内科";
String section = "内分泌科";
for(String url : urlList){
Long start = System.currentTimeMillis();
for (int i = 1; i <= page; i++) {
if (i == 1) {
method.getSubUrl(classify, section, url, i);
} else {
method.getSubUrl(classify, section, url + "all/" + i, i);
}
}
System.out.println(url + "用時:" + (System.currentTimeMillis()-start));
}
}
}
主要代碼
package com.ninemax.ak.html;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import com.ninemax.ak.base.SpringContextUtil;
import com.ninemax.ak.dao.CommonDao;
public class HtmlMethod {
public CommonDao commonDao = SpringContextUtil.getBean("commonDao");
HtmlParser htmlParser = new HtmlParser();
String webId = "120ASK";
String qid = "";
// 提問的ID
int qcount = 0;
// 回答的ID
int acount = 0;
List<Map<String,Object>> q_result = new ArrayList<Map<String,Object>>();
List<Map<String,Object>> a_result = new ArrayList<Map<String,Object>>();
/**
* 2級頁面
*
* @param classify
* @param second_url
*/
public void getUrlAndSection(String classify,String second_url) {
// 調用httpClient方法,傳回抓取的頁面
String second_html = HtmlUtil.getHtmlByURL(second_url);
// 擷取<div>文本
String second_details_html = htmlParser.get_TagNode_Html(second_html, "div", "class", "h-left fl");
// 擷取<ul>文本
String second_ul_html =htmlParser. get_TagNode_Html(second_details_html, "ul", "class", "clears h-ul1");
// 擷取<li>數組
List<String> second_li_List = htmlParser.get_TagNode_HtmlList(second_ul_html,"li");
// 總頁數
int totalPage = 200;
// 取url和section
if (!HtmlUtil.isEmptyList(second_li_List)) {
for (String str_li : second_li_List) {
// 擷取連結
String third_url = htmlParser.get_TagNode_attr(str_li, "a", "href");
// 擷取連結名稱
String section = htmlParser.get_LinkTag_text(str_li, "a");
// 調用2級url進入3級頁面,問題
for (int i = 1; i <=totalPage; i++) {
if (i == 1) {
getSubUrl(classify, section, third_url, i);
} else {
getSubUrl(classify, section, third_url + "all/" + i, i);
}
}
}
}
}
/**
* 3級頁面
*
* @param classify
* @param section
* @param url
*/
public void getSubUrl(String classify,String section,String third_url,int page) {
String third_html = HtmlUtil.getHtmlByURL(third_url);
// 擷取<div>文本
String third_details_html = htmlParser.get_TagNode_Html(third_html, "div", "class", "t13 h-main");
// 擷取<ul>文本
String third_ul_html = htmlParser.get_TagNode_Html(third_details_html, "ul", "class", "clears h-ul3");
// 擷取<li>數組
List<String> third_li_List = htmlParser.get_TagNode_HtmlList(third_ul_html, "li");
if (!HtmlUtil.isEmptyList(third_li_List)) {
for (String str_li : third_li_List) {
// 擷取提問标簽
String li_a_html = htmlParser.get_TagNode_Html(str_li, "a", "class", "q-quename");
// 擷取連結
String suburl = htmlParser.get_TagNode_attr(li_a_html, "a", "href");
// 擷取連結名稱
String title = htmlParser.get_LinkTag_text(li_a_html, "a");
getDetails(classify,third_url,section,suburl,title,page);
}
}
}
/**
* 回答問題詳情
*
* @param classify
* @param url
* @param section
* @param suburl
* @param title
*/
public void getDetails(String classify,String url,String section,String suburl,String title,int page){
String details_html = HtmlUtil.getHtmlByURL(suburl);
// 擷取<div>文本
String ask_html_information = htmlParser.get_TagNode_Html(details_html, "div", "class", "b_askbox");
// 擷取提問者html
List<String> q_details_List = htmlParser.get_TagNode_HtmlList(htmlParser.get_TagNode_Html(ask_html_information, "div", "class", "b_askab1"), "span");
// 計數器
int count = 0;
// 取sex和age
String sex = "";
String age = "";
String editTime = "";
if (!HtmlUtil.isEmptyList(q_details_List)) {
for (String details : q_details_List) {
count++;
if (count == 1) {
String sexAndage = details.replace("<span>", "").replace("</span>", "");
sex = sexAndage.split("")[1];
age = sexAndage.substring(4, sexAndage.length());
} else if (count == 2 || count == 3 || count == 4) {
String time = details.trim().replace("<span>", "").replace("</span>", "");
if (time.startsWith("2017")) {
editTime = time;
}
}
}
}
// 問題
String question = htmlParser.get_TagNode_Html(ask_html_information, "p", "class", "crazy_new").replace("<p class='crazy_new'>", "").replace("<span>", "").replace("</span>", "").replace("</p>", "").replace(" ", "").replace(" ", "");
// 擷取username
String username = htmlParser.get_TagNode_Html(htmlParser.get_TagNode_Html(ask_html_information, "div", "class", "b_answerarea"), "span").replace(htmlParser.get_TagNode_Html(htmlParser.get_TagNode_Html(htmlParser.get_TagNode_Html(ask_html_information, "div", "class", "b_answerarea"), "span"),"a"), "").replace("<span>釋出人:", "").replace("</span>", "").replace("<var class=\"ask_Author\">", "").replace("</var>", "").replace(" ", "");
Map<String, Object> qtmp = new HashMap<String, Object>();
qid = webId + HtmlUtil.getStringByLength(12, ++qcount);
qtmp.put("id", qid);
qtmp.put("classify", classify);
qtmp.put("section", section);
qtmp.put("question", question);
qtmp.put("title", title);
qtmp.put("editTime", editTime);
qtmp.put("url", url);
qtmp.put("suburl", suburl);
qtmp.put("username", username);
qtmp.put("sex", sex);
qtmp.put("age", age);
qtmp.put("page", page);
q_result.add(qtmp);
// 添加到資料庫
try {
commonDao.saveList(null, "120ask_q", q_result, HtmlUtil.splitList("id,classify,section,question,title,editTime,url,suburl,username,sex,age,page", ","));
} catch (Exception e) {
e.printStackTrace();
}
// 擷取N個回答者html
List<String> a_details_List = htmlParser.get_TagNode_HtmlList(htmlParser.get_TagNode_Html(details_html, "div", "class", "b_answerbox t10"), "div", "class", "b_answerli");
// 周遊答案
for(String answerHtml : a_details_List){
String answerDepartment = "";
String answerHospital = "";
String answerSpeciality = "";
// 回答者的URL
String answerUrl = htmlParser.get_TagNode_attr(answerHtml, "a", "href");
// 提示資訊
System.out.println("url:" + answerUrl);
// 回答者詳細資訊
// 排序僞連結
if (answerUrl != null) {
String answer_identification_html = htmlParser.get_TagNode_Html(HtmlUtil.getHtmlByURL(answerUrl), "div", "class", "contAbout");
// 醫院
answerHospital = htmlParser.get_TagNode_Html(htmlParser.get_TagNode_Html(answer_identification_html, "p", "class", "p2 clears"), "span").replace("<span><b>醫院:</b>", "").replace("</span>", "").trim();
// 科室
List<String> answer_identification_html2 = htmlParser.get_TagNode_HtmlList(answer_identification_html, "p", "class", "p2 clears p3");
int answerCount = 0;
for(String tmpHtml : answer_identification_html2){
answerCount ++;
if(answerCount == 1){
answerDepartment = htmlParser.get_TagNode_Html(tmpHtml,"span").replace("<span><b>科室:</b>", "").replace("</span>", "").replace(" ", "");
}
}
// 回答者技能
answerSpeciality = htmlParser.get_TagNode_Html(answer_identification_html, "p", "class", "p4").replace("<p class=\"p4\"><b>擅長:</b>","").replace("</p>", "").replace(" ", "");
}
// 回答者的姓名
String answerName = htmlParser.get_LinkTag_text(htmlParser.get_TagNode_Html(answerHtml, "span","class","b_sp1"),"a");
// 回答者的工作
String answerJob = htmlParser.get_TagNode_Html(answerHtml, "span","class","b_sp1").replace(htmlParser.get_TagNode_Html(htmlParser.get_TagNode_Html(answerHtml, "span","class","b_sp1"), "a","title","檢視這個會員資訊"), "").replace("<span class=\"b_sp1\"> ", "").replace("</span>", "").replace("<i>", "").replace("</i>", "").replace(" ", "");
// 回答的内容
String answerContent = htmlParser.get_TagNode_Html(answerHtml, "p").replace(" ", "").replace("<p>", "").replace("</p>", "").replace("<br />", "").replace(" ", "");
// 回答的時間
String answerTime = htmlParser.get_TagNode_Html(answerHtml, "span","class","b_anscont_time").replace(htmlParser.get_TagNode_Html(htmlParser.get_TagNode_Html(answerHtml, "span","class","b_anscont_time"), "a","title","對這個回答進行投訴"), "").replace("<span class=\"b_anscont_time\">", "").replace("</span>", "").trim();
Map<String,Object> atmp = new HashMap<String,Object>();
// 封裝資料
atmp.put("id",qid + HtmlUtil.getStringByLength(3,++acount));
atmp.put("qid", qid);
atmp.put("job", answerJob);
atmp.put("hospital", answerHospital);
atmp.put("department", answerDepartment);
atmp.put("speciality", answerSpeciality);
atmp.put("answer", answerContent);
atmp.put("quicktime", answerTime);
atmp.put("url", answerUrl);
atmp.put("username", answerName);
atmp.put("page", 1);
a_result.add(atmp);
}
try {
commonDao.saveList(null, "120ask_a", a_result,HtmlUtil.splitList("id,qid,job,hospital,department,speciality,username,answer,quicktime,url,page", ","));
} catch (Exception e) {
e.printStackTrace();
}
a_result = new ArrayList<Map<String, Object>>();
q_result = new ArrayList<Map<String, Object>>();
// 問題ID重置
acount = 0;
}
}
HTML解析類
package com.ninemax.ak.html;
import java.util.ArrayList;
import java.util.List;
import org.apache.log4j.Logger;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.nodes.TagNode;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
public class HtmlParser {
public static Logger log = Logger.getLogger(HtmlParser.class);
/**
* 擷取HTML節點
* @param html
* @param TagNodeName 節點名稱
* @return HTML文本
*/
public String get_TagNode_Html(String html,String TagNodeName){
String subhtml = "";
try {
Parser parser = new Parser(html);
NodeFilter tagNode = new TagNameFilter(TagNodeName);
NodeList nodes = parser.parse(tagNode);
if (nodes != null && nodes.size() > 0) {
TagNode textnode = (TagNode) nodes.elementAt(0);
subhtml = textnode.toHtml();
}
} catch (ParserException e) {
e.printStackTrace();
}
return subhtml;
}
/**
* 擷取HTML節點
* @param html
* @param TagNodeName 節點
* @param setAttrName 屬性名
* @param setAttrValue 屬性值
* @return HTML文本
*/
public String get_TagNode_Html(String html, String TagNodeName,String setAttrName, String setAttrValue) {
String second_details_html = "";
try {
Parser parser = new Parser(html);
NodeFilter tagNode = new TagNameFilter(TagNodeName);
NodeFilter attrNode_name = new HasAttributeFilter(setAttrName,setAttrValue);
NodeFilter andNode = new AndFilter(tagNode, attrNode_name);
NodeList nodes = parser.extractAllNodesThatMatch(andNode);
if (nodes != null && nodes.size() > 0) {
TagNode textnode = (TagNode) nodes.elementAt(0);
second_details_html = textnode.toHtml();
}
} catch (ParserException e) {
e.printStackTrace();
}
return second_details_html;
}
/**
* 擷取HTML節點數組
* @param html
* @param TagNodeName 節點名稱
* @return 節點數組
*/
public List<String> get_TagNode_HtmlList(String html,String TagNodeName) {
List<String> result = new ArrayList<String>();
NodeList nodes=null;
try {
Parser parser = new Parser(html);
NodeFilter tagNode = new TagNameFilter(TagNodeName);
nodes = parser.parse(tagNode);
} catch (ParserException e) {
e.printStackTrace();
}
if (nodes != null && nodes.size() > 0) {
for (int i = 0; i < nodes.size(); i++) {
TagNode textnode = (TagNode) nodes.elementAt(i);
String s = textnode.toHtml();
if (!HtmlUtil.isEmptyTrim(s)) {
result.add(s);
}
}
}
return result;
}
/**
* 擷取指定屬性HTML節點
* @param html
* @param TagNodeName 節點名稱
* @param setAttrName 屬性名稱
* @param setAttrValue 屬性值
* @return HTML文本集合
*/
public List<String> get_TagNode_HtmlList(String html,String TagNodeName ,String setAttrName ,String setAttrValue) {
List<String> result = new ArrayList<String>();
NodeList nodes = null;
try {
Parser parser = new Parser(html);
NodeFilter tagNode = new TagNameFilter(TagNodeName);
NodeFilter attrNode_name = new HasAttributeFilter(setAttrName,setAttrValue);
NodeFilter andNode = new AndFilter(tagNode, attrNode_name);
nodes = parser.extractAllNodesThatMatch(andNode);
} catch (ParserException e) {
log.error("html:" + html, e);
}
if (nodes != null && nodes.size() > 0) {
for (int i = 0; i < nodes.size(); i++) {
TagNode textnode = (TagNode) nodes.elementAt(i);
String s = textnode.toHtml();
if (!HtmlUtil.isEmptyTrim(s)) {
result.add(s);
}
}
}
return result;
}
/**
* 擷取HTML節點的屬性
* @param html
* @param TagNodeName 節點名稱
* @param getAttrName 屬性名稱
* @return
*/
public String get_TagNode_attr(String html, String TagNodeName,String getAttrName) {
String attr = "";
NodeList nodes = null;
try {
Parser parser = new Parser(html);
NodeFilter tagNode = new TagNameFilter(TagNodeName);
nodes = parser.parse(tagNode);
} catch (ParserException e) {
e.printStackTrace();
}
if (nodes != null && nodes.size() > 0) {
TagNode textnode = (TagNode) nodes.elementAt(0);
attr = textnode.getAttribute(getAttrName);
}
return attr;
}
/**
* 擷取A标簽連結名稱
* @param html
* @param TagNodeName A
* @return
*/
public String get_LinkTag_text(String html,String TagNodeName) {
String text="";
NodeList nodes=null;
try {
Parser parser = new Parser(html);
NodeFilter tagNode=new TagNameFilter(TagNodeName);
nodes = parser.parse(tagNode);
} catch (ParserException e) {
e.printStackTrace();
}
if(nodes!=null&&nodes.size()>0) {
LinkTag textnode = (LinkTag) nodes.elementAt(0);
text = textnode.getLinkText().trim();
}
return text;
}
}
工具類
package com.ninemax.ak.html;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpMethod;
import org.apache.commons.httpclient.methods.GetMethod;
public class HtmlUtil {
/**
* 組裝字元串
*
* @param length
* @param count
* @return
*/
public static String getStringByLength(Integer length, Object count) {
String str = count.toString();
if (length > 0 && null != count) {
StringBuffer sb = new StringBuffer();
for (int i = 0; i < length; i++) {
sb.append(0);
}
str = str.length() >= length ? str : sb.substring(str.length())
+ str;
}
return str;
}
/**
* 根據URL得到HTML
*
* @param url
* @return
*/
public static String getHtmlByURL(String url) {
HttpClient client = new HttpClient();
// 設定通路的URL
HttpMethod method = new GetMethod(url);
// 執行通路
try {
client.executeMethod(method);
String html = method.getResponseBodyAsString();
return clearHtml(html);
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
/**
* 處理HTML
*
* @param html
* @return
*/
public static String clearHtml(String html) {
if (null != html) {
return html.replace(" ", " ")
.replaceAll("[\\f\\n\\r\\t\\v]", "")
.replaceAll("<!--(?!-->).*?-->", "")
.replaceAll("<script(?!</script>).*?</script>", "");
}
return null;
}
public static double stringToDouble(String s) {
double result = 0;
if (s.indexOf("/") != -1) {// 對付分号類型的
double fenZi = Double.parseDouble(s.split("/")[0]);
double fenMu = Double.parseDouble(s.split("/")[1]);
result = fenZi / fenMu;
return result;// 處理分數類型的
} else {
return Double.parseDouble(s);// 處理小數和整數類型的
}
}
/**
* 判斷字元串是否為空
*
* @param s
* @return
*/
public static boolean isNullOrEmpty(String s) {
return null == s || s.length() < 1;
}
/**
* 判斷字元串是否為空或者空格
*
* @param s
* @return
*/
public static boolean isEmptyTrim(String s) {
return null == s || s.trim().length() < 1;
}
/**
* 判斷list是否為空
*
* @param list
* @return
*/
public static boolean isEmptyList(List<?> list) {
return null == list || list.size() <= 0;
}
/**
* 判斷map是否為空
*
* @param map
* @return
*/
public static boolean isEmptyMap(Map<?, ?> map) {
return null == map || map.size() <= 0;
}
public static List<String> splitList(String in, String split) {
String[] ss = in.split(split);
List<String> tags = new ArrayList<String>();
for (String s : ss) {
if (s.trim().length() > 0) {
tags.add(s.trim());
}
}
return tags;
}
public static String splitCollection(Collection<String> in, String split) {
StringBuffer sb = new StringBuffer();
if (null != in && in.size() > 0) {
for (String t : in) {
if (null != t && t.trim().length() > 0) {
sb.append(t.trim()).append(split);
}
}
}
return sb.length() > 0 ? sb.substring(0, sb.lastIndexOf(split)) : "";
}
}
Remark:QQ交流群:260052172
版權聲明:本文為CSDN部落客「weixin_33774883」的原創文章,遵循CC 4.0 BY-SA版權協定,轉載請附上原文出處連結及本聲明。
原文連結:https://blog.csdn.net/weixin_33774883/article/details/92040556