天天看点

java 获取指定网页内容_java获取url网页指定内容

获取网页源代码(←获取html网页源代码请看这儿)的前提下,根据网页源代码获取指定内容

import java.util.ArrayList;

import java.util.List;

import java.util.regex.Matcher;

import java.util.regex.Pattern;

public class GetTheSpecifyHtmlTagCode {

public static List match(String source, String element, String byAttr) {

List result = new ArrayList();

String reg = String.format("]*?\\s?%s=['\"](.*?)['\"]\\s?.*?>(.*?)%s>", element, byAttr, element);

Matcher m = Pattern.compile(reg).matcher(source);

while (m.find()) {

String elementResult = m.group(0);

String attrResult = m.group(1);

result.add(elementResult + VS

+ attrResult);

}

return result;

}

public static String getAttrValueByAttr(String source, String element, String byAttr) {

String result = "";

String reg = String.format("]*?\\s?%s=['\"](.*?)['\"]\\s?.*?>(.*?)%s>", element, byAttr, element);

Matcher m = Pattern.compile(reg).matcher(source);

while (m.find()) {

result = m.group(1);

}

return result;

}

public static String getAttrValue(String htmlCode, String element, String byAttr1, String attrValue, String byAttr2) {

List list = match(htmlCode, element, byAttr1);

String tempr = "";

for (int i = 0; i < list.size(); i++) {

String tempResult = list.get(i);

String[] temp = tempResult

.split(VS);

if (temp[1].equals(attrValue)) {

tempr = getAttrValueByAttr(htmlCode, element, byAttr2);

}

}

return tempr;

}

public static String getElementById(String htmlCode, String element, String byAttr, String attrValue) {

List list = match(htmlCode, element, byAttr);

String tempr = "";

for (int i = 0; i < list.size(); i++) {

String tempResult = list.get(i);

String[] temp = tempResult

.split(VS);

// System.out.println("--------> " + (i + 1) + ". " + list.get(i));

if (temp[1].equals(attrValue)) {

// System.out.println("内部打印 " + temp[0]);

tempr = temp[0];

}

}

return tempr;

}

public static String getElementValueByAttr(String htmlCode, String element, String byAttr, String attrValue) {

String elementCode = getElementById(htmlCode, element, byAttr, attrValue);

String regTagStart = String.format("]*?\\s?%s=['\"](.*?)['\"]\\s?.*?>", element,byAttr);

String tagStart="";

Matcher m = Pattern.compile(regTagStart).matcher(elementCode);

while (m.find()) {

tagStart = m.group(0);

}

String regTagEnd = String.format("%s>", element);

String result = elementCode.replace(tagStart, "").replace(regTagEnd, "");

return result;

}

public static String getElementValueByElementCode(String elementCode, String element,String byAttr) {

String regTagStart = String.format("]*?\\s?%s=['\"](.*?)['\"]\\s?.*?>", element,byAttr);

String tagStart="";

Matcher m = Pattern.compile(regTagStart).matcher(elementCode);

while (m.find()) {

tagStart = m.group(0);

}

String regTagEnd = String.format("%s>", element);

String result = elementCode.replace(tagStart, "").replace(regTagEnd, "");

return result;

}

private static String VS = "--------";//split标记自定义

}