天天看點

擷取省市區鎮爬蟲

1 package com.mock.utils;
  2 
  3 import java.io.IOException;
  4 import java.net.MalformedURLException;
  5 import java.util.ArrayList;
  6 import java.util.List;
  7 
  8 import org.jsoup.Jsoup;
  9 import org.jsoup.nodes.Document;
 10 import org.jsoup.nodes.Element;
 11 import org.jsoup.select.Elements;
 12 
 13 import com.gargoylesoftware.htmlunit.BrowserVersion;
 14 import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
 15 import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
 16 import com.gargoylesoftware.htmlunit.WebClient;
 17 import com.gargoylesoftware.htmlunit.WebClientOptions;
 18 import com.gargoylesoftware.htmlunit.html.HtmlPage;
 19 import com.justsy.army.mgt.mock.model.City;
 20 
 21 public class NationalBureauOfStatics {
 22     private static final String ADDRESS = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/";
 23     private static final String fix = ".html";
 24 
 25     public static void main(String[] args) {
 26         List<City> provinceList = new ArrayList<>();
 27         List<City> cityList = new ArrayList<>();
 28         List<City> countyList = new ArrayList<>();
 29         List<City> townList = new ArrayList<>();
 30         provinceList = getTVMall(provinceList, new City(), ADDRESS, 0);
 31         for (City city : provinceList) {
 32             cityList = getTVMall(cityList, city, city.getHtmlAddr(), 1);
 33         }
 34         for (City city : cityList) {
 35             countyList = getTVMall(countyList, city, city.getHtmlAddr(), 2);
 36         }
 37         for (City city : countyList) {
 38             townList = getTVMall(townList, city, city.getHtmlAddr(), 3);
 39         }
 40 
 41         for (City city : townList) {
 42             System.out.println(city.toString());
 43         }
 44     }
 45 
 46     public static List<City> getTVMall(List<City> list, City city, String address, int type) {
 47         WebClient webClient = new WebClient(BrowserVersion.CHROME);
 48         // webclient參數載體
 49         WebClientOptions clientOptions = webClient.getOptions();
 50         // 設定webClient的相關參數
 51         clientOptions.setJavaScriptEnabled(true);
 52         clientOptions.setCssEnabled(false);
 53         webClient.setAjaxController(new NicelyResynchronizingAjaxController());
 54         clientOptions.setTimeout(35000);
 55         clientOptions.setThrowExceptionOnScriptError(false);
 56         try {
 57             HtmlPage htmlPage = webClient.getPage(address);
 58             Document dom = Jsoup.parse(htmlPage.asXml());
 59             Elements ele = null;
 60             if (type == 0) {
 61                 ele = dom.getElementsByClass("provincetable");
 62             } else if (type == 1) {
 63                 ele = dom.getElementsByClass("citytable");
 64             } else if (type == 2) {
 65                 ele = dom.getElementsByClass("countytable");
 66             } else if (type == 3) {
 67                 ele = dom.getElementsByClass("towntable");
 68             }
 69             dom = Jsoup.parse(ele.toString());
 70             ele = dom.getElementsByTag("tr");
 71             if (ele != null) {
 72                 getList(list, ele, city, type);
 73             }
 74         } catch (FailingHttpStatusCodeException e) {
 75             e.printStackTrace();
 76         } catch (MalformedURLException e) {
 77             e.printStackTrace();
 78         } catch (IOException e) {
 79             e.printStackTrace();
 80         }
 81         return list;
 82     }
 83 
 84     private static List<City> getList(List<City> list, Elements ele, City city, int type) {
 85         if (type == 0) {
 86             for (int i = 3; i < ele.size(); i++) {
 87                 Element item = ele.get(i);
 88                 Elements aElements = item.getElementsByTag("a");
 89                 for (int j = 0; j < aElements.size(); j++) {
 90                     City c = new City();
 91                     String html = aElements.get(j).attr("href");
 92                     String name = aElements.get(j).text();
 93                     c.setProvince(name);
 94                     c.setHtmlAddr(ADDRESS + html);
 95                     c.setCode(html.replace(fix, "0000000000"));
 96                     list.add(c);
 97                 }
 98             }
 99             return list;
100         }
101         for (int i = 0; i < ele.size(); i++) {
102             Element item = ele.get(i);
103             Elements aElements = item.getElementsByTag("a");
104             if (aElements.size() > 0) {
105                 City c = new City();
106                 String html = aElements.get(0).attr("href");
107                 String code = aElements.get(0).text();
108                 String name = aElements.get(1).text();
109                 if (type == 1) {
110                     c.setProvince(city.getProvince());
111                     c.setCity(name);
112                 } else if (type == 2) {
113                     c.setProvince(city.getProvince());
114                     c.setCity(city.getCity());
115                     c.setCounty(name);
116                 } else if (type == 3) {
117                     c.setProvince(city.getProvince());
118                     c.setCity(city.getCity());
119                     c.setCounty(city.getCounty());
120                     c.setTown(name);
121                 }
122                 c.setCode(code);
123                 String provinceCode = city.getCode().substring(0, 2);
124                 if (!html.startsWith(provinceCode + "/")) {
125                     html = provinceCode + "/" + html;
126                 }
127                 c.setHtmlAddr(ADDRESS + html);
128                 list.add(c);
129                 System.out.println(c.toString());
130             }
131         }
132         return list;
133     }
134 }