擷取資料的方法依然采取了regex正則比對的方法,請求架構采用了java,爬蟲語言是groovy,本地拼接好sql語句,發送到mysql服務端,完成存儲。
代碼如下:
package com.fan
2
3import com.fantest.httpclient.FanLibrary
4import com.fantest.mysql.MySqlTest
5import com.fantest.utils.Regex
6import net.sf.json.JSONObject
7
8class Company extends FanLibrary {
9 static void main(String[] args) {
10 for (def i in 1..1060) {
11 getPage(i)
12// getInfo("/eportal/ui?pageId=307900&t=toDetail&ZSBH=D311056737")
13 }
14 testOver()
15 }
16
17 static getPage(int page) {
18 def url = "http://www.***.gov.cn/eportal/ui?pageId=307900"
19 def params = new JSONObject()
20 params.put("filter_LIKE_QYMC", EMPTY)
21 params.put("filter_LIKE_YYZZZCH", EMPTY)
22 params.put("filter_LIKE_ZSBH", EMPTY)
23 params.put("filter_LIKE_XXDZ", EMPTY)
24 params.put("currentPage", page)
25 params.put("pageSize", 15)
26 params.put("OrderByField", EMPTY)
27 params.put("OrderByDesc", EMPTY)
28 def response = getHttpResponse(getHttpPost(url, params))
29 def s = response.getString("content")
30 def all = Regex.regexAll(s, "<td s.*?浏覽")
31 for (int i = 1; i < all.size(); i++) {
32 def get = all.get(i)
33 def regex = Regex.getRegex(get, "href=\".*?\"").replace("amp;", EMPTY)
34 getInfo(regex)
35 sleep(3)
36 }
37 return response;
38 }
39
40 static getInfo(String url) {
41 try {
42 url = "http://www.***.gov.cn" + url;
43 def response = getHttpResponse(getHttpGet(url))
44 def content = response.getString("content")
45 def all = Regex.regexAll(content, "<td class=\"label\".*?\n.*\n.*\n.*\n.*\n.*")
46 def name = all.get(0).replaceAll("<.*?>", EMPTY).replaceAll("(\n| )", EMPTY).split(":")[1]
47 def adress = all.get(1).replaceAll("<.*?>", EMPTY).replaceAll("(\n| )", EMPTY).split(":")[1]
48 def money = all.get(2).replaceAll("<.*?>", EMPTY).replaceAll("(\n| )", EMPTY).split(":")[1]
49 def sid = all.get(3).replaceAll("<.*?>", EMPTY).replaceAll("(\n| )", EMPTY).split(":")[1]
50 def type = all.get(4).replaceAll("<.*?>", EMPTY).replaceAll("(\n| )", EMPTY).split(":")[1]
51 def man = all.get(5).replaceAll("<.*?>", EMPTY).replaceAll("(\n| )", EMPTY).split(":")[1]
52 def paper = all.get(6).replaceAll("<.*?>", EMPTY).replaceAll("(\n| )", EMPTY).split(":")[1]
53 def level = all.get(7).replaceAll("<.*?>", EMPTY).replaceAll("(\n| )", EMPTY).split(":")[1]
54 def gov = all.get(8).replaceAll("<.*?>", EMPTY).replaceAll("(\n| )", EMPTY).split(":")[1]
55 def time = all.get(9).replaceAll("<.*?>", EMPTY).replaceAll("(\n| )", EMPTY).split(":")[1]
56 def start = time.split("~")[0]
57 def end = time.split("~")[1]
58 String sql = "INSERT INTO company (name,adress,money,sid,type,man,paper,level,gov,start,end) VALUES (\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\");"
59 sql = String.format(sql, name, adress, money, sid, type, man, paper, level, gov, start, end)
60 output(sql)
61 MySqlTest.sendWork(sql)
62 }
63 catch (Exception e) {
64 output(e)
65 }
66 }
67}
第一頁的網頁結構如下:

第二頁詳情頁結構如下: