國家統計局網站相關分級頁面截圖
Python爬蟲爬取國家統計局網站【統計用區劃和城鄉劃分代碼】并存入MySQL資料庫

基本思路
爬取每個頁面的a标簽内容,生成省市兩級資料字典,最後合成區縣對應的連結,爬取第三層區劃代碼和名字,結合省市兩級名字生成最後的标準。
代碼
1 import pymysql
2 from bs4 import BeautifulSoup
3 import re
4 import requests
5 import lxml
6 import traceback
7 import time
8 import json
9 from lxml import etree
10
11 def get_area(year):
12 year=str(year)
13 url="http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/"+ year +"/index.html"
14 print(url)
15 headers={
16 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
17 }
18 response=requests.get(url,headers)
19 # print(response.text)
20 response.encoding='GBK'
21 page_text = response.text
22 soup=BeautifulSoup(page_text,'lxml')
23 # print(page_text)
24 all_province=soup.find_all('tr',class_='provincetr') #擷取所有省份第一級的tr 有4個tr
25 # all_province長度為4,其中第一組是從北京市到黑龍江省
26 """
27 格式是這樣的:
28 <tr class="provincetr"><td><a href="11.html">北京市<br/></a></td>
29 <td><a href="12.html">天津市<br/></a></td>
30 <td><a href="13.html">河北省<br/></a></td>
31 <td><a href="14.html">山西省<br/></a></td>
32 <td><a href="15.html">内蒙古自治區<br/></a></td>
33 <td><a href="21.html">遼甯省<br/></a></td><td>
34 """
35 province_str="" #為了友善處理,把省份資料變成一個字元串
36 for i in range(len(all_province)):
37 province_str=province_str+str(all_province[i])
38 # print(province_str)
39 # 開始分别獲得a标簽的href和text
40 province={}
41 province_soup=BeautifulSoup(province_str,'lxml')
42 province_href=province_soup.find_all("a") #擷取所有的a标簽
43 for i in province_href:
44 href_str=str(i)
45 # print(href_str)
46 #建立省份資料字典
47 province.update({BeautifulSoup(href_str,'lxml').find("a").text:BeautifulSoup(href_str,'lxml').find("a")["href"]})
48 # print(province)
49 """
50 資料provide字典
51 {'北京市': '11.html', '天津市': '12.html', '河北省': '13.html', '山西省': '14.html',
52 '内蒙古自治區': '15.html', '遼甯省': '21.html', '吉林省': '22.html', '黑龍江省': '23.html',
53 '上海市': '31.html', '江蘇省': '32.html', '浙江省': '33.html', '安徽省': '34.html',
54 '福建省': '35.html', '江西省': '36.html', '山東省': '37.html', '河南省': '41.html',
55 '湖北省': '42.html', '湖南省': '43.html', '廣東省': '44.html', '廣西壯族自治區': '45.html',
56 '海南省': '46.html', '重慶市': '50.html', '四川省': '51.html', '貴州省': '52.html', '雲南省': '53.html',
57 '西藏自治區': '54.html', '陝西省': '61.html', '甘肅省': '62.html', '青海省': '63.html',
58 '甯夏回族自治區': '64.html', '新疆維吾爾自治區': '65.html'}
59 """
60 # 根據身份資料字典繼續爬取下一級的市級資料,建立市級資料字典
61 city=[]
62 city_url=""
63 city_tr=[]
64 temp_list=[]
65 for item in province.items():
66 # print(value)
67 city_url="http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/"+year+"/"+item[1]
68 city_html=requests.get(city_url,headers)
69 city_html.encoding='GBK'
70 city_text=city_html.text
71 city_tr.append(BeautifulSoup(city_text,'lxml').find_all('tr',class_="citytr"))
72 # 獲得所有的市區tr city_tr清單長度是31 對應31個省或直轄市
73 # 下面開始建立市區的字典{"名字":"連結"}
74 #存放省名字清單
75 province_key=[]
76 for key in province.keys():
77 province_key.append(key)
78 num=0
79 for i in city_tr:
80 for j in i:
81 # j:<tr class="citytr"><td><a href="11/1101.html">110100000000</a></td><td><a href="11/1101.html">市轄區</a></td></tr>
82 # print(j)
83 etree_ = etree.HTML(str(j))
84 temp_list.append({
85 etree_.xpath('//tr/td[2]/a/text()')[0]:
86 etree_.xpath('//tr/td[2]/a/@href')[0]
87 })
88 # print(temp_list)
89 city.append({province_key[num]:temp_list})
90 num=num+1
91 temp_list=[]
92 print(len(city))
93
94 """
95 city[11]
96 {'安徽省': [{'合肥市': '34/3401.html'}, {'蕪湖市': '34/3402.html'}, {'蚌埠市': '34/3403.html'},
97 {'淮南市': '34/3404.html'}, {'馬鞍山市': '34/3405.html'}, {'淮北市': '34/3406.html'}, {'銅陵市': '34/3407.html'},
98 {'安慶市': '34/3408.html'}, {'黃山市': '34/3410.html'}, {'滁州市': '34/3411.html'}, {'阜陽市': '34/3412.html'},
99 {'宿州市': '34/3413.html'}, {'六安市': '34/3415.html'}, {'亳州市': '34/3416.html'}, {'池州市': '34/3417.html'},
100 {'宣城市': '34/3418.html'}]}
101 """
102
103 # 搞定市級字典,下面開始最後一步,area
104 province_name=""
105 city_name=""
106 area_name=""
107 area_tr=[]
108 area_list=[]
109 temp_area_list=[]
110
111 for item1 in city:
112 for k1,v1 in item1.items():
113 province_name=k1
114 if(province_name in ["北京","天津","上海","重慶"]):
115 province_name=province_name+"市"
116 if(province_name =="甯夏"):
117 province_name=province_name+"回族自治區"
118 if(province_name in["西藏","内蒙古"]):
119 province_name=province_name+"自治區"
120 if(province_name == "新疆"):
121 province_name=province_name+"維吾爾自治區"
122 if (province_name == "廣西"):
123 province_name = province_name + "壯族自治區"
124 if(province_name=="黑龍江"):
125 province_name=province_name+"省"
126 if(len(province_name)==2 and province_name not in ["西藏","甯夏","新疆","廣西","北京","天津","上海","重慶"]):
127 province_name = province_name+"省"
128 for item2 in v1:
129 for k2,v2 in item2.items():
130 city_name=k2
131 # print(city_name)
132 area_url="http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/"+ year +"/"+ v2
133 print(area_url)
134 area_response=requests.get(area_url,headers)
135 area_response.encoding='GBK'
136 area_text=area_response.text
137 area_soup=BeautifulSoup(area_text,'lxml')
138 area_tr=area_soup.find_all("tr",class_="countytr")
139 for i in range(len(area_tr)):
140 etree_area = etree.HTML(str(area_tr[i]))
141 try:
142 area_name=etree_area.xpath("//tr/td[2]/a/text()")[0]
143 except:
144 area_name = etree_area.xpath("//tr/td[2]/text()")[0]
145 # print(area_name)
146 # print(str(area_tr[i]))
147 try:
148 temp_area_list.append({
149 etree_area.xpath("//tr/td[1]/a/text()")[0][0:6]: province_name+"·"+city_name+"·"+area_name
150 })
151 except:
152 temp_area_list.append({
153 etree_area.xpath("//tr/td[1]/text()")[0][0:6]: province_name+"·"+city_name+"·"+area_name
154 })
155 area_list.append(temp_area_list)
156 temp_area_list=[]
157 time.sleep(1)
158 return area_list
159
160 def into_mysql(year):
161 year=str(year)
162 SQL=""
163 conn,cursor=get_mysql_conn()
164 res=get_area(year)
165 try:
166 for item in res:
167 for k,v in item[0].items():
168 print(k)
169 print(v)
170 SQL="insert into std_area (year,area_code, area_name) values ('"+year+"','"+k+"','"+v+"')"
171 print(SQL)
172 cursor.execute(SQL)
173 conn.commit()
174 except:
175 print("出現錯誤")
176 conn,cursor.close()
177 return None
178
179 def query(sql,*args):
180 """
181 通用封裝查詢
182 :param sql:
183 :param args:
184 :return:傳回查詢結果 ((),())
185 """
186 conn , cursor= get_mysql_conn()
187 print(sql)
188 cursor.execute(sql)
189 res = cursor.fetchall()
190 close_conn(conn , cursor)
191 return res
192 """
193 ------------------------------------------------------------------------------------
194 """
195 def get_mysql_conn():
196 """
197 :return: 連接配接,遊标
198 """
199 # 建立連接配接
200 conn = pymysql.connect(host="127.0.0.1",
201 user="root",
202 password="000429",
203 db="data_cleaning",
204 charset="utf8")
205 # 建立遊标
206 cursor = conn.cursor() # 執行完畢傳回的結果集預設以元組顯示
207 return conn, cursor
208
209 def close_conn(conn, cursor):
210 if cursor:
211 cursor.close()
212 if conn:
213 conn.close()
214 if __name__ == '__main__':
215 # res=get_area()
216 into_mysql('2009')