天天看點

Python清單處理網頁表格資料

涉及知識點

  1. 正規表達式re
  2. 清單處理

代碼如下

import requests
import sys, io
import re

url = "http://www.nifdc.org.cn/CL0903/11390.html"
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0',
'Referer':'',
}

html_doc = requests.get(url=url, headers=headers).content

# 解決編碼問題
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
# 顯示中文字元
html_doc = html_doc.decode("gb18030")
# print(html_doc)

# 分别擷取批号和生産企業
# pat_num = re.compile(r">((B|C)?2018[A-Za-z0-9]{5,6})<")
# # |R01[0-9]{4}|P3K7[1,2]1M|YHBV[A-Za-z0-9]{5,6}
# lst_num = pat_num.findall(html_doc)
# # lst_num = re.search(r">((B|C)?2018[A-Za-z0-9]{5,6})<", html_doc).group(1)

# print(lst_num)
pat = re.compile(r'<span style="font-family:宋體;color:black;font-size:12px">(.+?)</span>')
lst = pat.findall(html_doc)
# print(len(lst))
# print('table_data:\n', lst)
# print('*' * 120)
serial_no_lst = []
for i in range(150):
	serial_no = lst.index(str(i+1))
	# if lst[serial_no-1] == "中國食品藥品檢定研究院":
	serial_no_lst.append(serial_no)
# print(len(serial_no_lst))
# print(serial_no_lst)
# print('*' * 120)
# print(lst.count("中國食品藥品檢定研究院"))
# print(lst.index("中國食品藥品檢定研究院"))
# 檢查未按照大小順序排序的清單項
count = 0
while count+1 < 150:
	if serial_no_lst[count] >= serial_no_lst[count + 1]:
		# print('the %d number is not in order' % (count+1))
		pass
		# count += 1
	else:
		# print('ok--%d' % (count+1))
		pass
		# count += 1
	count += 1
# print('4th: ', serial_no_lst[4])
# print('95th: ', serial_no_lst[95])
# print('*' * 120)
# lst_ck = []
# for item in serial_no_lst:
# 	elem = lst[item]
# 	lst_ck.append(elem)
# print('lst_ck--%s:\n%s' % (len(lst_ck),lst_ck))
serial_no_update_lst = serial_no_lst[:]
serial_no_update_lst[4] = int((serial_no_update_lst[3] + serial_no_update_lst[5])/2)
serial_no_update_lst[95] = int((serial_no_update_lst[94] + serial_no_update_lst[96])/2+1)
# print('序号1-150所在位置:\n', serial_no_update_lst)
# print('4th: ', serial_no_update_lst[4])
# print('95th: ', serial_no_update_lst[95])
# print('*' * 120)
# print('lst[81]:', lst[81])
# print('lst[1594]:', lst[1594])
print('*' * 120)
# 根據表格元素清單lst和序号清單serial_no_update_lst逐行拆分表格
table_row_lst = []
title_row = lst[:serial_no_update_lst[0]]
table_row_lst.append(title_row)
for i in range(149):
	tmp = lst[serial_no_update_lst[i]:serial_no_update_lst[i+1]]
	table_row_lst.append(tmp)
last_row = lst[serial_no_update_lst[149]:]
table_row_lst.append(last_row)
print('table_row_lst:\n', table_row_lst)