打開中國天氣首頁,選中某個城市,例如廣州。
![](https://img.laitimes.com/img/__Qf2AjLwojIjJCLyojI0JCLiYTMfhHLlN3XnxCM38FdsYkRGZkRG9lcvx2bjxCMy8VZ6l2cs0za65keJR0T1MmeNl3ZU9UQClGVF5UMR9Fd4VGdsATNfd3bkFGazxycykFaKdkYzZUbapXNXlleSdVY2pESa9VZwlHdssmch1mclRXY39CXldWYtlWPzNXZj9mcw1ycz9WL49zRQpkLhdjYhR2YkBTZjBzYjRmN5QDOzQTOxYWZ4MWZ3gjZ0UzLc52YucWbp5GZzNmLn9Gbi1yZtl2Lc9CX6MHc0RHaiojIsJye.jpg)
進入城市天氣預報詳細頁面,選擇“7天”。
分析一周天氣預報的結構體系,查找一周天氣預報的根節點。根節點是id="7d"的div标簽。
解析每日天氣預報清單的節點詳細。每日天氣預報清單是一個包含多個li标簽、class="t clearfix"的ul标簽。每日天氣預報包含的有效資訊為:日期、天氣、溫度、風向等4個基本要素。
由于中國天氣城市天氣預報的網頁是靜态頁面且頁面資料不是Javascript處理生成的,擷取該頁面的源碼不算複雜。總體上,步驟相對簡單。使用requests庫來擷取城市7天天氣預報的網頁源碼,通過scrapy的selector和xpath解析每日天氣預報清單中的節點,提取天氣預報的4個基本要素。天氣預報的基本要素被包含在簡單的HTML标簽之中,隻需将标簽替換掉即可。
最終的代碼如下:
"""
@author: MR.N
@created: 2021-08-22 12:30 AM Sun.
"""
import ssl
import urllib
from scrapy import Selector
import gzip
GUANG_ZHOU = 'http://www.weather.com.cn/weather/101280101.shtml'
headers = {
'User-Agent': 'Mozilla/5.0 (Linux; Android 7.0; SM-G892A Build/NRD90M; wv) '
'AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/67.0.3396.87 Mobile Safari/537.36',
'Accept-Encoding': 'gzip, deflate, br',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache',
'Sec-Fetch-User': '?1',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Site': 'none',
'Upgrade-Insecure-Requests': '1',
'Accept-Language': 'en-US,en;q=0.5',
'Connection': 'keep-alive'
}
def get_local_weather(timeout=3):
global GUANG_ZHOU
global headers
ssl._create_default_https_context = ssl._create_unverified_context
req = urllib.request.Request(GUANG_ZHOU, headers=headers)
opener = urllib.request.build_opener()
res = opener.open(req, timeout=3)
data = ''
if res.getcode() == 200:
data = res.read()
if data is not None:
data = gzip.decompress(data).decode('UTF-8', errors='strict')
else:
print(str(res.getcode()))
if data is None or len(data) < 100:
print('no data', data)
return ''
sel = Selector(text=data)
groups = sel.xpath('//div[@id="7d"]/ul[@class="t clearfix"]/li').getall()
weather_datas = ''
if groups is None or len(groups) < 1:
print('no match')
else:
print('[data]', len(groups))
index = 0
for group in groups:
index += 1
wea_data = ''
temp_sel = Selector(text=group)
date_str = temp_sel.xpath('//h1/text()').get()
wea_str = temp_sel.xpath('//p[@class="wea"]/text()').get()
temp_str = temp_sel.xpath('//p[@class="tem"]').get() \
.replace('<p class="tem">', '').replace('</p>', '')\
.replace('<i>', '').replace('</i>', '') \
.replace('<span>', '').replace('</span>', '').replace('\n', '')
win_str = temp_sel.xpath('//p[@class="win"]/i/text()').get().replace('<', '')
wea_data = '[' + date_str + '] ' + wea_str + ' ' + temp_str + ' ' + win_str
if index != 1:
weather_datas += '\n'
weather_datas += wea_data
return weather_datas