0.從新聞url擷取點選次數,并整理成函數
- newsUrl
- newsId(re.search())
- clickUrl(str.format())
- requests.get(clickUrl)
- re.search()/.split()
- str.lstrip(),str.rstrip()
- int
- 整理成函數
- 擷取新聞釋出時間及類型轉換也整理成函數
1.從新聞url擷取新聞詳情: 字典,anews
2.從清單頁的url擷取新聞url:清單append(字典) alist
3.生成所頁清單頁的url并擷取全部新聞 :清單extend(清單) allnews
*每個同學爬學号尾數開始的10個清單頁
4.設定合理的爬取間隔
import time
import random
time.sleep(random.random()*3)
5.用pandas做簡單的資料處理并儲存
儲存到csv或excel檔案
newsdf.to_csv(r'F:\duym\爬蟲\gzccnews.csv')
6.代碼示例
import re
from bs4 import BeautifulSoup
from datetime import datetime
import requests
import pandas as pd
import time
import random
"""新聞點選次數"""
def newsClick(newsUrl):
newsId = re.findall('(\d+)', newsUrl)[-1]
clickUrl = 'http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80'.format(newsId)
resClicks = requests.get(clickUrl).text
resClick = int(re.search("hits'[)].html[(]'(\d*)'[)]", resClicks).groups(0)[0])
return resClick
"""新聞釋出時間"""
def newsDateTime(showinfo):
newsDate = showinfo.split()[0].split(':')[1]
newsTime = showinfo.split()[1]
newsDateTime = newsDate + ' ' + newsTime
dateTime = datetime.strptime(newsDateTime, '%Y-%m-%d %H:%M:%S') #類型轉換
return dateTime
"""新聞字典"""
def newsDicts(newsUrl):
newsText = requests.get(newsUrl)
newsText.encoding = 'utf-8'
newsSoup = BeautifulSoup(newsText.text, 'html.parser')
newsDict = {}
newsDict['newsTitle'] = newsSoup.select('.show-title')[0].text
showinfo = newsSoup.select('.show-info')[0].text
newsDict['newsDateTime'] = newsDateTime(showinfo)
newsDict['newsClick'] = newsClick(newsUrl)
return newsDict
"""新聞清單"""
def newsList(newsUrl):
newsText = requests.get(newsUrl)
newsText.encoding = 'utf-8'
newsSoup = BeautifulSoup(newsText.text, 'html.parser')
newsList = []
for news in newsSoup.select('li'):
if len(news.select('.news-list-title')) > 0:
url = news.select('a')[0]['href']
newsDesc = news.select('.news-list-description')[0].text
newsDict = newsDicts(url)
newsDict['newsUrl'] = url
newsDict['description'] = newsDesc
newsList.append(newsDict)
return newsList
"""27-37頁新聞清單"""
def allNews():
allnews = []
for i in range(34,44):
newsUrl = 'http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html'.format(i)
allnews.extend(newsList(newsUrl))
time.sleep(random.random() * 3) #爬取間隔
return allnews
newsDF = pd.DataFrame(allNews())
newsDF.to_csv('gzccnews.csv') #儲存為csv檔案
