作業要求來自:https://edu.cnblogs.com/campus/gzcc/GZCC-16SE1/homework/3002
0.從新聞url擷取點選次數,并整理成函數
- newsUrl
- newsId(re.search())
- clickUrl(str.format())
- requests.get(clickUrl)
- re.search()/.split()
- str.lstrip(),str.rstrip()
- int
- 整理成函數
- 擷取新聞釋出時間及類型轉換也整理成函數
def click(url):
newsId=re.search('/(\d*).html',url).group(1)
clickUrl='http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80'.format(newsId)
resClick=requests.get(clickUrl)
newsClick=int(re.search("hits'[)].html[(]'(\d+)'[)]",resClick).groups(1))
return newsClick
def newsdt(showinfo):
newsDate=showinfo.split()[0].split(':')[1]
newsTime=showinfo.split()[1]
newsDT=newsDate+' '+newsTime
dt=datetime.strptime(newsDT,'%Y-%m-%d %H:%M:%S')
return dt
1.從新聞url擷取新聞詳情: 字典,anews
def anews(url):
newsDetail={}
res=requests.get(url)
res.encoding='utf-8'
s=BeautifulSoup(res.text,'html.parser')
newsDetail['newsTitle']=s.select('.show-title')[0].text
showinfo=s.select('.show-info')[0].text
newsDetail['newsDT']=newsdt(showinfo)
newsDetail['newsClick']=click(url)
return newsDetail
2.從清單頁的url擷取新聞url:清單append(字典) alist
def alist(url):
res=requests.get(listUrl)
res.encoding='utf-8'
soup=BeautifulSoup(res.text,'html.parser')
newsList=[]
for news in soup.select('li'):
if len(news.select('.news-list-title'))>0:
newsUrl=news.select('a')[0]['href']
newsDest=news.select('.news-list-description')[0].text
newsDict=anews(newsUrl)
newsDict['newsUrl'] = newsUrl l
newsDict['description']=newsDest
newsList.append(newsDict)
return newsList
listUrl= 'http://news.gzcc.cn/html/xiaoyuanxinwen/'
alist(listUrl)
3.生成所頁清單頁的url并擷取全部新聞 :清單extend(清單) allnews
*每個同學爬學号尾數開始的10個清單頁
allnews = []
for i in range(16,26):
listUrl='http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html'.format(i)
allnews.extend(alist(listUrl))
4.設定合理的爬取間隔
import time
import random
time.sleep(random.random()*3)
for i in range(5):
time.sleep(random.random()*3)
5.用pandas做簡單的資料處理并儲存
儲存到csv或excel檔案
newsdf.to_csv(r'F:\duym\爬蟲\gzccnews.csv')
pd.Series(allnews)
newsdf=pd.DataFrame(allnews)
newsdf.to_csv=(r'C:\Users\Czc\PycharmProjects\news.csv')
運作結果:

6.完整代碼:
1 import requests
2 from bs4 import BeautifulSoup
3 from datetime import datetime
4 import re
5 import pandas as pd
6 import time
7 import random
8
9 def click(url):
10 newsId = re.search('/(\d*).html', url).group(1)
11 clickUrl = 'http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80'.format(newsId)
12 resClick = requests.get(clickUrl)
13 newsClick = int(resClick.text.split('.html')[-1].lstrip("('").rstrip("');"))
14 return newsClick
15
16 def newsdt(showinfo):
17 newsDate = showinfo.split()[0].split(':')[1]
18 newsTime = showinfo.split()[1]
19 newsDT = newsDate + ' ' + newsTime
20 dt = datetime.strptime(newsDT, '%Y-%m-%d %H:%M:%S')
21 return dt
22
23 def anews(url):
24 newsDetail = {}
25 res = requests.get(url)
26 res.encoding = 'utf-8'
27 s = BeautifulSoup(res.text, 'html.parser')
28 newsDetail['newsTitle'] = s.select('.show-title')[0].text
29 showinfo = s.select('.show-info')[0].text
30 newsDetail['newsDT'] = newsdt(showinfo)
31 newsDetail['newsClick'] = click(url)
32 return newsDetail
33
34 def alist(url):
35 res = requests.get(listUrl)
36 res.encoding = 'utf-8'
37 soup = BeautifulSoup(res.text, 'html.parser')
38 newsList = []
39 for news in soup.select('li'):
40 if len(news.select('.news-list-title')) > 0:
41 newsUrl = news.select('a')[0]['href']
42 newsDest = news.select('.news-list-description')[0].text
43 newsDict = anews(newsUrl)
44 newsDict['newsUrl'] = newsUrl
45 newsDict['description'] = newsDest
46 newsList.append(newsDict)
47 return newsList
48
49 listUrl = 'http://news.gzcc.cn/html/xiaoyuanxinwen/'
50 alist(listUrl)
51 res = requests.get(listUrl)
52 res.encoding = 'utf-8'
53 soup = BeautifulSoup(res.text, 'html.parser')
54 for news in soup.select('li'):
55 if len(news.select('.news-list-title')) > 0:
56 newsUrl = news.select('a')[0]['href']
57
58
59 i = int(soup.select('#pages')[0].text.split('..')[1].rstrip(' 下一頁 '))
60 allnews = []
61 for i in range(16, 26):
62 listUrl = 'http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html'.format(i)
63 allnews.extend(alist(listUrl))
64
65 res = requests.get('http://news.gzcc.cn/html/xiaoyuanxinwen/')
66 res.encoding = 'utf-8'
67 soup = BeautifulSoup(res.text, 'html.parser')
68 for news in soup.select('li'):
69 if len(news.select('.news-list-title')) > 0:
70 newsUrl = news.select('a')[0]['href']
71
72 pd.Series(anews)
73 newsdf = pd.DataFrame(allnews)
74 newsdf.to_csv(r'C:\Users\Czc\PycharmProjects\news.csv')
75
76 for i in range(5):
77 time.sleep(random.random() * 3)
78 print(newsdf)
View Code