作业要求来自:https://edu.cnblogs.com/campus/gzcc/GZCC-16SE1/homework/3002
0.从新闻url获取点击次数,并整理成函数
- newsUrl
- newsId(re.search())
- clickUrl(str.format())
- requests.get(clickUrl)
- re.search()/.split()
- str.lstrip(),str.rstrip()
- int
- 整理成函数
- 获取新闻发布时间及类型转换也整理成函数
def click(url):
newsId=re.search('/(\d*).html',url).group(1)
clickUrl='http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80'.format(newsId)
resClick=requests.get(clickUrl)
newsClick=int(re.search("hits'[)].html[(]'(\d+)'[)]",resClick).groups(1))
return newsClick
def newsdt(showinfo):
newsDate=showinfo.split()[0].split(':')[1]
newsTime=showinfo.split()[1]
newsDT=newsDate+' '+newsTime
dt=datetime.strptime(newsDT,'%Y-%m-%d %H:%M:%S')
return dt
1.从新闻url获取新闻详情: 字典,anews
def anews(url):
newsDetail={}
res=requests.get(url)
res.encoding='utf-8'
s=BeautifulSoup(res.text,'html.parser')
newsDetail['newsTitle']=s.select('.show-title')[0].text
showinfo=s.select('.show-info')[0].text
newsDetail['newsDT']=newsdt(showinfo)
newsDetail['newsClick']=click(url)
return newsDetail
2.从列表页的url获取新闻url:列表append(字典) alist
def alist(url):
res=requests.get(listUrl)
res.encoding='utf-8'
soup=BeautifulSoup(res.text,'html.parser')
newsList=[]
for news in soup.select('li'):
if len(news.select('.news-list-title'))>0:
newsUrl=news.select('a')[0]['href']
newsDest=news.select('.news-list-description')[0].text
newsDict=anews(newsUrl)
newsDict['newsUrl'] = newsUrl l
newsDict['description']=newsDest
newsList.append(newsDict)
return newsList
listUrl= 'http://news.gzcc.cn/html/xiaoyuanxinwen/'
alist(listUrl)
3.生成所页列表页的url并获取全部新闻 :列表extend(列表) allnews
*每个同学爬学号尾数开始的10个列表页
allnews = []
for i in range(16,26):
listUrl='http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html'.format(i)
allnews.extend(alist(listUrl))
4.设置合理的爬取间隔
import time
import random
time.sleep(random.random()*3)
for i in range(5):
time.sleep(random.random()*3)
5.用pandas做简单的数据处理并保存
保存到csv或excel文件
newsdf.to_csv(r'F:\duym\爬虫\gzccnews.csv')
pd.Series(allnews)
newsdf=pd.DataFrame(allnews)
newsdf.to_csv=(r'C:\Users\Czc\PycharmProjects\news.csv')
运行结果:

6.完整代码:
1 import requests
2 from bs4 import BeautifulSoup
3 from datetime import datetime
4 import re
5 import pandas as pd
6 import time
7 import random
8
9 def click(url):
10 newsId = re.search('/(\d*).html', url).group(1)
11 clickUrl = 'http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80'.format(newsId)
12 resClick = requests.get(clickUrl)
13 newsClick = int(resClick.text.split('.html')[-1].lstrip("('").rstrip("');"))
14 return newsClick
15
16 def newsdt(showinfo):
17 newsDate = showinfo.split()[0].split(':')[1]
18 newsTime = showinfo.split()[1]
19 newsDT = newsDate + ' ' + newsTime
20 dt = datetime.strptime(newsDT, '%Y-%m-%d %H:%M:%S')
21 return dt
22
23 def anews(url):
24 newsDetail = {}
25 res = requests.get(url)
26 res.encoding = 'utf-8'
27 s = BeautifulSoup(res.text, 'html.parser')
28 newsDetail['newsTitle'] = s.select('.show-title')[0].text
29 showinfo = s.select('.show-info')[0].text
30 newsDetail['newsDT'] = newsdt(showinfo)
31 newsDetail['newsClick'] = click(url)
32 return newsDetail
33
34 def alist(url):
35 res = requests.get(listUrl)
36 res.encoding = 'utf-8'
37 soup = BeautifulSoup(res.text, 'html.parser')
38 newsList = []
39 for news in soup.select('li'):
40 if len(news.select('.news-list-title')) > 0:
41 newsUrl = news.select('a')[0]['href']
42 newsDest = news.select('.news-list-description')[0].text
43 newsDict = anews(newsUrl)
44 newsDict['newsUrl'] = newsUrl
45 newsDict['description'] = newsDest
46 newsList.append(newsDict)
47 return newsList
48
49 listUrl = 'http://news.gzcc.cn/html/xiaoyuanxinwen/'
50 alist(listUrl)
51 res = requests.get(listUrl)
52 res.encoding = 'utf-8'
53 soup = BeautifulSoup(res.text, 'html.parser')
54 for news in soup.select('li'):
55 if len(news.select('.news-list-title')) > 0:
56 newsUrl = news.select('a')[0]['href']
57
58
59 i = int(soup.select('#pages')[0].text.split('..')[1].rstrip(' 下一页 '))
60 allnews = []
61 for i in range(16, 26):
62 listUrl = 'http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html'.format(i)
63 allnews.extend(alist(listUrl))
64
65 res = requests.get('http://news.gzcc.cn/html/xiaoyuanxinwen/')
66 res.encoding = 'utf-8'
67 soup = BeautifulSoup(res.text, 'html.parser')
68 for news in soup.select('li'):
69 if len(news.select('.news-list-title')) > 0:
70 newsUrl = news.select('a')[0]['href']
71
72 pd.Series(anews)
73 newsdf = pd.DataFrame(allnews)
74 newsdf.to_csv(r'C:\Users\Czc\PycharmProjects\news.csv')
75
76 for i in range(5):
77 time.sleep(random.random() * 3)
78 print(newsdf)
View Code