网址 https://movie.douban.com/top250
一共250部电影,有分页,获取每一部的详细信息
不采用框架,使用 urilib读取网页,re进行正则表达式匹配,lxml进行xpath查找
1 from film import *
2 from urllib import request
3 import time,re
4 url=r\'https://movie.douban.com/top250?start=\'
5 for i in range(10):
6 url=url+str(i*25)
7 print(url)
8
9 headers = {
10 \'User-Agent\': r\'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) \'
11 r\'Chrome/45.0.2454.85 Safari/537.36 115Browser/6.0.3\',
12 \'Connection\': \'keep-alive\'
13 }
14 req=request.Request(url,headers=headers)
15 page=request.urlopen(req).read()
16 page=page.decode(\'utf-8\')
17 #fp=open("page.txt",mode="w",encoding="UTF-8")
18 #fp.writelines(page)
19 p=re.compile(r\'\<em\sclass=\"\"\>\d+\</em\>\s*\<a\shref=\"https://movie.douban.com/subject/\d+/\"\>\')
20 result=p.findall(page)
21 for item in result:
22 #print(item)
23 p=re.compile(r\'\d+\')
24 no=p.findall(item)
25 #print(no[0])
26 p=re.compile(r\'https://movie.douban.com/subject/\d+/\')
27 rurl=p.findall(item)
28 #print(rurl[0])
29 filma=film(no[0],rurl[0],\'\',\'\',\'\',\'\',\'\',\'\')
30 filma.getall()
31 filma.detail()
32 time.sleep(3)
33 #print (result)
34 time.sleep(3)
35 #print(i)
film.py 如果要做数据的持久化,在这里实现
1 from urllib import request
2 from lxml import etree
3 class film:
4 def __init__(self,no,url,name,year,score,director,classification,actor):
5 self.name=name
6 self.year=year
7 self.score=score
8 self.director=director
9 self.classification=classification
10 self.actor=actor
11 self.url=url
12 self.no=no
13
14 def detail(self):
15 temp = "No:%s;url:%s;片名:%s;年份:%s;分数:%s;导演:%s;分级:%s;演员:%s;" %(self.no,self.url,self.name,self.year,self.score,self.director,self.classification,self.actor)
16 print(temp)
17 def getall(self):
18 headers={
19 \'User-Agent\': r\'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) \'
20 r\'Chrome/45.0.2454.85 Safari/537.36 115Browser/6.0.3\',
21 \'Connection\': \'keep-alive\'
22 }
23 req=request.Request(self.url,headers=headers)
24 page=request.urlopen(req).read()
25 page=page.decode(\'utf-8\')
26 selector=etree.HTML(page)
27 print (page)
28 self.name=selector.xpath(\'/html/body/div[3]/div[1]/h1/span[1]/text()\')
29 self.year=selector.xpath(\'//*[@id="content"]/h1/span[2]/text()\')
30 self.score=selector.xpath(\'//*[@id="interest_sectl"]/div[1]/div[2]/strong/text()\')
31 self.director=selector.xpath(\'//*[@id="info"]/span[1]/span[2]/a/text()\')
32 self.classification=selector.xpath(\'//*[@id="info"]/span[5]/text()\')
33 self.actor=selector.xpath(\'//*[@id="info"]/span[3]/span[2]/a/text()\')
34
35
