#coding=utf8
__author__ = 'Administrator'
import os
import re
import urllib.request
import pymysql
class Spider:
#頁面初始化
def __init__(self,url,retext,path):
self.url = url
self.path = path
self.retext = retext
def mkdir(self,path):
isExists = os.path.exists(path)
if not isExists:
os.makedirs(path)
return path
def getData(self):
url = urllib.request.Request(self.url)
html = urllib.request.urlopen(url).read()
print(html)
html = html.decode('utf-8','ignore')
imgRe = re.compile(self.retext)
data = imgRe.findall(html)
return data
#self.data = data
def saveImg(self,imgurl,imgname):
#img = urllib.request.urlopen(imgurl).read()
#img = urllib.request.urlopen(imgurl)
#print(img)
path = self.path
try:
img = urllib.request.urlopen(imgurl)
#print(img)
except Exception as e:
print(e)
else:
img = img.read()
f = open("./%s/%s.jpg" %(path,imgname),'wb')
f.write(img)
f.close()
def saveMysql(self,title,url,catogary,content):
try:
con=pymysql.connect(host='qdm***w.com',user='q****46',passwd='*******',db='qd*****db',port=3306,charset='utf8')
cur=con.cursor()
#cur.execute('select * from imgurl')
#data=cur.fetchall()
insert = "insert into pic(title,url,catogary,content) values ('%s','%s','%s')"%(title,url,catogary,content)
#print(insert)
cur.execute(insert)
cur.close()#關閉遊标
con.close()#釋放資料庫資源
except Exception as e:
print("發生異常:%s"%e)
def getContent(self):
path = self.mkdir(self.path)
data = self.getData()
#print(data)
fp = open('./%s/url.txt'%(path),'w+')
x = 0
for d in data:
print(d)
fp.write(d)
if (len(d) < 80) :
#self.saveImg(d,x)
#print(d)
x+=1
fp.close()
url = "http://www.zhihu.com/question/29649162"
url2 = 'http://image.baidu.com/activity/starfans/4093640704%201415350495?&albumtype=1'
retext = r'http://.*?\.jpg|http://.*?\.png'
retext2 = r'<h2 class="zm-item-title.*?>(.*?)</h2>'
spider = Spider(url,retext2,"趙麗穎")
spider.getContent()