天天看點

python3學習爬蟲 正則以及url

#coding=utf8
__author__ = 'Administrator'

import os
import re
import urllib.request
import pymysql

class Spider:
    #頁面初始化
    def __init__(self,url,retext,path):
        self.url = url
        self.path = path
        self.retext = retext

    def mkdir(self,path):
        isExists = os.path.exists(path)
        if not isExists:
            os.makedirs(path)
        return path

    def getData(self):
        url = urllib.request.Request(self.url)
        html = urllib.request.urlopen(url).read()
        print(html)
        html = html.decode('utf-8','ignore')
        imgRe = re.compile(self.retext)
        data = imgRe.findall(html)
        return data
        #self.data = data

    def saveImg(self,imgurl,imgname):
        #img = urllib.request.urlopen(imgurl).read()
        #img = urllib.request.urlopen(imgurl)
        #print(img)
        path = self.path
        try:
            img = urllib.request.urlopen(imgurl)
            #print(img)
        except Exception as e:
            print(e)
        else:
            img = img.read()
            f = open("./%s/%s.jpg" %(path,imgname),'wb')
            f.write(img)
            f.close()

           
    def saveMysql(self,title,url,catogary,content):
        try:
            con=pymysql.connect(host='qdm***w.com',user='q****46',passwd='*******',db='qd*****db',port=3306,charset='utf8')
            cur=con.cursor()
            #cur.execute('select * from imgurl')
            #data=cur.fetchall()
            insert = "insert into pic(title,url,catogary,content) values ('%s','%s','%s')"%(title,url,catogary,content)
            #print(insert)
            cur.execute(insert)
            cur.close()#關閉遊标
            con.close()#釋放資料庫資源
        except  Exception as e:
            print("發生異常:%s"%e)
           
def getContent(self):
        path = self.mkdir(self.path)
        data = self.getData()
        #print(data)
        fp = open('./%s/url.txt'%(path),'w+')
        x = 0
        for d in data:
            print(d)
            fp.write(d)
            if (len(d) < 80) :
                #self.saveImg(d,x)
                #print(d)
                x+=1
        fp.close()


url = "http://www.zhihu.com/question/29649162"
url2 = 'http://image.baidu.com/activity/starfans/4093640704%201415350495?&albumtype=1'
retext = r'http://.*?\.jpg|http://.*?\.png'



retext2 = r'<h2 class="zm-item-title.*?>(.*?)</h2>'
spider = Spider(url,retext2,"趙麗穎")
spider.getContent()