python3學習爬蟲正則以及url

2023-05-14 21:11:39

#coding=utf8
__author__ = 'Administrator'

import os
import re
import urllib.request
import pymysql

class Spider:
    #頁面初始化
    def __init__(self,url,retext,path):
        self.url = url
        self.path = path
        self.retext = retext

    def mkdir(self,path):
        isExists = os.path.exists(path)
        if not isExists:
            os.makedirs(path)
        return path

    def getData(self):
        url = urllib.request.Request(self.url)
        html = urllib.request.urlopen(url).read()
        print(html)
        html = html.decode('utf-8','ignore')
        imgRe = re.compile(self.retext)
        data = imgRe.findall(html)
        return data
        #self.data = data

    def saveImg(self,imgurl,imgname):
        #img = urllib.request.urlopen(imgurl).read()
        #img = urllib.request.urlopen(imgurl)
        #print(img)
        path = self.path
        try:
            img = urllib.request.urlopen(imgurl)
            #print(img)
        except Exception as e:
            print(e)
        else:
            img = img.read()
            f = open("./%s/%s.jpg" %(path,imgname),'wb')
            f.write(img)
            f.close()

    def saveMysql(self,title,url,catogary,content):
        try:
            con=pymysql.connect(host='qdm***w.com',user='q****46',passwd='*******',db='qd*****db',port=3306,charset='utf8')
            cur=con.cursor()
            #cur.execute('select * from imgurl')
            #data=cur.fetchall()
            insert = "insert into pic(title,url,catogary,content) values ('%s','%s','%s')"%(title,url,catogary,content)
            #print(insert)
            cur.execute(insert)
            cur.close()#關閉遊标
            con.close()#釋放資料庫資源
        except  Exception as e:
            print("發生異常:%s"%e)

def getContent(self):
        path = self.mkdir(self.path)
        data = self.getData()
        #print(data)
        fp = open('./%s/url.txt'%(path),'w+')
        x = 0
        for d in data:
            print(d)
            fp.write(d)
            if (len(d) < 80) :
                #self.saveImg(d,x)
                #print(d)
                x+=1
        fp.close()


url = "http://www.zhihu.com/question/29649162"
url2 = 'http://image.baidu.com/activity/starfans/4093640704%201415350495?&albumtype=1'
retext = r'http://.*?\.jpg|http://.*?\.png'



retext2 = r'<h2 class="zm-item-title.*?>(.*?)</h2>'
spider = Spider(url,retext2,"趙麗穎")
spider.getContent()

python3學習爬蟲正則以及url

繼續閱讀

來自python的【條件控制/語句循環/break/continue/else/pass】一、條件控制二、語句循環

無法解析的外部符号 wmain，該符号在函數 "void cdecl mainCRTStartupHelper(struct HINSTANCE *,unsigned short con......

TestLink導出用例轉換工具(XML2Excel)

YAML簡介和PyYAML安全操作YAML支援的類型YAML的優點：yaml的基本文法python操作

Small tricks

libsvm for python 安裝

學習軟體測試基礎測試第七天

Zeppelin 配置通路 REST APIApache Zeppelin Configuration REST API

【Torch】最簡潔logging使用指南

27. Remove Element(清單)題目代碼

Cloud Studio初體驗

使用 ctypes 進行 Python 和 C 的混合程式設計

【python】【資料處理】畫多元資料分布圖

【python】netconf協定對接管理裝置

「Python 網絡自動化」NETCONF —— Python 使用 NETCONF 管理配置 H3C 網絡裝置

在python中建立excel并寫入

python3學習爬蟲 正則以及url

繼續閱讀

python3學習爬蟲正則以及url