天天看點

python多線程寫入資料庫urllib_Python基于多線程實作抓取資料存入資料庫的方法

本文執行個體講述了Python基于多線程實作抓取資料存入資料庫的方法。分享給大家供大家參考,具體如下:

1. 資料庫類

"""

使用須知:

代碼中資料表名 aces ,需要更改該資料表名稱的注意更改

"""

import pymysql

class Database():

# 設定本地資料庫使用者名和密碼

host = "localhost"

user = "root"

password = ""

database = "test"

port = 3306

charset = "utf8"

cursor=''

connet =''

def __init__(self):

#連接配接到資料庫

self.connet = pymysql.connect(host = self.host , user = self.user,password = self.password , database = self.database, charset = self.charset)

self.cursor = self.connet.cursor()

# #删表

def dropTables(self):

self.cursor.execute('''''drop table if exists aces''')

print("删表")

#建表

def createTables(self):

self.cursor.execute('''''create table if not exists aces

(

asin varchar(11) primary key not null,

checked varchar(200));''')

print("建表")

#儲存資料

def save(self,aceslist):

self.cursor.execute("insert into aces ( asin, checked) values(%s,%s)", (aceslist[0],aceslist[1]))

self.connet.commit()

#判斷元素是否已經在資料庫裡,在就傳回true ,不在就傳回false

def is_exists_asin(self,asin):

self.cursor.execute('select * from aces where asin = %s',asin)

if self.cursor.fetchone() is None:

return False

return True

# db =Database()

2. 多線程任務類

import urllib.parse

import urllib.parse

import urllib.request

from queue import Queue

import time

import random

import threading

import logging

import pymysql

from bs4 import BeautifulSoup

from local_data import Database

#一個子產品中存儲多個類 AmazonSpeder , ThreadCrawl(threading.Thread), AmazonSpiderJob

class AmazonSpider():

def __init__(self):

self.db = Database()

def randHeader(self):

head_connection = ['Keep-Alive', 'close'] head_accept = ['text/html, application/xhtml+xml, */*'] head_accept_language = ['zh-CN,fr-FR;q=0.5', 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3'] head_user_agent = ['Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',

'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36',

'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; rv:11.0) like Gecko)',

'Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1',

'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3',