本文執行個體講述了Python基于多線程實作抓取資料存入資料庫的方法。分享給大家供大家參考,具體如下:
1. 資料庫類
"""
使用須知:
代碼中資料表名 aces ,需要更改該資料表名稱的注意更改
"""
import pymysql
class Database():
# 設定本地資料庫使用者名和密碼
host = "localhost"
user = "root"
password = ""
database = "test"
port = 3306
charset = "utf8"
cursor=''
connet =''
def __init__(self):
#連接配接到資料庫
self.connet = pymysql.connect(host = self.host , user = self.user,password = self.password , database = self.database, charset = self.charset)
self.cursor = self.connet.cursor()
# #删表
def dropTables(self):
self.cursor.execute('''''drop table if exists aces''')
print("删表")
#建表
def createTables(self):
self.cursor.execute('''''create table if not exists aces
(
asin varchar(11) primary key not null,
checked varchar(200));''')
print("建表")
#儲存資料
def save(self,aceslist):
self.cursor.execute("insert into aces ( asin, checked) values(%s,%s)", (aceslist[0],aceslist[1]))
self.connet.commit()
#判斷元素是否已經在資料庫裡,在就傳回true ,不在就傳回false
def is_exists_asin(self,asin):
self.cursor.execute('select * from aces where asin = %s',asin)
if self.cursor.fetchone() is None:
return False
return True
# db =Database()
2. 多線程任務類
import urllib.parse
import urllib.parse
import urllib.request
from queue import Queue
import time
import random
import threading
import logging
import pymysql
from bs4 import BeautifulSoup
from local_data import Database
#一個子產品中存儲多個類 AmazonSpeder , ThreadCrawl(threading.Thread), AmazonSpiderJob
class AmazonSpider():
def __init__(self):
self.db = Database()
def randHeader(self):
head_connection = ['Keep-Alive', 'close'] head_accept = ['text/html, application/xhtml+xml, */*'] head_accept_language = ['zh-CN,fr-FR;q=0.5', 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3'] head_user_agent = ['Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; rv:11.0) like Gecko)',
'Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3',