天天看點

使用selenium爬取淘寶商品資訊

使用selenium爬取淘寶商品資訊,先進入淘寶首頁,在搜尋那裡自動輸入apple,自動跳轉到有關apple的頁面,然後爬取資訊。跳轉到第二頁,要先進行句柄操作,否則爬取的資訊還是第一頁的。

from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

import time


def get_productlist(browser,txtname,current_page):
    print('正在爬取'+'第'+current_page+'頁内容')
    productlist=browser.find_elements_by_xpath("//div[@id='mainsrp-itemlist']//div[@class='items']//div[@class='item J_MouserOnverReq  ']")
    #print(producelist)
    pricelist=[]
    locationlist=[]
    namelist=[]
    imagelist=[]
    shoplist=[]
    for product in productlist:
        price=product.find_element_by_xpath(".//div[@class='ctx-box J_MouseEneterLeave J_IconMoreNew']//div[@class='price g_price g_price-highlight']//strong").text
        name=product.find_element_by_xpath(".//div[@class='ctx-box J_MouseEneterLeave J_IconMoreNew']//div[@class='row row-2 title']//a").text
        shop=product.find_element_by_xpath(".//div[@class='ctx-box J_MouseEneterLeave J_IconMoreNew']//div[@class='row row-3 g-clearfix']//div[@class='shop']//a").text
        location=product.find_element_by_xpath(".//div[@class='ctx-box J_MouseEneterLeave J_IconMoreNew']//div[@class='row row-3 g-clearfix']//div[@class='location']").text
        image=product.find_element_by_xpath(".//div[@class='pic-box J_MouseEneterLeave J_PicBox']//div[@class='pic-box-inner']//div[@class='pic']//a//img").get_attribute('src')
        pricelist.append(price)
        locationlist.append(location)
        namelist.append(name)
        imagelist.append(image)
        shoplist.append(shop)
        print(price+' '+image)

    #寫入檔案中
    documentname=txtname+'.txt'
    for i in range(0,len(pricelist)):
        with open(documentname,'w+',encoding='utf-8') as f:
            f.write(str(namelist[i])+'\t'+str(shoplist[i])+'\t'+str(locationlist[i])+'\t'+str(pricelist[i])+'\t'+str(imagelist[i])+'\n')
    
def scrollTo(browser):
    #每次應該用 selenium去控制遊标向下滑一點   讓多一點的産品加載出來
    for i in range(0,5):
        js = 'window.scrollTo( 800 ,' + str((i + 1) * 1280) + ')'
        browser.execute_script(js)
        time.sleep(2)
    
    
def next_page(browser):
    #跳轉下一頁
    browser.find_element_by_xpath("//div[@id='mainsrp-pager']//div[@class='m-page g-clearfix']//div[@class='wraper']//div[@class='inner clearfix']//ul[@class='items']//li[@class='item next']//a").click()
    time.sleep(5)
    
#擷取目前頁碼
def get_currentpage(browser):
    page_current=browser.find_element_by_xpath("//div[@id='mainsrp-pager']//div[@class='m-page g-clearfix']//div[@class='wraper']//div[@class='inner clearfix']//ul[@class='items']//li[@class='item active']").text
    
    return page_current


#擷取句柄
def hanle(browser):
    all_handles = browser.window_handles #擷取所有視窗句柄
    #now_handle = browser.current_window_handle #擷取目前視窗句柄
    #print(now_handle)
    for handle in all_handles:
        #print(handle)    #輸出待選擇的視窗句柄
        browser.switch_to_window(handle)
        next_page(browser)#跳轉下一頁
        #browser.close() #關閉目前視窗
    

#browser.close()
if __name__=='__main__':
    headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
           'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
           'Accept-Charset': 'utf-8',
           'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36',
           'Connection': 'keep-alive'
           }

    cap = DesiredCapabilities.PHANTOMJS.copy()  #使用copy()防止修改原代碼定義dict

    for key, value in headers.items():
        cap['phantomjs.page.customHeaders.{}'.format(key)] = value
    
    url='https://www.taobao.com/'
    #browser = webdriver.PhantomJS(desired_capabilities=cap) #selenium設定phantomjs請求頭
    options = webdriver.ChromeOptions()# 進入浏覽器設定
    options.add_argument(')# 設定中文
    #selenium設定chrome請求頭
    options.add_argument('user-agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36"')# 更換頭部
    options.add_argument('Accept="text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"')
    options.add_argument('Connection="keep-alive"')
    options.add_argument('Accept-Charset="utf-8"')
    options.add_argument('Accept-Language="zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3"')
    browser=webdriver.Chrome(chrome_options=options)
    browser.set_window_size(1920,1080)
    browser.get(url)
    #browser.implicitly_wait(3)
    time.sleep(3)
    
    #跳轉到https://s.taobao.com/search?q=apple
    try:
        searchKey=browser.find_element_by_xpath("//form[@id='J_TSearchForm']//input[@id='q']")
        searchKey.send_keys('apple')

        searchButton=browser.find_element_by_xpath("//form[@id='J_TSearchForm']/div[@class='search-button']")
        searchButton.click()
    except Exception as e:
        print(e)
    
    
    '''
    get_productlist(browser,'taobao1',get_currentpage(browser))
    hanle(browser)#擷取句柄
    next_page(browser)
    get_productlist(browser,'taobao2',get_currentpage(browser))
    hanle(browser)#擷取句柄
    next_page(browser)
    get_productlist(browser,'taobao3',get_currentpage(browser))
    page_next=browser.find_element_by_xpath("//div[@id='mainsrp-pager']//div[@class='m-page g-clearfix']//div[@class='wraper']//div[@class='inner clearfix']//ul[@class='items']//li[@class='item active']").text
    print("目前頁碼為:"+page_next)
    
    '''
    #,'taobao4','taobao5','taobao6','taobao7','taobao8','taobao9','taobao10'
    txtnamelist=['taobao1','taobao2','taobao3']
    start_page=0
    end_page=3
    for i in range(start_page,end_page):
        #scrollTo(browser)
        get_productlist(browser,txtnamelist[i],get_currentpage(browser))
        hanle(browser)#擷取句柄
            
            
    page_next=browser.find_element_by_xpath("//div[@id='mainsrp-pager']//div[@class='m-page g-clearfix']//div[@class='wraper']//div[@class='inner clearfix']//ul[@class='items']//li[@class='item active']").text
    print("目前頁碼為:"+page_next)