天天看點

Selenium擷取網頁資料

# coding:utf-8

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import os


def get_url_html(url):
    # 擷取執行驅動路徑, 驅動放在項目根目錄下, 驅動下載下傳位址:https://chromedriver.storage.googleapis.com/index.html
    driver_path = os.path.dirname(os.path.abspath(__file__)) + os.sep + "chromedriver"

    # 添加選項
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--no-sandbox")
    
    # 啟動webdriver
    session = webdriver.Chrome(executable_path=driver_path, chrome_options=chrome_options)
    
    # 通路url
    session.get(url)
    
    # 通路url後睡3秒,視情況而定
    time.sleep(3)
    
    # 擷取網頁源代碼
    content = session.page_source
    
    # 退出webdriver, 否則會在背景留下chromedriver驅動程序
    session.quit()
    return content