天天看点

Selenium获取网页数据

# coding:utf-8

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import os


def get_url_html(url):
    # 获取执行驱动路径, 驱动放在项目根目录下, 驱动下载地址:https://chromedriver.storage.googleapis.com/index.html
    driver_path = os.path.dirname(os.path.abspath(__file__)) + os.sep + "chromedriver"

    # 添加选项
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--no-sandbox")
    
    # 启动webdriver
    session = webdriver.Chrome(executable_path=driver_path, chrome_options=chrome_options)
    
    # 访问url
    session.get(url)
    
    # 访问url后睡3秒,视情况而定
    time.sleep(3)
    
    # 获取网页源代码
    content = session.page_source
    
    # 退出webdriver, 否则会在后台留下chromedriver驱动进程
    session.quit()
    return content