并不保證爬取所有想用的圖檔
先是對up主相簿頁爬所有圖檔動态的位址存在文本檔案中
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver import ActionChains
import time
from tellwlib.py import download
from selenium.webdriver.common.by import By
url = 'https://space.bilibili.com/177023891/album'
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
driver = webdriver.Chrome(chrome_options=chrome_options)
#driver = webdriver.Chrome()
driver.get(url)
try:
for i in range(1, 9):#該up主相簿頁面有8頁
WebDriverWait(driver, 20, 0.5).until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'picture')))
doctors = driver.find_elements_by_class_name('picture')
with open('beyond.txt', 'a') as f:
for doctor in doctors:
f.write(doctor.get_attribute('href')+'\n')
pages = driver.find_elements_by_class_name('panigation')
for page in pages:
if page.text == str(i+1):
break
ActionChains(driver).move_to_element(page).click(page).perform()
time.sleep(5)
finally:
driver.close()
爬到相簿頁的最後一頁的時候會因為點選最後一個按鈕(因為沒有第9頁的按鈕)而報錯,程式也應該在此時終止
接下來從文本檔案依次取出圖檔動态頁位址通路爬取圖檔
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver import ActionChains
import time
from tellwlib.py import download
from selenium.webdriver.common.by import By
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
with open('beyond.txt', 'r') as f:
contents = f.readlines()
for idx, content in enumerate(contents):
print('processing %dth link %s'%(idx+1, content.strip()))
driver = webdriver.Chrome(chrome_options=chrome_options)
driver.get(content.strip())
try:
WebDriverWait(driver, 20, 0.5).until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'images')))
pics = driver.find_elements_by_css_selector('.images > img')
for pic in pics:
picurl = pic.get_attribute('src')
download.download_file(picurl, 'beyond/'+picurl.split('/')[-1])
finally:
driver.close()
目前做不出Chrome的單例類或者類變量,是以還是有些耗資源的,希望自己未來能有所進步。
參考連結:
Python+Selenium+ChromeDriver之浏覽器爬蟲入門