1、某某**集招**資訊
2、使用到了requests,bs4,openpyxl,time子產品
(1)分析頁面,發送請求,并把請求傳回定義為一個子產品。
url = 'https://***.*****.com/company/{0}/jobs/?n={1}'.format(qiye_id, page)
(2)提取資料,BeautifulSoup解析。
bs = BeautifulSoup(html, 'html.parser')
job_list = bs.find_all('div', class_='c-job-list')
for item in job_list: # 分别周遊每一個崗位
name = item.find('h3').text
(3)存儲資料,openpyxl對excel操作。經典六步驟:
def save_excel(list_1): # 存儲
wk = openpyxl.Workbook()
sheet = wk.active
for item in list_1:
sheet.append(item)
# 儲存檔案
wk.save('06-招**資訊1.xlsx')
(4)開始調用,循環調用上面的函數。注意使用time.sleep(5)
def start(id, pages):
for page in range(1, pages + 1):
resp_data = send_request(id, pages) # 調用請求傳回
parse_html(resp_data) #調用提取資料
time.sleep(2)
save_excel(lst) # 儲存
import requests
from bs4 import BeautifulSoup
import openpyxl
import time
lst = []
def send_request(qiye_id, page):
url = 'https://www.*****.com/company/{0}/jobs/?n={1}'.format(qiye_id, page)
headers = {
"User-Agent": "Mozilla/5.0(Windows NT 6.1;WOW64) AppleWebKit/537.36(KABUL, like Gecko) "
"Chrome/86.0.4240.198Safari/537.36 "
}
resp = requests.get(url=url, headers=headers)
return resp.text
# 提取資料
def parse_html(html):
bs = BeautifulSoup(html, 'html.parser') # 得到BeautifulSoup對象
job_list = bs.find_all('div', class_='c-job-list') # 得到包含崗位的div,共15個
for item in job_list: # 分别周遊每一個崗位
name = item.find('h3').text
div_tag = item.find('div', class_='job-desc')
span_tag = div_tag.find_all('span')
# print(name, span_tag[0].text, span_tag[1].text, span_tag[2].text, span_tag[3].text, span_tag[4].text)
# print(name)
url = item.find('a', class_='job-name')['href'] # 提取job-name的a标簽,擷取href屬性,以便擷取詳細的崗位描述。
url = 'https://www.*****.com' + url
# print(url)
lst.append([name, span_tag[0].text, span_tag[1].text, span_tag[2].text, span_tag[3].text, span_tag[4].text,url])
def save_excel(list_1): # 存儲
wk = openpyxl.Workbook()
sheet = wk.active
for item in list_1:
sheet.append(item)
# 儲存檔案
wk.save('06-**資訊.xlsx')
def start(id, pages):
for page in range(1, pages + 1):
resp_data = send_request(id, pages)
parse_html(resp_data)
time.sleep(2)
save_excel(lst)
if __name__ == '__main__':
id = '16021570'
page = 3
start(id, page)