# coding:utf-8
import requests
from lxml import etree
from bs4 import BeautifulSoup
import chardet
BASE_DOMAIN = "http://www.ygdy8.net"
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36"
}
def get_detailed_urls(url):
# 1.擷取網頁資訊
response = requests.get(url, headers=HEADERS)
# 檢視網頁後發現,編碼方式為“gb2312”charset
encode_style = chardet.detect(response.content)["encoding"]
# text = response.content.decode(encode_style, "ignore")
text = response.content.decode("gbk", "ignore")
# 2.對擷取的text進行解析,解析成元素
soup = BeautifulSoup(text, "lxml")
# 2.1擷取所有"a"标簽
# trs = soup.select("tr")
# for tr in trs:
# print tr
# 2.2擷取第2個"a"标簽
# trs = soup.select("tr")[1]
# print trs
# 2.3擷取class為even的标簽
# trs = soup.select("table.tbspan")
# for tr in trs:
# print tr
# 2.4擷取所有a标簽的herf屬性
# trs = soup.select("a")
# for tr in trs:
# print tr["href"]
# 2.5擷取所有的職位資訊(text文本)
trs = soup.select("tr")
for tr in trs:
infos=list(tr.stripped_strings)
print infos
def spider():
# 1.擷取第二頁詳細url
# url = "http://www.ygdy8.net/html/gndy/dyzz/index.html"
base_url = "http://www.ygdy8.net/html/gndy/dyzz/list_23_{}.html"
for i in range(1, 8):
url = base_url.format(i)
get_detailed_urls(url)
break
if __name__ == '__main__':
spider()