天天看點

BeautifulSoup解析工具與css選擇器使用簡介

# coding:utf-8

import requests
from lxml import etree
from bs4 import BeautifulSoup
import chardet
BASE_DOMAIN = "http://www.ygdy8.net"
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36"
}


def get_detailed_urls(url):
    # 1.擷取網頁資訊
    response = requests.get(url, headers=HEADERS)
    # 檢視網頁後發現,編碼方式為“gb2312”charset
    encode_style = chardet.detect(response.content)["encoding"]
    # text = response.content.decode(encode_style, "ignore")
    text = response.content.decode("gbk", "ignore")

    # 2.對擷取的text進行解析,解析成元素
    soup = BeautifulSoup(text, "lxml")

    # 2.1擷取所有"a"标簽
    # trs = soup.select("tr")
    # for tr in trs:
    #     print tr

    # 2.2擷取第2個"a"标簽
    # trs = soup.select("tr")[1]
    # print trs

    # 2.3擷取class為even的标簽
    # trs = soup.select("table.tbspan")
    # for tr in trs:
    #     print tr

    # 2.4擷取所有a标簽的herf屬性
    # trs = soup.select("a")
    # for tr in trs:
    #     print tr["href"]

    # 2.5擷取所有的職位資訊(text文本)
    trs = soup.select("tr")
    for tr in trs:
        infos=list(tr.stripped_strings)
        print infos


def spider():
    # 1.擷取第二頁詳細url
    # url = "http://www.ygdy8.net/html/gndy/dyzz/index.html"
    base_url = "http://www.ygdy8.net/html/gndy/dyzz/list_23_{}.html"
    for i in range(1, 8):
        url = base_url.format(i)
        get_detailed_urls(url)
        break


if __name__ == '__main__':
    spider()