天天看點

練:python爬取小說

  • 新人首發,還望多多指教 >_<
# -*- coding:utf-8 -*-
'''
    爬取小說 by @asdfv
    将每部小說的章節内容儲存至本地
'''

import urllib2,re
from bs4 import BeautifulSoup
import threading

# 請求并擷取網頁源碼
def get_html_content(url):
    user_agent = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
    header = {'User-Agent':user_agent}
    request = urllib2.Request(url=url, headers=header)
    html = urllib2.urlopen(request).read()
    return html

#[小說名,小說url]
novel_list = []
def get_novels_list(html):
    soup_novels = BeautifulSoup(html,'html.parser')
    for string in soup_novels.find_all(attrs="l"):
        for str_name in string.find_all(attrs="clearfix stitle"):
            novel_list.append([str_name.get_text().encode('utf-8'),str_name.get('href')])
    return novel_list

# 擷取小說跳轉至章節顯示頁面連結
def turn2novel(novel_chapters_url):
    html = get_html_content(novel_chapters_url)
    if html:
        soup_novel = BeautifulSoup(html,'html.parser')
        # print soup_novel.find(attrs="reader").get('href')
        return soup_novel.find(attrs="reader").get('href') # 章節url

# 擷取某一小說所有章節名稱及其連結
def novel_chapters_content(chapter):
    html = get_html_content(chapter)
    if html:
        reg_bookname = re.compile(r'<div class="chapName"><span class="r">(.*?)</span>')
        bookname = re.findall(reg_bookname,html)
        reg = re.compile(r'<li><a href="(.*?)" target="_blank" rel="external nofollow"  title=".*?">(.*?)</a></li>')
        url_chapters_name = re.findall(reg,html)
    return url_chapters_name

# 擷取小說各章節文本内容
def get_chapter_novel_content(chapter_txt_url):
    html = get_html_content(chapter_txt_url)
    if html:
        html = html.decode('gbk').encode('utf-8')
        reg = re.compile(r'</script>(.*?)<script type="text/javascript">')
        content = re.findall(reg,html)[]  # 小說文本内容
        return content.replace('&nbsp;&nbsp;&nbsp;&nbsp;','    ').replace('<br />','\n')

# 儲存
def download_novel(url):
    html = get_html_content(url)
    listnovel = get_novels_list(html)
    for item in listnovel:
        chapters_url = turn2novel(item[])
        with open('F:\\paqu\\txt\\' + item[].decode('UTF-8').encode('GBK')  + '.txt','a') as f:
            print item[]
            try:
                for item_chapter in novel_chapters_content(chapters_url):
                    # f.writelines(item_chapter[1] + '\n')
                    txt_url = chapters_url + '/' +item_chapter[]
                    # print get_chapter_novel_content(txt_url)
                    f.writelines(' ' + item_chapter[] + '\n\n' + get_chapter_novel_content(txt_url).decode('UTF-8').encode('GBK') + '\n\n')
            except:
                print '----error here!----'
                continue
def th(event):
    thr = threading.Thread(target=download_novel(event))
    thr.start()

url = 'http://www.quanshu.net/list/2_1.html'
th(url)
           
  • 爬取結果顯示:
    練:python爬取小說