# -*- coding:utf-8 -*-
'''
爬取小說 by @asdfv
将每部小說的章節内容儲存至本地
'''
import urllib2,re
from bs4 import BeautifulSoup
import threading
# 請求并擷取網頁源碼
def get_html_content(url):
user_agent = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
header = {'User-Agent':user_agent}
request = urllib2.Request(url=url, headers=header)
html = urllib2.urlopen(request).read()
return html
#[小說名,小說url]
novel_list = []
def get_novels_list(html):
soup_novels = BeautifulSoup(html,'html.parser')
for string in soup_novels.find_all(attrs="l"):
for str_name in string.find_all(attrs="clearfix stitle"):
novel_list.append([str_name.get_text().encode('utf-8'),str_name.get('href')])
return novel_list
# 擷取小說跳轉至章節顯示頁面連結
def turn2novel(novel_chapters_url):
html = get_html_content(novel_chapters_url)
if html:
soup_novel = BeautifulSoup(html,'html.parser')
# print soup_novel.find(attrs="reader").get('href')
return soup_novel.find(attrs="reader").get('href') # 章節url
# 擷取某一小說所有章節名稱及其連結
def novel_chapters_content(chapter):
html = get_html_content(chapter)
if html:
reg_bookname = re.compile(r'<div class="chapName"><span class="r">(.*?)</span>')
bookname = re.findall(reg_bookname,html)
reg = re.compile(r'<li><a href="(.*?)" target="_blank" rel="external nofollow" title=".*?">(.*?)</a></li>')
url_chapters_name = re.findall(reg,html)
return url_chapters_name
# 擷取小說各章節文本内容
def get_chapter_novel_content(chapter_txt_url):
html = get_html_content(chapter_txt_url)
if html:
html = html.decode('gbk').encode('utf-8')
reg = re.compile(r'</script>(.*?)<script type="text/javascript">')
content = re.findall(reg,html)[] # 小說文本内容
return content.replace(' ',' ').replace('<br />','\n')
# 儲存
def download_novel(url):
html = get_html_content(url)
listnovel = get_novels_list(html)
for item in listnovel:
chapters_url = turn2novel(item[])
with open('F:\\paqu\\txt\\' + item[].decode('UTF-8').encode('GBK') + '.txt','a') as f:
print item[]
try:
for item_chapter in novel_chapters_content(chapters_url):
# f.writelines(item_chapter[1] + '\n')
txt_url = chapters_url + '/' +item_chapter[]
# print get_chapter_novel_content(txt_url)
f.writelines(' ' + item_chapter[] + '\n\n' + get_chapter_novel_content(txt_url).decode('UTF-8').encode('GBK') + '\n\n')
except:
print '----error here!----'
continue
def th(event):
thr = threading.Thread(target=download_novel(event))
thr.start()
url = 'http://www.quanshu.net/list/2_1.html'
th(url)
- 爬取結果顯示: