天天看點

python爬取筆趣閣小說的代碼微小調整修改

原文來自:  https://blog.csdn.net/u012717715/article/details/92811743

本人說明:其實嚴格地這不是轉載,實際上更應該算是原創!原文代碼不能正常運作(我推測準确說應該是:原系統是在MAC中開發的,但在win中有若幹問題),是以才有本文,但是實際上就算在windows中還是有問題沒有完全解決:就是如果儲存資料的檔案夾和檔案已經存在則會有錯誤發生,是以大家使用時如果碰到請手工删除他們。本來是想發在原文的評論中,但是評論篇幅有限,隻能發40行代碼!沒辦法才單獨發的

程式的原理那些請看原文

環境:win10 20h2 19042.804,64位簡體中文版,python-3.9.2,VScode x64 1.53.2

一、要pip install lxml和pip install requests

二、完整代碼修改後如下

# -*- coding:UTF-8 -*-
from bs4 import BeautifulSoup
import requests, sys, threading , os , shutil
 
"""
類說明:下載下傳《筆趣看》網小說《一念永恒》
Parameters:
    無
Returns:
    無
Modify:
    2017-09-13
"""
class downloader(object):
 
    def __init__(self):
        self.server = 'http://www.biqukan.com/'
        self.target = 'http://www.biqukan.com/1_1094/'
        self.names = []            #存放章節名
        self.urls = []            #存放章節連結
        self.nums = 0            #章節數
 
    """
    函數說明:擷取章節清單以及對應的下載下傳連結清單
    Parameters:
        無
    Returns:
        無
    Modify:
        2017-09-13
    """
    def get_download_url(self):
        req = requests.get(url = self.target)
        html = req.text
        div_bf = BeautifulSoup(req.content,'lxml')
        div = div_bf.find_all('div', class_ = 'listmain')
        a_bf = BeautifulSoup(str(div[0]),'lxml')
        a = a_bf.find_all('a')
        self.nums = len(a[13:])  #剔除不必要的章節,并統計章節數
        for each in a[13:]:
            self.names.append(each.string)
            self.urls.append(self.server + each.get('href'))
 
    """
    函數說明:擷取對應章節的内容
    Parameters:
        target - 下載下傳連接配接(string)
    Returns:
        texts - 章節内容(string)
    Modify:
        2017-09-13
    """
    def get_contents(self, target):
        req = requests.get(url = target)
        html = req.text
        bf = BeautifulSoup(req.content, 'lxml')
        texts = bf.find_all('div', class_ = 'showtxt')
        texts = texts[0].text.replace('\xa0'*8,'\n\n')
        return texts
 
    """
    函數說明:将爬取的文章内容寫入檔案
    Parameters:
        name - 章節名稱(string)
        path - 目前路徑下,小說儲存名稱(string)
        text - 章節内容(string)
    Returns:
        無
    Modify:
        2017-09-13
    """
    def writer(self, name, path, text):
        write_flag = True
        with open(path, 'a', encoding='utf-8') as f:
            '''
            f.write(name + '\n')
            f.writelines(text)
            '''
            tmpstr=name+ '\n'+text
            f.write(tmpstr)
            f.write('\n\n')
 
class myThread (threading.Thread):   #繼承父類threading.Thread
    def __init__(self, name,temdl,startNum,endNum):
        threading.Thread.__init__(self)
        self.runlist = list()
        self.name = name
        self.temdl = temdl
        self.startNum = startNum
        self.endNum = endNum
    def run(self):                   #把要執行的代碼寫到run函數裡面 線程在建立後會直接運作run函數 
        print ("Starting " + self.name)
        startDownloadTxt(self.temdl,self.startNum,self.endNum)
        print ("Exiting " + self.name)
 
def mkdir(path):
    '''
    判斷路徑是否存在
    存在     True
    不存在   False
    ''' 
    isExists=os.path.exists(path)
    if not isExists:
        os.makedirs(path) 
        return True
    else:
        return False
    
def subFile(path):
    #特定目錄下的檔案清單
    docList = os.listdir(path)
    # 顯示目前檔案夾下所有檔案并進行排序
    '''
    key = lambda x:int(x[:-4])
    忽略檔案名開始到倒數第四個字元為止
    docList.remove(i)
    删除數組中點開頭的系統隐藏檔案,因為會影響排序
    '''
    for i in docList:
        if(i[0]=='.'):
            docList.remove(i)
            break
    docList.sort(key = lambda x: int(x[:-4]))
        
    #建立一個以書籍名字命名的檔案
    fnamepath = path+'/一念永恒.txt'
    fname = open(fnamepath, "w", encoding='utf-8')
    #打開你之前命名的下載下傳檔案
    for i in docList:
        tempath = path+'/'+i
        x = open (tempath,  "r", encoding='utf-8')    #打開清單中的檔案,讀取檔案内容
        fname.write(x.read())  #寫入建立的檔案中
        x.close()    #關閉清單檔案
    fname.close()
    
    #移動最後的完成檔案到桌面,在删除download檔案夾(windows環境下根據情況自己修改路徑)
    shutil.move(fnamepath, 'c:/Users/lps/Desktop')
    shutil.rmtree(path)
    
 
'''
temdl:downloader類
startNum,endNum開始和結束的rang
'''
def startDownloadTxt(temdl,startNum,endNum):
    mkdir('download')
    for i in range(startNum,endNum):
        temPath = './download/'+str(i)+'.txt'
        tem=temdl.get_contents(temdl.urls[i])
        temdl.writer(temdl.names[i], temPath, temdl.get_contents(temdl.urls[i]))
    print(temdl.names[i]+'done')
 
if __name__ == "__main__":
    dl = downloader()
    #擷取章節清單以及對應的下載下傳連結清單
    dl.get_download_url()
    print('《一年永恒》開始下載下傳:')
    # 建立二十個線程(太多的話,會卡)
    threads = []
    threadNum = 40
    for p in range(threadNum):
        threadname = '"Thread'+str(p)
        stepNum = dl.nums//threadNum
        if(p==threadNum-1):
            thread = myThread(threadname,dl,p*stepNum,dl.nums)
        else:
            thread = myThread(threadname,dl,p*stepNum,(p+1)*stepNum)
        threads.append(thread)
    
    try:
        # 開啟線程
        for t in threads:
            t.start()
        for t in threads:
            t.join()
            
    except:
        print ("Error: unable to start thread")
 
subFile('./download')
print('一念永恒下載下傳完成')