天天看点

基于requests的一只战斗力有50的python网文爬虫

这只爬虫可以从某些盗版网络小说网站批量下载所有章节,其中有三个方面增强了它的战斗力,分别是:

  • 设置 header
  • 设置 timeout 和 max_retries
  • 越过服务器反爬虫抛出的404和502错误

下面直接上代码

import requests
from bs4 import BeautifulSoup
import time
from requests.adapters import HTTPAdapter

def download(bookID):
    url = "https://www.xbiqugexsw.com"
    flag = True
    count = 0
    header={"Host": "www.xbiqugexsw.com","User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36"}
    # 要添加header避免反爬虫,https://blog.csdn.net/qq_42787271/article/details/81571229
    sess = requests.Session()
    sess.mount('http://', HTTPAdapter(max_retries=5))
    sess.mount('https://', HTTPAdapter(max_retries=5))
    while flag:
        if count==0:
            page = "/book/"+str(bookID)+"/"+str(count)+".html"
        try:
            r = sess.get(url+page,timeout=5, headers=header) # 超过5秒没有响应就重连
            s = BeautifulSoup(r.text)
            if s.h1.text.startswith("404"):
                print("对方服务器碧池扔过来一个404,没关系咱接着捅...")
                continue
            # 写入文本
            with open("test.txt","a",encoding="utf-8") as f:
                f.write("\n\n"+s.h1.text+"\n\n") # 标题
                print("【已下载章节】: %s"%(s.h1.text))
                for line in s.find(id="content").contents:
                    line = line.string
                    if line is None:
                        f.write("\n") # 段落空行
                    elif line.endswith("http://www.xbiqugexsw.com/") or line.startswith("本章未完"):
                        continue # 去除反转码水印
                    elif line.startswith("正在加载中"):
                        continue # 避免页面加载不完全
                    else:
                        line = line.replace("\r\n\t\t\t\t","")
                        f.write(line)
            # 取下一页或者下一章的连接
            b = s.select(".bottem2")[0].select("a")[2]
            if b.has_attr("rel"):
                page =  b["href"]
                count += 1
            else:
                print("【已下载至最新章节】")
                flag = False
        except requests.exceptions.RequestException as e:
            print(time.strftime('%Y-%m-%d %H:%M:%S'),e)
        # 有时候网络连接不稳定,可能需要等待,后面可以尝试超时等待,直接get里面加上timeout即可
        """参考链接
        https://www.cnblogs.com/gl1573/p/10129382.html
        """

download(202343)
           

关于bookID的说明:

比如 https://www.xbiqugexsw.com/book/202343 后面的数字就是bookID。

如果换一个更好的网站,事情会变得简单许多:

import requests
from bs4 import BeautifulSoup
import time
from requests.adapters import HTTPAdapter

def download(bookName,firstPageUrl):
    url = "http://www.biquge.se"
    flag = True
    count = 0
    sess = requests.Session()
    sess.mount('http://', HTTPAdapter(max_retries=5))
    sess.mount('https://', HTTPAdapter(max_retries=5))
    with open("%s.txt"%bookName,"w",encoding="utf-8") as f:
        while flag:
            if count==0:
                page = firstPageUrl
            try:
                r = sess.get(page,timeout=5) # 超过5秒没有响应就重连
                s = BeautifulSoup(r.text)
                if s.h1.text.startswith("404"):
                    print(time.strftime('%Y-%m-%d %H:%M:%S'),"对方服务器碧池扔过来一个404,没关系咱接着捅...")
                    continue
                if s.h1.text.startswith("502"):
                    print(time.strftime('%Y-%m-%d %H:%M:%S'),"对方服务器碧池扔过来一个502,没关系咱接着捅...")
                    continue
                f.write("\n\n"+s.h1.text+"\n\n") # 标题
                print("【已下载章节】: %s"%(s.h1.text)+" "*10,end="\r",flush=True) # end='\r'可以让光标回到最前面,从而实现覆盖刷新
                content = s.find(id="content").prettify().replace('<div id="content">',"").replace('</div>',"").replace("<br/>","\n")
                f.write(content)
                # 取下一页或者下一章的连接
                b = s.select(".bottem2")[0].select("a")[3]
                page =  url+b["href"]
                count += 1
                if b["href"].endswith("/"):
                    print("【已下载至最新章节】")
                    flag = False
            except requests.exceptions.RequestException as e:
                print(time.strftime('%Y-%m-%d %H:%M:%S'),e)
            # 有时候网络连接不稳定,可能需要等待,后面可以尝试超时等待,直接get里面加上timeout即可
            """参考链接
            https://www.cnblogs.com/gl1573/p/10129382.html
            """ 

t0 = time.time()
bookName = "*****"
firstPageUrl = "http://www.biquge.se/12190/34012714.html"  # 小说第一章的网页地址
download(bookName,firstPageUrl)
print("下载累计耗时%d秒"%(time.time()-t0))