一個百度貼吧下載下傳指定單個文章所有回複的工具(JavaScript)實作

2021-12-04 23:50:00

var http = require('http'),

fs = require('fs');

function Spider(postId, seeLz) {

this.currentPageNum = 1;

this.numOfPagesToCrawl = 0;

this.baseUrl = 'http://tieba.baidu.com/p/'

+ postId

+ '?see_lz='

+ (seeLz || 1)

+ '&pn=';

this.data = '';

}

Spider.prototype = {

constructor: Spider,

crawl: function(pageNum) {

var self = this;

var url = this.baseUrl + (pageNum || this.currentPageNum);

http.request(url, function(res){

res.setEncoding("utf8");

// response returns chunks

res.on('data', function(chunk){

this.data += chunk;

});

res.on('end', function(){

var that = this;

// because of the nature of asynchronous, can't simply return the data here, but bring in Processor object for data processing

var processor = new Processor(this.data);

var title = processor.getTitle();

this.numOfPagesToCrawl = processor.getPageCount();

console.log(title);

console.log('====================');

console.log('');

var posts = processor.getPosts();

var index = 0;

var interval = setInterval(function() {

console.log(posts[index]);

console.log("> posts left on the current page: " + (posts.length - index));

console.log('~~~~~~~~~~~~~~~~~~~~');

processor.writeFile(title, posts[index] + '\n\n');

index++;

if (index >= posts.length) {

console.log("end of this page");

this.data = '';

self.currentPageNum++;

clearInterval(interval);

if (self.currentPageNum < that.numOfPagesToCrawl) {

console.log("get ready to the next page");

self.crawl(self.currentPageNum);

} else {

console.log("that's all there's to it...");

}

}, 500);

}).end();

}

};

function Processor(data) {

this.data = data;

Processor.prototype = {

constructor: Processor,

// to extract page content that only exists in a single location

matchSingle: function(regex) {

var matched = this.data.match(regex);

var result = !!matched? matched[1] : '';

return result;

// to extract page contents that exist in multiple locations

matchMulti: function(regex) {

var results = !!matched? matched : [];

return results;

// to remove rubbish contents

purify: function(str) {

var htmlTags = /<.*?>/g;

var spaces = /\s+/g;

var purified = str.replace(htmlTags, '')

.replace(spaces, '');

return purified;

// to extract total page counts

getPageCount: function() {

var pageCount = this.matchSingle(/

return pageCount;

// to extract post title

getTitle: function() {

var title = this.matchSingle(/core_title_txt.*?title="(.*?)"/);

return title;

// to extract all posts in a given page

getPosts: function() {

var rawPosts = this.matchMulti(/(.*?)<\/div>/g);

var posts = [];

for (var i = 0; i < rawPosts.length; i ++) {

posts.push(this.purify(rawPosts[i]));

}

return posts;

writeFile: function(fileName, data) {

fs.appendFile((fileName || 'output') + '.txt', data, function(err) {

if (err) {

throw err;

}

})

}

var spider = new Spider(process.argv[2] || 3138733512);

spider.crawl();

一個百度貼吧下載下傳指定單個文章所有回複的工具(JavaScript)實作

繼續閱讀

主流浏覽器四大綜合性能測試

JavaScript自學筆記【4】函數的聲明與調用目錄二、函數的聲明三、函數的調用

請求逾時VUE axios重新再次請求

nodejs微信開發---授權登入+擷取使用者資訊微信網頁授權

debian9更新4.9.0核心到4.19.2核心過程

Javascript建構Bingo卡片遊戲

JavaScript的那些坑之事件代理事件代理事件階段

javascript的for (var i in data)慎用javascript中的for (var i in data)謹慎用

tab滑鼠經過菜單切換

vue （vue2.0）使用總結(從大體結構總結)

vue搭建過程及出現問題

/\B(?=(?:\d{3})+$)/g 一條令人費解的正規表達式

适用于JavaScript的ECMAScript 2020規範向前發展

JS生成uuid的四種方法

layui多任務上傳添加進度條