天天看點

nodejs使用asyn優化的爬取

/*
 使用request + cheerio來爬取zngirls網站上的資料
 */
const request = require('request');
const http = require('http');
const fs = require('fs');
const cheerio = require('cheerio');
const url = require('url');
const util = require('util');
const path = require('path');
const process = require('process');
const events = require('events');
const EventEmitter = events.EventEmitter;
const async = require('async');

function Crawl(girlID) {
    this.girlID = girlID;
    this.girlUrlFmt = 'http://www.zngirls.com/girl/%d/';
    //同時可以執行兩個
    var self = this;
    this.queue = async.queue(function (task, callback) {
        //console.log('Hello' + task.name);
        //執行操作
        console.log('url:' + task.url + ' file:' + task.file);
        self.download2(task.url, task.file, callback);
    }, 2);
}

Crawl.prototype = {
    start: function () {
        //爬取個人首頁
        var self = this;
        request.get(this.U(this.getGirlUrl()), function (err, response, body) {
            if (err) {
                console.error('錯誤資訊:', err);
            } else {
                var $ = cheerio.load(body);

                $('.igalleryli_link').each(function (i) {
                    var link = $(this);
                    var href = link.attr('href');

                    //根據gallery的位址來繼續爬取gallery
                    var hrefID = href.match(/\/g\/(\d+)/)[1];
                    var downDir = path.join('' + self.girlID, hrefID);
                    var hostname = url.parse(response.request.href).hostname;
                    var galleryUrl = url.format({
                        hostname: hostname,
                        pathname: href,
                        protocol: 'http',
                    });

                    //建立下載下傳目錄(如果不存在)
                    var arrDir = downDir.split(path.sep);
                    var startDir = arrDir.shift();
                    while (true) {
                        if (!fs.existsSync(startDir)) {
                            fs.mkdirSync(startDir);
                        }

                        if (arrDir.length === 0) break;
                        startDir = startDir + path.sep + arrDir.shift();
                    }

                    //爬取影集
                    request.get(self.U(galleryUrl), function (err, response, body) {
                        if (err) {
                            console.error('下載下傳錯誤:' + response.url, err);
                            process.exit(-1);
                        }

                        var $ = cheerio.load(body);
                        var images = $('#hgallery > img');
                        if (images) {
                            var im = $(images[0]);
                            var src = im.attr('src')
                            var preUrl = src.slice(0, src.lastIndexOf('/') + 1);

                            //爬取所有的圖檔并異步下載下傳
                            $('#dinfo > span').each(function (i) {
                                    var span = $(this);
                                    var matched = span.text().match(/(\d+).*/);
                                    if (matched) {
                                        //該影集的數目
                                        var count = matched[1];
                                        for (var i = 0; i < count; ++i) {
                                            var jpgFile = self.formatIndex(i) + '.jpg';
                                            var jpgUrl = preUrl + jpgFile;
                                            var jpgDownFile = path.join(downDir, jpgFile);

                                            //self.download2(jpgUrl, jpgDownFile);
                                            self.queue.push({url:jpgUrl, file:jpgDownFile});
                                        }
                                        ;
                                    }
                                }
                            );
                        }
                    });
                    console.log('爬取影集執行完畢');
                });
            }
        });

        console.log('個人全部影集執行完畢');
    },

    formatIndex: function (i) {
        var si = i + '';
        if (i === 0) {
            return si;
        }

        while (si.length < 3) {
            si = '0' + si;
        }

        return si;
    },

    getGirlUrl: function () {
        return util.format(this.girlUrlFmt, this.girlID);
    },

    U: function (_url) {
        return {
            url: _url,
            headers: {
                referer: 'http://www.baidu.com',
                connection: 'keep-alive',
                'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36',
            }
        };
    },

    download: function (_url, filename) {
        console.log('jpgUrl=' + _url + ' jpgFile=' + filename);
        var opt = this.U(_url);
        //opt.pool = {maxSockets: 2};
        request.get(opt).on('error', function (err) {
            console.error('下載下傳錯誤:', err);
            process.exit(-1000);
        }).pipe(fs.createWriteStream(filename)).on('close', function () {
            console.log('完成圖檔下載下傳:' + filename);
        });
    },

    download2: function (jpgUrl, jpgDownFile,callback) {
        var jpgUrlP = url.parse(jpgUrl);
        http.get({
            host: jpgUrlP.host,
            port: 80,
            headers: {
                referer: 'http://www.baidu.com',
            },
            path: jpgUrl,
        }, function (res) {
            var buffers = [];
            res.on('data', function (data) {
                buffers.push(data);
            });

            res.on('end', function () {
                var body = Buffer.concat(buffers);
                fs.writeFileSync(jpgDownFile, body);

                if(callback){
                    //保證同步
                    callback();
                }
                console.log('完成圖檔下載下傳:' + jpgDownFile);
            });
        });

    }
};

var girlID = 19705;
var crawl = new Crawl(girlID);
crawl.start();