/*
使用request + cheerio來爬取zngirls網站上的資料
*/
const request = require('request');
const http = require('http');
const fs = require('fs');
const cheerio = require('cheerio');
const url = require('url');
const util = require('util');
const path = require('path');
const process = require('process');
const events = require('events');
const EventEmitter = events.EventEmitter;
const async = require('async');
function Crawl(girlID) {
this.girlID = girlID;
this.girlUrlFmt = 'http://www.zngirls.com/girl/%d/';
//同時可以執行兩個
var self = this;
this.queue = async.queue(function (task, callback) {
//console.log('Hello' + task.name);
//執行操作
console.log('url:' + task.url + ' file:' + task.file);
self.download2(task.url, task.file, callback);
}, 2);
}
Crawl.prototype = {
start: function () {
//爬取個人首頁
var self = this;
request.get(this.U(this.getGirlUrl()), function (err, response, body) {
if (err) {
console.error('錯誤資訊:', err);
} else {
var $ = cheerio.load(body);
$('.igalleryli_link').each(function (i) {
var link = $(this);
var href = link.attr('href');
//根據gallery的位址來繼續爬取gallery
var hrefID = href.match(/\/g\/(\d+)/)[1];
var downDir = path.join('' + self.girlID, hrefID);
var hostname = url.parse(response.request.href).hostname;
var galleryUrl = url.format({
hostname: hostname,
pathname: href,
protocol: 'http',
});
//建立下載下傳目錄(如果不存在)
var arrDir = downDir.split(path.sep);
var startDir = arrDir.shift();
while (true) {
if (!fs.existsSync(startDir)) {
fs.mkdirSync(startDir);
}
if (arrDir.length === 0) break;
startDir = startDir + path.sep + arrDir.shift();
}
//爬取影集
request.get(self.U(galleryUrl), function (err, response, body) {
if (err) {
console.error('下載下傳錯誤:' + response.url, err);
process.exit(-1);
}
var $ = cheerio.load(body);
var images = $('#hgallery > img');
if (images) {
var im = $(images[0]);
var src = im.attr('src')
var preUrl = src.slice(0, src.lastIndexOf('/') + 1);
//爬取所有的圖檔并異步下載下傳
$('#dinfo > span').each(function (i) {
var span = $(this);
var matched = span.text().match(/(\d+).*/);
if (matched) {
//該影集的數目
var count = matched[1];
for (var i = 0; i < count; ++i) {
var jpgFile = self.formatIndex(i) + '.jpg';
var jpgUrl = preUrl + jpgFile;
var jpgDownFile = path.join(downDir, jpgFile);
//self.download2(jpgUrl, jpgDownFile);
self.queue.push({url:jpgUrl, file:jpgDownFile});
}
;
}
}
);
}
});
console.log('爬取影集執行完畢');
});
}
});
console.log('個人全部影集執行完畢');
},
formatIndex: function (i) {
var si = i + '';
if (i === 0) {
return si;
}
while (si.length < 3) {
si = '0' + si;
}
return si;
},
getGirlUrl: function () {
return util.format(this.girlUrlFmt, this.girlID);
},
U: function (_url) {
return {
url: _url,
headers: {
referer: 'http://www.baidu.com',
connection: 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36',
}
};
},
download: function (_url, filename) {
console.log('jpgUrl=' + _url + ' jpgFile=' + filename);
var opt = this.U(_url);
//opt.pool = {maxSockets: 2};
request.get(opt).on('error', function (err) {
console.error('下載下傳錯誤:', err);
process.exit(-1000);
}).pipe(fs.createWriteStream(filename)).on('close', function () {
console.log('完成圖檔下載下傳:' + filename);
});
},
download2: function (jpgUrl, jpgDownFile,callback) {
var jpgUrlP = url.parse(jpgUrl);
http.get({
host: jpgUrlP.host,
port: 80,
headers: {
referer: 'http://www.baidu.com',
},
path: jpgUrl,
}, function (res) {
var buffers = [];
res.on('data', function (data) {
buffers.push(data);
});
res.on('end', function () {
var body = Buffer.concat(buffers);
fs.writeFileSync(jpgDownFile, body);
if(callback){
//保證同步
callback();
}
console.log('完成圖檔下載下傳:' + jpgDownFile);
});
});
}
};
var girlID = 19705;
var crawl = new Crawl(girlID);
crawl.start();