用nodejs寫了個爬取電影心靈奇旅電影熱評的功能。
//爬取豆瓣心靈奇旅影評,包括使用者首頁頭像
let request = require('request')
let fs = require('fs')
const path = require('path');
var startNum = 0//起始爬取位置
//發送請求
function reqData(url) {
return new Promise((resolve, reject) => {
request(url, function (error, response, body) {
if (error) {
reject(error)
} else {
resolve({
response,
body
})
}
});
});
}
//請求處理
async function req(url) {
let {
response,
body
} = await reqData(url)
//爬取評論的正規表達式
let reg1 = /<span class="short">(.*?)<\/span>/igs
//爬取使用者資訊的正則
let reg2 = /<a title="(.*?)" href="(.*?)".*?<img src="(.*?)".*?<\/a>/igs
let users = []
let res1
let res2
let id = startNum
while ((res1 = reg1.exec(body)) && (res2 = reg2.exec(body))) {
id++
let user = {
id: id,
userName: res2[1],
userHome: res2[2],
userIMage: res2[3],
comment: res1[1]
}
users.push(user)
}
return new Promise((resolve, reject) => {
if (users.length == 0) {
reject("爬取結果為空!")
} else {
resolve(users)
}
})
}
async function scrapyComments() {
//爬取200條資料
while (startNum < 20 * 10) {
var goalUrl = "https://movie.douban.com/subject/24733428/comments?start=" + startNum + "&limit=20&status=P&sort=new_score"
let users = await req(goalUrl);
startNum += 20;
let strUser = JSON.stringify(users)
writeData("/心靈奇旅/comments.json", strUser)
}
console.log("爬取成功!");
}
scrapyComments()
//寫入資料
function writeData(fileName, data) {
let filePath = path.join(__dirname, fileName)
let dirName = path.dirname(fileName)
let dirPath = path.join(__dirname, dirName)
//檔案不存在則建立
fs.access(dirPath, (err) => {
if (err) {
fs.mkdirSync(dirPath, err => {
console.log(err);
})
}
});
fs.open(filePath, 'wx', (err, fd) => {
//寫入
fs.writeFile(filePath, data, {
flag: 'a+'
}, err => {
if (err) {
console.error(err)
return
}
})
})
}
注:代碼小白,寫的不好,多指教。