以https://fxhblog.top/为爬取对象,实现一个简单的网页抓取器
- 在node环境下安装superagent和cheerio
cnpm install superagent --save
cnpm install cheerio --save
superAgent.get(aimUrl).end(function (err, res) {
if (err) {
console.log("访问出错");
}
let $ = cheerio.load(res.text);
});
let $ = cheerio.load(res.text);//cheerio中类似于jq的写法
- 将获取到的response分析后,提取有用的信息,比如文章的URL等等
let allLink = $('.post-type-normal .post-block link');
allLink.each(function (index, current) {
let _this = $(current);
let currentArticle = {};
currentArticle.articleUrl = encodeURI(_this["0"].attribs.href);
currentArticle.articleTitle = getTitleByUrl(_this["0"].attribs.href);
currentArticle.No = index;
articleList.push(currentArticle);
let promiseSpiders = constructPromiseSpider(articleList);
});
const getTitleByUrl = (_url) => {
const title = _url.split("/");
return title[title.length - 2];
};//获取每篇文章的标题
- 将每一个文章URL单独包裹在一个promise中用于爬取
const constructPromiseSpider = (data) => {
return data.map(function (current) {
return new Promise(function (resolve, reject) {
superAgent.get(current.articleUrl).end(function (err, res) {
if (err) {
console.log("爬取文章详情出错");
}
res.No = current.No;
res.Url = current.articleUrl;
resolve(res);
});
});
});
};
- 当页面爬取完成调用Promise.all([传入一个promise对象数组])进行第二次数据分析提取有价值的信息,比如文章发表时间,文章字数,文章分类等等
const statisticArticle = (resArray) => {
resArray.forEach(function (currentRes) {
let $ = cheerio.load(currentRes.text);
let currentArticle = {
"文章描述信息": {}
};
currentArticle["编号"] = currentRes.No;
currentArticle["文章地址"] = currentRes.Url;
currentArticle["标题"] = $(".post-header .post-title").text();
currentArticle["文章描述信息"]["发表时间"] = $(".post-header .post-meta time").text().split("\n").join("").trim();
currentArticle["文章描述信息"]["文章分类"] = $(".post-header .post-meta span a").attr("href");
currentArticle["文章描述信息"]["文章字数"] = $(".post-header .post-meta .post-wordcount .post-meta-item-text").next().text().split("\n").join("").split(" ")[1];
currentArticle["文章描述信息"]["阅读次数"] = $(".post-header .post-meta .post-wordcount .post-meta-item-text").next().text().split("\n").join("").split(" ")[2].trim();
currentArticle["文章内容"] = $(".post-body").text();
articleDetailList.push(currentArticle);
});
return articleDetailList;
};
- 将构造好的数据存入本地JSON文件中,完成网页爬取
const createJSON = (data) => {
let _data = {};
data.forEach(function (current) {
_data[`编号${current["编号"]}`] = current;
});
const dataToJson = JSON.stringify(_data);
fs.writeFile("./FxhBlog.json", dataToJson, function (err) {
if (err) {
console.log("写入失败", err);
}
console.log("Success!");
});
};