使用Node.js中的superagent+cheerio实现一个简单爬虫

以https://fxhblog.top/为爬取对象，实现一个简单的网页抓取器

在node环境下安装superagent和cheerio

cnpm install superagent --save

cnpm install cheerio --save

构造XHR请求

superAgent.get(aimUrl).end(function (err, res) {
    if (err) {
        console.log("访问出错");
    }
    let $ = cheerio.load(res.text);
});

let $ = cheerio.load(res.text);//cheerio中类似于jq的写法

将获取到的response分析后，提取有用的信息，比如文章的URL等等

let allLink = $('.post-type-normal .post-block link');
allLink.each(function (index, current) {
    let _this = $(current);
    let currentArticle = {};
    currentArticle.articleUrl = encodeURI(_this["0"].attribs.href);
    currentArticle.articleTitle = getTitleByUrl(_this["0"].attribs.href);
    currentArticle.No = index;
    articleList.push(currentArticle);
    let promiseSpiders = constructPromiseSpider(articleList);
});

const getTitleByUrl = (_url) => {
    const title = _url.split("/");
    return title[title.length - 2];
};//获取每篇文章的标题

将每一个文章URL单独包裹在一个promise中用于爬取

const constructPromiseSpider = (data) => {
    return data.map(function (current) {
        return new Promise(function (resolve, reject) {
            superAgent.get(current.articleUrl).end(function (err, res) {
                if (err) {
                    console.log("爬取文章详情出错");
                }
                res.No = current.No;
                res.Url = current.articleUrl;
                resolve(res);
            });
        });
    });
};

当页面爬取完成调用Promise.all([传入一个promise对象数组])进行第二次数据分析提取有价值的信息，比如文章发表时间，文章字数，文章分类等等

const statisticArticle = (resArray) => {
    resArray.forEach(function (currentRes) {
        let $ = cheerio.load(currentRes.text);
        let currentArticle = {
            "文章描述信息": {}
        };
        currentArticle["编号"] = currentRes.No;
        currentArticle["文章地址"] = currentRes.Url;
        currentArticle["标题"] = $(".post-header .post-title").text();
        currentArticle["文章描述信息"]["发表时间"] = $(".post-header .post-meta time").text().split("\n").join("").trim();
        currentArticle["文章描述信息"]["文章分类"] = $(".post-header .post-meta span a").attr("href");
        currentArticle["文章描述信息"]["文章字数"] = $(".post-header .post-meta .post-wordcount .post-meta-item-text").next().text().split("\n").join("").split("                  ")[1];
        currentArticle["文章描述信息"]["阅读次数"] = $(".post-header .post-meta .post-wordcount .post-meta-item-text").next().text().split("\n").join("").split("                  ")[2].trim();
        currentArticle["文章内容"] = $(".post-body").text();
        articleDetailList.push(currentArticle);
    });
    return articleDetailList;
};

将构造好的数据存入本地JSON文件中，完成网页爬取

const createJSON = (data) => {
    let _data = {};
    data.forEach(function (current) {
        _data[`编号${current["编号"]}`] = current;
    });
    const dataToJson = JSON.stringify(_data);
    fs.writeFile("./FxhBlog.json", dataToJson, function (err) {
        if (err) {
            console.log("写入失败", err);
        }
        console.log("Success!");
    });
};

以https://fxhblog.top/为爬取对象，实现一个简单的网页抓取器

发表回复 取消回复

发表回复取消回复