aimer1124/JianshuSpider

View on GitHub
util/syncData.js

Summary

Maintainability
B
6 hrs
Test Coverage
var schedule = require('node-schedule');

var myPageHref = '/users/552f687b314b';
var moment = require('moment');
var today = moment(new Date()).format("YYYY-MM-DD");
var cheerio = require('cheerio');
var async = require('async');

var getURL = require('../proxy/getURL');
var articleProxy = require('../proxy/article');
var userProxy = require('../proxy/user');
var myInfoProxy = require('../proxy/myInfo');
var collectionsProxy = require('../proxy/collections');

function articleInfo() {

    var articleTitles = [];
    // console.log('获取数据开始');
    getURL.getPageContent('/', function (err, gres, next) {

        if (err) {
            console.log("访问首页失败");
        } else {
            var $ = cheerio.load(gres.text);
            $('.article-list li').each(function (idx, article) {
                var $article = $(article);
                articleTitles.push({
                    articleTitle: $article.find('.title a').text(),
                    author: $article.find('.author-name').text(),
                    authorHref: $article.find('.author-name').attr('href'),
                    articleHref: $article.find('.title a').attr('href')
                });
            });

            var conCurrencyCount = 0;
            var fetchUrl = function (article, callback) {
                var delay = parseInt((Math.random() * 10000000) % 2000, 10);
                conCurrencyCount++;
                // console.log('并发数:' + conCurrencyCount + ',访问的页面是:' + article.authorHref + ',控制的延迟:' + delay);
                getURL.getPageContent(article.authorHref, function (err, res) {
                    if (err) {
                        console.log('访问页面:' + article.authorHref + '失败');
                    } else {
                        var $ = cheerio.load(res.text);
                        var favorite = $('.clearfix').find('b').eq(4).text();
                        var follower = $('.clearfix').find('b').eq(1).text();


                        articleProxy.findByHref(article.articleHref, function (err, findArticle) {
                            if (findArticle.length == 0) {
                                articleProxy.saveArticle(article);
                            }
                        });

                        userProxy.getUserById(article.authorHref, function (err, findAuthor) {
                            if (findAuthor.length == 0) {
                                userProxy.saveUser(article, favorite, follower, function (err) {
                                    if (err) console.log(err);
                                });
                            } else {
                                userProxy.updateUser(article, favorite, follower, function (err) {
                                    if (err) console.log(err);
                                })
                            }
                        });
                    }
                });
                setTimeout(function () {
                    conCurrencyCount--;
                    callback(null, article + ' html content');
                }, delay);
            };

            async.mapLimit(articleTitles, 5, function (article, callback) {
                fetchUrl(article, callback);
            }, function (err) {
                if (err) return next(err);
                // console.log('获取数据结束');
            });
        }
    });
}

function getMyInfo() {
    myInfoProxy.getToday(today, function (err, result) {
        getURL.getPageContent(myPageHref, function (err, res) {
            if (err) {
                console.log('访问页面:' + myPageHref + '失败');
            } else {
                var $ = cheerio.load(res.text);
                var favorite = $('.clearfix').find('b').eq(4).text();
                var follower = $('.clearfix').find('b').eq(1).text();

                if (result.length == 0) {
                    myInfoProxy.saveInfo(today, favorite, follower, function (err) {
                        if (err) return next(err);
                    });
                } else {
                    myInfoProxy.updateInfo(today, favorite, follower, function (err) {
                        if (err) return next(err);
                    })
                }
            }
        });
    });
}

function getCollections() {
    var now = moment().format('x');
    var collectionsList = [];
    for (var i = 0; i <= 50; i++){
        collectionsList.push({
            url: "/collections?page=" + i + "&_=" + now
        })
    }

    var conCurrencyCount = 0;
    var fetchCollectionUrl = function (collectionUrl, callback) {
        var delay = parseInt((Math.random() * 10000000) % 2000, 10);
        conCurrencyCount++;

        getURL.getPageContent(collectionUrl.url, function (err, res) {
            if (err) {
                console.log('访问页面:' + collectionUrl.url + '失败');
            } else {
                var $ = cheerio.load(res.text);
                if ($('div').find('h1').text() == "您要找的页面不存在") {
                    console.log('页面不存在');
                    count = 50;
                } else {
                    $('#all-collections li .collections-info').each(function (idx, collectionEle) {
                        var href = $(collectionEle).find('h5 a').attr('href');
                        var articleCount = $(collectionEle).find('.blue-link').text();
                        var follower = getCollectionFollower($(collectionEle).find('p').last().text());
                        var collection = [];
                        collection.push({
                            id: href.split('/')[href.split('/').length - 1].toString(),
                            title: $(collectionEle).find('h5 a').text(),
                            articleCount: articleCount.split('篇')[0],
                            follower: follower,
                            description: $(collectionEle).find('.description').text()
                        });
                        collectionsProxy.saveAndUpdateCollections(collection[0],function (err) {
                            if (err)
                                console.log('保存失败'+ err);
                        });
                    });
                }
            }
        });
        setTimeout(function () {
            conCurrencyCount--;
            callback(null, collectionUrl + ' html content');
        }, delay);

    };



    async.mapLimit(collectionsList, 5, function (collectionUrl, callback) {
        fetchCollectionUrl(collectionUrl, callback);
    }, function (err) {
        if (err) return next(err);
    });

}


function getCollectionFollower(content) {
    var splitContent = content.split('·')[content.split('·').length - 1];
    var follower = splitContent.split('人')[0];
    if (follower.indexOf('K') > -1) {
        return follower.split('K')[0] * 1000;
    }else {
        return follower;
    }
}

function syncMyInfoAndArticle() {


    //every 5 Minutes
    schedule.scheduleJob("*/5 * * * *", function () {
        console.log('Sync myInfo and article...');
        articleInfo();
        getMyInfo();
    });

}

function syncCollections() {

    var rule = new schedule.RecurrenceRule();
    rule.minute = 17;
    //every 4 Hours
    schedule.scheduleJob(rule, function () {
        console.log('Sync collections...');

        getCollections();
    });
}

function syncData() {

    syncMyInfoAndArticle();

    syncCollections();
}

exports.syncData = syncData;