wbio/reviews-collector-android

View on GitHub
src/index.js

Summary

Maintainability
D
2 days
Test Coverage
'use strict';

const Crawler = require('node-webcrawler');
const cheerio = require('cheerio');
const _ = require('lodash');
const EventEmitter = require('events').EventEmitter;
const firstPage = 0;


class Collector {

    /**
     * Initialize a new instance of Collector
     * @param {string} appId - The app ID to collect reviews for
     * @param {Object} options - Configuration options for the review collection
     */
    constructor(apps, options) {
        if (options && options.maxPages && options.checkBeforeContinue) {
            console.error('Warning: The \'maxPages\' option will be ignored when \'checkBeforeContinue\' is present');
        }
        const defaults = {
            maxPages: 5,
            userAgent: 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.85 Safari/537.36',
            delay: 5000,
            maxRetries: 3,
        };
        this.options = _.assign(defaults, options);
        this.apps = {};
        if (_.isArray(apps)) {
            _.forEach(apps, (appId) => {
                if (typeof appId !== 'string') {
                    throw new Error('App IDs must be strings');
                }
                this.apps[appId] = {
                    appId: appId,
                    retries: 0,
                    pageNum: firstPage,
                };
            });
        } else if (_.isString(apps)) {
            // 'apps' is a single app ID string
            this.apps[apps] = {
                appId: apps,
                retries: 0,
                pageNum: firstPage,
            };
        } else {
            throw new Error('You must provide either a string or an array for the \'apps\' argument');
        }
        this.emitter = new EventEmitter();
    }

    /**
     * Collect reviews for the Collector's app using the options provided in the constructor
     */
    collect() {
        // Preserve our reference to 'this'
        const self = this;
        // Get a list of app IDs
        const appIds = _.keys(self.apps);
        // Keep track of what we're processing
        let currentApp;
        let currentPage;
        let nextStepDecided;    // Whether or not 'continue()' or 'stop()' has been called

        // Setup the Crawler instance
        const c = new Crawler({
            maxConnections: 1,
            userAgent: self.options.userAgent,
            followRedirect: true,
            followAllRedirects: true,
            callback: function processRequest(error, result) {
                if (error) {
                    console.error(`Could not complete the request: ${error}`);
                    requeue();
                } else {
                    parse(result);
                }
            },
        });

        // Queue the first app
        processNextApp();

        /**
         * Collect reviews for the next app in the list (if one exists)
         */
        function processNextApp() {
            if (appIds.length > 0) {
                currentApp = appIds.shift();
                currentPage = firstPage;
                queuePage();
            } else {
                emit('done with apps');
            }
        }

        /**
         * Add a page to the Crawler queue to be parsed
         * @param {number} pageNum - The page number to be collected (0-indexed)
         */
        function queuePage() {
            // Delay the request for the specified # of milliseconds
            setTimeout(() => {
                const url = `https://play.google.com/store/getreviews?id=${currentApp}&reviewSortOrder=0&reviewType=1&pageNum=${currentPage}`;
                const postData = {
                    xhr: '1',
                };
                // Add the url to the Crawler queue
                c.queue({
                    uri: url,
                    method: 'POST',
                    headers: {
                        'User-Agent': self.options.userAgent,
                        'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',
                        'Content-Length': formToString(postData).length,
                    },
                    form: postData,
                });
            }, self.options.delay);
        }

        /**
         * Parse a reviews page and emit review objects
         * @param {string} result - The page HTML
         * @param {number} pageNum - The number of the page that is being parsed
         */
        function parse(result) {
            const html = responseToHtml(result);
            if (typeof html === 'undefined') {
                // We got an invalid response
                requeue();
            } else if (html === null) {
                // There were no more reviews
                emit('done collecting', {
                    appId: currentApp,
                    pageNum: currentPage,
                });
            } else if (typeof html === 'string') {
                // We got a valid response, proceed
                const converted = htmlToReviews(html, currentApp, currentPage, emit);
                if (converted.error) {
                    console.error(`Could not turn response into reviews: ${converted.error}`);
                    requeue();
                } else {
                    const numReviewsFound = converted.reviews.length;
                    // Reset retries
                    self.apps[currentApp].retries = 0;
                    // Let our listener(s) know we finished a page
                    const objToEmit = {
                        appId: currentApp,
                        pageNum: currentPage,
                        reviews: converted.reviews,
                    };
                    // Set the firstReviewTime and lastReviewTime
                    if (numReviewsFound > 0) {
                        objToEmit.firstReviewTime = converted.reviews[numReviewsFound - 1].date;
                        objToEmit.lastReviewTime = converted.reviews[0].date;
                    }
                    // Reset nextStepDecided
                    nextStepDecided = false;
                    if (self.options.checkBeforeContinue) {
                        // stop() should always call stopProcessingApp()
                        objToEmit.stop = stopProcessingApp;
                        // If we had reviews, user can continue, if not, calling continue should move to next app
                        if (numReviewsFound > 0) {
                            objToEmit.continue = continueProcessingApp;
                        } else {
                            objToEmit.continue = stopProcessingApp;
                        }
                    }
                    // Emit the object
                    emit('page complete', objToEmit);
                    // If we don't have to wait for the user to tell us to continue, we can do it ourselves
                    if (!self.options.checkBeforeContinue) {
                        if (numReviewsFound > 0 &&
                            (
                                self.options.maxPages === 0 ||
                                currentPage + 1 < self.options.maxPages + firstPage
                            )
                        ) {
                            continueProcessingApp();
                        } else {
                            stopProcessingApp();
                        }
                    }
                }
            }
        }

        /**
         * Requeue a page if we aren't over the retries limit
         * @param {number} pageNum - The number of the page to requeue
         */
        function requeue() {
            self.apps[currentApp].retries++;
            if (self.apps[currentApp].retries < self.options.maxRetries) {
                queuePage();
            } else {
                emit('done collecting', {
                    appId: currentApp,
                    pageNum: currentPage,
                    appsRemaining: appIds.length,
                    error: new Error('Retry limit reached'),
                });
                // Move on to the next app
                processNextApp();
            }
        }

        /**
         * Process the next page of the current app
         */
        function continueProcessingApp() {
            // Make sure that the user doesn't call both stop() and continue() for the same page
            if (!nextStepDecided) {
                // Set nextStepDecided to true
                nextStepDecided = true;
                // Increment currentPage and queue it
                currentPage++;
                queuePage();
            }
        }

        /**
         * Stop processing the current app and go on to the next app
         */
        function stopProcessingApp() {
            // Make sure that the user doesn't call both stop() and continue() for the same page
            if (!nextStepDecided) {
                // Set nextStepDecided to true
                nextStepDecided = true;
                // Emit the 'done collecting' event
                emit('done collecting', {
                    appId: currentApp,
                    pageNum: currentPage,
                    appsRemaining: appIds.length,
                });
                // Move on to the next app
                processNextApp();
            }
        }

        /**
         * Emit a message with the event emitter
         * @param {string} event - The event to emit
         * @param {Object} obj - The object to emit with the event
         */
        function emit(event, obj) {
            const toEmit = obj || {};
            // Add the OS to the emit message
            toEmit.os = 'Android';
            self.emitter.emit(event, toEmit);
        }
    }

    /**
     * Attach event handlers to the Collector's event emitter
     * @param {string} event - The name of the event to listen for
     * @param {funtion} action - The function to be executed each time this event is emitted
     */
    on(event, action) {
        this.emitter.on(event, action);
    }

}
module.exports = Collector;

/**
 * Convert HTML extracted from the reviews JSON object into an array of reviews
 * @param {string} html - The HTML extracted via #responseToHtml
 * @param {string} appId - The app ID of the app that the given HTML is from
 * @param {Function} emit - The collector's emit() function
 * @return {Object[]} An array of review objects
 */
function htmlToReviews(html, appId, pageNum, emit) {
    try {
        const $ = cheerio.load(html);
        const reviewObjs = $('.single-review');
        const reviews = [];
        // Get the reviews
        _.forEach(reviewObjs, (reviewObj) => {
            const review = {};
            const reviewInfo = $(reviewObj).find('.review-info');
            const reviewBody = $(reviewObj).children('.review-body');
            // Review ID
            const id = $(reviewObj).children('.review-header').attr('data-reviewid');
            review.id = id;
            // Review Date
            const dateStr = $(reviewInfo).children('.review-date').text();
            review.date = new Date(dateStr);
            // Review Rating
            const ratingStr = $(reviewInfo).find('.current-rating').attr('style');
            const widthRegex = /width: ([0-9]{2,3})%/;
            review.rating = Number(widthRegex.exec(ratingStr)[1]) / 20;
            // Review Title
            review.title = $(reviewBody).children('.review-title').text().trim();
            // Review Text
            review.text = $(reviewBody)
                .clone()
                .children()
                .remove()
                .end()
                .text()
                .trim();
            // Add it to our reviews array
            reviews.push(review);
            // Let our listener(s) know
            emit('review', {
                appId: appId,
                pageNum: pageNum,
                review: review,
            });
        });
        // Return our reviews
        return { reviews: reviews };
    } catch (err) {
        return { error: err };
    }
}

/**
 * Extract the HTML from the HTTP request's response
 * @param {Object} response - the response returned from the HTTP requesy
 * @return {string|null|undefined} String if response was valid, null if no reviews, undefined if invalid response
 */
function responseToHtml(response) {
    if (response.headers['content-type'] === 'application/json; charset=utf-8') {
        try {
            const decoded = decodeUnicode(decodeUTF8(response.body));
            const body = JSON.parse(removeLeadingChars(decoded));
            if (_.isArray(body) && body.length > 0) {
                const arr = body[0];
                if (_.isArray(arr) && arr.length === 4) {
                    return arr[2];
                }
                console.log('No more reviews for this app');
                return null;
            }
            console.error('Unexpected response - JSON was not in the format we expected');
            return undefined;
        } catch (err) {
            console.error('Unexpected response - JSON was invalid');
            return undefined;
        }
    }
    console.error('Unexpected response - was not in JSON format');
    return undefined;
}

/**
 * Helper function to get rid of the extraneous characters at the beginning of the response
 * @param {string} str - The response string to remove the characters from
 * @return {string} - The string with the leading characters removed
 */
function removeLeadingChars(str) {
    const firstCharAt = str.indexOf('[');
    return str.substring(firstCharAt, str.length);
}

/**
 * Helper function to turn a form object into a URL-encoded string
 * @param {Object} form - The object to be converted
 * @return {[type]} The URL-encoded string
 */
function formToString(form) {
    const keys = Object.keys(form);
    let str = '';
    let i;
    for (i = 0; i < keys.length; i++) {
        if (i > 0) {
            str += '&';
        }
        str += `${encodeURIComponent(keys[i])}=${encodeURIComponent(form[keys[i]])}`;
    }
    return str;
}

/**
 * Helper function to decode a string to unicode
 * @param {string} str - The string to be decoded
 * @return {string} The resultant unicode string
 */
function decodeUnicode(str) {
    if (str) {
        const patt = /\\u([\d\w]{4})/gi;
        return str.replace(patt, (match, grp) => String.fromCharCode(parseInt(grp, 16)));
    }
}

/**
 * Helper function to decode a unicode string to UTF8
 * @param {string} str - The string to be decoded
 * @return {string} The resultant UTF8 string
 */
function decodeUTF8(str) {
    try {
        const encoded = escape(str);
        return decodeURIComponent(encoded);
    } catch (err) {
        return str;
    }
}