fossasia/loklak_webclient

View on GitHub
iframely/lib/utils.js

Summary

Maintainability
F
6 days
Test Coverage
var events = require('events');
var request = require('request');
var zlib = require('zlib');
var iconv = require('iconv-lite');
var async = require('async');
var imagesize = require('imagesize');
var moment = require('moment');
var _ = require('underscore');

var cache = require('./cache');
var htmlUtils = require('./html-utils');

if (!GLOBAL.CONFIG) {
    GLOBAL.CONFIG = require('../config');
}

var prepareRequestOptions = exports.prepareRequestOptions = function(request_options) {

    if (CONFIG.PROXY) {

        var url = request_options.uri || request_options.url;

        // TODO: make 'for'.
        var proxy = _.find(CONFIG.PROXY, function (p) {
            return _.find(p.re, function (re) {
                return url.match(re);
            });
        });
        if (proxy) {
            if (proxy.proxy_server) {
                request_options.proxy = proxy.proxy_server;
            }
            if (proxy.user_agent && request_options.headers) {
                request_options.headers['User-Agent'] = proxy.user_agent;
            }
        }
    }

    return request_options;
};

/**
 * @public
 * Do HTTP GET request and handle redirects
 * @param url Request uri (parsed object or string)
 * @param {Object} options
 * @param {Number} [options.maxRedirects]
 * @param {Boolean} [options.fullResponse] True if need load full page response. Default: false.
 * @param {Function} [callback] The completion callback function or events.EventEmitter object
 * @returns {events.EventEmitter} The emitter object which emit error or response event
 */
var getUrl = exports.getUrl = function(url, options) {

    var req = new events.EventEmitter();

    var options = options || {};

    // Store cookies between redirects and requests.
    var jar = options.jar;
    if (!jar) {
        jar = request.jar();
    }

    process.nextTick(function() {
        try {

            var supportGzip = !process.version.match(/^v0\.8/);

            var r = request(prepareRequestOptions({
                uri: url,
                method: 'GET',
                headers: {
                    'User-Agent': CONFIG.USER_AGENT,
                    'Connection': 'close',
                    'Accept-Encoding': supportGzip ? 'gzip' : ''
                },
                maxRedirects: options.maxRedirects || 3,
                timeout: options.timeout || CONFIG.RESPONSE_TIMEOUT,
                followRedirect: options.followRedirect,
                jar: jar
            }))
                .on('error', function(error) {
                    req.emit('error', error);
                })
                .on('response', function(res) {

                    if (supportGzip && ['gzip', 'deflate'].indexOf(res.headers['content-encoding']) > -1) {

                        var gunzip = zlib.createUnzip();
                        gunzip.request = res.request;
                        gunzip.statusCode = res.statusCode;
                        gunzip.headers = res.headers;

                        if (!options.asBuffer) {
                            gunzip.setEncoding("binary");
                        }

                        req.emit('response', gunzip);

                        res.pipe(gunzip);

                    } else {

                        if (!options.asBuffer) {
                            res.setEncoding("binary");
                        }

                        req.emit('response', res);
                    }
                });

            req.emit('request', r);

        } catch (ex) {
            console.error('Error on getUrl for', url, '.\n Error:' + ex);
            req.emit('error', ex);
        }
    });
    return req;
};

var getHead = function(url, options) {

    var req = new events.EventEmitter();

    var options = options || {};

    // Store cookies between redirects and requests.
    var jar = options.jar;
    if (!jar) {
        jar = request.jar();
    }

    process.nextTick(function() {
        try {

            var r = request({
                uri: url,
                method: 'HEAD',
                headers: {
                    'User-Agent': CONFIG.USER_AGENT,
                    'Connection': 'close'
                },
                maxRedirects: options.maxRedirects || 3,
                timeout: options.timeout || CONFIG.RESPONSE_TIMEOUT,
                followRedirect: options.followRedirect,
                jar: jar
            })
                .on('error', function(error) {
                    req.emit('error', error);
                })
                .on('response', function(res) {
                    req.emit('response', res);
                });

            req.emit('request', r);

        } catch (ex) {
            console.error('Error on getHead for', url, '.\n Error:' + ex);
            req.emit('error', ex);
        }
    });
    return req;
};

exports.getCharset = function(string, doNotParse) {
    var charset;

    if (doNotParse) {
        charset = string.toUpperCase();
    } else if (string) {
        var m = string && string.match(/charset\s*=\s*([\w_-]+)/i);
        charset = m && m[1].toUpperCase();
    }

    return charset;
};

exports.encodeText = function(charset, text) {
    try {
        var b = iconv.encode(text, "ISO8859-1");
        return iconv.decode(b, charset || "UTF-8");
    } catch(e) {
        return text;
    }
};

/**
 * @public
 * Get image size and type.
 * @param {String} uri Image uri.
 * @param {Object} [options] Options.
 * @param {Boolean} [options.cache] False to disable cache. Default: true.
 * @param {Function} callback Completion callback function. The callback gets two arguments (error, result) where result has:
 *  - result.format
 *  - result.width
 *  - result.height
 *
 *  error == 404 if not found.
 * */
exports.getImageMetadata = function(uri, options, callback){

    if (typeof options === 'function') {
        callback = options;
        options = {};
    }

    options = options || {};

    cache.withCache("image-meta:" + uri, function(callback) {

        var loadImageHead, imageResponseStarted, totalTime, timeout, contentLength;
        var requestInstance = null;

        function finish(error, data) {

            if (timeout) {
                clearTimeout(timeout);
                timeout = null;
            } else {
                return;
            }

            // We don't need more data. Abort causes error. timeout === null here so error will be skipped.
            requestInstance && requestInstance.abort();

            if (!error && !data) {
                error = 404;
            }

            data = data || {};

            if (options.debug) {
                data._time = {
                    imageResponseStarted: imageResponseStarted || totalTime(),
                    loadImageHead: loadImageHead && loadImageHead() || 0,
                    total: totalTime()
                };
            }

            if (error && error.message) {
                error = error.message;
            }

            if ((typeof error === 'string' && error.indexOf('ENOTFOUND') > -1) ||
                error === 500) {
                error = 404;
            }

            if (error) {
                data.error = error;
            }

            callback(null, data);
        }

        timeout = setTimeout(function() {
            finish("timeout");
        }, options.timeout || CONFIG.RESPONSE_TIMEOUT);

        if (options.debug) {
            totalTime = createTimer();
        }

        async.waterfall([
            function(cb){

                getUrl(uri, {
                    timeout: options.timeout || CONFIG.RESPONSE_TIMEOUT,
                    maxRedirects: 5,
                    asBuffer: true
                })
                    .on('request', function(req) {
                        requestInstance = req;
                    })
                    .on('response', function(res) {

                        var content_type = res.headers['content-type'];

                        if (content_type && content_type !== 'application/octet-stream' && content_type !== 'binary/octet-stream') {

                            if (content_type.indexOf('image') === -1) {
                                cb('invalid content type: ' + res.headers['content-type']);
                            }

                        } else {

                            if (!uri.match(/\.(jpg|png|gif)(\?.*)?$/i)) {
                                cb('invalid content type: no content-type header and file extension');
                            }
                        }

                        if (res.statusCode == 200) {
                            if (options.debug) {
                                imageResponseStarted = totalTime();
                            }
                            contentLength = parseInt(res.headers['content-length'] || '0', 10);
                            imagesize(res, cb);
                        } else {
                            cb(res.statusCode);
                        }
                    })
                    .on('error', function(error) {
                        cb(error);
                    });
            },
            function(data, cb){

                if (options.debug) {
                    loadImageHead = createTimer();
                }

                if (contentLength) {
                    data.content_length = contentLength;
                }

                cb(null, data);
            }
        ], finish);

    }, {
        disableCache: options.disableCache,
        ttl: CONFIG.IMAGE_META_CACHE_TTL || CONFIG.CACHE_TTL
    }, callback);
};

exports.getUriStatus = function(uri, options, callback) {

    if (typeof options === 'function') {
        callback = options;
        options = {};
    }

    options = options || {};

    cache.withCache("status:" + uri, function(cb) {

        var time, timeout;

        function finish(error, data) {

            if (timeout) {
                clearTimeout(timeout);
                timeout = null;
            } else {
                return;
            }

            data = data || {};

            if (error) {
                data.error = error;
            }

            if (options.debug) {
                data._time = time();
            }

            cb(null, data);
        }

        timeout = setTimeout(function() {
            finish("timeout");
        }, options.timeout || CONFIG.RESPONSE_TIMEOUT);

        if (options.debug) {
            time = createTimer();
        }

        getUriStatus(uri, options, finish);

    }, {
        disableCache: options.disableCache,
        ttl: CONFIG.IMAGE_META_CACHE_TTL || CONFIG.CACHE_TTL
    }, callback);
};

exports.getContentType = function(uriForCache, uriOriginal, options, cb) {

    cache.withCache("content-type:" + uriForCache, function(cb) {

        var timeout, requestInstance, totalTime;

        function finish(error, content_type) {

            if (timeout) {
                clearTimeout(timeout);
                timeout = null;
            } else {
                return;
            }

            // We don't need more data. Abort causes error. timeout === null here so error will be skipped.
            requestInstance && requestInstance.abort();

            var data = {};

            if (options.debug) {
                data._time = totalTime();
            }

            if (error) {
                data.error = error;
            }

            if (!error && !data) {
                data.error = 404;
            }

            data.type = content_type;

            cb(null, data);
        }

        timeout = setTimeout(function() {
            finish("timeout");
        }, options.timeout || CONFIG.RESPONSE_TIMEOUT);

        if (options.debug) {
            totalTime = createTimer();
        }

        getHead(uriOriginal, {
            timeout: options.timeout || CONFIG.RESPONSE_TIMEOUT,
            maxRedirects: 5
        })
            .on('request', function(req) {
                requestInstance = req;
            })
            .on('response', function(res) {

                var content_type = res.headers['content-type'];

                if (content_type) {
                    content_type = content_type.split(';')[0];
                }

                finish(null, content_type);
            })
            .on('error', function(error) {
                finish(error);
            });

    }, {disableCache: options.disableCache}, cb);
};

var NOW = new Date().getTime();

exports.unifyDate = function(date) {

    if (typeof date === "string" && date.match(/^\d+$/)) {
        date = parseInt(date);
    }

    if (typeof date === "number") {

        if (date === 0) {
            return null;
        }

        // Check if date in seconds, not miliseconds.
        if (NOW / date > 100) {
            date = date * 1000;
        }

        return new Date(date).toISOString();
    }

    // TODO: time in format 'Mon, 29 October 2012 18:15:00' parsed as local timezone anyway.
    var timestamp = Date.parse(date);

    if (isNaN(timestamp)) {
        // allow Z at the end which Node doesn't seem to parse into timestamp otherwise        
        timestamp = Date.parse(date.replace(/Z$/, ''));
    }

    if (!isNaN(timestamp)) {
        return new Date(timestamp).toISOString();
    }

    // Bad date to parse.
    return null;
};


var lowerCaseKeys = exports.lowerCaseKeys = function(obj) {
    for (var k in obj) {
        var lowerCaseKey = k.toLowerCase();
        if (lowerCaseKey != k) {
            obj[lowerCaseKey] = obj[k];
            delete obj[k];
            k = lowerCaseKey;
        }
        if (typeof obj[k] == "object") {
            lowerCaseKeys(obj[k]);
        }
    }
};

exports.sendLogToWhitelist = function(uri, meta, oembed, whitelistRecord) {

    if (!CONFIG.WHITELIST_LOG_URL) {
        return
    }

    var data = getWhitelistLogData(meta, oembed);

    if (data) {

        if (whitelistRecord && !whitelistRecord.isDefault) {
            // Check if all detected rels present in whitelist record.
            var hasNew = false;
            for(var key in data) {
                var bits = key.split('_');
                var source = bits[0];
                var rel = bits[1];
                // Skip existing source-rel.
                if (!whitelistRecord[source] || !whitelistRecord[source][rel]) {
                    hasNew = true
                }
            }
            if (!hasNew) {
                return hasNew;
            }
        }

        data.uri = uri;

        request({
            uri: CONFIG.WHITELIST_LOG_URL,
            method: 'GET',
            qs: data
        })
            .on('error', function(error) {
                console.error('Error logging url:', uri, error);
            })
            .on('response', function(res) {
                if (res.statusCode !== 200) {
                    console.error('Error logging url:', uri, res.statusCode);
                }
            });
    }
};

exports.filterLinks = function(data, options) {

    var links = data.links;

    for(var i = 0; i < links.length;) {

        var link = links[i];

        // SSL.

        var isImage = link.type.indexOf('image') === 0;
        var isHTML5Video = link.type.indexOf('video/') === 0;

        if (options.filterNonSSL) {

            var sslProtocol = link.href && link.href.match(/^(https:)?\/\//i);
            var hasSSL = link.rel.indexOf('ssl') > -1;

            if (sslProtocol || hasSSL || isImage || isHTML5Video) {
                // Good: has ssl.
            } else {
                // Filter non ssl if required.
                link.error = true;
            }
        }

        // HTML5.

        if (options.filterNonHTML5) {
            var hasHTML5 = link.rel.indexOf('html5') > -1;
            var isReader = link.rel.indexOf('reader') > -1;

            if (hasHTML5 || isImage || isHTML5Video || isReader) {
                // Good: is HTML5.
            } else {
                // Filter non HTML5 if required.
                link.error = true;
            }
        }

        // Max-width.

        if (options.maxWidth) {

            var isImage = link.type.indexOf('image') === 0;
            // TODO: force make html5 video responsive?
            var isHTML5Video = link.type.indexOf('video/') === 0;

            var m = link.media;

            if (m && !isImage && !isHTML5Video) {
                if (m.width && m.width > options.maxWidth) {
                    link.error = true;
                } else if (m['min-width'] && m['min-width'] > options.maxWidth) {
                    link.error = true;
                }
            }
        }

        if (link.error) {
            links.splice(i, 1);
        } else {
            i++;
        }
    }
};

function iterateLinks(links, func) {

    if (links instanceof Array) {

        return links.forEach(func);

    } else if (typeof links === 'object') {

        for(var id in links) {
            var items = links[id];
            if (items instanceof Array) {
                items.forEach(func);
            }
        }
    }
}

exports.generateLinksHtml = function(data, options) {

    // Links may be grouped.

    var links = data.links;

    iterateLinks(links, function(link) {
        if (!link.html && !link.type.match(/^image/)) {

            // Force make mp4 video to be autoplay in autoplayMode.
            if (options.autoplayMode && link.type.indexOf('video/') === 0 && link.rel.indexOf('autoplay') === -1) {
                link.rel.push('autoplay');
            }

            var html = htmlUtils.generateLinkElementHtml(link, {
                iframelyData: data
            });
            if (html) {
                link.html = html;
            }
        }
    });

    if (!data.html) {

        var links_list = [];
        iterateLinks(links, function(link) {
            links_list.push(link);
        });
        var plain_data = _.extend({}, data, {links:links_list});

        // Prevent override main html field.
        var mainLink = htmlUtils.findMainLink(plain_data, options);

        if (mainLink) {

            var html = mainLink.html;

            if (!html && mainLink.type.match(/^image/)) {
                html = htmlUtils.generateLinkElementHtml(mainLink, {
                    iframelyData: data
                });
            }

            if (html) {
                data.rel = mainLink.rel;
                data.html = html;
            }
        }
    }

    data.rel = data.rel || [];
};

//====================================================================================
// Private
//====================================================================================

var getUriStatus = function(uri, options, cb) {

    var r = request({
        uri: uri,
        method: 'GET',
        headers: {
            'User-Agent': CONFIG.USER_AGENT
        },
        maxRedirects: 5,
        timeout: options.timeout || CONFIG.RESPONSE_TIMEOUT,
        jar: request.jar()   //Enable cookies, uses new jar
    })
        .on('error', cb)
        .on('response', function(res) {
            r.abort();
            cb(null, {
                code: res.statusCode,
                content_type: res.headers['content-type'],
                content_length: res.headers['content-length'] ? parseInt(res.headers['content-length'] || '0', 10) : null
            });
        });
};

var createTimer = exports.createTimer = function() {

    var timer = new Date().getTime();

    return function() {
        return new Date().getTime() - timer;
    };
};

var SHOPIFY_OEMBED_URLS = ['shopify.com', '/collections/', '/products/'];

function isYoutube(meta) {

    var video;
    if (meta.og && (video = meta.og.video)) {
        if (!(video instanceof Array)) {
            video = [video];
        }
        for(var i = 0; i < video.length; i++) {
            var v = video[i];

            var url = v.url || v;

            if (url.indexOf && url.indexOf('youtube') > -1) {
                return true;
            }

            if (v.secure_url && v.secure_url.indexOf && v.secure_url.indexOf('youtube') > -1) {
                return true;
            }
        }
    }

    return false;
}

function getWhitelistLogData(meta, oembed) {

    var r = {};

    if (meta) {
        var isJetpack = meta.twitter && meta.twitter.card === 'jetpack';
        var isWordpress = meta.generator && /wordpress/i.test(meta.generator);

        var isShopify = false;
        if (meta.alternate) {
            var alternate = meta.alternate instanceof Array ? meta.alternate : [meta.alternate];
            var oembedLink;
            for(var i = 0; !oembedLink && i < alternate.length; i++) {
                var a = alternate[i];
                if (a.type && a.href && a.type.indexOf('oembed') > -1) {
                    oembedLink = a;
                }
            }
            if (oembedLink) {
                for(var i = 0; !isShopify && i < SHOPIFY_OEMBED_URLS.length; i++) {
                    if (oembedLink.href.indexOf(SHOPIFY_OEMBED_URLS[i]) > -1) {
                        isShopify = true;
                    }
                }
            }
        }

        r.twitter_photo =
            (meta.twitter && meta.twitter.card === 'photo')
            &&
            (meta.og && meta.og.type !== "article")
            &&
            !isJetpack
            &&
            !isWordpress
            &&
            (meta.twitter && meta.twitter.site !== 'tumblr')
            && (
                (meta.twitter && !!meta.twitter.image)
                ||
                (meta.og && !!meta.og.image)
            );

        r.twitter_player =
            meta.twitter && !!meta.twitter.player;

        r.twitter_stream =
            meta.twitter && meta.twitter.player && !!meta.twitter.player.stream;

        r.og_video =
            (meta.og && !!meta.og.video)
            && !isYoutube(meta);

        r.video_src =
            !!meta.video_src;

        r.sm4_video =
            !!(meta.sm4 && meta.sm4.video && meta.sm4.video.embed)
    }

    if (oembed && oembed.type !== 'link' && !(oembed.type == 'rich' && isWordpress)) {        
        r['oembed_' + oembed.type] = true;
    }

    var hasTrue = false;
    var result = {};
    for(var k in r) {
        if (r[k]) {
            result[k] = r[k];
            hasTrue = true;
        }
    }
    
    return hasTrue && result;
}