fabiocicerchia/salmonjs

View on GitHub
src/parser/phantom.js

Summary

Maintainability
F
1 wk
Test Coverage
/**
 *               __                         _____ _______
 * .-----.---.-.|  |.--------.-----.-----._|     |     __|
 * |__ --|  _  ||  ||        |  _  |     |       |__     |
 * |_____|___._||__||__|__|__|_____|__|__|_______|_______|
 *
 * salmonJS v0.5.0
 *
 * Copyright (C) 2014 Fabio Cicerchia <info@fabiocicerchia.it>
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

var Parser     = require('../parser'),
    fs         = require('fs'),
    spawn      = require('child_process').spawn,
    utils      = new (require('../utils'))(),
    system     = typeof GLOBAL !== 'undefined' && typeof GLOBAL.system !== 'undefined' ? GLOBAL.system : require('system'),
    args       = system.args || system.argv,
    testing    = args.join(' ').indexOf('jasmine-node') !== -1 || args.join(' ').indexOf('grunt') !== -1,
    settings   = !testing ? JSON.parse(args[1]) : {},
    webpageObj = typeof webpage !== 'undefined' ? webpage : require('webpage'),
    page       = webpageObj.create();

/**
 * PhantomParser Class.
 *
 * Use PhantomJS in order to process the page and get all the links and
 * relevant information for the crawler.
 *
 * @class PhantomParser
 * @extends Parser
 */
var PhantomParser = function (utils, spawn, page, settings) {
    /**
     * The WebPage element.
     *
     * @property page
     * @type {Object}
     * @default {Object}
     */
    this.page = page;

    /**
     * Sanitised HTML content.
     *
     * @property sanitisedHtml
     * @type {String}
     * @default ""
     */
    this.sanitisedHtml = '';

    /**
     * Current instance.
     *
     * @property currentCrawler
     * @type {Object}
     * @default this
     */
    var currentParser = this;

    /**
     * Reset all the containers.
     *
     * @method reset
     * @return undefined
     */
    this.reset = function () {
        this.resetHttpHeaders(this.page);
        this.resetCookie(this.page);
        this.resetStack();
        this.resetReport();
    };

    /**
     * Configure all the callbacks for PhantomJS.
     *
     * @method setUpPage
     * @return undefined
     */
    this.setUpPage = function (page) {
        page.settings.resourceTimeout = settings.config.parser.timeout;
        page.onResourceTimeout        = this.onResourceTimeout;
        page.onError                  = this.onError;
        page.onInitialized            = this.onInitialized;
        page.onResourceReceived       = this.onResourceReceived;
        page.onAlert                  = this.onAlert;
        page.onConfirm                = this.onConfirm;
        page.onPrompt                 = this.onPrompt;
        page.onConsoleMessage         = this.onConsoleMessage;
        page.onNavigationRequested    = this.onNavigationRequested;
        page.viewportSize             = { width: 1024, height: 800 };
        page.settings.userAgent       = 'salmonJS/0.5.0 (+http://www.salmonjs.org)';

        this.setHttpHeaders(page);
    };

    /**
     * Set some HTTP headers for the request.
     *
     * @method setHttpHeaders
     * @param {Object} page The WebPage object.
     * @return undefined
     */
    this.setHttpHeaders = function (page) {
        var customHeaders = {};

        //customHeaders['Accept-Encoding'] = 'gzip,deflate';
        customHeaders.Connection = 'keep-alive';

        if (settings.username !== null && settings.password !== null) {
            customHeaders.Authorization = 'Basic ' + btoa(settings.username + ':' + settings.password);
        }

        page.customHeaders = customHeaders;
    };

    /**
     * Reset the HTTP header container.
     *
     * @method resetHttpHeaders
     * @param {Object} page The WebPage object.
     * @return undefined
     */
    this.resetHttpHeaders = function (page) {
        page.customHeaders = {};
    };

    /**
     * Handle the querysting for the URL request.
     *
     * @method handleQueryString
     * @return undefined
     */
    this.handleQueryString = function () {
        var currentQS = this.url.replace(/.*\?(.+)(#.*)?/, '$1');
        currentQS = currentQS === this.url ? '' : currentQS;
        currentQS = utils.normaliseData(currentQS);
        var get = this.data.GET || {};
        for (var p in get) {
            if (get.hasOwnProperty(p)) {
                currentQS[p] = get[p];
            }
        }

        var getParams = utils.arrayToQuery(utils.normaliseData(utils.arrayToQuery(currentQS)));
        this.url = this.url.replace(/\?(.+)(#.*)?/, '') + (getParams !== '' ? ('?' + getParams) : '');
    };

    /**
     * Add a cookie for the current request.
     *
     * @method addCookie
     * @param {Object} data The cookie settings.
     * @return {Boolean}
     */
    this.addCookie = function(data) {
        var res = this.page.addCookie(data);
        if (!res) {
            res = phantom.addCookie(data);
        }

        return res;
    };

    /**
     * Reset the cookie container.
     *
     * @method resetCookie
     * @param {Object} page The WebPage object.
     * @return undefined
     */
    this.resetCookie = function (page) {
        phantom.clearCookies();
        page.clearCookies();
    };

    /**
     * Parse the URL as GET request.
     *
     * @method parseGet
     * @return undefined
     */
    this.parseGet = function () {
        this.handleQueryString();

        // InitPhantomJs
        this.setUpPage(this.page);

        for (var cookie in this.data.COOKIE) {
            if (this.data.COOKIE.hasOwnProperty(cookie)) {
                var domain = '';
                if (this.url.substr(0, 7) !== 'file://') {
                    domain = this.url.replace(/^http(s)?:\/\/([^\/]+)\/?.*$/, '$2');
                }

                this.addCookie({
                    'name'   : cookie,
                    'value'  : this.data.COOKIE[cookie],
                    'domain' : domain
                });
            }
        }

        var headers = {};
        // TODO: Add a method
        for (var header in this.page.customHeaders) {
            if (this.page.customHeaders.hasOwnProperty(header)) {
                headers[header] = this.page.customHeaders[header];
            }
        }

        for (header in this.data.HEADERS) {
            if (this.data.HEADERS.hasOwnProperty(header)) {
                headers[header] = this.data.HEADERS[header];
            }
        }
        this.page.customHeaders = headers;

        // RestoreFreezedInstance: set all the variables need for ReExecuteJsEvents

        // Open
        this.page.open(this.url, this.onOpen);
        this.page.onLoadFinished = this.onLoadFinished;
    };

    /**
     * Parse the URL as POST request.
     *
     * @method parsePost
     * @return undefined
     */
    this.parsePost = function () {
        this.handleQueryString();

        this.setUpPage(this.page);

        for (var cookie in this.data.COOKIE) {
            if (this.data.COOKIE.hasOwnProperty(cookie)) {
                var domain = '';
                if (this.url.substr(0, 7) !== 'file://') {
                    domain = this.url.replace(/^http(s)?:\/\/([^\/]+)\/?.*$/, '$2');
                }

                this.addCookie({
                    'name'   : cookie,
                    'value'  : this.data.COOKIE[cookie],
                    'domain' : domain
                });
            }
        }

        var headers = {};
        for (var header in this.page.customHeaders) {
            if (this.page.customHeaders.hasOwnProperty(header)) {
                headers[header] = this.page.customHeaders[header];
            }
        }

        for (header in this.data.HEADERS) {
            if (this.data.HEADERS.hasOwnProperty(header)) {
                headers[header] = this.data.HEADERS[header];
            }
        }
        this.page.customHeaders = headers;

        var doNotContinue = false;
        for (var i in this.data.POST) {
            if (this.data.POST.hasOwnProperty(i)) {
                if (this.data.POST[i].toString().substr(0, 1) === '@' && fs.exists(this.data.POST[i].toString().substr(1))) {
                    doNotContinue = true;
                    this.spawnAndUseNodeJs(this.url, this.data.POST);
                }
            }
        }

        if (!doNotContinue) {
            this.page.open(this.url, 'post', this.data.POST, this.onOpen);
            this.page.onLoadFinished = this.onLoadFinished;
        }
    };

    /**
     * In case an upload is required, open another process (upload.js) in order
     * to upload the file(s) defined in the data object.
     * Then set the output as page content for the current request and trigger
     * the onLoadFinished to process it.
     *
     * @method spawnAndUseNodeJs
     * @param {String} url  The URL where upload the data
     * @param {Object} data The data to be uploaded
     * @return undefined
     */
    this.spawnAndUseNodeJs = function (url, data) {
        var args    = [ fs.workingDirectory + '/src/upload.js', url, JSON.stringify(data) ],
            process = spawn('node', args);

        process.stdout.on('data', function(data) {
            currentParser.page.setContent(data, currentParser.url);
            //currentParser.page.onInitialized(); // TODO: It crashes phantomjs
            currentParser.onLoadFinished();
        });

        process.stderr.on('data', function() {
            return currentParser.exit();
        });
    };

    /**
     * Handle the request to change the current URL.
     *
     * @method onNavigationRequested
     * @param {String} url The target URL of this navigation event
     * @return undefined
     */
    this.onNavigationRequested = function (url) {
        // TODO: What to do with reloads?
        // TODO: There is a random bug here.
        if (url !== 'about:blank' && !url.match(/^file:\/\//) && !settings.followRedirects && url.indexOf(currentParser.url) === -1) {
            return currentParser.exit();
        }

        return undefined;
    };

    /**
     * Push the WebPage in the stack of the unprocessed ones after firing the
     * requested JS event.
     *
     * @method putPageInStack
     * @param {Object} page  The WebPage instance to be cloned.
     * @param {String} evt   The event to fire.
     * @param {String} xPath The XPath expression to identify the element to fire.
     * @return undefined
     */
    this.putPageInStack = function (page, evt, xPath) {
        var id, pageFork = currentParser.cloneWebPage(page);

        // TODO: Do I want to get a "snapshot" freezing the current instance? (page, url, content, type, event & xPath)
        id = evt + xPath;

        if (currentParser.stepHashes.indexOf(id) === -1) {
            currentParser.stepHashes.push(id);
            console.log('FIRE(' + evt + ', ' + xPath + ')');

            if (xPath.substr(0, 1) !== '/') {
                pageFork.evaluate(utils.fireEventObject, {event: evt, xPath: xPath});
            } else {
                pageFork.evaluate(utils.fireEventDOM, {event: evt, xPath: xPath});
            }

            currentParser.stackPages.push(pageFork);
        }
    };

    /**
     * Reset the stackPages container.
     *
     * @method resetStack
     * @return undefined
     */
    this.resetStack = function () {
        currentParser.stackPages = [];
    };

    /**
     * Stop the execution of the current parser and return the data scraped.
     *
     * @method exit
     * @return undefined
     */
    this.exit = function () {
        var response;

        response = {
            idCrawler: settings.idCrawler,
            links:     currentParser.links,
            report:    currentParser.report
        };

        console.log('###' + JSON.stringify(response));
        if (args.join(' ').indexOf('jasmine-node') === -1) {
            phantom.exit(); // TODO: This will crash PhantomJS
        }
    };

    /**
     * Callback fired when the page has been opened.
     *
     * @method onOpen
     * @param {String} status The page return status
     * @return undefined
     */
    this.onOpen = function (status) {
        currentParser.report.time.end = Date.now();
        currentParser.report.time.total = currentParser.report.time.end - currentParser.report.time.start;

        if (status === 'success') {
            //currentParser.page.navigationLocked = true;
        }
    };

    /**
     * Callback fired when the page goes on timeout.
     *
     * @method onResourceTimeout
     * @return undefined
     */
    this.onResourceTimeout = function () {
        currentParser.exit();

        return true;
    };

    /**
     * Callback fired when the page has thrown an error.
     *
     * @method onError
     * @param {String} msg   The error message
     * @param {Array}  trace The stack trace
     * @return undefined
     */
    this.onError = function (msg, trace) {
        var msgStack = ['ERROR: ' + msg];

        if (trace && trace.length) {
            msgStack.push('TRACE:');
            trace.forEach(function (t) {
                msgStack.push(' -> ' + t.file + ': ' + t.line + (t.function ? ' (in function "' + t.function + '")' : ''));
            });
        }

        currentParser.report.errors.push(msgStack.join('\n'));
    };

    /**
     * Callback invoked after the web page is created but before a URL is
     * loaded.
     *
     * @method onInitialized
     * @param {Object} page The WebPage instance
     * @return undefined
     */
    this.onInitialized = function (page) {
        var currPage = page || currentParser.page;

        currPage.injectJs('../sha1.js');
        currPage.injectJs('../events.js');
    };

    /**
     * Callback invoked when the a resource requested by the page is received.
     *
     * @method onResourceReceived
     * @param {Object} response The response metadata object
     * @return undefined
     */
    this.onResourceReceived = function (response) {
        if (response.stage === 'end') {
            currentParser.report.resources[response.url] = {
                headers: response.headers
            };
        }
    };

    /**
     * Callback invoked when there is a JavaScript alert on the web page.
     *
     * @method onAlert
     * @param {String} msg The string for the message
     * @return undefined
     */
    this.onAlert = function (msg) {
        currentParser.report.alerts.push(msg);
    };

    /**
     * Callback invoked when there is a JavaScript confirm on the web page.
     *
     * @method onConfirm
     * @param {String} msg The string for the message
     * @return undefined
     */
    this.onConfirm = function (msg) {
        var retVal = true;

        // possible entrypoints: parseGet, putPageInStack.
        currentParser.report.confirms.push(msg);
        if (settings.data !== undefined && settings.data.CONFIRM !== undefined && settings.data.CONFIRM[msg] !== undefined) {
            retVal = settings.data.CONFIRM[msg];
        }

        // `true` === pressing the "OK" button, `false` === pressing the "Cancel" button
        return retVal;
    };

    /**
     * Callback invoked when there is a JavaScript prompt on the web page.
     *
     * @method onPrompt
     * @param {String} msg        The string for the message
     * @param {String} defaultVal The default value for the prompt answer
     * @return undefined
     */
    this.onPrompt = function (msg, defaultVal) {
        currentParser.report.prompts.push({msg: msg, defaultVal: defaultVal});

        // possible entrypoints: parseGet, putPageInStack.
        if (settings.data !== undefined && settings.data.PROMPT !== undefined && settings.data.PROMPT[msg] !== undefined) {
            return settings.data.PROMPT[msg];
        }

        return defaultVal;
    };

    /**
     * Callback invoked when there is a JavaScript console message on the web
     * page.
     *
     * @method onConsoleMessage
     * @param {String}  msg      The string for the message
     * @param {Integer} lineNum  The line number
     * @param {String}  sourceId The source identifier
     * @return undefined
     */
    this.onConsoleMessage = function (msg, lineNum, sourceId) {
        currentParser.report.console.push({msg: msg, lineNum: lineNum, sourceId: sourceId});
    };

    /**
     * Reset the report and links containers.
     *
     * @method resetReport
     * @return undefined
     */
    this.resetReport = function () {
        this.report = {
            errors:     [],
            alerts:     [],
            confirms:   [],
            prompts:    [],
            console:    [],
            failure:    false,
            resources:  {},
            time:       { start: 0, end: 0, total: 0 },
            content:    '',
            httpMethod: '',
            event:      '',
            xPath:      '',
            data:       {}
        };

        this.links = {
            a:      [],
            link:   [],
            script: [],
            meta:   [],
            form:   [],
            events: []
        };
    };

    /**
     * Callback fired when the page has been loaded properly.
     *
     * @method onLoadFinished
     * @return undefined
     */
    this.onLoadFinished = function () {
        if (settings.sanitise !== undefined && settings.sanitise.toString() === 'true') {
            var tmp_fn  = fs.workingDirectory + '/file_' + ((new Date()).getTime()) + '.html',
                args    = [ fs.workingDirectory + '/src/tidy.js', tmp_fn ],
                process;

            fs.write(tmp_fn, page.content, 0777);
            process = spawn('node', args);

            process.stdout.on('data', function(data) {
                try { fs.remove(tmp_fn); } catch (ignore) {}
                currentParser.sanitisedHtml += data.toString();
            });

            process.stderr.on('data', function(data) {
                try { fs.remove(tmp_fn); } catch (ignore) {}
                console.log(data.toString());
                return currentParser.exit();
            });

            process.on('exit', function() {
                try { fs.remove(tmp_fn); } catch (ignore) {}

                if (page.content !== currentParser.sanitisedHtml) {
                    console.log('HTML sanitised');
                }

                //TODO: What to do if it exits before receive data?
                currentParser.page.setContent(currentParser.sanitisedHtml, currentParser.url);
                currentParser.evaluateAndParse();
            });
        } else {
            currentParser.evaluateAndParse();
        }
    };

    /**
     * Evaluate and parse the current page request.
     *
     * @method evaluateAndParse
     * @return undefined
     */
    this.evaluateAndParse = function () {
        var step;

        // ReExecuteJsEvents
        // Execute all the events to be sure to follow the right path to the
        // right page to be processed.
        for (step in currentParser.stepStack) {
            if (currentParser.stepStack[step].event !== '' && currentParser.stepStack[step].xPath !== '') {
                if (currentParser.stepStack[step].xPath[0] !== '/') {
                    currentParser.page.evaluate(utils.fireEventObject, currentParser.stepStack[step]);
                } else {
                    currentParser.page.evaluate(utils.fireEventDOM, currentParser.stepStack[step]);
                }
            }
        }

        // Process
        currentParser.parsePage(currentParser.page);
    };

    /**
     * Clone the specified WebPage.
     *
     * @method cloneWebPage
     * @param  {Object} page The WebPage instance to be cloned.
     * @return {Object}
     */
    this.cloneWebPage = function (page) {
        var newPage = webpageObj.create();

        currentParser.setUpPage(newPage);
        newPage.setContent(page.content, page.url);
        newPage.onInitialized(newPage);

        return newPage;
    };

    /**
     * Parse the page to get the information needed.
     *
     * @method parsePage
     * @param {Object} page The WebPage instance
     * @return undefined
     */
    this.parsePage = function (page) {
        var url, links = {}, events;

        currentParser.report.content = page.content;

        url = page.evaluate(function () {
            return document.location.href;
        });

        if (page.content.indexOf('<html') !== -1) {
            if (settings.storeDetails && settings.storeDetails !== 'undefined') {
                fs.makeDirectory(fs.absolute(settings.storeDetails) + '/' + settings.execId + '/');
                page.render(fs.absolute(settings.storeDetails) + '/' + settings.execId + '/' + settings.idRequest + '.png');
            }

            links = page.evaluate(currentParser.onEvaluate, currentParser.tags);

            // TODO: What if an event will change url? will it be processed in here or just returned back to the crawler?
            events = page.evaluate(function () {
                return window.eventContainer !== undefined ? window.eventContainer.getEvents() : [];
            });

            utils.loopEach(events, function(type, type_value) {
                utils.loopEach(type_value, function(act, act_value) {
                    utils.loopEach(act_value, function(evt, evt_value) {
                        currentParser.putPageInStack(page, type, evt_value);
                    });
                });
            });

            for (var tag in links) {
                if (links.hasOwnProperty(tag) && tag !== 'form') {
                    currentParser.links[tag] = [].map.call(links[tag], function (item) {
                        return utils.normaliseUrl(item, url);
                    }).concat(currentParser.links[tag]).filter(utils.onlyUnique);
                }
            }

            currentParser.links.form = [].map.call(links.form, function (item) {
                item.action = item.action || url;
                item.action = utils.normaliseUrl(item.action, url);

                if (item.action === undefined) {
                    return undefined;
                }

                return item;
            }).concat(currentParser.links.form).filter(utils.onlyUnique);

            while (Object.keys(currentParser.stackPages).length !== 0) {
                currentParser.parsePage(currentParser.stackPages.pop());
            }
        }

        links = page.evaluate(currentParser.onEvaluateNonHtml, page.content);

        if (links.hasOwnProperty('mixed_full')) {
            currentParser.links.mixed_full = [].map.call(links.mixed_full, function (item) {
                return utils.normaliseUrl(item, url);
            }).concat(currentParser.links.mixed_full).filter(utils.onlyUnique);
        }
        if (links.hasOwnProperty('mixed_rel')) {
            currentParser.links.mixed_rel = [].map.call(links.mixed_rel, function (item) {
                item = item.replace(/^['"](.+)['"]$/, '$1');
                return utils.normaliseUrl(item, url);
            }).concat(currentParser.links.mixed_rel).filter(utils.onlyUnique);
        }

        currentParser.exit();
    };

    /**
     * Callback fired to evaluate the page content.
     *
     * @method onEvaluate
     * @return {Object} A list of links (anchors, links, scripts and forms).
     */
    this.onEvaluate = function () {
        var urls = {},
            currentUrl = document.location.href,
            attribute,
            tag,
            links,
            tags = arguments[0],
            elements;

        for (tag in tags) {
            if (tags.hasOwnProperty(tag)) {
                urls[tag] = [];
                attribute = tags[tag];
                if (typeof attribute === 'object') {
                    for (var attr in attribute) {
                        if (attribute.hasOwnProperty(attr)) {
                            elements = document.querySelectorAll(tag);
                            links = [];
                            if (elements.length > 0) {
                                links = [].map.call(elements, function (item) {
                                    return item[attr];
                                });
                            }
                            if (links.length > 0) {
                                urls[tag] = urls[tag].concat(links);
                            }
                        }
                    }
                } else {
                    elements = document.querySelectorAll(tag);
                    if (elements.length > 0) {
                        urls[tag] = [].map.call(elements, function (item) {
                            return item[attribute];
                        });
                    }
                }
            }
        }

        elements = document.querySelectorAll('meta');
        if (elements.length > 0) {
            urls.meta = [].map.call(elements, function (item) {
                if (item.getAttribute('http-equiv') === 'refresh') {
                    return item.getAttribute('content').split(/=/, 2)[1];
                }

                return undefined;
            });
        }

        elements = document.querySelectorAll('form');
        if (elements.length > 0) {
            urls.form = [].map.call(elements, function (item) {
                var input, select, textarea;

                input = [].map.call(item.getElementsByTagName('input'), function (item) {
                    return item.getAttribute('name');
                });

                select = [].map.call(item.getElementsByTagName('select'), function (item) {
                    return item.getAttribute('name');
                });

                textarea = [].map.call(item.getElementsByTagName('textarea'), function (item) {
                    return item.getAttribute('name');
                });

                return {
                    action: item.getAttribute('action') || currentUrl,
                    type:   item.getAttribute('method') || 'get',
                    fields: input.concat(select).concat(textarea)
                };
            });
        }

        return urls;
    };

    /**
     * Retrieve all the links in the page.
     * Generally used when the page is not HTML.
     *
     * @method onEvaluateNonHtml
     * @return {Object}
     */
    this.onEvaluateNonHtml = function () {
        var urls          = {
                mixed_full: [],
                mixed_rel:  [],
            },
            content       = arguments[0],
            protocol      = '((https?|ftp):)?',
            credentials   = '([\\w]+:\\w+@)?',
            hostname      = '(([a-z0-9]|[a-z0-9][a-z0-9\\-]*[a-z0-9])\\.)*([a-z0-9]|[a-z0-9][a-z0-9\\-]*[a-z0-9])',
            ip            = '(([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\\.){3}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])',
            port          = '(:([0-9]{1,4}|[1-5][0-9]{4}|6[0-4][0-9]{3}|65[0-4][0-9]{2}|655[0-2][0-9]|6553[0-5]))?',
            path_char     = '(([a-z0-9\\-_.!~*\'\\(\\):@&=+$,])|(%[0-9a-f][0-9a-f]))',
            path          = path_char + '*(?:;' + path_char + '*)*(?:/' + path_char + '*(?:;' + path_char + '*)*)*',
            querystring   = '(\\?' + path_char + '*)?',
            hash          = '(#' + path_char + '*)?',
            regex_fullurl = new RegExp(
                '(' + protocol + '//' + credentials + '(' + hostname + '|' + ip + ')' + port + '/' + path + querystring + hash + ')',
                'igm'
            ),
            regex_relurl = new RegExp(
                //'\'('+path+')\'|"('+path+')"',
                '\'(/'+path+')\'|"(/'+path+')"|\'(../'+path+')\'|"(../'+path+')"',
                'igm'
            );

        urls.mixed_full = content.match(regex_fullurl);
        urls.mixed_rel  = content.match(regex_relurl);

        urls.mixed_full = urls.mixed_full === null ? [] : urls.mixed_full;
        urls.mixed_rel  = urls.mixed_rel === null ? [] : urls.mixed_rel;

        return urls;
    };
};

PhantomParser.prototype = new Parser();
if (args.join(' ').indexOf('jasmine-node') === -1 && args.join(' ').indexOf('grunt') === -1) {
    new PhantomParser(utils, spawn, page, settings).parse(settings.url, settings.type, settings.data, settings.evt, settings.xPath);
} else {
    module.exports = PhantomParser;
}