src/index.js
'use strict'; // eslint-disable-line
const JSONStream = require('JSONStream');
const crawl = require('./crawl');
const concurrencyKey = Symbol('concurrency');
const urlKey = Symbol('url');
const finderKey = Symbol('finder');
const timeoutKey = Symbol('timeout');
const runnerKey = Symbol('runner');
const phantomParamsKey = Symbol('phantomParams');
const phantomPageSettingsKey = Symbol('phantomPageSettings');
const followRedirectsKey = Symbol('followRedirects');
const browserCookiesKey = Symbol('browserCookies');
const triesKey = Symbol('tries');
const redirectFilterKey = Symbol('redirectFilter');
/**
* The CrawlKit base class. This is where the magic happens.
*/
class CrawlKit {
/**
* Create a CrawlKit instance
* @constructor
* @param {String} [url] The start URL. Sets the {@link CrawlKit#url}.
* @param {String} [name] The instance name of the crawler. Used for logging purposes.
* @return {CrawlKit} a new CrawlKit instance
*/
constructor(url, name) {
if (url) {
this.url = url;
}
if (name) {
this.name = name;
}
this[runnerKey] = new Map();
this[finderKey] = {};
this[browserCookiesKey] = [];
}
/**
* Getter/setter for overall timeout for one website processing
* (opening page, evaluating runners and finder functions).
* The timeout starts fresh for each website.
*
* Values under zero are set to zero.
*
* @type {!integer}
* @default 30000 (30 seconds)
*/
set timeout(num) {
this[timeoutKey] = parseInt(num, 10);
}
/**
* @ignore
*/
get timeout() {
return Math.max(0, this[timeoutKey] || 30000);
}
/**
* Getter/setter for the concurrency of the crawler.
* This controls the amount of PhantomJS instances that will be spawned
* and used to work on found websites. Adapt this to the power of your machine.
*
* Values under one are set to one.
*
* @type {!integer}
* @default 1 (No concurrency)
*/
set concurrency(num) {
this[concurrencyKey] = parseInt(num, 10);
}
/**
* @ignore
*/
get concurrency() {
return Math.max(1, this[concurrencyKey] || 1);
}
/**
* Getter/setter for the start URL of the crawler.
* This is the URL that will be used as an initial endpoint for the crawler.
* If the protocol is omitted (e.g. URL starts with //), the URL will be rewritten to http://
* @type {String}
*/
set url(str) {
this[urlKey] = str;
}
/**
* @ignore
*/
get url() {
return this[urlKey];
}
/**
* With this method a {@link Finder} instance can be set for the crawler.
* A finder is used for link discovery on a website. It is run directly after page load
* and is optional (e.g. if you want to only work on a single page).
*
* @param {!Finder} finder The finder instance to use for discovery.
* @param {...*} [runnableParams] These parameters are passed
* to the function returned by {@link Finder#getRunnable}
* at evaluation time.
*/
setFinder(finder /* parameters... */) {
if (!finder || typeof finder.getRunnable !== 'function') {
throw new Error('Not a valid finder instance');
}
this[finderKey].finder = finder;
this[finderKey].parameters = Array.prototype.slice.call(arguments, 1);
}
/**
* Getter/setter for the number of tries when a PhantomJS instance crashes on a page
* or {@link CrawlKit#timeout} is hit.
* When a PhantomJS instance crashes whilst crawling a webpage, this instance is shutdown
* and replaced by a new one. By default the webpage that failed in such a way will be
* re-queued.
* If the finders and runners did not respond within the defined timeout,
* it will be tried to run them again as well.
* This member controls how often that re-queueing happens.
*
* Values under zero are set to zero.
*
* @type {!integer}
* @default 3 (read: try two more times after the first failure, three times in total)
*/
set tries(n) {
this[triesKey] = parseInt(n, 10);
}
/**
* @ignore
*/
get tries() {
return Math.max(0, this[triesKey] || 3);
}
/**
* Allows you to add a runner that is executed on each crawled page.
* The returned value of the runner is added to the overall result.
* Runners run sequentially on each webpage in the order they were added.
* If a runner is crashing PhantomJS more than {@link CrawlKit#tries} times,
* subsequent {@link Runner}s are not executed.
*
* @see For examples see `examples/simple.js` and `examples/advanced.js`.
* @param {!String} key The runner identificator. This is also used in the result stream/object.
* @param {!Runner} runner The runner instance to use for discovery.
* @param {...*} [runnableParams] These parameters are passed to the function returned
* by {@link Runner#getRunnable} at evaluation time.
*/
addRunner(key, runner /* args ... */) {
if (!key) {
throw new Error('Not a valid runner key');
}
if (!runner ||
typeof runner.getCompanionFiles !== 'function' ||
typeof runner.getRunnable !== 'function') {
throw new Error('Not a valid runner instance');
}
const parameters = Array.prototype.slice.call(arguments, 2);
this[runnerKey].set(key, {
runner,
parameters,
});
}
/**
* Getter/setter for the map of parameters to pass to PhantomJS.
* You can use this for example to ignore SSL errors.
* For a list of parameters, please refer to the
* [PhantomJS documentation]{@link http://phantomjs.org/api/command-line.html}.
*
* @type {!Object.<String,String>}
*/
set phantomParameters(params) {
this[phantomParamsKey] = params;
}
/**
* @ignore
*/
get phantomParameters() {
return this[phantomParamsKey] || {};
}
/**
* Getter/setter for the map of settings to pass to an opened page.
* You can use this for example for Basic Authentication.
* For a list of options, please refer to the
* [PhantomJS documentation]{@link http://phantomjs.org/api/webpage/property/settings.html}.
* Nested settings can just be provided in dot notation as the key, e.g. 'settings.userAgent'.
*
* @type {!Object.<String,*>}
*/
set phantomPageSettings(settings) {
this[phantomPageSettingsKey] = settings;
}
/**
* @ignore
*/
get phantomPageSettings() {
return this[phantomPageSettingsKey] || {};
}
/**
* Getter/setter for whether to follow redirects or not.
* When following redirects, the original page is not processed.
*
* @type {!boolean}
* @default false
*/
set followRedirects(value) {
this[followRedirectsKey] = !!value;
}
/**
* @ignore
*/
get followRedirects() {
return this[followRedirectsKey] || false;
}
/**
* Getter/setter for the cookies to set within PhantomJS.
* Each entry is supposed to be an object following the
* [PhantomJS spec]{@link http://phantomjs.org/api/webpage/method/add-cookie.html}.
*
* @type {!Array.<Object>}
*/
set browserCookies(cookies) {
if (!(cookies instanceof Array)) {
throw new Error('Not properly munchable');
}
this[browserCookiesKey] = cookies;
}
/**
* @ignore
*/
get browserCookies() {
return this[browserCookiesKey];
}
/**
* Getter/setter for the filter that is applied to redirected URLs.
* With this filter you can prevent the redirect or rewrite it.
* The filter callback gets two arguments. The first one is the target URL
* the scond one the source URL.
* Return false for preventing the redirect. Return a String (URL) to follow the redirect.
*
* @type {Function}
*/
set redirectFilter(filter) {
if (typeof filter !== 'function') {
throw new Error('Filter must be valid function');
}
this[redirectFilterKey] = filter;
}
/**
* @ignore
*/
get redirectFilter() {
return this[redirectFilterKey] || (targetUrl => targetUrl);
}
/**
* This method starts the crawling/scraping process.
*
* @param {boolean} [shouldStream=false] Whether to stream the results or use a Promise
* @return {(Stream|Promise.<Object>)} By default a Promise object is returned that resolves
* to the result. If streaming is enabled it returns a
* JSON stream of the results.
*/
crawl(shouldStream) {
if (shouldStream) {
const stream = JSONStream.stringifyObject();
crawl(this, (scope) => {
stream.write([scope.url, scope.result]);
}, runnerKey, finderKey)(() => stream.end());
return stream;
}
return new Promise((resolve) => {
const results = {};
crawl(this, (scope) => {
results[scope.url] = scope.result;
}, runnerKey, finderKey)(() => resolve({
results,
}));
});
}
}
module.exports = CrawlKit;