lib/scraper.js
// # Scraper
//
// > Scraper class in the Node.js Thresher package.
// >
// > author: [Richard Smith-Unna](http://blahah/net)
// > email: <richard@contentmine.org>
// > copyright: Shuttleworth Foundation (2014)
// > license: [MIT](https://github.com/ContentMine/thresher/blob/master/LICENSE-MIT)
//
// ---
//
// ## Description
//
// Scrapers can scrape DOMs (or URLs from which DOMs can be rendered). They are
// created from ScraperJSON definitions, and return scraped data as structured
// JSON. Scraping a provided DOM is synchronous, while scraping a URL is
// asynchronous. and can be monitored by subscribing to events.
//
// Scrapers emit the following events:
// * `error`: on any error. If not intercepted, these events will throw.
// * `elementCaptured` ***(data)***: when an element is successfully captured.
// * `elementCaptureFailed` ***(element)***: when element capture fails.
// * `downloadComplete`: when a download finished.
// * `done` ***(results)***: when the entire scraping process is finished
//
// ## Usage
//
// The Scraper class is created from a ScraperJSON definition:
//
// var scraper = new Scraper(definition);
//
// The scraper is them executed on a DOM:
//
// scraper.scrapeDoc(doc);
//
var EventEmitter2 = require('eventemitter2').EventEmitter2
, util = require('util')
, file = require('./file.js')
, Downloader = require('./download.js')
, url = require('./url.js')
, dom = require('./dom.js')
, Ticker = require('./ticker.js')
, request = require('request')
, HeadlessRenderer = require('./renderer/headless.js')
, BasicRenderer = require('./renderer/basic.js')
, ElementTree = require('./elementTree.js').ElementTree;
var Scraper = function(definition, headless) {
var scraper = this;
EventEmitter2.call(this, {
wildcard: true,
maxListeners: 0
});
if (scraper.validate(definition)) {
scraper.valid = true;
// The definition is laoded into the properties
// of the scraper. Optional properties are set to
// null if they are missing.
scraper.url = definition.url;
scraper.doi = definition.doi || null;
scraper.name = definition.name;
scraper.elements = definition.elements;
scraper.followables = definition.followables || [];
scraper.actions = definition.actions || null;
// The renderer is chosen. Basic by default (see BasicRenderer),
// but if the user specifies headless rendering, or if there are
// any interactions to perform on the page, the renderer is Headless
// (see HeadlessRenderer).
if (headless || definition.headless || scraper.actions) {
scraper.rendererClass = HeadlessRenderer;
scraper.emit('info', 'using headless renderer');
} else {
scraper.rendererClass = BasicRenderer;
scraper.emit('info', 'using basic renderer');
}
// Elements are processed into a queue. Because some elements
// depend on following URLs specified by other elements, dependencies
// are resolved into a tree. The scraping proceeds by starting at the
// root of the tree and scraping all the child elements. Any with
// dependents are then rendered and their children scraped, and so on.
scraper.loadElements();
scraper.tree = new ElementTree(scraper.elementsArray);
scraper.follow_urls = {};
scraper.results = {};
// In order to resolve follows efficiently, we store rendered documents
// in an object using their element name as the key. The starting URL
// is stored with the key 'root'.
scraper.docs = {};
} else {
scraper.valid = false;
}
}
// Scraper inherits from EventEmitter
util.inherits(Scraper, EventEmitter2);
// Provide a new renderer
Scraper.prototype.newRenderer = function() {
var scraper = this;
var renderer = new scraper.rendererClass();
renderer.on('renderer.*', function(var1, var2) {
scraper.emit(this.event, var1, var2)
});
return renderer;
}
// Validate a scraperJSON definition
//
Scraper.prototype.validate = function(def){
var problems = [];
// url key must exist
if (!def.url) {
problems.push('must have "url" key');
}
// elements key must exist
if(!def.elements) {
problems.push('must have "elements" key');
} else {
// there must be at least 1 element
if (Object.keys(def.elements).length == 0) {
problems.push('no elements were defined');
} else {
// each terminal element (leaf) must have a selector
var keywords = ['selector', 'attribute', 'download',
'regex', 'follow', 'name'];
var checkLeaves = function(elements) {
for (k in elements) {
if (keywords.indexOf(k) > -1) {
continue;
}
var e = elements[k];
if (!(typeof e === 'object')) {
continue;
}
var isLeaf = true;
for (j in e) {
if (keywords.indexOf(j) == -1) {
// this element has child[ren]
isLeaf = false;
checkLeaves(e);
}
}
if (isLeaf) {
// this element is a leaf
if (!e.selector) {
problems.push('element ' + k + ' has no selector');
}
}
}
}
checkLeaves(def.elements);
}
}
if (problems.length > 0) {
this.emit('definitionError', problems);
return false;
}
return true;
}
// Check if this scraper applies to a given URL
Scraper.prototype.matchesURL = function(theUrl) {
var regex = new RegExp(this.url);
return regex.test(theUrl);
}
// Load elements from a dictionary of nested objects
// to a dictionary of nested scrapers, also
// storing all elements in a flat array for rapid iteration
Scraper.prototype.loadElements = function() {
this.elementsArray = getChildElements(this);
}
// Flatten an element tree by recursion
// add the key of each element to the element
// as name. Include followables.
function getChildElements(obj) {
var elementsArray = [];
// process followables first, they
// will be excluded from results later
if (obj.followables) {
for (var key in obj.followables) {
var element = obj.followables[key];
element.name = key;
elementsArray.push(element);
elementsArray.concat(getChildElements(element));
}
}
if (obj.elements) {
for (var key in obj.elements) {
var element = obj.elements[key];
element.name = key;
elementsArray.push(element);
elementsArray.concat(getChildElements(element));
}
}
return elementsArray;
}
// Restore scraping results to the structure of the
// input scraper
Scraper.prototype.structureResults = function() {
var scraper = this;
var cleanResults = {};
fillChildResults(scraper, scraper.elements, cleanResults);
return cleanResults;
}
// Recursively populate a results object with scraping results,
// following the structure of the scraper element tree by
// depth-first recursion
function fillChildResults(scraper, obj, newRes) {
var baseKeys = ['selector', 'attribute', 'download', 'regex', 'follow', 'name'];
for (var key in obj) {
if (baseKeys.indexOf(key) >= 0) {
// ignore base keys
continue;
}
newRes[key] = {};
// add any result value to the element
if (scraper.results.hasOwnProperty(key)) {
newRes[key].value = scraper.results[key];
}
// continue structuring child results
var element = obj[key];
fillChildResults(scraper, element, newRes[key]);
}
}
// Scrape the provided URL
// Start at the root node.
// Render the root URL and save the document in the docs object.
// Iterate through the child elements scraping them.
// For each child element, recurse.
Scraper.prototype.scrapeUrl = function(theUrl, node) {
var scraper = this;
scraper.startTicker();
scraper.results = scraper.results || {};
node = node || scraper.tree.root;
var children = node.children;
// render the base url and load the HTML into a DOM
var renderer = scraper.newRenderer();
renderer.render(theUrl, scraper.actions);
scraper.ticker.elongate();
renderer.on('renderer.urlRendered', function(theUrl, html) {
scraper.emit('urlRendered', theUrl);
// the children of the root node have no dependencies, so we scrape
// all the elements in it from the base URL
var doc = dom.render(html);
scraper.docs[theUrl] = doc;
followed = [];
for (var i = 0; i < children.length; i++) {
var child = children[i];
var hasChildren = child.children.length > 0;
scraper.scrapeElement(doc, child.element,
theUrl, child.element.name,
hasChildren);
if (hasChildren) {
followed.push(child);
}
}
// climb down the tree for any children with children of their own
followed.forEach(function(child) {
// scrape every url for the followed element
var nextUrls = scraper.follow_urls[child.element.name];
for (var i = 0; i < nextUrls.length; i++) {
scraper.scrapeUrl(nextUrls[i], child);
}
})
scraper.ticker.tick();
});
}
// Scrape a specific element
Scraper.prototype.scrapeElement = function(doc, element, scrapeUrl, key, follow_url) {
var scraper = this;
follow_url = typeof follow_url !== 'undefined' ? follow_url : false;
// extract element
key = key || element.name;
if (follow_url) {
scraper.follow_urls[key] = []
}
var selector = element.selector;
var attribute = element.attribute;
var matches = dom.select(selector, doc);
if (!scraper.results.hasOwnProperty(key)) {
scraper.results[key] = [];
}
if (matches.length === 0) {
scraper.emit('elementCaptureFailed', key);
scraper.emit('selectorFailed', selector, key);
}
for (var i = 0; i < matches.length; i++) {
var res = matches[i];
if (res === undefined || res === null) {
scraper.emit('elementCaptureFailed', key);
scraper.emit('selectorFailed', selector, key);
} else {
res = dom.getAttribute(res, attribute);
if (res === undefined || res === null) {
scraper.emit('elementCaptureFailed', key);
scraper.emit('attributeFailed', attribute, selector, key);
continue;
}
// run regex if applicable
if (element.regex) {
res = scraper.runRegex(res, element.regex)
}
// if the result is a URL, trim and clean it
if (follow_url || element.download) {
res = res.trim();
res = url.cleanResourcePath(res, scrapeUrl);
}
// if the element has followers, save the url
if (follow_url) {
scraper.follow_urls[key].push(res);
}
// process downloads
if (element.download) {
scraper.downloadElement(element, res, scrapeUrl);
}
// save the result
scraper.results[key].push(res);
scraper.emit('elementCaptured', key, res);
}
}
scraper.emit('elementResults', key, scraper.results[key]);
}
Scraper.prototype.startTicker = function() {
var scraper = this;
if (!scraper.ticker) {
scraper.ticker = new Ticker(0, function() {
var results = scraper.structureResults();
scraper.emit('end', scraper.results, results);
});
}
}
// Download the resource specified by an element
Scraper.prototype.downloadElement = function(element, res, scrapeUrl) {
var down = new Downloader();
var scraper = this;
// rename downloaded file?
var rename = null;
if (typeof element.download === 'object') {
if (element.download.rename) {
rename = element.download.rename;
}
}
// set download running
down.downloadResource(res, scrapeUrl, rename);
// add it to the task ticker
scraper.ticker.elongate();
down.once('downloadStarted', function(url) {
scraper.emit(this.event, url);
});
down.once('downloadSaved', function(path) {
scraper.emit(this.event, path);
scraper.ticker.tick();
down.removeAllListeners();
});
down.once('*Error', function(err) {
scraper.emit(this.event, err);
scraper.ticker.tick();
});
}
// Run regular expression on a captured element
Scraper.prototype.runRegex = function(string, regex) {
var re;
if (regex instanceof Object) {
if (regex.flags) {
var flags = regex.flags.join('');
re = new RegExp(regex.source, flags);
} else {
re = new RegExp(regex.source);
}
} else {
re = new RegExp(regex);
}
var match = re.exec(string);
var matches = [];
while (match != null) {
var captures = match.slice(1);
if (re.global) {
matches = matches.concat(captures);
} else {
matches = captures;
break;
}
match = re.exec(string);
}
return matches;
}
module.exports = Scraper;