website-scraper/node-website-scraper

View on GitHub
lib/scraper.js

Summary

Maintainability
B
4 hrs
Test Coverage
A
100%
import PromiseQueue from 'p-queue';
import logger from './logger.js';
import defaults from './config/defaults.js';
import recursiveSources from './config/recursive-sources.js';
import Resource from './resource.js';
import request from './request.js';
import ResourceHandler from './resource-handler/index.js';
import {
    SaveResourceToFileSystemPlugin,
    GenerateFilenameBySiteStructurePlugin,
    GenerateFilenameByTypePlugin,
    GetResourceReferencePlugin
} from './plugins/index.js';

import * as utils from './utils/index.js';
const { extend, union, urlsEqual, getTypeByMime, getTypeByFilename, series } = utils;
import NormalizedUrlMap from './utils/normalized-url-map.js';

const actionNames = [
    'beforeStart', 'afterFinish', 'error',
    'beforeRequest', 'afterResponse',
    'onResourceSaved', 'onResourceError',
    'generateFilename',
    'getReference',
    'saveResource',
];

const mandatoryActions = [
    { name: 'saveResource', pluginClass: SaveResourceToFileSystemPlugin },
    { name: 'generateFilename', pluginClass: GenerateFilenameByTypePlugin },
    { name: 'getReference', pluginClass: GetResourceReferencePlugin },
];

const filenameGeneratorPlugins = {
    byType: GenerateFilenameByTypePlugin,
    bySiteStructure: GenerateFilenameBySiteStructurePlugin
};

class Scraper {
    constructor (options) {
        this.normalizeOptions(options);
        logger.info('init with options', this.options);

        this.applyPlugins(this.options.plugins);

        this.resourceHandler = new ResourceHandler(this.options, {
            requestResource: this.requestResource.bind(this),
            getReference: this.runActions.bind(this, 'getReference')
        });
        this.resources = this.options.urls.map(({url, filename}) => new Resource(url, filename));

        this.requestedResourcePromises = new NormalizedUrlMap(); // Map url -> request promise
        this.loadedResources = new NormalizedUrlMap(); // Map url -> resource
        this.requestQueue = new PromiseQueue({concurrency: this.options.requestConcurrency});
    }

    normalizeOptions (options) {
        this.options = extend(defaults, options);
        this.options.request = extend(defaults.request, options.request);

        const urls = Array.isArray(options.urls) ? options.urls : [options.urls];

        this.options.urls = urls.map((urlItem) => {
            if (typeof urlItem === 'string') {
                return { url: urlItem, filename: this.options.defaultFilename };
            } else {
                return {url: urlItem.url, filename: urlItem.filename || this.options.defaultFilename};
            }
        });

        if (this.options.subdirectories) {
            this.options.subdirectories.forEach((element) => {
                element.extensions = element.extensions.map((ext) => ext.toLowerCase());
            });
        }

        this.options.recursiveSources = recursiveSources;
        if (this.options.recursive) {
            this.options.sources = union(this.options.sources, this.options.recursiveSources);
        }

        this.options.plugins = this.options.plugins || [];

        if (Object.keys(filenameGeneratorPlugins).includes(this.options.filenameGenerator)) {
            this.options.plugins.unshift(new filenameGeneratorPlugins[this.options.filenameGenerator]());
        }
    }

    applyPlugins (plugins = []) {
        this.actions = {};
        actionNames.forEach(actionName => this.actions[actionName] = []);
        plugins.forEach(plugin => {
            logger.debug(`[plugin] apply plugin ${plugin.constructor.name}`);
            plugin.apply(this.addAction.bind(this));
        });

        mandatoryActions.forEach(mandatoryAction => {
            if (this.actions[mandatoryAction.name].length === 0) {
                const plugin = new mandatoryAction.pluginClass();
                logger.debug(`[plugin] apply default plugin ${plugin.constructor.name} for action ${mandatoryAction.name}`);
                plugin.apply(this.addAction.bind(this));
            }
        });
    }

    addAction (name, handler) {
        if (!actionNames.includes(name)) {
            throw new Error(`Unknown action "${name}"`);
        }
        logger.debug(`add action ${name}`);
        this.actions[name].push(handler);
    }

    loadResource (resource) {
        const url = resource.getUrl();

        if (this.loadedResources.has(url)) {
            logger.debug('found loaded resource for ' + resource);
        } else {
            logger.debug('add loaded resource ' + resource);
            this.loadedResources.set(url, resource);
        }
    }

    async saveResource (resource) {
        resource.setSaved();

        try {
            await this.resourceHandler.handleResource(resource);
            logger.info('saving resource ' + resource + ' to fs');
            await this.runActions('saveResource', {resource});
            // ignore promise here, just notifying external code about resource saved
            this.runActions('onResourceSaved', {resource});
        } catch (err) {
            logger.warn('failed to save resource ' + resource);
            await this.handleError(err, resource);
        }
    }

    createNewRequest (resource) {
        const self = this;
        const url = resource.getUrl();

        const requestPromise = Promise.resolve()
            .then(async () => {
                const referer = resource.parent ? resource.parent.getUrl() : null;
                return this.requestQueue.add(async () => {
                    const {requestOptions} = await this.runActions('beforeRequest', {resource, requestOptions: this.options.request});
                    return request.get({
                        url,
                        referer,
                        options: requestOptions,
                        afterResponse: this.actions.afterResponse.length ? this.runActions.bind(this, 'afterResponse') : undefined
                    });
                });
            }).then(async function requestCompleted (responseData) {
                if (!responseData) {
                    logger.debug('no response returned for url ' + url);
                    return null;
                }

                if (!urlsEqual(responseData.url, url)) { // Url may be changed in redirects
                    logger.debug('url changed. old url = ' + url + ', new url = ' + responseData.url);

                    if (self.requestedResourcePromises.has(responseData.url)) {
                        return self.requestedResourcePromises.get(responseData.url);
                    }

                    resource.setUrl(responseData.url);
                    self.requestedResourcePromises.set(responseData.url, requestPromise);
                }

                resource.setType(getTypeByMime(responseData.mimeType));

                const { filename } = await self.runActions('generateFilename', { resource, responseData });
                resource.setFilename(filename);

                // if type was not determined by mime we can try to get it from filename after it was generated
                if (!resource.getType()) {
                    resource.setType(getTypeByFilename(filename));
                }

                if (responseData.metadata) {
                    resource.setMetadata(responseData.metadata);
                }

                resource.setEncoding(responseData.encoding);
                resource.setText(responseData.body);

                self.loadResource(resource); // Add resource to list for future downloading, see Scraper.waitForLoad
                return resource;
            }).catch(function handleError (err) {
                logger.error('failed to request resource ' + resource);
                return self.handleError(err, resource);
            });

        self.requestedResourcePromises.set(url, requestPromise);
        return requestPromise;
    }

    async requestResource (resource) {
        const url = resource.getUrl();
        const depth = resource.getDepth();

        if (this.options.urlFilter && depth > 0 && !this.options.urlFilter(url)) {
            logger.debug('filtering out ' + resource + ' by url filter');
            return null;
        }

        if (this.options.maxDepth && depth > this.options.maxDepth) {
            logger.debug('filtering out ' + resource + ' by depth');
            return null;
        }

        if (this.requestedResourcePromises.has(url)) {
            logger.debug('found requested resource for ' + resource);
            return this.requestedResourcePromises.get(url);
        }

        return this.createNewRequest(resource);
    }

    async runActions (actionName, params) {
        logger.debug(`run ${this.actions[actionName].length} actions ${actionName}`);

        let result = extend(params);
        for (let action of this.actions[actionName]) {
            if (typeof action === 'function') {
                result = await action(extend(params, result));
            }
        }
        return result;
    }

    async load () {
        const loadResourcePromises = this.resources.map(
            resource => this.requestResource.bind(this, resource)
        );
        await series(loadResourcePromises);
        return this.waitForLoad();
    }

    // Returns a promise which gets resolved when all resources are loaded.
    // 1. Get all not saved resources and save them
    // 2. Recursion if any new not saved resource were added during this time. If not, loading is done.
    async waitForLoad () {
        const resourcesToSave = Array.from(this.loadedResources.values()).filter((r) => !r.isSaved());
        const loadingIsFinished = resourcesToSave.length === 0;

        if (!loadingIsFinished) {
            const saveResourcePromises = resourcesToSave.map(
                resource => this.saveResource.bind(this, resource)
            );
            await series(saveResourcePromises);
            return this.waitForLoad();
        }
        logger.info('downloading is finished successfully');
        return Promise.resolve(this.resources);
    }

    async handleError (err, resource) {
        // ignore promise here, just notifying external code about resource error
        this.runActions('onResourceError', {resource, error: err});

        if (this.options.ignoreErrors) {
            logger.warn('ignoring error: ' + err.message);
            return null;
        }

        logger.error('error occurred: ' + err.message);
        throw err;
    }

    async scrape () {
        try {
            await this.runActions('beforeStart', {options: this.options, utils});
            return await this.load();
        } catch (error) {
            logger.error('finishing with error: ' + error.message);
            await this.runActions('error', {error});
            throw error;
        } finally {
            await this.runActions('afterFinish');
        }
    }
}

export default Scraper;