masnagam/njtool

View on GitHub
lib/journal.js

Summary

Maintainability
A
2 hrs
Test Coverage
A
100%
// This file is distributed under the MIT license.
// See LICENSE file in the project root for details.

'use strict';

const moment = require('moment');
const puppeteer = require('puppeteer');

// istanbul ignore next
function collectArticles2017() {
  const article_elements = document.querySelectorAll('#content article');
  return Array.from(article_elements).map((article) => {
    const title = article.querySelector('a').innerText;
    const url = article.querySelector('a').href;
    return { title, url };
  });
}

// istanbul ignore next
function collectArticles2018() {
  const article_elements = document.querySelectorAll('article');
  return Array.from(article_elements).map((article) => {
    const title = article.querySelector('a').innerText;
    const type_element = article.querySelector('[data-test="article.type"]');
    const type = type_element ? type_element.innerText : null;
    const date_element = article.querySelector('time');
    const date = date_element ? date_element.dateTime : null;
    const url = article.querySelector('a').href;
    const desc_element = article.querySelector('[itemprop="description"] p');
    const description = desc_element ? desc_element.innerText : null;
    const author_elements =
      article.querySelectorAll('[data-test="author-list"] [itemprop="name"]');
    const authors = Array.from(author_elements).map((elem) => elem.innerText);
    return { title, type, date, description, authors, url };
  });
}

class Journal {
  constructor(id) {
    const [name, volume, issue] = id.split(':');
    if (name === undefined || volume === undefined || issue == undefined) {
      throw new Error(`Invalid journal ID: ${id}`);
    }
    if (name != 'nature') {
      throw new Error(`Not supported at this moment: ${name}`);
    }

    this.name = name;
    this.volume = parseInt(volume);
    this.issue = parseInt(issue);
    this.content = null;
  }

  static from(args) {
    if (!Array.isArray(args)) {
      args = [args];
    }
    return args.map((arg) => new Journal(arg));
  }

  get id() {
    return `${this.name}:${this.volume}:${this.issue}`;
  }

  get url() {
    const base = 'https://www.nature.com';
    if (this.volume < 553) {
      return `${base}/${this.name}/journal/v${this.volume}/n${this.issue}/index.html`;
    }
    return `${base}/${this.name}/volumes/${this.volume}/issues/${this.issue}`;
  }

  get metadata() {
    let metadata = {
      name: this.name,
      volume: this.volume,
      issue: this.issue,
      url: this.url
    };
    if (this.content) {
      metadata.date = this.content.date;
      metadata.articles = this.content.articles;
    }
    if (this.error) {
      metadata.error = this.error;
    }
    return metadata;
  }

  async scrape(options) {
    let opt = {
      headless: options.headless
    };
    if (!options.sandbox) {
      opt.args = ['--no-sandbox', '--disable-setuid-sandbox'];
    }
    let browser = null;
    try {
      browser = await puppeteer.launch(opt);
      const page = await browser.newPage();
      await page.goto(this.url);
      const title = await page.title();
      if (title.startsWith('Page not found')) {
        throw new Error('Not found');
      }
      let date = this._getDateFromTitle(title);
      if (!date) {
        date = await this._getDateFromPage(page);
      }
      const articles = await page.evaluate(this._collectArticlesFunction);
      this.content = { date, articles };
    } catch (e) {
      this.error = e.message;
    }
    if (browser) {
      await browser.close();
    }
    return this.metadata;
  }

  get _collectArticlesFunction() {
    if (this.volume < 553) {
      return collectArticles2017;
    }
    return collectArticles2018;
  }

  _getDateFromTitle(title) {
    const components = title.split(', ');
    if (components.length < 2) {
      return null;
    }
    return this._convertDate(components[1]);
  }

  async _getDateFromPage(page) {
    const date = await page.evaluate(() => {
      // istanbul ignore next
      return document.querySelector('#issue-meta .more').innerText;
    });
    return this._convertDate(date);
  }

  _convertDate(date) {
    return moment(date, 'D MMMM YYYY').format('YYYY-MM-DD');
  }
}

module.exports = Journal;