mysamai/mysam-core

View on GitHub
lib/extract/extractor.js

Summary

Maintainability
A
0 mins
Test Coverage
const each = require('lodash/each');
const isEqual = require('lodash/isEqual');
const maxBy = require('lodash/maxBy');
const stem = require('./utils');
const { tokenize } = stem;
const validateTag = tag => Array.isArray(tag.bounds) && tag.bounds.length === 2;

class Extractor {
  constructor (baseline, tags = []) {
    this.baseline = stem(baseline);
    this.tags = tags.filter(validateTag);
  }

  tag (tags) {
    each(tags, (bounds, label) => {
      const tag = { label, bounds };

      if (validateTag(tag)) {
        this.tags.push(tag);
      }
    });

    // Make sure that tags are ordered by their beginning location
    this.tags.sort((first, second) => first.bounds[0] - second.bounds[0]);

    return this;
  }

  extract (sentence) {
    const tokens = tokenize(sentence);
    const tags = this.getTags(stem(tokens));
    const extracted = {};

    each(tags, (bounds, label) => {
      extracted[label] = bounds
        ? tokens.slice(bounds[0], bounds[1] + 1).join(' ') : null;
    });

    return { tags, extracted };
  }

  getTags (words) {
    let currentWords = words;
    let startIndex = 0;
    let tags = this.tags.map(tag => {
      // Find the words in the baseline that this tag matches
      const [ baselineStart, baselineEnd ] = tag.bounds;
      const baselineWords = this.baseline.slice(baselineStart, baselineEnd + 1);

      const result = this.getTag(tag, currentWords, startIndex);

      if (result.bounds) {
        const [ tagStart, tagEnd ] = result.bounds;
        const copy = currentWords.slice(0);

        // We need to replace the match with the baseline words so that
        // suceeding tags still match. Currently we can only do that
        // if the tag and the baseline tag have the same length since
        // indices would get messed up otherwise
        if (baselineWords.length === tagEnd - tagStart + 1) {
          copy.splice(tagStart, (tagEnd + 1) - tagStart, ...baselineWords);

          currentWords = copy;
        }

        startIndex = result.bounds[1];
      }

      return result;
    });

    const result = {};

    tags.forEach(tag => {
      result[tag.label] = tag.bounds;
    });

    return result;
  }

  getTag (tag, words, startIndex = 0) {
    const label = tag.label;
    const [ start, end ] = tag.bounds;
    const length = start - end;

    let bounds = null;

    // If baseline start at 0, we'll also start at the beginning
    const tagStart = start === 0 ? 0
      // Otherwise use the best match (matching to the left)
      : this.bestMatch(words, start - 1, -1, startIndex).location;

    if (tagStart === -1) {
      return { label, bounds };
    }

    const tagEnd = end === this.baseline.length - 1
      // If baseline tag ends at the end of sentence use the end of
      // the text to check
      ? words.length - 1
      // Otherwise use the best match (matching to the right)
      : this.bestMatch(words, start + 1, 1, tagStart).location;

    if (tagEnd === -1) {
      bounds = [ tagStart, tagStart + length ];
    } else if (tagEnd >= tagStart) {
      bounds = [ tagStart, tagEnd ];
    }

    return { label, bounds };
  }

  bestMatch (words, baselineIndex, direction, startIndex = 0) {
    const matchList = [];

    let currentMatch, previousMatch;

    do {
      previousMatch = currentMatch;
      currentMatch = this.matchWords(words, baselineIndex, direction, startIndex);
      // Update the startIndex to find the next match
      startIndex = currentMatch.location + currentMatch.matches;
      matchList.push(currentMatch);
    } while (
      // no more matches
      currentMatch.location !== -1 &&
      // break condition to avoid possible infinite loops
      !isEqual(currentMatch, previousMatch)
    );

    return maxBy(matchList, match => match.matches);
  }

  matchWords (words, baselineIndex, direction = -1, startIndex = 0) {
    // The baseline word
    let word = this.baseline[baselineIndex];
    // The index of the same word in the text to check
    let index = words.indexOf(word, startIndex);
    // Offset between baseline and the text to check
    let offset = index - baselineIndex;
    // the initial index is the location we return
    let location = index;
    // The number of matches
    let matches = 0;

    if (location !== -1) {
      // Depending on if we are going to the left or the right
      // the location needs to be adjusted accordingly
      location = index + (-1 * direction);
    }

    // Now add up all matches between the baseline text and the text to check
    while (index !== -1 && typeof words[index] !== 'undefined' &&
        typeof this.baseline[index - offset] !== 'undefined' &&
        words[index] === this.baseline[index - offset]) {
      matches++;
      index = index + direction;
    }

    return { location, matches };
  }
}

module.exports = Extractor;