src/utils/search.js from Chalarangelo/30-seconds-of-code

src/utils/search.js
Summary

Maintainability

1 day
Test Coverage

Issues
import searchEngineSettings from '#settings/searchEngine';
const { serverStopWords, clientStopWords } = searchEngineSettings;

// Standard suffix manipulations.
const step2list = {
  ational: 'ate',
  tional: 'tion',
  enci: 'ence',
  anci: 'ance',
  izer: 'ize',
  bli: 'ble',
  alli: 'al',
  entli: 'ent',
  eli: 'e',
  ousli: 'ous',
  ization: 'ize',
  ation: 'ate',
  ator: 'ate',
  alism: 'al',
  iveness: 'ive',
  fulness: 'ful',
  ousness: 'ous',
  aliti: 'al',
  iviti: 'ive',
  biliti: 'ble',
  logi: 'log',
};

const step3list = {
  icate: 'ic',
  ative: '',
  alize: 'al',
  iciti: 'ic',
  ical: 'ic',
  ful: '',
  ness: '',
};

// Consonant-vowel sequences.
const consonant = '[^aeiou]';
const vowel = '[aeiouy]';
const consonants = '(' + consonant + '[^aeiouy]*)';
const vowels = '(' + vowel + '[aeiou]*)';

const gt0 = new RegExp('^' + consonants + '?' + vowels + consonants);
const eq1 = new RegExp(
  '^' + consonants + '?' + vowels + consonants + vowels + '?$'
);
const gt1 = new RegExp('^' + consonants + '?(' + vowels + consonants + '){2,}');
const vowelInStem = new RegExp('^' + consonants + '?' + vowel);
const consonantLike = new RegExp('^' + consonants + vowel + '[^aeiouwxy]$');

// Exception expressions.
const sfxLl = /ll$/;
const sfxE = /^(.+?)e$/;
const sfxY = /^(.+?)y$/;
const sfxIon = /^(.+?(s|t))(ion)$/;
const sfxEdOrIng = /^(.+?)(ed|ing)$/;
const sfxAtOrBlOrIz = /(at|bl|iz)$/;
const sfxEED = /^(.+?)eed$/;
const sfxS = /^.+?[^s]s$/;
const sfxSsesOrIes = /^.+?(ss|i)es$/;
const sfxMultiConsonantLike = /([^aeiouylsz])\1$/;
const step2 = new RegExp(
  '^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$'
);
const step3 = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/;
const step4 = new RegExp(
  '^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$'
);

/**
 * Tokenizes a string:
 * - Split into words using a regular expression.
 * - Filter words that are under 2 characters long
 */
const tokenize = str =>
  str
    .split(/[^a-z0-9\-']+/i)
    .filter(tkn => !tkn.length < 2)
    .map(tkn => tkn.toLowerCase());

/**
 * Stems a string:
 * - Return as-is if under 3 characters long
 * - Use porter stemmer otherwise
 */
const stem = str => {
  // Exit early
  if (str.length < 3) return str;

  let firstCharacterWasLowerCaseY;
  let match;

  // Detect initial `y`, make sure it never matches.
  if (str.startsWith('y')) {
    firstCharacterWasLowerCaseY = true;
    str = 'Y' + str.slice(1);
  }

  // Step 1a.
  if (sfxSsesOrIes.test(str)) {
    // Remove last two characters.
    str = str.slice(0, -2);
  } else if (sfxS.test(str)) {
    // Remove last character.
    str = str.slice(0, -1);
  }

  // Step 1b.
  if ((match = sfxEED.exec(str))) {
    if (gt0.test(match[1])) {
      // Remove last character.
      str = str.slice(0, -1);
    }
  } else if ((match = sfxEdOrIng.exec(str)) && vowelInStem.test(match[1])) {
    str = match[1];

    if (sfxAtOrBlOrIz.test(str)) {
      // Append `e`.
      str += 'e';
    } else if (sfxMultiConsonantLike.test(str)) {
      // Remove last character.
      str = str.slice(0, -1);
    } else if (consonantLike.test(str)) {
      // Append `e`.
      str += 'e';
    }
  }

  // Step 1c.
  if ((match = sfxY.exec(str)) && vowelInStem.test(match[1])) {
    // Remove suffixing `y` and append `i`.
    str = match[1] + 'i';
  }

  // Step 2.
  if ((match = step2.exec(str)) && gt0.test(match[1]))
    str = match[1] + step2list[match[2]];

  // Step 3.
  if ((match = step3.exec(str)) && gt0.test(match[1]))
    str = match[1] + step3list[match[2]];

  // Step 4.
  if ((match = step4.exec(str))) {
    if (gt1.test(match[1])) str = match[1];
  } else if ((match = sfxIon.exec(str)) && gt1.test(match[1])) str = match[1];

  // Step 5.
  if (
    (match = sfxE.exec(str)) &&
    (gt1.test(match[1]) ||
      (eq1.test(match[1]) && !consonantLike.test(match[1])))
  )
    str = match[1];

  if (sfxLl.test(str) && gt1.test(str)) str = str.slice(0, -1);

  // Turn initial `Y` back to `y`.
  if (firstCharacterWasLowerCaseY) str = 'y' + str.slice(1);

  return str;
};

/**
 * Removes stop words from an array of words:
 * - Use the list of stop words to remove stop words from the given array
 */
const cleanStopWords = (stopWords, words) =>
  words.filter(tkn => !stopWords.includes(tkn));

const cleanServerStopWords = words => cleanStopWords(serverStopWords, words);
const cleanClientStopWords = words => cleanStopWords(clientStopWords, words);

/**
 * Deduplicates a list of tokens.
 */
const deduplicateTokens = tokens => [
  ...new Set(tokens.map(t => t.replace(/['-]$/, ''))),
];

/**
 * Given a string, produce a list of tokens (server-side variant).
 */
const parseTokens = str =>
  deduplicateTokens(
    cleanServerStopWords(tokenize(str)).map(tkn => stem(tkn))
  ).filter(
    tkn =>
      !!tkn &&
      tkn.length > 1 &&
      !/^-?\d+$/i.test(tkn) &&
      !/^[()[\]$^.;:|\\/%&*#@!%,"'~`\-+=]+$/i.test(tkn)
  );

/**
 * Given a string, produce a list of tokens (client-side variant).
 */
export const quickParseTokens = str =>
  deduplicateTokens(
    cleanClientStopWords(tokenize(str)).map(tkn => stem(tkn))
  ).filter(
    tkn =>
      !!tkn &&
      tkn.length > 1 &&
      !/^-?\d+$/i.test(tkn) &&
      !/^[()[\]$^.;:|\\/%&*#@!%,"'~`\-+=]+$/i.test(tkn)
  );

export default parseTokens;