wtetsu/deinja

View on GitHub
src/build.js

Summary

Maintainability
A
0 mins
Test Coverage
/*
 * deinja
 * Copyright (C) 2018 wtetsu
 * https://github.com/wtetsu/deinja
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

const Form = require("./form");
const Deinflection = require("./deinflection");
const TailSearcher = require("./tailsearcher");
const InflectionType = require("./inflectiontype");
const UniqList = require("uniqlist");

const build = (data) => {
  const searchers = createSearchers(data);
  const newConverter = (word) => deinflect(word, searchers).map((d) => d.baseForm);
  return newConverter;
};

module.exports = build;

class Inflection {
  constructor(inflection, base, form) {
    this.inflection = inflection;
    this.base = base;
    this.form = form;
  }
}

const createSearchers = (data) => {
  return {
    adjective: createTailSearcher(data.adjective, "inflection"),
    ichidan: createTailSearcher(data.ichidan, "inflection"),
    godan: createTailSearcher(data.godan, "inflection"),
    suru: createTailSearcher(data.suru, "inflection"),
    kuru: createTailSearcher(data.kuru, "inflection"),
    special: createTailSearcher(data.special, "inflection"),
    iku: createTailSearcher(data.iku, "inflection"),
    bogus: createTailSearcher(data.bogus),
  };
};

const createTailSearcher = (list, key) => {
  const data = list.map((a) => new Inflection(...a));
  return new TailSearcher(data, key);
};

const deinflect = (inflectedWord, searchers) => {
  const terms = new UniqList();
  terms.push(new Deinflection(inflectedWord, inflectedWord, -1, -1));

  deinflectRegular(terms, searchers.adjective, InflectionType.ADJECTIVE, true);
  deinflectRegular(terms, searchers.ichidan, InflectionType.ICHIDAN, true);
  deinflectRegular(terms, searchers.godan, InflectionType.GODAN, false);

  deinflectIrregular(terms, searchers.suru, InflectionType.SURU);
  deinflectIrregular(terms, searchers.kuru, InflectionType.KURU);
  deinflectIrregular(terms, searchers.special, InflectionType.SPECIAL);
  deinflectIrregular(terms, searchers.iku, InflectionType.IKU);

  return filterBogusEndings(terms.array, searchers.bogus);
};

const deinflectRegular = (terms, inflectionSearcher, inflectionType, processAsAdded) => {
  const initialSize = terms.size();
  for (let i = 0; i < terms.size(); i++) {
    if (!processAsAdded && i >= initialSize) {
      break;
    }
    const deinflection = terms.get(i);
    const inflections = inflectionSearcher.search(deinflection.baseForm);

    for (let j = 0; j < inflections.length; j++) {
      const inflection = inflections[j];
      if (deinflection.inflectionType === InflectionType.ADJECTIVE && !isAuxAdjective(inflection.form)) {
        continue;
      }
      const deinflectedWord = deinflectWord(deinflection.baseForm, inflection);
      if (!deinflectedWord) {
        continue;
      }
      if (inflectionType === InflectionType.ICHIDAN && !hasIchidanEnding(deinflectedWord)) {
        continue;
      }
      const newRecord = new Deinflection(deinflection.baseForm, deinflectedWord, inflection.form, inflectionType);
      terms.push(newRecord, deinflectedWord);
    }
  }
};

const AUX_ADJECTIVE_TYPES = new Set([Form.TAI, Form.SOU, Form.NEGATIVE]);

const isAuxAdjective = (form) => {
  return AUX_ADJECTIVE_TYPES.has(form);
};

const deinflectWord = (inflectedWord, inflection) => {
  if (!inflectedWord.endsWith(inflection.inflection)) {
    return null;
  }

  const endIndex = inflectedWord.length - inflection.inflection.length;
  const baseWord = inflectedWord.substring(0, endIndex) + inflection.base;
  if (baseWord.length <= 1) {
    return null;
  }

  return baseWord;
};

const deinflectIrregular = (terms, inflectionSearcher, inflectionType) => {
  const initialSize = terms.size();
  for (let i = 0; i < initialSize; i++) {
    const deinflection = terms.get(i);

    const inflections = inflectionSearcher.search(deinflection.baseForm);
    for (let j = 0; j < inflections.length; j++) {
      const inflection = inflections[j];
      if (inflection.inflection === deinflection.baseForm) {
        const word = inflection.base;
        const newRecord = new Deinflection(deinflection.baseForm, word, inflection.form, inflectionType);
        terms.push(newRecord, word);
      }
    }
  }
};

const filterBogusEndings = (terms, bogusSearcher) => {
  const result = [];
  for (let i = 0; i < terms.length; i++) {
    const deinflection = terms[i];
    if (deinflection.baseForm === deinflection.inflectedWord) {
      continue;
    }
    const isInvalid = bogusSearcher.find(deinflection.baseForm);
    if (!isInvalid) {
      result.push(deinflection);
    }
  }
  return result;
};

const ICHIDAN = new Set([
  "い",
  "き",
  "ぎ",
  "し",
  "じ",
  "ち",
  "ぢ",
  "に",
  "ひ",
  "び",
  "ぴ",
  "み",
  "り",
  "イ",
  "キ",
  "ギ",
  "シ",
  "ジ",
  "チ",
  "ヂ",
  "ニ",
  "ヒ",
  "ビ",
  "ピ",
  "ミ",
  "リ",
  "え",
  "け",
  "げ",
  "せ",
  "ぜ",
  "て",
  "で",
  "ね",
  "へ",
  "べ",
  "ぺ",
  "め",
  "れ",
  "エ",
  "ケ",
  "ゲ",
  "セ",
  "ゼ",
  "テ",
  "デ",
  "ネ",
  "ヘ",
  "ベ",
  "ペ",
  "メ",
  "レ",
]);

const hasIchidanEnding = (word) => {
  const len = word.length;
  if (len <= 1) {
    return false;
  }

  const s = word.substring(len - 2, len - 1);
  return ICHIDAN.has(s);
};