src/main/core/entry/en.js
/**
* Mouse Dictionary (https://github.com/wtetsu/mouse-dictionary/)
* Copyright 2018-present wtetsu
* Licensed under MIT
*/
import UniqList from "uniqlist";
import rule from "../rule";
import text from "../../lib/text";
const RE_UNNECESSARY_CHARACTERS = new RegExp(String.fromCharCode(0x200c), "g");
const RE_SLASH = new RegExp("/", "g");
const createLookupWordsEn = (rawSourceStr, withCapitalized = false, mustIncludeOriginalText = false) => {
const replacedSourceStr = rawSourceStr.replace(RE_UNNECESSARY_CHARACTERS, "").replace(RE_SLASH, " / ");
const sourceStr = text.dealWithHyphens(replacedSourceStr, rule.doLetters);
const { firstWords, linkedWords } = processSourceString(sourceStr);
const lookupWords = new UniqList();
lookupWords.filer = (s) => s.length >= 2 || s === firstWord;
lookupWords.merge(linkedWords);
const firstWord = firstWords?.[0];
if (firstWord) {
lookupWords.merge(processFirstWord(firstWord));
}
const slashWords = createSlashWord(firstWords);
if (slashWords) {
lookupWords.merge(slashWords);
}
if (withCapitalized) {
lookupWords.merge(lookupWords.toArray().map((s) => s.toUpperCase()));
}
const titledExpressions = generateTitledExpressions(firstWords);
lookupWords.merge(titledExpressions);
if (mustIncludeOriginalText && !lookupWords.keys.has(rawSourceStr)) {
return [rawSourceStr, ...lookupWords.toArray()];
}
return lookupWords.toArray();
};
// ["united", "kingdom"] -> ["United", "United Kingdom"]
const generateTitledExpressions = (words) => {
if (!(words?.length >= 1)) {
return [];
}
const result = [];
let str = toTitle(words[0]);
if (str.length >= 2) {
result.push(str);
}
for (let i = 1; ; i++) {
if (i >= words.length || i >= 6) {
break;
}
const w = words[i];
str += " " + toTitle(w);
result.push(str);
}
return result;
};
const processSourceString = (sourceStr) => {
const linkedWords = [];
const lowerStr = sourceStr.toLowerCase();
const isAllLower = lowerStr === sourceStr;
let firstWords;
if (isAllLower) {
const words1 = createWordsList(sourceStr);
for (let i = 0; i < words1.length; i++) {
linkedWords.push(...createLinkedWords(words1[i], true));
}
firstWords = words1[0];
} else {
const words1 = createWordsList(sourceStr);
for (let i = 0; i < words1.length; i++) {
linkedWords.push(...createLinkedWords(words1[i], false));
}
const words2 = createWordsList(lowerStr);
for (let i = 0; i < words2.length; i++) {
linkedWords.push(...createLinkedWords(words2[i], true));
}
firstWords = words2[0];
}
const quotedStrings = fetchQuotedStrings(sourceStr);
for (let i = 0; i < quotedStrings.length; i++) {
const word3 = createWordsList(quotedStrings[i]);
for (let j = 0; j < word3.length; j++) {
linkedWords.push(...createLinkedWords(word3[j], true));
}
}
return { firstWords, linkedWords };
};
const processFirstWord = (firstWord) => [
...dealWithFirstWordHyphen(firstWord),
...divideIntoTwoWords(firstWord),
...cutDuplicatedLetters(firstWord),
];
const createSlashWord = (wordList) => {
if (!wordList) {
return null;
}
if (wordList[1] === "/" && wordList.length >= 3) {
const slashWord = wordList[0] + "/" + wordList[2];
return [slashWord, slashWord.toLowerCase()];
}
return null;
};
const JOINER_LIST = ["-", "", " "];
// "ladies-in-waiting" -> ["ladies-in-waiting", "lady-in-waiting", ...]
const dealWithFirstWordHyphen = (theFirstWord) => {
const wordList = theFirstWord.split("-");
if (wordList.length <= 1) {
return [];
}
const result = new UniqList();
const splittedFirstWord = wordList[0];
const phraseWithoutHyphen = wordList.join("");
result.push(phraseWithoutHyphen);
result.push(phraseWithoutHyphen.toLowerCase());
const baseWords = rule.doBase(splittedFirstWord);
for (const baseWord of baseWords) {
wordList[0] = baseWord;
for (const joiner of JOINER_LIST) {
const joinedWithHyphen = wordList.join(joiner);
result.push(joinedWithHyphen);
result.push(joinedWithHyphen.toLowerCase());
}
}
return result.toArray();
};
const divideIntoTwoWords = (str) => {
const result = [];
for (let i = 2; i <= str.length - 2; i++) {
const former = str.slice(0, i);
const latter = str.slice(i);
result.push(former + " " + latter);
result.push(former + "-" + latter);
}
return result;
};
// craaaaaaaaaaaaazy -> crazy
// snoooooooze -> snoze
const cutDuplicatedLetters = (str) => {
let prevCode = 0;
let count = 0;
let startIndex = -1;
let endIndex = -1;
for (let i = 0; i < str.length; i++) {
const code = str.charCodeAt(i);
if (code === prevCode) {
count++;
} else {
if (count < 3) {
prevCode = code;
count = 1;
startIndex = i;
} else {
endIndex = i;
count = 0;
break;
}
}
}
if (count >= 3) {
endIndex = str.length;
}
if (startIndex === -1 || endIndex === -1) {
return [];
}
const prefix = str.substring(0, startIndex);
const middle = String.fromCharCode(prevCode);
const postfix = str.substring(endIndex);
return [prefix + middle + postfix, prefix + middle + middle + postfix];
};
const isValidCharacter = (ch) => rule.doLetters(ch);
const createWordsList = (str) => {
if (!str) {
return [];
}
const wordsList = [];
const breakIndex = findBreak(str);
if (breakIndex >= 2) {
wordsList.push(text.splitIntoWords(str.substring(0, breakIndex), isValidCharacter));
}
const words = text.splitIntoWords(str, isValidCharacter);
wordsList.push(words);
const unifiedSpellingWords = rule.doSpelling(words);
if (unifiedSpellingWords) {
wordsList.push(unifiedSpellingWords);
}
return wordsList;
};
const findBreak = (str) => {
let r = -1;
for (let i = 0; i < str.length; i++) {
const code = str.charCodeAt(i);
if (code === 44 || code === 46) {
r = i;
break;
}
}
return r;
};
const createLinkedWords = (words, isAllLower) => {
const lookupWords = [];
const linkedWords = createLinkedWordList(words, isAllLower, 1);
lookupWords.push(...linkedWords);
// ["on", "my", "own"] -> [["on", "one's", "own"], ["on", "someone's", "own"]]
const convertedWordsList = words.length >= 2 ? rule.doPronoun(words) : [];
for (const convertedWords of convertedWordsList) {
if (convertedWords) {
const linkedConvertedWords = createLinkedWordList(convertedWords, isAllLower, 2);
lookupWords.push(...linkedConvertedWords);
}
}
if (words.length >= 2) {
// "self taught" -> "selftaught"
lookupWords.push(words[0] + words[1]);
}
return lookupWords;
};
const TRAILING_RULES = [
[
{ search: "'s", new: "" },
{ search: "s", new: "" },
],
[{ search: "er", new: "" }],
[{ search: "iest", new: "y" }],
[{ search: "est", new: "" }],
];
/**
* ['cut', 'back'] -> [ 'cut back', 'cut' ]
* [ 'ran', 'with' ] -> [ 'ran with', 'ran', 'run with', 'run' ]
*/
const createLinkedWordList = (arr, allLowercase, minWordNum = 1) => {
const enablePhrasing = allLowercase;
const ignoreLowerCase = allLowercase;
const linkedWords = text.linkWords(arr, minWordNum, enablePhrasing);
if (minWordNum <= 1) {
const wlist = parseFirstWord(arr[0], ignoreLowerCase);
linkedWords.push(...wlist);
}
const newPhrases = [];
for (let i = 0; i < linkedWords.length; i++) {
const arr = text.tryToReplaceTrailingStrings(linkedWords[i], TRAILING_RULES);
newPhrases.push(...arr);
}
linkedWords.push(...newPhrases);
return linkedWords;
};
/**
* "wordone-wordtwo-wordthree" -> ["wordone", "wordtwo", "wordthree", "-wordthree"]
* "Announcements" -> ["Announcement", "announcements", "announcement]
* "third-party" -> ["third party", "third", "party", "-party"]
*/
const parseFirstWord = (sourceStr, ignoreLowerCase, minLength = 3) => {
if (!sourceStr) {
return [];
}
const wordList = new UniqList();
wordList.filer = (a) => a.length >= minLength;
let strList;
if (ignoreLowerCase) {
strList = [sourceStr];
} else {
const lowerStr = sourceStr.toLowerCase();
strList = lowerStr === sourceStr ? [sourceStr] : [sourceStr, lowerStr];
}
for (let i = 0; i < strList.length; i++) {
const str = strList[i];
if (i >= 1) {
wordList.push(str);
}
wordList.merge(text.tryToReplaceTrailingStrings(str, TRAILING_RULES));
const arr = text.splitString(str, 2);
if (arr.length >= 2) {
wordList.push(arr.join(" "));
}
wordList.merge(arr);
const arrayArray = arr.map(rule.doBase);
for (let j = 0; j < arrayArray.length; j++) {
wordList.merge(arrayArray[j]);
}
if (arr.length >= 2) {
// Add a prefix
const first = arr[0];
if (isHyphenLikeCharacter(sourceStr, first.length)) {
wordList.push(first + "-");
}
// Add a postfix
const last = arr.at(-1);
if (isHyphenLikeCharacter(sourceStr, sourceStr.length - last.length - 1)) {
wordList.push("-" + last);
}
}
}
return wordList.toArray();
};
const isHyphenLikeCharacter = (sourceStr, position) => {
const code = sourceStr.charCodeAt(position);
// Note: This kind of naive comparison is fast enough(Much faster than using Set)
return code === 45 || code === 8209;
};
const QUOTE_CHARS = ['"', "'"];
const fetchQuotedStrings = (str) => {
const result = [];
for (const q of QUOTE_CHARS) {
const nextQuoteIndex = str.indexOf(q, 1);
if (nextQuoteIndex >= 3) {
const startIndex = str.startsWith(q) ? 1 : 0;
const quotedString = str.substring(startIndex, nextQuoteIndex);
result.push(quotedString);
const loweredQuotedString = quotedString.toLowerCase();
if (loweredQuotedString !== quotedString) {
result.push(loweredQuotedString);
}
}
}
return result;
};
const toTitle = (str) => {
return str.substring(0, 1).toUpperCase() + str.substring(1).toLowerCase();
};
export default createLookupWordsEn;