Smile-SA/elasticsuite

View on GitHub
src/module-elasticsuite-core/etc/elasticsuite_analysis.xml

Summary

Maintainability
Test Coverage
<?xml version="1.0"?>
<!--
/**
 * Smile_ElasticsuiteCore default analysis configuration.
 *
 * DISCLAIMER
 *
 * Do not edit or add to this file if you wish to upgrade Smile ElasticSuite to newer
 * versions in the future.
 *
 * @category  Smile
 * @package   Smile\ElasticsuiteCore
 * @author    Aurelien FOUCRET <aurelien.foucret@smile.fr>
 * @copyright 2020 Smile
 * @license   Open Software License ("OSL") v. 3.0
 */
 -->
<analysis xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
          xsi:noNamespaceSchemaLocation="urn:magento:module:Smile_ElasticsuiteCore:etc/elasticsuite_analysis.xsd">

    <char_filters>
        <char_filter name="html_strip" type="html_strip" language="default"/>
    </char_filters>

    <filters>
        <filter name="trim" type="trim" language="default"/>
        <filter name="truncate_to_max" type="truncate" language="default">
            <!-- Absolute max supported by Lucene for a token length is 32766 bytes, so using 1/4 of that to accomodate multibyte UTF-8 characters -->
            <length>8192</length>
        </filter>
        <filter name="lowercase" type="lowercase" language="default"/>
        <filter name="word_delimiter" type="word_delimiter" language="default">
            <generate_word_parts>true</generate_word_parts>
            <catenate_words>true</catenate_words>
            <catenate_numbers>true</catenate_numbers>
            <catenate_all>true</catenate_all>
            <split_on_case_change>true</split_on_case_change>
            <split_on_numerics>true</split_on_numerics>
            <preserve_original>true</preserve_original>
        </filter>
        <filter name="shingle" type="shingle" language="default">
            <min_shingle_size>2</min_shingle_size>
            <max_shingle_size>4</max_shingle_size>
            <output_unigrams>false</output_unigrams>
        </filter>
        <filter name="reference_shingle" type="shingle" language="default">
            <min_shingle_size>2</min_shingle_size>
            <max_shingle_size>10</max_shingle_size>
            <output_unigrams>true</output_unigrams>
            <token_separator></token_separator>
        </filter>
        <filter name="reference_word_delimiter" type="word_delimiter" language="default">
            <generate_word_parts>true</generate_word_parts>
            <catenate_words>false</catenate_words>
            <catenate_numbers>false</catenate_numbers>
            <catenate_all>false</catenate_all>
            <split_on_case_change>true</split_on_case_change>
            <split_on_numerics>true</split_on_numerics>
            <preserve_original>false</preserve_original>
        </filter>
        <filter name="ascii_folding" type="asciifolding" language="default">
            <preserve_original>false</preserve_original>
        </filter>

        <filter name="stemmer" type="stemmer" language="ar">
            <language>arabic</language>
        </filter>
        <filter name="stemmer" type="stemmer" language="eu">
            <language>basque</language>
        </filter>
        <filter name="stemmer" type="stemmer" language="bg">
            <language>bulgarian</language>
        </filter>
        <filter name="stemmer" type="stemmer" language="ca">
            <language>catalan</language>
        </filter>
        <filter name="stemmer" type="stemmer" language="cs">
            <language>czech</language>
        </filter>
        <filter name="stemmer" type="stemmer" language="da">
            <language>danish</language>
        </filter>
        <filter name="stemmer" type="stemmer" language="de">
            <language>german2</language>
        </filter>
        <filter name="stemmer" type="stemmer" language="en">
            <language>english</language>
        </filter>
        <filter name="stemmer" type="stemmer" language="es">
            <language>spanish</language>
        </filter>
        <filter name="stemmer" type="stemmer" language="el">
            <language>greek</language>
        </filter>
        <filter name="stemmer" type="stemmer" language="fi">
            <language>finnish</language>
        </filter>
        <filter name="stemmer" type="stemmer" language="fr">
            <language>french</language>
        </filter>
        <filter name="stemmer" type="stemmer" language="gl">
            <language>galician</language>
        </filter>
        <filter name="stemmer" type="stemmer" language="hi">
            <language>hindi</language>
        </filter>
        <filter name="stemmer" type="stemmer" language="hu">
            <language>hungarian</language>
        </filter>
        <filter name="stemmer" type="stemmer" language="id">
            <language>indonesian</language>
        </filter>
        <filter name="stemmer" type="stemmer" language="it">
            <language>italian</language>
        </filter>
        <filter name="stemmer" type="stemmer" language="lv">
            <language>latvian</language>
        </filter>
        <filter name="stemmer" type="stemmer" language="lt">
            <language>lithuanian</language>
        </filter>
        <filter name="stemmer" type="stemmer" language="nb">
            <language>norwegian</language>
        </filter>
        <filter name="stemmer" type="stemmer" language="nn">
            <language>light_nynorsk</language>
        </filter>
        <filter name="stemmer" type="stemmer" language="nl">
            <language>dutch</language>
        </filter>
        <filter name="stemmer" type="stemmer" language="pt">
            <language>portuguese</language>
        </filter>
        <filter name="stemmer" type="stemmer" language="ro">
            <language>romanian</language>
        </filter>
        <filter name="stemmer" type="stemmer" language="ru">
            <language>russian</language>
        </filter>
        <filter name="stemmer" type="stemmer" language="sv">
            <language>swedish</language>
        </filter>
        <filter name="stemmer" type="stemmer" language="tr">
            <language>turkish</language>
        </filter>
        <filter name="stemmer_override" type="stemmer_override" language="fr">
            <!-- Having an empty list of rules is valid.
                 This means existing rules can be both re-defined or voided in a custom module. -->
            <!--<rules>[]</rules>-->
            <!--
                Known issues with the "french" stemmer.
                 -->
            <rules>["clous => clou", "verrous => verrou", "ecrous => ecrou", "jeux => jeu", "photos => photo", "clef => cle", "clefs => cle", "compas => compas"]</rules>
        </filter>
        <filter name="stemmer_override" type="stemmer_override" language="it">
            <!--
                Known issues with the "italian" stemmer. Please note that we do not claim 100% accuracy in the selected stems.
                For instance, the stem for "trapani" is originally "trapan", the same as the verb "trapanare".
                So the actual valid rule might be "trapano => trapan" ... -->
            <rules>["trapani => trap", "zanzariere => zanzarier", "lavatoi => lavatoi", "lamiere => lamier", "plafoniere => plafonier"]</rules>
        </filter>
        <filter name="elision" type="elision" language="ca">
            <articles>["d", "l", "m", "n", "s", "t"]</articles>
        </filter>
        <filter name="elision" type="elision" language="fr">
            <articles>["l", "m", "t", "qu", "n", "s", "j", "d", "c"]</articles>
        </filter>
        <filter name="elision" type="elision" language="it">
            <articles>["c", "l", "all", "dall", "dell", "nell", "sull", "coll", "pell", "gl", "agl", "dagl", "degl", "negl", "sugl", "un", "m", "t", "s", "v", "d"]</articles>
        </filter>
        <filter name="phonetic" type="phonetic" language="default">
            <encoder>metaphone</encoder>
        </filter>
        <filter name="phonetic" type="phonetic" language="fr">
            <encoder>beider_morse</encoder>
            <languageset>french</languageset>
        </filter>
        <filter name="lowercase" type="lowercase" language="el">
            <language>greek</language>
        </filter>
        <filter name="edge_ngram_filter" type="edge_ngram" language="default">
            <min_gram>3</min_gram>
            <max_gram>20</max_gram>
        </filter>
        <filter name="trim_leading_zeroes" type="pattern_replace" language="default">
            <pattern><![CDATA[^0+]]></pattern>
            <replacement><![CDATA[]]></replacement>
        </filter>
        <filter name="trim_trailing_zeroes" type="pattern_replace" language="default">
            <pattern><![CDATA[0+$]]></pattern>
            <replacement><![CDATA[]]></replacement>
        </filter>
        <filter name="reduce_zeroes" type="pattern_replace" language="default">
            <pattern><![CDATA[0+]]></pattern>
            <replacement><![CDATA[0]]></replacement>
        </filter>
    </filters>

    <analyzers>
        <analyzer name="keyword" tokenizer="keyword" language="default">
            <filters>
                <filter ref="truncate_to_max" />
            </filters>
        </analyzer>
        <analyzer name="standard" tokenizer="standard" language="default">
            <filters>
                <filter ref="ascii_folding" />
                <filter ref="trim" />
                <filter ref="elision" />
                <filter ref="word_delimiter" />
                <filter ref="lowercase" />
                <filter ref="stemmer_override" />
                <filter ref="stemmer" />
            </filters>
            <char_filters>
                <char_filter ref="html_strip" />
            </char_filters>
        </analyzer>
        <analyzer name="whitespace" tokenizer="standard" language="default">
            <filters>
                <filter ref="ascii_folding" />
                <filter ref="trim" />
                <filter ref="elision" />
                <filter ref="word_delimiter" />
                <filter ref="lowercase" />
            </filters>
            <char_filters>
                <char_filter ref="html_strip" />
            </char_filters>
        </analyzer>
        <analyzer name="reference" tokenizer="standard" language="default">
            <filters>
                <filter ref="ascii_folding" />
                <filter ref="trim" />
                <filter ref="elision" />
                <filter ref="reference_word_delimiter" />
                <filter ref="trim_leading_zeroes" />
                <filter ref="trim_trailing_zeroes" />
                <filter ref="reduce_zeroes" />
                <filter ref="lowercase" />
                <filter ref="reference_shingle" />
            </filters>
            <char_filters>
                <char_filter ref="html_strip" />
            </char_filters>
        </analyzer>
        <analyzer name="shingle" tokenizer="whitespace" language="default">
            <filters>
                <filter ref="ascii_folding" />
                <filter ref="trim" />
                <filter ref="elision" />
                <filter ref="word_delimiter" />
                <filter ref="lowercase" />
                <filter ref="stemmer_override" />
                <filter ref="stemmer" />
                <filter ref="shingle" />
            </filters>
            <char_filters>
                <char_filter ref="html_strip" />
            </char_filters>
        </analyzer>
        <analyzer name="sortable" tokenizer="keyword" language="default">
            <filters>
                <filter ref="ascii_folding" />
                <filter ref="trim" />
                <filter ref="lowercase" />
            </filters>
            <char_filters>
                <char_filter ref="html_strip" />
            </char_filters>
        </analyzer>
        <analyzer name="phonetic" tokenizer="standard" language="default">
            <filters>
                <filter ref="ascii_folding" />
                <filter ref="trim" />
                <filter ref="elision" />
                <filter ref="word_delimiter" />
                <filter ref="lowercase" />
                <filter ref="phonetic" />
            </filters>
            <char_filters>
                <char_filter ref="html_strip" />
            </char_filters>
        </analyzer>
        <analyzer name="standard_edge_ngram" tokenizer="standard" language="default">
            <filters>
                <filter ref="ascii_folding" />
                <filter ref="trim" />
                <filter ref="elision" />
                <filter ref="word_delimiter" />
                <filter ref="lowercase" />
                <filter ref="stemmer_override" />
                <filter ref="stemmer" />
                <filter ref="edge_ngram_filter"/>
            </filters>
            <char_filters>
                <char_filter ref="html_strip"/>
            </char_filters>
        </analyzer>
    </analyzers>

    <normalizers>
        <normalizer name="untouched" type="custom" language="default">
            <char_filters></char_filters>
        </normalizer>

        <!--
            If you want to have case insensitive layered navigation filters options matching,
            you can override the "untouched" normalizer above (which does nothing) by this one.
            Please note, though, that those filter options will now be displayed in lowercase
            in the layered navigation filters.
        <normalizer name="untouched" type="custom" language="default">
            <char_filters></char_filters>
            <filters>
                <filter ref="ascii_folding" />
                <filter ref="trim" />
                <filter ref="lowercase" />
            </filters>
        </normalizer>
        -->
    </normalizers>

    <stemmers>
        <group language="nl" title="Dutch">
            <stemmer identifier="dutch" recommended="true">
                <label>Dutch Snowball</label>
            </stemmer>
            <stemmer identifier="dutch_kp">
                <label>Dutch KP (Kraaij-Pohlmann)</label>
            </stemmer>
        </group>
        <group language="en" title="English">
            <stemmer identifier="english" recommended="true">
                <label>English Porter</label>
            </stemmer>
            <stemmer identifier="light_english">
                <label>Light English (Krovetz, 2000)</label>
            </stemmer>
            <stemmer identifier="lovins">
                <label>English Lovins</label>
            </stemmer>
            <stemmer identifier="minimal_english">
                <label>Minimal English (Harman, 1991)</label>
            </stemmer>
            <stemmer identifier="porter2">
                <label>English Porter2</label>
            </stemmer>
            <stemmer identifier="possessive_english">
                <label>English Possessive</label>
            </stemmer>
        </group>
        <group language="fi" title="Finish">
            <stemmer identifier="finish" recommended="true">
                <label>Finish Snowball</label>
            </stemmer>
            <stemmer identifier="light_finish">
                <label>Light Finish (Savoy, 2003)</label>
            </stemmer>
        </group>
        <group language="fr" title="French">
            <stemmer identifier="light_french" recommended="true">
                <label>Light French (Savoy, 2006)</label>
            </stemmer>
            <stemmer identifier="french">
                <label>French Snowball</label>
            </stemmer>
            <stemmer identifier="minimal_french">
                <label>Minimal French (Savoy, 1999)</label>
            </stemmer>
        </group>
        <group language="gl" title="Galician">
            <stemmer identifier="galician" recommended="true">
                <label>Galician</label>
            </stemmer>
            <stemmer identifier="minimal_galician">
                <label>Minimal Galician (Singular/Plural only)</label>
            </stemmer>
        </group>
        <group language="de" title="German">
            <stemmer identifier="light_german" recommended="true">
                <label>Light German (Savoy, 2006)</label>
            </stemmer>
            <stemmer identifier="german">
                <label>German Snowball</label>
            </stemmer>
            <stemmer identifier="german2">
                <label>German2 Snowball variant</label>
            </stemmer>
            <stemmer identifier="minimal_german">
                <label>Minimal German (Savoy, 2002a)</label>
            </stemmer>
        </group>
        <group language="hu" title="Hungarian">
            <stemmer identifier="hungarian" recommended="true">
                <label>Hungarian (Savoy, 2002a)</label>
            </stemmer>
            <stemmer identifier="light_hungarian">
                <label>Light Hungarian (Savoy, 2006)</label>
            </stemmer>
        </group>
        <group language="it" title="Italian">
            <stemmer identifier="light_italian" recommended="true">
                <label>Light Italian (Savoy, 2001)</label>
            </stemmer>
            <stemmer identifier="italian">
                <label>Italian Snowball</label>
            </stemmer>
        </group>
        <group language="nb" title="Norwegian (Bokmal)">
            <stemmer identifier="norwegian" recommended="true">
                <label>Norwegian Snowball</label>
            </stemmer>
            <stemmer identifier="light_norwegian" recommended="true">
                <label>Light Norwegian (variant of Light Swedish)</label>
            </stemmer>
            <stemmer identifier="minimal_norwegian">
                <label>Minimal Norwegian</label>
            </stemmer>
        </group>
        <group language="nn" title="Norwegian (Nynorsk)">
            <stemmer identifier="light_nynorsk" recommended="true">
                <label>Light Nynorsk (variant of Light Swedish)</label>
            </stemmer>
            <stemmer identifier="minimal_nynorsk">
                <label>Minimal Nynorsk</label>
            </stemmer>
        </group>
        <group language="pt" title="Portuguese">
            <stemmer identifier="light_portuguese" recommended="true">
                <label>Light Portuguese (Savoy, 2006)</label>
            </stemmer>
            <stemmer identifier="minimal_portuguese">
                <label>Minimal Portuguese (Orengo, 2007)</label>
            </stemmer>
            <stemmer identifier="portuguese">
                <label>Portuguese Snowball</label>
            </stemmer>
            <stemmer identifier="portuguese_rslp">
                <label><![CDATA[Portuguese RSLP (Orengo & Huyck, 2001)]]></label>
            </stemmer>
        </group>
        <group language="ru" title="Russian">
            <stemmer identifier="russian" recommended="true">
                <label>Russian Snowball</label>
            </stemmer>
            <stemmer identifier="light_russian">
                <label>Light Russian (Dolamic, 2009)</label>
            </stemmer>
        </group>
        <group language="es" title="Spanish">
            <stemmer identifier="light_spanish" recommended="true">
                <label>Light Spanish (Savoy, 2001)</label>
            </stemmer>
            <stemmer identifier="spanish">
                <label>Spanish Snowball</label>
            </stemmer>
            <stemmer identifier="spanish_plural">
                <label>Spanish Plural (Singular/Plural form only)</label>
            </stemmer>
        </group>
        <group language="sv" title="Swedish">
            <stemmer identifier="swedish" recommended="true">
                <label>Swedish Snowball</label>
            </stemmer>
            <stemmer identifier="light_swedish">
                <label>Light Swedish (Savoy, 2003)</label>
            </stemmer>
        </group>
    </stemmers>
</analysis>