wikimedia/mediawiki-extensions-CirrusSearch

View on GitHub
profiles/FullTextQueryBuilderProfiles.config.php

Summary

Maintainability
A
3 hrs
Test Coverage
<?php

namespace CirrusSearch;

/**
 * CirrusSearch - List of FullTextQueryBuilderProfiles used to generate an elasticsearch
 * query by parsing user input.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License along
 * with this program; if not, write to the Free Software Foundation, Inc.,
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 * http://www.gnu.org/copyleft/gpl.html
 */

return [
    // default query builder, based on top of QueryString
    'default' => [
        'builder_class' => Query\FullTextQueryStringQueryBuilder::class,
        'settings' => [],
    ],
    // fulltext query based on simple match queries suited to with browser tests
    // Not necessarily good for real world wikis
    'browser_tests' => [
        'builder_class' => Query\FullTextSimpleMatchQueryBuilder::class,
        // Adjusted according to tests/browser/features/relevancy_api.feature
        // and a fresh index (no deletes) and bm25 defaults for all fields
        // title > redirects > category > heading > opening > text > aux
        // These settings might not be ideal with a real index and real word norms
        'settings' => [
            'default_min_should_match' => '1',
            'default_query_type' => 'most_fields',
            'default_stem_weight' => 0.3,
            'fields' => [
                // very high title weight for features/create_new_page.feature:23
                // Make sure that Catapult wins Catapult/adsf despite not having
                // Catapult in the content
                'title' => 2.3,
                'redirect.title' => [
                    'boost' => 2.0,
                    'in_dismax' => 'redirects_or_shingles'
                ],
                // Shingles on title+redirect, suggest is
                // currently analyzed only with plain so we
                // include them in a dismax with redirects
                'suggest' => [
                    'is_plain' => true,
                    'boost' => 1.05,
                    'in_dismax' => 'redirects_or_shingles',
                ],
                // category should win over heading/opening
                'category' => 1.8,
                'heading' => 1.3,
                // Pack text and opening_text in a dismax query
                // this is to avoid scoring twice the same words
                'text' => [
                    'boost' => 0.4,
                    'in_dismax' => 'text_and_opening_text',
                ],
                'opening_text' => [
                    'boost' => 0.5,
                    'in_dismax' => 'text_and_opening_text',
                ],
                'auxiliary_text' => 0.2,
                'file_text' => 0.2,
            ],
            'phrase_rescore_fields' => [
                // Low boost to counter high phrase rescore boost
                'text' => 0.14,
                // higher on text.plain for tests/browser/features/relevancy_api.feature:106
                'text.plain' => 0.2,
            ],
            'dismax_settings' => [
                // Use a tie breaker, avg field length is so
                // low for opening_text that we would have to
                // set an insanely high boost to make sure it
                // wins text in the dismax. Instead we use a
                // tie breaker that will add 20% of the score
                // of the opening_text clauses
                'text_and_opening_text' => [
                    'tie_breaker' => 0.2,
                ],
            ],
        ]
    ],

    // Per field builder tuned for en.wikipedia.org
    'perfield_builder' => [
        'builder_class' => \CirrusSearch\Query\FullTextSimpleMatchQueryBuilder::class,
        'settings' => [
            'default_min_should_match' => '1',
            'default_query_type' => 'most_fields',
            'default_stem_weight' => 3.0,
            'fields' => [
                'title' => 0.3,
                'redirect.title' => [
                    'boost' => 0.27,
                    'in_dismax' => 'redirects_or_shingles'
                ],
                'suggest' => [
                    'is_plain' => true,
                    'boost' => 0.20,
                    'in_dismax' => 'redirects_or_shingles',
                ],
                'category' => 0.05,
                'heading' => 0.05,
                'text' => [
                    'boost' => 0.6,
                    'in_dismax' => 'text_and_opening_text',
                ],
                'opening_text' => [
                    'boost' => 0.5,
                    'in_dismax' => 'text_and_opening_text',
                ],
                'auxiliary_text' => 0.05,
                'file_text' => 0.5,
            ],
            'phrase_rescore_fields' => [
                // very low (don't forget it's multiplied by 10 by default)
                // Use the all field to avoid loading positions on another field,
                // score is roughly the same when used on text
                'all' => 0.06,
                'all.plain' => 0.1,
            ],
        ],
    ],
    // Per field builder tuned for searching crossproject where a strong
    // title match is required
    'perfield_builder_title_filter' => [
        'builder_class' => \CirrusSearch\Query\FullTextSimpleMatchQueryBuilder::class,
        'settings' => [
            'default_min_should_match' => '1',
            'default_query_type' => 'most_fields',
            'default_stem_weight' => 3.0,
            'filter' => [
                // Similar to the default filter (all terms must match
                // in the content) + additional contraint on title/redirect
                // which can be relaxed with minimum_should_match (defaults
                // to 3<80%)
                'type' => 'constrain_title',
                'settings' => [
                    'minimum_should_match' => '3<80%'
                ],
            ],
            'fields' => [
                'title' => 0.3,
                'redirect.title' => [
                    'boost' => 0.27,
                    'in_dismax' => 'redirects_or_shingles'
                ],
                'suggest' => [
                    'is_plain' => true,
                    'boost' => 0.20,
                    'in_dismax' => 'redirects_or_shingles',
                ],
                'category' => 0.05,
                'heading' => 0.05,
                'text' => [
                    'boost' => 0.6,
                    'in_dismax' => 'text_and_opening_text',
                ],
                'opening_text' => [
                    'boost' => 0.5,
                    'in_dismax' => 'text_and_opening_text',
                ],
                'auxiliary_text' => 0.05,
                'file_text' => 0.5,
            ],
            'phrase_rescore_fields' => [
                // very low (don't forget it's multiplied by 10 by default)
                // Use the all field to avoid loading positions on another field,
                // score is roughly the same when used on text
                'all' => 0.06,
                'all.plain' => 0.1,
            ],
        ],
    ],
];