wikimedia/mediawiki-extensions-CirrusSearch

View on GitHub
includes/Search/Escaper.php

Summary

Maintainability
A
2 hrs
Test Coverage
<?php

namespace CirrusSearch\Search;

/**
 * Escapes queries.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License along
 * with this program; if not, write to the Free Software Foundation, Inc.,
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 * http://www.gnu.org/copyleft/gpl.html
 */
class Escaper {

    /**
     * @var string MediaWiki language code
     */
    private $language;

    /**
     * Allow leading wildcards?
     * @var bool
     */
    private $allowLeadingWildcard;

    /**
     * @param string $language MediaWiki language code
     * @param bool $allowLeadingWildcard
     */
    public function __construct( $language, $allowLeadingWildcard = true ) {
        $this->language = $language;
        $this->allowLeadingWildcard = $allowLeadingWildcard;
    }

    /**
     * @param string $text
     * @return string
     */
    public function escapeQuotes( $text ) {
        if ( $this->language === 'he' ) {
            // Hebrew uses the double quote (") character as a standin for quotation marks (“”)
            // which delineate phrases.  It also uses double quotes as a standin for another
            // character (״), call a Gershayim, which mark acronyms.  Here we guess if the intent
            // was to mark a phrase, in which case we leave the quotes alone, or to mark an
            // acronym, in which case we escape them.
            return preg_replace( '/(?<=[^\s\\\\])"(?=\S)/u', '\\"', $text );
        }
        return $text;
    }

    /**
     * Make sure the query string part is well formed by escaping some syntax that we don't
     * want users to get direct access to and making sure quotes are balanced.
     * These special characters _aren't_ escaped:
     * * and ?: Do a wildcard search against the stemmed text which isn't strictly a good
     * idea but this is so rarely used that adding extra code to flip prefix searches into
     * real prefix searches isn't really worth it.
     * ~: Do a fuzzy match against the stemmed text which isn't strictly a good idea but it
     * gets the job done and fuzzy matches are a really rarely used feature to be creating an
     * extra index for.
     * ": Perform a phrase search for the quoted term.  If the "s aren't balanced we insert one
     * at the end of the term to make sure elasticsearch doesn't barf at us.
     *
     * @param string $string
     * @return string
     */
    public function fixupQueryStringPart( $string ) {
        // Escape characters that can be escaped with \\
        $string = preg_replace( '/(
                \(|     (?# no user supplied groupings)
                \)|
                \{|     (?# no exclusive range queries)
                }|
                \[|     (?# no inclusive range queries either)
                ]|
                \^|     (?# no user supplied boosts at this point, though I cant think why)
                :|        (?# no specifying your own fields)
                \\\(?!") (?# the only acceptable escaping is for quotes)
            )/x', '\\\$1', $string );
        // Forward slash escaping doesn't work properly in all environments so we just eat them.   Nom.
        $string = str_replace( '/', ' ', $string );

        // Elasticsearch's query strings can't abide unbalanced quotes
        return $this->balanceQuotes( $string );
    }

    /**
     * Make sure that all operators and lucene syntax is used correctly in the query string
     * and store if this is a fuzzy query.
     * If it isn't then the syntax escaped so it becomes part of the query text.
     *
     * @param string $string
     * @return string fixed up query string
     */
    public function fixupWholeQueryString( $string ) {
        $escapeBadSyntax = static function ( $matches ) {
            return preg_replace( '/(?=[^\s\w])/', '\\', $matches[0] );
        };

        // Be careful when editing this method because the ordering of the replacements matters.

        // Escape ~ that don't follow a term or a quote
        $string = preg_replace_callback( '/(?<![\w"])~/u', $escapeBadSyntax, $string );

        // When allow leading wildcard is disabled elasticsearch will report an
        // error if these are unescaped. Escape ? and * that don't follow a term.
        if ( !$this->allowLeadingWildcard ) {
            $string = preg_replace_callback( '/(?<!\w)[?*]/u', $escapeBadSyntax, $string );
        }

        // Reduce token ranges to bare tokens without the < or >
        $string = preg_replace( '/[<>]+(\S)/u', '$1', $string );

        // Turn bad fuzzy searches into searches that contain a ~ and set $this->fuzzyQuery for good ones.
        $string = preg_replace_callback( '/(?<leading>\w)~(?<trailing>\S*)/u',
            static function ( $matches ) use ( &$fuzzyQuery ) {
                if ( preg_match( '/^[0-2]?$/', $matches[ 'trailing' ] ) ) {
                    return $matches[ 0 ];
                } else {
                    return $matches[ 'leading' ] . '\\~' .
                        preg_replace( '/(?<!\\\\)~/', '\~', $matches[ 'trailing' ] );
                }
            }, $string );

        // Turn bad proximity searches into searches that contain a ~
        $string = preg_replace_callback( '/"~(?<trailing>\S*)/u', static function ( $matches ) {
            if ( preg_match( '/\d+/', $matches[ 'trailing' ] ) ) {
                return $matches[ 0 ];
            } else {
                return '"\\~' . $matches[ 'trailing' ];
            }
        }, $string );

        // Escape +, -, and ! when not immediately followed by a term or when immediately
        // prefixed with a term.  Catches "foo-bar", "foo- bar", "foo - bar".  The only
        // acceptable use is "foo -bar" and "-bar foo".
        $string = preg_replace_callback( '/[+\-!]+(?!\w)/u', $escapeBadSyntax, $string );
        $string = preg_replace_callback( '/(?<!^|[ \\\\])[+\-!]+/u', $escapeBadSyntax, $string );

        // Escape || when not between terms
        $string = preg_replace_callback( '/^\s*\|\|/u', $escapeBadSyntax, $string );
        $string = preg_replace_callback( '/\|\|\s*$/u', $escapeBadSyntax, $string );

        // Lowercase AND and OR when not surrounded on both sides by a term.
        // Lowercase NOT when it doesn't have a term after it.
        $string = preg_replace_callback( '/^\s*(?:AND|OR)\b|\b(?:AND|OR|NOT)\s*$/u',
            [ self::class, 'lowercaseMatched' ], $string );
        $string = preg_replace_callback( '/\b(?:AND|OR|NOT)\s+(?=AND\b|OR\b|NOT\b)/u',
            [ self::class, 'lowercaseMatched' ], $string );

        return $string;
    }

    /**
     * @param string[] $matches
     * @return string
     */
    private static function lowercaseMatched( $matches ) {
        return strtolower( $matches[ 0 ] );
    }

    /**
     * @param string $text
     * @return string
     */
    public function balanceQuotes( $text ) {
        if ( $this->unbalancedQuotes( $text ) ) {
            $text .= '"';
        }
        return $text;
    }

    /**
     * @param string $text
     * @param int $from
     * @param int $to
     * @return bool true if there are unbalanced quotes in the [$from, $to] range.
     */
    public function unbalancedQuotes( $text, $from = 0, $to = -1 ) {
        $to = $to < 0 ? strlen( $text ) : $to;
        $inQuote = false;
        $inEscape = false;
        for ( $i = $from; $i < $to; $i++ ) {
            if ( $inEscape ) {
                $inEscape = false;
                continue;
            }
            switch ( $text[ $i ] ) {
                case '"':
                    $inQuote = !$inQuote;
                    break;
                case '\\':
                    $inEscape = true;
            }
        }
        return $inQuote;
    }

    /**
     * Unescape a given string
     * @param string $query string to unescape
     * @param string $escapeChar escape sequence
     * @return string
     */
    public function unescape( $query, $escapeChar = '\\' ) {
        $escapeChar = preg_quote( $escapeChar, '/' );
        return preg_replace( "/$escapeChar(.)/u", '$1', $query );
    }

    /**
     * Is leading wildcard allowed?
     *
     * @return bool
     */
    public function getAllowLeadingWildcard() {
        return $this->allowLeadingWildcard;
    }

    /**
     * @return string
     */
    public function getLanguage() {
        return $this->language;
    }
}