wikimedia/mediawiki-extensions-Wikibase

View on GitHub
repo/includes/Parsers/DateFormatParser.php

Summary

Maintainability
D
2 days
Test Coverage
<?php

namespace Wikibase\Repo\Parsers;

use DataValues\IllegalValueException;
use DataValues\TimeValue;
use ValueParsers\CalendarModelParser;
use ValueParsers\IsoTimestampParser;
use ValueParsers\ParseException;
use ValueParsers\ParserOptions;
use ValueParsers\StringValueParser;
use Wikimedia\AtEase\AtEase;

/**
 * This parser is in essence the inverse operation of MediaWiki's Language::sprintfDate.
 *
 * @see \Language::sprintfDate
 *
 * @license GPL-2.0-or-later
 * @author Thiemo Kreuz
 */
class DateFormatParser extends StringValueParser {

    private const FORMAT_NAME = 'date-format';

    public const OPT_DATE_FORMAT = 'dateFormat';

    /**
     * Option for unlocalizing non-canonical digits. Must be an array of strings, mapping canonical
     * digit characters ("1", "2" and so on, possibly including "." and ",") to localized
     * characters.
     */
    public const OPT_DIGIT_TRANSFORM_TABLE = 'digitTransformTable';

    /**
     * Option for localized month names. Should be a two-dimensional array, the first dimension
     * mapping the month's numbers 1 to 12 to arrays of localized month names, possibly including
     * full month names, genitive names and abbreviations. Can also be a one-dimensional array of
     * strings.
     */
    public const OPT_MONTH_NAMES = 'monthNames';

    /**
     * Option to override the precision auto-detection and set a specific precision. Should be an
     * integer or string containing one of the TimeValue::PRECISION_... constants.
     */
    public const OPT_PRECISION = 'precision';

    /** @var IsoTimestampParser */
    private $isoTimestampParser;

    public function __construct( ParserOptions $options = null ) {
        parent::__construct( $options );

        $this->defaultOption( self::OPT_DATE_FORMAT, 'j F Y' );
        // FIXME: Should not be an option. Options should be trivial, never arrays or objects!
        $this->defaultOption( self::OPT_DIGIT_TRANSFORM_TABLE, null );
        $this->defaultOption( self::OPT_MONTH_NAMES, null );
        $this->defaultOption( self::OPT_PRECISION, null );

        $this->isoTimestampParser = new IsoTimestampParser(
            new CalendarModelParser( $this->options ),
            $this->options
        );
    }

    /**
     * @see StringValueParser::stringParse
     *
     * @param string $value
     *
     * @throws ParseException
     * @return TimeValue
     */
    protected function stringParse( $value ) {
        $date = $this->parseDate( $value );
        $precision = TimeValue::PRECISION_YEAR;
        $time = [ $this->parseFormattedNumber( $date['year'] ), 0, 0, 0, 0, 0 ];

        if ( isset( $date['month'] ) ) {
            $precision = TimeValue::PRECISION_MONTH;
            $time[1] = $this->findMonthMatch( $date );

            if ( isset( $date['day'] ) ) {
                $precision = TimeValue::PRECISION_DAY;
                $time[2] = $this->parseFormattedNumber( $date['day'] );

                if ( isset( $date['hour'] ) ) {
                    $precision = TimeValue::PRECISION_HOUR;
                    $time[3] = $this->parseFormattedNumber( $date['hour'] );

                    if ( isset( $date['minute'] ) ) {
                        $precision = TimeValue::PRECISION_MINUTE;
                        $time[4] = $this->parseFormattedNumber( $date['minute'] );

                        if ( isset( $date['second'] ) ) {
                            $precision = TimeValue::PRECISION_SECOND;
                            $time[5] = $this->parseFormattedNumber( $date['second'] );
                        }
                    }
                }
            }
        }

        $option = $this->getOption( self::OPT_PRECISION );
        if ( $option !== null ) {
            if ( !is_int( $option ) && !ctype_digit( $option ) ) {
                throw new ParseException( 'Precision must be an integer' );
            }

            $option = (int)$option;

            // It's impossible to increase the detected precision via option, e.g. from year to month if
            // no month is given. If a day is given it can be increased, relevant for midnight.
            if ( $option <= $precision || $precision >= TimeValue::PRECISION_DAY ) {
                $precision = $option;
            }
        }

        $timestamp = vsprintf( '+%04s-%02s-%02sT%02s:%02s:%02sZ', $time );

        // Use IsoTimestampParser to detect the correct calendar model.
        $iso = $this->isoTimestampParser->parse( $timestamp );

        try {
            // We intentionally do not re-use the precision from IsoTimestampParser here,
            // because it reduces precision for values with zeros in the right-most fields.
            // Our above method of determining the precision is therefore better.
            return new TimeValue( $timestamp, 0, 0, 0, $precision, $iso->getCalendarModel() );
        } catch ( IllegalValueException $ex ) {
            throw new ParseException( $ex->getMessage(), $value, self::FORMAT_NAME );
        }
    }

    // phpcs:disable Generic.Metrics.CyclomaticComplexity.MaxExceeded,Squiz.WhiteSpace.FunctionSpacing
    /**
     * @see Language::sprintfDate
     *
     * @param string $format A date format, as described in Language::sprintfDate.
     *
     * @return string Regular expression
     */
    private function parseDateFormat( $format ) {
        $length = strlen( $format );

        $number = $this->getNumberPattern();
        $notFollowedByNumber = '(?!' . $number . ')';
        $optionalPunctuation = '\p{P}*';
        $optionalWhitespace = '\p{Z}*';
        $separation = $notFollowedByNumber . $optionalWhitespace;
        $pattern = '<^' . $optionalWhitespace;

        for ( $p = 0; $p < $length; $p++ ) {
            $code = $format[$p];

            // "x" is used as a prefix for MediaWiki specific, 2- and 3-letter codes.
            if ( $code === 'x' && $p < $length - 1 ) {
                $code .= $format[++$p];

                if ( preg_match( '<^x[ijkmot]$>', $code ) && $p < $length - 1 ) {
                    $code .= $format[++$p];
                }
            }

            switch ( $code ) {
                // Year
                case 'o':
                case 'Y':
                    $pattern .= '(?P<year>' . $number . '+)' . $separation;
                    break;

                // Month
                case 'F':
                case 'M':
                case 'm':
                case 'n':
                case 'xg':
                    $pattern .= '(?P<month>' . $number . '{1,2}' . $notFollowedByNumber
                        . $this->getMonthNamesPattern() . ')' . $optionalPunctuation
                        . $optionalWhitespace;
                    break;

                // Day
                case 'd':
                case 'j':
                    $pattern .= '(?P<day>' . $number . '{1,2})' . $optionalPunctuation
                        . $separation;
                    break;

                // Hour
                case 'G':
                case 'H':
                    $pattern .= '(?P<hour>' . $number . '{1,2})' . $separation;
                    break;

                // Minute
                case 'i':
                    $pattern .= '(?P<minute>' . $number . '{1,2})' . $separation;
                    break;

                // Second
                case 's':
                    $pattern .= '(?P<second>' . $number . '{1,2})' . $separation;
                    break;

                // Escaped "x"
                case 'xx':
                    $pattern .= 'x';
                    break;

                // Escaped character or backslash at the end of the sequence
                case '\\':
                    $pattern .= preg_quote( $p < $length - 1 ? $format[++$p] : '\\' );
                    break;

                // Quoted sequence
                case '"':
                    $endQuote = strpos( $format, '"', $p + 1 );
                    if ( $endQuote !== false ) {
                        $pattern .= preg_quote( substr( $format, $p + 1, $endQuote - $p - 1 ) );
                        $p = $endQuote;
                    } else {
                        $pattern .= '"';
                    }
                    break;

                // Textual representation of the day of the week. Redundant, can be ignored.
                case 'D':
                case 'l':
                    $pattern .= '\p{L}*' . $optionalPunctuation . $separation;
                    break;

                // We can ignore "raw" and "raw toggle" when parsing, because we always accept
                // canonical digits.
                case 'xN':
                case 'xn':
                    break;

                // 12-hour format
                case 'A':
                case 'a':
                case 'g':
                case 'h':
                // Full, formatted dates
                case 'c':
                case 'r':
                case 'U':
                // Numeric representation of the day of the week
                case 'N':
                case 'w':
                // Timezone
                case 'e':
                case 'O':
                case 'P':
                case 'T':
                case 'Z':
                // Daylight saving time ("1" if true)
                case 'I':
                // Leap year ("1" if true)
                case 'L':
                // Number of days in the current month
                case 't':
                case 'xit':
                case 'xjt':
                // Week number
                case 'W':
                // "Hebrew" and "Roman" modifiers
                case 'xh':
                case 'xr':
                // 2-digit year
                case 'y':
                case 'xiy':
                // Day of the year
                case 'z':
                case 'xiz':
                // Day, month and year in incompatible calendar models (Hebrew, Iranian, and others)
                case 'xiF':
                case 'xij':
                case 'xin':
                case 'xiY':
                case 'xjF':
                case 'xjj':
                case 'xjn':
                case 'xjx':
                case 'xjY':
                case 'xkY':
                case 'xmF':
                case 'xmj':
                case 'xmn':
                case 'xmY':
                case 'xoY':
                case 'xtY':
                    throw new ParseException( 'Unsupported date format "' . $code . '"' );

                // Character with no meaning
                default:
                    if ( preg_match( '<^' . $optionalPunctuation . '$>u', $format[$p] ) ) {
                        $pattern .= $optionalPunctuation;
                    } elseif ( preg_match( '<^' . $optionalWhitespace . '$>u', $format[$p] ) ) {
                        $pattern .= $optionalWhitespace;
                    } else {
                        $pattern .= preg_quote( $format[$p] );
                    }
            }
        }

        return $pattern . '$>iu';
    }
    // phpcs:enable

    /**
     * @return string Partial regular expression
     */
    private function getNumberPattern() {
        $pattern = '[\d';

        $transformTable = $this->getDigitTransformTable();
        if ( is_array( $transformTable ) ) {
            $pattern .= preg_quote( implode( '', $transformTable ) );
        }

        return $pattern . ']';
    }

    /**
     * @return string Partial regular expression
     */
    private function getMonthNamesPattern() {
        $pattern = '';

        foreach ( $this->getMonthNames() as $i => $monthNames ) {
            $pattern .= '|(?P<month' . $i . '>'
                . implode( '|', array_map( 'preg_quote', (array)$monthNames ) )
                . ')';
        }

        return $pattern;
    }

    /**
     * @param string $input
     *
     * @throws ParseException
     * @return string[] Guaranteed to have the "year" key, optionally followed by more elements.
     *  Guaranteed to be continuous, e.g. "year" and "day" with no "month" is illegal.
     */
    private function parseDate( $input ) {
        $pattern = $this->parseDateFormat( $this->getDateFormat() );

        AtEase::suppressWarnings();
        $success = preg_match( $pattern, $input, $matches );
        AtEase::restoreWarnings();

        if ( !$success ) {
            throw new ParseException(
                $success === false
                    ? 'Illegal date format "' . $this->getDateFormat() . '"'
                    : 'Failed to parse "' . $input . '"',
                $input,
                self::FORMAT_NAME
            );
        }

        if ( !isset( $matches['year'] )
            || ( isset( $matches['day'] ) && !isset( $matches['month'] ) )
            || ( isset( $matches['hour'] ) && !isset( $matches['day'] ) )
            || ( isset( $matches['minute'] ) && !isset( $matches['hour'] ) )
            || ( isset( $matches['second'] ) && !isset( $matches['minute'] ) )
        ) {
            throw new ParseException( 'Non-continuous date format', $input, self::FORMAT_NAME );
        }

        return $matches;
    }

    /**
     * @param string[] $matches
     *
     * @return int|string
     */
    private function findMonthMatch( $matches ) {
        for ( $i = 1; $i <= 12; $i++ ) {
            if ( !empty( $matches['month' . $i] ) ) {
                return $i;
            }
        }

        return $this->parseFormattedNumber( $matches['month'] );
    }

    /**
     * @param string $number
     *
     * @return string Canonical number
     */
    private function parseFormattedNumber( $number ) {
        $transformTable = $this->getDigitTransformTable();

        if ( is_array( $transformTable ) ) {
            // Eliminate empty array values (bug T66347).
            $transformTable = array_filter( $transformTable );
            $number = strtr( $number, array_flip( $transformTable ) );
        }

        return $number;
    }

    /**
     * @return string
     */
    private function getDateFormat() {
        return $this->getOption( self::OPT_DATE_FORMAT );
    }

    /**
     * @return string[]|null
     */
    private function getDigitTransformTable() {
        return $this->getOption( self::OPT_DIGIT_TRANSFORM_TABLE );
    }

    /**
     * @return array[]|string[]
     */
    private function getMonthNames() {
        return $this->getOption( self::OPT_MONTH_NAMES ) ?: [];
    }

}