includes/Segmenter.php from lokal-profil/mediawiki-extensions-Wikispeech

includes/Segmenter.php
Summary

Maintainability

1 hr
Test Coverage

Issues
<?php

/**
 * @file
 * @ingroup Extensions
 * @license GPL-2.0-or-later
 */

/**
 * Used for dividing text into segments, that can then be sent to the
 * TTS server. Also calculates values for variables that are needed
 * for highlighting.
 *
 * @since 0.0.1
 */

class Segmenter {

    /**
     * An array to which finished segments are added.
     *
     * @var array $segments
     */

    private $segments;

    /**
     * The segment that is currently being built.
     *
     * @var array $segments
     */

    private $currentSegment;

    function __construct() {
        $this->segments = [];
        $this->currentSegment = [
            'content' => [],
            'startOffset' => null,
            'endOffset' => null
        ];
    }

    /**
     * Divide a cleaned content array into segments, one for each sentence.
     *
     * A segment is an array with the keys "content", "startOffset"
     * and "endOffset". "content" is an array of `CleanedText`s and
     * `SegmentBreak`s. "startOffset" is the offset of the first
     * character of the segment, within the text node it
     * appears. "endOffset" is the offset of the last character of the
     * segment, within the text node it appears. These are used to
     * determine start and end of a segment in the original HTML.
     *
     * A sentence is here defined as a sequence of tokens ending with
     * a dot (full stop).
     *
     * @since 0.0.1
     * @param array $cleanedContent An array of items returned by
     *  `Cleaner::cleanHtml()`.
     * @return array An array of segments, each containing the
     *  `CleanedText's in that segment.
     */
    public function segmentSentences( $cleanedContent ) {
        foreach ( $cleanedContent as $item ) {
            if ( $item instanceof CleanedText ) {
                $this->addSegments( $item );
            } elseif ( $item instanceof SegmentBreak ) {
                $this->finishSegment();
            }
        }
        if ( $this->currentSegment['content'] ) {
            // Add the last segment, unless it's empty.
            $this->finishSegment();
        }
        return $this->segments;
    }

    /**
     * Add segments for a string.
     *
     * Looks for sentence final strings (strings which a sentence ends
     * with). When a sentence final string is found, it's sentence is
     * added to the $currentSegment.
     *
     * @since 0.0.1
     * @param CleanedText $text The text to segment.
     */
    private function addSegments( $text ) {
        $nextStartOffset = 0;
        do {
            $endOffset = $this->addSegment( $text, $nextStartOffset );
            // The earliest the next segments can start is one after
            // the end of the current one.
            $nextStartOffset = $endOffset + 1;
        } while ( $nextStartOffset < mb_strlen( $text->string ) - 1 );
    }

    /**
     * Add a sentence, or part thereof, to a segment.
     *
     * Finds the next sentence by sentence final characters and adds
     * them to the segment under construction. If no sentence final
     * character was found, all the remaining text is added. Stores
     * start offset when the first text of a segment is added and end
     * offset when the last is.
     *
     * @since 0.0.1
     * @param CleanedText $text The text to segment.
     * @param int $startOffset The offset where the next sentence can
     *  start, at the earliest. If the sentence has leading
     *  whitespaces, this will be moved forward.
     * @return int The offset of the last character in the
     *   sentence. If the sentence didn't end yet, this is the last
     *   character of $text.
     */
    private function addSegment( $text, $startOffset=0 ) {
        if ( $this->currentSegment['startOffset'] === null ) {
            // Move the start offset ahead by the number of leading
            // whitespaces. This means that whitespaces before or
            // between segments aren't included.
            $leadingWhitespacesLength = self::getLeadingWhitespacesLength(
                mb_substr( $text->string, $startOffset )
            );
            $startOffset += $leadingWhitespacesLength;
        }
        // Get the offset for the next sentence final character.
        $endOffset = self::getSentenceFinalOffset(
            $text->string,
            $startOffset
        );
        // If no sentence final character is found, add the rest of
        // the text and remember that this segment isn't ended.
        $ended = true;
        if ( $endOffset === null ) {
            $endOffset = mb_strlen( $text->string ) - 1;
            $ended = false;
        }
        $sentence = mb_substr(
            $text->string,
            $startOffset,
            $endOffset - $startOffset + 1
        );
        if ( $sentence !== '' && $sentence !== "\n" ) {
            // Don't add `CleanedText`s with the empty string or only
            // newline.
            $sentenceText = new CleanedText(
                $sentence,
                $text->path
            );
            array_push( $this->currentSegment['content'], $sentenceText );
            if ( $this->currentSegment['startOffset'] === null ) {
                // Record the start offset if this is the first text
                // added to the segment.
                $this->currentSegment['startOffset'] = $startOffset;
            }
            $this->currentSegment['endOffset'] = $endOffset;
            if ( $ended ) {
                $this->finishSegment();
            }
        }
        return $endOffset;
    }

    /**
     * Get the number of whitespaces at the start of a string.
     *
     * @since 0.0.1
     * @param string $string The string to count leading whitespaces
     *  for.
     * @return int The number of whitespaces at the start of $string.
     */
    private static function getLeadingWhitespacesLength( $string ) {
        $trimmedString = ltrim( $string );
        return mb_strlen( $string ) - mb_strlen( $trimmedString );
    }

    /**
     * Get the offset of the first sentence final character in a string.
     *
     * @since 0.0.1
     * @param string $string The string to look in.
     * @param int $offset The offset to start looking from.
     * @return int The offset of the first sentence final character
     *  that was found, if any, else null.
     */
    private static function getSentenceFinalOffset( $string, $offset ) {
        // For every potentially sentence final character after the
        // first one, we want to start looking from the character
        // after the last one we found. For the first one however, we
        // want to start looking from the character at the offset, to
        // not miss if that is a sentence final character. To only
        // have one loop for both these cases, we need to go back one
        // for the first search.
        $offset --;
        do {
            // Find the next character that may be sentence final.
            $offset = mb_strpos( $string, '.', $offset + 1 );
            if ( $offset === false ) {
                // No character that can be sentence final was found.
                return null;
            }
        } while ( !self::isSentenceFinal( $string, $offset ) );
        return $offset;
    }

    /**
     * Test if a character is at the end of a sentence.
     *
     * Dots in abbreviations should only be counted when they also are sentence
     * final. For example:
     * "Monkeys, penguins etc.", but not "Monkeys e.g. baboons".
     *
     * @since 0.0.1
     * @param string $string The string to check in.
     * @param int $index The the index in $string of the character to check.
     * @return bool True if the character is sentence final, else false.
     */
    private static function isSentenceFinal( $string, $index ) {
        $character = mb_substr( $string, $index, 1 );
        $nextCharacter = null;
        if ( mb_strlen( $string ) > $index + 1 ) {
            $nextCharacter = mb_substr( $string, $index + 1, 1 );
        }
        $characterAfterNext = null;
        if ( mb_strlen( $string ) > $index + 2 ) {
            $characterAfterNext = mb_substr( $string, $index + 2, 1 );
        }
        if (
            $character == '.' &&
            ( $nextCharacter == ' ' && self::isUpper( $characterAfterNext ) ||
            $nextCharacter == '' ||
            $nextCharacter == "\n" )
        ) {
            // A dot is sentence final if it's followed by a space and a
            // capital letter or at the end of string or line.
            return true;
        } else {
            return false;
        }
    }

    /**
     * Test if a string is upper case.
     *
     * @since 0.0.1
     * @param string $string The string to test.
     * @return bool true if the entire string is upper case, else false.
     */
    private static function isUpper( $string ) {
        return mb_strtoupper( $string ) == $string;
    }

    /**
     * Add the current segment to the array of segments.
     *
     * Creates a new, empty segment as the new current segment.
     *
     * @since 0.0.1
     * @param array $segments The array of segments to add the
     *  finished one to.
     * @param array $currentCegments The finished segment to add.
     */
    private function finishSegment() {
        if ( count( $this->currentSegment['content'] ) ) {
            array_push( $this->segments, $this->currentSegment );
        }
        // Create a fresh segment to add following text to.
        $this->currentSegment = [
            'content' => [],
            'startOffset' => null,
            'endOffset' => null
        ];
    }
}