lokal-profil/mediawiki-extensions-Wikispeech

View on GitHub
includes/Cleaner.php

Summary

Maintainability
A
3 hrs
Test Coverage
<?php

/**
 * @file
 * @ingroup Extensions
 * @license GPL-2.0-or-later
 */

/**
 * Used for cleaning text with HTML markup. The cleaned text is used
 * as input for `Segmenter`.
 *
 * @since 0.0.1
 */

class Cleaner {
    /**
     * An array of tags that should be removed completely during cleaning.
     *
     * @var array $removeTags
     */

    private $removeTags;

    /**
     * An array of tags that should add a segment break during cleaning.
     *
     * @var array $segmentBreakingTags
     */

    private $segmentBreakingTags;

    /**
     * An array of `CleanedText`s and `SegmentBreak`s.
     *
     * @var array $cleanedContent
     */

    private $cleanedContent;

    /**
     * @param array|null $removeTags An array of tags that should be removed
     *  completely during cleaning.
     * @param array|null $segmentBreakingTags An array of `CleanedText`s and
     *  `SegmentBreak`s.
     */
    function __construct( $removeTags, $segmentBreakingTags ) {
        if ( $removeTags == null ) {
            $removeTags = [];
        }
        if ( $segmentBreakingTags == null ) {
            $segmentBreakingTags = [];
        }
        $this->removeTags = $removeTags;
        $this->segmentBreakingTags = $segmentBreakingTags;
    }

    /**
     * Clean HTML tags from a string.
     *
     * Separates any HTML tags from the text.
     *
     * @since 0.0.1
     * @param string $markedUpText Input text that may contain HTML
     *  tags.
     * @return array An array of `CleanedText`s and `SegmentBreak`s
     *  representing text nodes.
     */
    public function cleanHtml( $markedUpText ) {
        $dom = self::createDomDocument( $markedUpText );
        $xpath = new DOMXPath( $dom );
        // Only add elements below the dummy element. These are the
        // elements from the original HTML.
        $top = $xpath->evaluate( '/meta/dummy' )->item( 0 );
        $this->cleanedContent = [];
        $this->addContent( $top );
        // Remove any segment break at the start or end of the array,
        // since they won't do anything.
        if (
            count( $this->cleanedContent ) &&
            $this->cleanedContent[0] instanceof SegmentBreak
        ) {
            array_shift( $this->cleanedContent );
        }
        if ( self::lastElement( $this->cleanedContent ) instanceof SegmentBreak ) {
            array_pop( $this->cleanedContent );
        }
        return $this->cleanedContent;
    }

    /**
     * Create a DOMDocument from an HTML string.
     *
     * A dummy element is added as top node.
     *
     * @since 0.0.1
     * @param string $markedUpString The string to create the
     *  DOMDocument.
     * @return DOMDocument The created DOMDocument.
     */
    private static function createDomDocument( $markedUpText ) {
        $dom = new DOMDocument();
        // Add encoding information and wrap the input text in a dummy
        // tag to prevent p tags from being added for text nodes.
        global $wgWikispeechContentWrapperTagName;
        $contentTag = '<' . $wgWikispeechContentWrapperTagName . '>';
        // @codingStandardsIgnoreStart
        $wrappedText = '<head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/><dummy>' .
            $markedUpText .
            '</dummy></head>';
        // @codingStandardsIgnoreEnd
        libxml_use_internal_errors( true );
        $dom->loadHTML(
            $wrappedText,
            LIBXML_HTML_NODEFDTD | LIBXML_HTML_NOIMPLIED
        );
        return $dom;
    }

    /**
     * Recursively add items to the cleaned content.
     *
     * Goes through all the child nodes of $node and adds their
     * content text. Adds segment breaks for appropriate tags.
     *
     * @since 0.0.1
     * @param DOMNode $node The top node to add from.
     */
    private function addContent( $node ) {
        if ( !$node instanceof DOMComment && !$this->matchesRemove( $node ) ) {
            foreach ( $node->childNodes as $child ) {
                if (
                    !self::lastElement( $this->cleanedContent )
                        instanceof SegmentBreak &&
                    in_array(
                        $child->nodeName,
                        $this->segmentBreakingTags
                    )
                ) {
                    // Add segment breaks for start tags specified in
                    // the config, unless the previous item is a break
                    // or this is the first item.
                    array_push( $this->cleanedContent, new SegmentBreak() );
                }
                if ( $child->nodeType == XML_TEXT_NODE ) {
                    // Remove the path to the dummy node and instead
                    // add "." to match when used with context.
                    $path = preg_replace(
                        '!^/meta/dummy' . '!',
                        '.',
                        $child->getNodePath()
                    );
                    $text = new CleanedText( $child->textContent, $path );
                    array_push( $this->cleanedContent, $text );
                } else {
                    $this->addContent( $child );
                }
                if (
                    !self::lastElement( $this->cleanedContent ) instanceof SegmentBreak &&
                    in_array(
                        $child->nodeName,
                        $this->segmentBreakingTags
                    )
                ) {
                    // Add segment breaks for end tags specified in
                    // the config.
                    array_push( $this->cleanedContent, new SegmentBreak() );
                }
            }
        }
    }

    /**
     * Check if a node matches criteria for removal.
     *
     * The node is compared to the removal criteria from the
     * configuration, to determine if it should be removed completely.
     *
     * @since 0.0.1
     * @param DOMNode $node The node to check.
     * @return bool true if the node match removal criteria, otherwise
     *  false.
     */
    private function matchesRemove( $node ) {
        if ( !array_key_exists( $node->nodeName, $this->removeTags ) ) {
            // The node name isn't found in the removal list.
            return false;
        }
        $removeCriteria = $this->removeTags[$node->nodeName];
        if ( $removeCriteria === true ) {
            // Node name is found and there are no extra criteria.
            return true;
        } elseif ( is_array( $removeCriteria ) ) {
            // If there are multiple classes for a tag, check if any
            // of them match.
            foreach ( $removeCriteria as $class ) {
                if ( self::nodeHasClass( $node, $class ) ) {
                    return true;
                }
            }
        } elseif ( self::nodeHasClass( $node, $removeCriteria ) ) {
            // Node name and class name match.
            return true;
        }
        return false;
    }

    /**
     * Check if a node has a class attribute, containing a string.
     *
     * Since this is for checking HTML tag classes, the class
     * attribute, if present, is assumed to be a string of substrings,
     * sepparated by spaces.
     *
     * @since 0.0.1
     * @param DOMNode $node The node to check.
     * @param string $className The name of the class to check for.
     * @return bool true if the node's class attribute contain
     *  $className, otherwise false.
     */
    private static function nodeHasClass( $node, $className ) {
        $classNode = $node->attributes->getNamedItem( 'class' );
        if ( $classNode == null ) {
            return false;
        }
        $classString = $classNode->nodeValue;
        $nodeClasses = explode( ' ', $classString );
        if ( in_array( $className, $nodeClasses ) ) {
            return true;
        }
        return false;
    }

    /**
     * Get the last element in an array.
     *
     * @since 0.0.1
     * @param array $array The array to get the last element from.
     * @return The last element in the array, null if array is empty.
     */
    private static function lastElement( $array ) {
        if ( !count( $array ) ) {
            return null;
        } else {
            return $array[count( $array ) - 1];
        }
    }
}