includes/ResourceLoader/VueComponentParser.php
<?php
/**
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
* http://www.gnu.org/copyleft/gpl.html
*
* @file
* @author Roan Kattouw
*/
namespace MediaWiki\ResourceLoader;
use DOMDocument;
use DOMElement;
use DOMNode;
use InvalidArgumentException;
use Wikimedia\RemexHtml\DOM\DOMBuilder;
use Wikimedia\RemexHtml\HTMLData;
use Wikimedia\RemexHtml\Serializer\HtmlFormatter;
use Wikimedia\RemexHtml\Serializer\Serializer;
use Wikimedia\RemexHtml\Serializer\SerializerNode;
use Wikimedia\RemexHtml\Tokenizer\Attributes;
use Wikimedia\RemexHtml\Tokenizer\Tokenizer;
use Wikimedia\RemexHtml\TreeBuilder\Dispatcher;
use Wikimedia\RemexHtml\TreeBuilder\TreeBuilder;
use Wikimedia\Zest\Zest;
/**
* Parser for Vue single file components (.vue files). See parse() for usage.
*
* @ingroup ResourceLoader
* @internal For use within FileModule.
*/
class VueComponentParser {
/**
* Parse a Vue single file component, and extract the script, template and style parts.
*
* Returns an associative array with the following keys:
* - 'script': The JS code in the <script> tag
* - 'template': The HTML in the <template> tag
* - 'style': The CSS/LESS styles in the <style> tag, or null if the <style> tag was missing
* - 'styleLang': The language used for 'style'; either 'css' or 'less', or null if no <style> tag
*
* The following options can be passed in the $options parameter:
* - 'minifyTemplate': Whether to minify the HTML in the template tag. This removes
* HTML comments and strips whitespace. Default: false
*
* @param string $html HTML with <script>, <template> and <style> tags at the top level
* @param array $options Associative array of options
* @return array
* @throws InvalidArgumentException If the input is invalid
*/
public function parse( string $html, array $options = [] ): array {
$dom = $this->parseHTML( $html );
// Remex wraps everything in <html><head>, unwrap that
$head = Zest::getElementsByTagName( $dom, 'head' )[ 0 ];
// Find the <script>, <template> and <style> tags. They can appear in any order, but they
// must be at the top level, and there can only be one of each.
if ( !$head ) {
throw new InvalidArgumentException( 'Parsed DOM did not contain a <head> tag' );
}
$nodes = $this->findUniqueTags( $head, [ 'script', 'template', 'style' ] );
// Throw an error if we didn't find a <script> or <template> tag. <style> is optional.
foreach ( [ 'script', 'template' ] as $requiredTag ) {
if ( !isset( $nodes[ $requiredTag ] ) ) {
throw new InvalidArgumentException( "No <$requiredTag> tag found" );
}
}
$this->validateAttributes( $nodes['script'], [] );
$this->validateAttributes( $nodes['template'], [] );
if ( isset( $nodes['style'] ) ) {
$this->validateAttributes( $nodes['style'], [ 'lang' ] );
}
$styleData = isset( $nodes['style'] ) ? $this->getStyleAndLang( $nodes['style'] ) : null;
$template = $this->getTemplateHtml( $html, $options['minifyTemplate'] ?? false );
return [
'script' => trim( $nodes['script']->nodeValue ?? '' ),
'template' => $template,
'style' => $styleData ? $styleData['style'] : null,
'styleLang' => $styleData ? $styleData['lang'] : null
];
}
/**
* Parse HTML to DOM using RemexHtml
* @param string $html
* @return DOMDocument
*/
private function parseHTML( $html ): DOMDocument {
$domBuilder = new DOMBuilder( [ 'suppressHtmlNamespace' => true ] );
$treeBuilder = new TreeBuilder( $domBuilder, [ 'ignoreErrors' => true ] );
$tokenizer = new Tokenizer( new Dispatcher( $treeBuilder ), $html, [ 'ignoreErrors' => true ] );
$tokenizer->execute();
// @phan-suppress-next-line PhanTypeMismatchReturnSuperType
return $domBuilder->getFragment();
}
/**
* Find occurrences of specified tags in a DOM node, expecting at most one occurrence of each.
* This method only looks at the top-level children of $rootNode, it doesn't descend into them.
*
* @param DOMNode $rootNode Node whose children to look at
* @param string[] $tagNames Tag names to look for (must be all lowercase)
* @return DOMElement[] Associative arrays whose keys are tag names and values are DOM nodes
*/
private function findUniqueTags( DOMNode $rootNode, array $tagNames ): array {
$nodes = [];
foreach ( $rootNode->childNodes as $node ) {
$tagName = strtolower( $node->nodeName );
if ( in_array( $tagName, $tagNames ) ) {
if ( isset( $nodes[ $tagName ] ) ) {
throw new InvalidArgumentException( "More than one <$tagName> tag found" );
}
$nodes[ $tagName ] = $node;
}
}
return $nodes;
}
/**
* Verify that a given node only has a given set of attributes, and no others.
* @param DOMNode $node Node to check
* @param array $allowedAttributes Attributes the node is allowed to have
* @throws InvalidArgumentException If the node has an attribute it's not allowed to have
*/
private function validateAttributes( DOMNode $node, array $allowedAttributes ): void {
if ( $allowedAttributes ) {
foreach ( $node->attributes as $attr ) {
if ( !in_array( $attr->name, $allowedAttributes ) ) {
throw new InvalidArgumentException( "<{$node->nodeName}> may not have the " .
"{$attr->name} attribute" );
}
}
} elseif ( $node->attributes->length > 0 ) {
throw new InvalidArgumentException( "<{$node->nodeName}> may not have any attributes" );
}
}
/**
* Get the contents and language of the <style> tag. The language can be 'css' or 'less'.
* @param DOMElement $styleNode The <style> tag.
* @return array [ 'style' => string, 'lang' => string ]
* @throws InvalidArgumentException If an invalid language is used, or if the 'scoped' attribute is set.
*/
private function getStyleAndLang( DOMElement $styleNode ): array {
$style = trim( $styleNode->nodeValue ?? '' );
$styleLang = $styleNode->hasAttribute( 'lang' ) ?
$styleNode->getAttribute( 'lang' ) : 'css';
if ( $styleLang !== 'css' && $styleLang !== 'less' ) {
throw new InvalidArgumentException( "<style lang=\"$styleLang\"> is invalid," .
" lang must be \"css\" or \"less\"" );
}
return [
'style' => $style,
'lang' => $styleLang,
];
}
/**
* Get the HTML contents of the <template> tag, optionally minifed.
*
* To work around a bug in PHP's DOMDocument where attributes like @click get mangled,
* we re-parse the entire file using a Remex parse+serialize pipeline, with a custom dispatcher
* to zoom in on just the contents of the <template> tag, and a custom formatter for minification.
* Keeping everything in Remex and never converting it to DOM avoids the attribute mangling issue.
*
* @param string $html HTML that contains a <template> tag somewhere
* @param bool $minify Whether to minify the output (remove comments, strip whitespace)
* @return string HTML contents of the template tag
*/
private function getTemplateHtml( $html, $minify ) {
$serializer = new Serializer( $this->newTemplateFormatter( $minify ) );
$tokenizer = new Tokenizer(
$this->newFilteringDispatcher(
new TreeBuilder( $serializer, [ 'ignoreErrors' => true ] ),
'template'
),
$html, [ 'ignoreErrors' => true ]
);
$tokenizer->execute( [ 'fragmentNamespace' => HTMLData::NS_HTML, 'fragmentName' => 'template' ] );
return trim( $serializer->getResult() );
}
/**
* Custom HtmlFormatter subclass that optionally removes comments and strips whitespace.
* If $minify=false, this formatter falls through to HtmlFormatter for everything (except that
* it strips the <!doctype html> tag).
*
* @param bool $minify If true, remove comments and strip whitespace
* @return HtmlFormatter
*/
private function newTemplateFormatter( $minify ) {
return new class( $minify ) extends HtmlFormatter {
private $minify;
public function __construct( $minify ) {
$this->minify = $minify;
}
public function startDocument( $fragmentNamespace, $fragmentName ) {
// Remove <!doctype html>
return '';
}
public function comment( SerializerNode $parent, $text ) {
if ( $this->minify ) {
// Remove all comments
return '';
}
return parent::comment( $parent, $text );
}
public function characters( SerializerNode $parent, $text, $start, $length ) {
if (
$this->minify && (
// Don't touch <pre>/<listing>/<textarea> nodes
$parent->namespace !== HTMLData::NS_HTML ||
!isset( $this->prefixLfElements[ $parent->name ] )
)
) {
$text = substr( $text, $start, $length );
// Collapse runs of adjacent whitespace, and convert all whitespace to spaces
$text = preg_replace( '/[ \r\n\t]+/', ' ', $text );
$start = 0;
$length = strlen( $text );
}
return parent::characters( $parent, $text, $start, $length );
}
public function element( SerializerNode $parent, SerializerNode $node, $contents ) {
if (
$this->minify && (
// Don't touch <pre>/<listing>/<textarea> nodes
$node->namespace !== HTMLData::NS_HTML ||
!isset( $this->prefixLfElements[ $node->name ] )
) &&
$contents !== null
) {
// Remove leading and trailing whitespace
$contents = preg_replace( '/(^[ \r\n\t]+)|([\r\n\t ]+$)/', '', $contents );
}
return parent::element( $parent, $node, $contents );
}
};
}
/**
* Custom Dispatcher subclass that only dispatches tree events inside a tag with a certain name.
* This effectively filters the tree to only the contents of that tag.
*
* @param TreeBuilder $treeBuilder
* @param string $nodeName Tag name to filter for
* @return Dispatcher
*/
private function newFilteringDispatcher( TreeBuilder $treeBuilder, $nodeName ) {
return new class( $treeBuilder, $nodeName ) extends Dispatcher {
private $nodeName;
private $nodeDepth = 0;
private $seenTag = false;
public function __construct( TreeBuilder $treeBuilder, $nodeName ) {
$this->nodeName = $nodeName;
parent::__construct( $treeBuilder );
}
public function startTag( $name, Attributes $attrs, $selfClose, $sourceStart, $sourceLength ) {
if ( $this->nodeDepth ) {
parent::startTag( $name, $attrs, $selfClose, $sourceStart, $sourceLength );
}
if ( $name === $this->nodeName ) {
if ( $this->nodeDepth === 0 && $this->seenTag ) {
// This is the second opening tag, not nested in the first one
throw new InvalidArgumentException( "More than one <{$this->nodeName}> tag found" );
}
$this->nodeDepth++;
$this->seenTag = true;
}
}
public function endTag( $name, $sourceStart, $sourceLength ) {
if ( $name === $this->nodeName ) {
$this->nodeDepth--;
}
if ( $this->nodeDepth ) {
parent::endTag( $name, $sourceStart, $sourceLength );
}
}
public function characters( $text, $start, $length, $sourceStart, $sourceLength ) {
if ( $this->nodeDepth ) {
parent::characters( $text, $start, $length, $sourceStart, $sourceLength );
}
}
public function comment( $text, $sourceStart, $sourceLength ) {
if ( $this->nodeDepth ) {
parent::comment( $text, $sourceStart, $sourceLength );
}
}
};
}
}