includes/ApiWikispeech.php
<?php
/**
* @file
* @ingroup Extensions
* @license GPL-2.0-or-later
*/
class ApiWikispeech extends ApiBase {
/**
* Execute an API request.
*
* @since 0.0.1
*/
function execute() {
$parameters = $this->extractRequestParams();
if ( empty( $parameters['output'] ) ) {
$this->dieWithError( [ 'apierror-paramempty', 'output' ] );
}
$titleAndContent =
$this->getTitleAndContent( $parameters['page'] );
$displayTitle = $titleAndContent[ 0 ];
$pageContent = $titleAndContent[ 1 ];
$result = FormatJson::parse(
$parameters['removetags'],
FormatJson::FORCE_ASSOC
);
if ( !$result->isGood() ) {
$this->dieWithError( [
'apierror-wikispeech-removetagsinvalidjson',
''
] );
}
$removeTags = $result->getValue();
if ( !$this->isValidRemoveTags( $removeTags ) ) {
$this->dieWithError( [
'apierror-wikispeech-removetagsinvalid',
''
] );
}
$this->processPageContent(
$displayTitle,
$pageContent,
$parameters['output'],
$removeTags,
$parameters['segmentbreakingtags']
);
}
/**
* Get the title and parsed content of the named page.
*
* @since 0.0.1
* @param string $pageTitle The title of the page to get content
* from.
* @return array An array containing the displayed title HTML and
* the parsed content for the page given in the request to the
* Wikispeech API.
*/
private function getTitleAndContent( $pageTitle ) {
// Get and validate Title
$title = Title::newFromText( $pageTitle );
if ( !$title || $title->isExternal() ) {
$this->dieWithError( [
'apierror-invalidtitle',
wfEscapeWikiText( $pageTitle )
] );
}
if ( !$title->canExist() ) {
$this->dieWithError( 'apierror-pagecannotexist' );
}
// Parse latest revision, using parser cache
$page = WikiPage::factory( $title );
$popts = $page->makeParserOptions( $this->getContext() );
$pout = $page->getParserOutput( $popts );
if ( !$pout ) {
$this->dieWithError( [
'apierror-nosuchrevid',
$page->getLatest()
] );
}
// Return title and content HTML.
return [ $pout->getDisplayTitle(), $pout->getText() ];
}
/**
* Tests if a variable is valid as "remove tags".
*
* The variable should be an associative array. Keys should be
* strings and values should be either booleans, strings or
* sequential arrays containing strings.
*
* @since 0.0.1
* @param mixed $removeTags The variable to test.
* @return bool true if $removeTags is valid, else false.
*/
public function isValidRemoveTags( $removeTags ) {
if ( !is_array( $removeTags ) ) {
return false;
}
foreach ( $removeTags as $tagName => $rule ) {
if ( !is_string( $tagName ) ) {
// A key isn't a string.
return false;
}
if ( is_array( $rule ) ) {
// Rule is a list of class names.
foreach ( $rule as $className ) {
if ( !is_string( $className ) ) {
// Only strings are valid if the rule is
// an array.
return false;
}
}
} elseif ( !is_bool( $rule ) && !is_string( $rule ) ) {
// Rule is not array, string or boolean.
return false;
}
}
return true;
}
/**
* Process HTML and return it as original, cleaned and/or segmented.
*
* @since 0.0.1
* @param string $displayTitle The title HTML as displayed on the page.
* @param string $pageContent The HTML string to process.
* @param array $outputFormats Specifies what output formats to
* return. Can be any combination of: "originalcontent",
* "cleanedtext" and "segments".
* @param string $removeTags Used by `Cleaner` to remove tags.
* @param array $segmentBreakingTags Used by `Segmenter` to break
* segments.
* @return array An array containing the output from the processes
* specified by $outputFormats:
* * "originalcontent": The input HTML string.
* * "cleanedtext": The cleaned HTML, as a string.
* * "segments": Cleaned and segmented HTML as an array.
*/
public function processPageContent(
$displayTitle,
$pageContent,
$outputFormats,
$removeTags,
$segmentBreakingTags
) {
$values = [];
if ( in_array( 'originalcontent', $outputFormats ) ) {
$values['originalcontent'] = $pageContent;
}
$cleanedText = null;
if ( in_array( 'cleanedtext', $outputFormats ) ) {
// Make a string of all the cleaned text, starting with
// the title.
$cleanedTextString = '';
$cleanedText = $this->getCleanedText(
$displayTitle,
$pageContent,
$removeTags,
$segmentBreakingTags
);
foreach ( $cleanedText as $item ) {
if ( $item instanceof SegmentBreak ) {
$cleanedTextString .= "\n";
} elseif ( $item->string != "\n" ) {
// Don't add text that is only newline.
$cleanedTextString .= $item->string;
}
}
$values['cleanedtext'] = trim( $cleanedTextString );
}
if ( in_array( 'segments', $outputFormats ) ) {
$segmenter = new Segmenter();
if ( $cleanedText == null ) {
$cleanedText = $this->getCleanedText(
$displayTitle,
$pageContent,
$removeTags,
$segmentBreakingTags
);
}
$segments = $segmenter->segmentSentences( $cleanedText );
$values['segments'] = $segments;
}
$this->getResult()->addValue(
null,
$this->getModuleName(),
$values
);
}
/**
* Clean content text and title.
*
* @param string $displayTitle The title HTML as displayed on the page.
* @param string $pageContent The HTML string to process.
* @param string $removeTags Used by `Cleaner` to remove tags.
* @param array $segmentBreakingTags Used by `Segmenter` to break
* segments.
* @since 0.0.1
* @return array Title and content represented as `CleanedText`s
* and `SegmentBreak`s
*/
public function getCleanedText(
$displayTitle,
$pageContent,
$removeTags,
$segmentBreakingTags
) {
$cleaner = new Cleaner( $removeTags, $segmentBreakingTags );
$titleSegment = $cleaner->cleanHtml( $displayTitle )[0];
$titleSegment->path = '//h1[@id="firstHeading"]//text()';
$cleanedText = $cleaner->cleanHtml( $pageContent );
// Add the title as a separate utterance to the start.
array_unshift( $cleanedText, $titleSegment, new SegmentBreak() );
return $cleanedText;
}
/**
* Specify what parameters the API accepts.
*
* @since 0.0.1
* @return array
*/
public function getAllowedParams() {
global $wgWikispeechRemoveTags;
global $wgWikispeechSegmentBreakingTags;
return array_merge(
parent::getAllowedParams(),
[
'page' => [
ApiBase::PARAM_TYPE => 'string',
ApiBase::PARAM_REQUIRED => true
],
'output' => [
ApiBase::PARAM_TYPE => [
'originalcontent',
'cleanedtext',
'segments'
],
ApiBase::PARAM_REQUIRED => true,
ApiBase::PARAM_ISMULTI => true,
ApiBase::PARAM_HELP_MSG_PER_VALUE => []
],
'removetags' => [
ApiBase::PARAM_TYPE => 'string',
ApiBase::PARAM_DFLT => json_encode(
$wgWikispeechRemoveTags
)
],
'segmentbreakingtags' => [
ApiBase::PARAM_TYPE => 'string',
ApiBase::PARAM_ISMULTI => true,
ApiBase::PARAM_DFLT => implode(
$wgWikispeechSegmentBreakingTags,
'|'
)
]
]
);
}
/**
* Give examples of usage.
*
* @since 0.0.1
* @return array
*/
public function getExamplesMessages() {
return [
// @codingStandardsIgnoreStart
'action=wikispeech&format=json&page=Main_Page&output=segments&removetags={"sup": true, "div": "toc"}&segmentbreakingtags=h1|h2'
// @codingStandardsIgnoreEnd
=> 'apihelp-wikispeech-example-1',
'action=wikispeech&format=json&page=Main_Page&output=originalcontent|cleanedtext'
=> 'apihelp-wikispeech-example-2',
];
}
}