wikimedia/mediawiki-core

View on GitHub
includes/specials/SpecialExport.php

Summary

Maintainability
F
3 days
Test Coverage
<?php
/**
 * Copyright © 2003-2008 Brooke Vibber <bvibber@wikimedia.org>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License along
 * with this program; if not, write to the Free Software Foundation, Inc.,
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 * http://www.gnu.org/copyleft/gpl.html
 *
 * @file
 */

namespace MediaWiki\Specials;

use HTMLTextAreaField;
use MediaWiki\Export\WikiExporterFactory;
use MediaWiki\HTMLForm\HTMLForm;
use MediaWiki\Linker\LinksMigration;
use MediaWiki\Logger\LoggerFactory;
use MediaWiki\MainConfigNames;
use MediaWiki\SpecialPage\SpecialPage;
use MediaWiki\Title\Title;
use MediaWiki\Title\TitleFormatter;
use WikiExporter;
use Wikimedia\Rdbms\IConnectionProvider;
use Wikimedia\Rdbms\SelectQueryBuilder;

/**
 * A special page that allows users to export pages in a XML file
 *
 * @ingroup SpecialPage
 * @ingroup Dump
 */
class SpecialExport extends SpecialPage {
    protected bool $curonly;
    protected bool $doExport;
    protected int $pageLinkDepth;
    protected bool $templates;

    private IConnectionProvider $dbProvider;
    private WikiExporterFactory $wikiExporterFactory;
    private TitleFormatter $titleFormatter;
    private LinksMigration $linksMigration;

    /**
     * @param IConnectionProvider $dbProvider
     * @param WikiExporterFactory $wikiExporterFactory
     * @param TitleFormatter $titleFormatter
     * @param LinksMigration $linksMigration
     */
    public function __construct(
        IConnectionProvider $dbProvider,
        WikiExporterFactory $wikiExporterFactory,
        TitleFormatter $titleFormatter,
        LinksMigration $linksMigration
    ) {
        parent::__construct( 'Export' );
        $this->dbProvider = $dbProvider;
        $this->wikiExporterFactory = $wikiExporterFactory;
        $this->titleFormatter = $titleFormatter;
        $this->linksMigration = $linksMigration;
    }

    public function execute( $par ) {
        $this->setHeaders();
        $this->outputHeader();
        $config = $this->getConfig();

        $this->curonly = true;
        $this->doExport = false;
        $request = $this->getRequest();
        $this->templates = $request->getCheck( 'templates' );
        $this->pageLinkDepth = $this->validateLinkDepth(
            $request->getIntOrNull( 'pagelink-depth' )
        );
        $nsindex = '';
        $exportall = false;

        if ( $request->getCheck( 'addcat' ) ) {
            $page = $request->getText( 'pages' );
            $catname = $request->getText( 'catname' );

            if ( $catname !== '' && $catname !== null && $catname !== false ) {
                $t = Title::makeTitleSafe( NS_MAIN, $catname );
                if ( $t ) {
                    /**
                     * @todo FIXME: This can lead to hitting memory limit for very large
                     * categories. Ideally we would do the lookup synchronously
                     * during the export in a single query.
                     */
                    $catpages = $this->getPagesFromCategory( $t );
                    if ( $catpages ) {
                        if ( $page !== '' ) {
                            $page .= "\n";
                        }
                        $page .= implode( "\n", $catpages );
                    }
                }
            }
        } elseif ( $request->getCheck( 'addns' ) && $config->get( MainConfigNames::ExportFromNamespaces ) ) {
            $page = $request->getText( 'pages' );
            $nsindex = $request->getText( 'nsindex', '' );

            if ( strval( $nsindex ) !== '' ) {
                /**
                 * Same implementation as above, so same @todo
                 */
                $nspages = $this->getPagesFromNamespace( (int)$nsindex );
                if ( $nspages ) {
                    $page .= "\n" . implode( "\n", $nspages );
                }
            }
        } elseif ( $request->getCheck( 'exportall' ) && $config->get( MainConfigNames::ExportAllowAll ) ) {
            $this->doExport = true;
            $exportall = true;

            /* Although $page and $history are not used later on, we
            nevertheless set them to avoid that PHP notices about using
            undefined variables foul up our XML output (see call to
            doExport(...) further down) */
            $page = '';
            $history = '';
        } elseif ( $request->wasPosted() && $par == '' ) {
            // Log to see if certain parameters are actually used.
            // If not, we could deprecate them and do some cleanup, here and in WikiExporter.
            LoggerFactory::getInstance( 'export' )->debug(
                'Special:Export POST, dir: [{dir}], offset: [{offset}], limit: [{limit}]', [
                    'dir' => $request->getRawVal( 'dir' ),
                    'offset' => $request->getRawVal( 'offset' ),
                    'limit' => $request->getRawVal( 'limit' ),
                ] );

            $page = $request->getText( 'pages' );
            $this->curonly = $request->getCheck( 'curonly' );
            $rawOffset = $request->getVal( 'offset' );

            if ( $rawOffset ) {
                $offset = wfTimestamp( TS_MW, $rawOffset );
            } else {
                $offset = null;
            }

            $maxHistory = $config->get( MainConfigNames::ExportMaxHistory );
            $limit = $request->getInt( 'limit' );
            $dir = $request->getVal( 'dir' );
            $history = [
                'dir' => 'asc',
                'offset' => false,
                'limit' => $maxHistory,
            ];
            $historyCheck = $request->getCheck( 'history' );

            if ( $this->curonly ) {
                $history = WikiExporter::CURRENT;
            } elseif ( !$historyCheck ) {
                if ( $limit > 0 && ( $maxHistory == 0 || $limit < $maxHistory ) ) {
                    $history['limit'] = $limit;
                }

                if ( $offset !== null ) {
                    $history['offset'] = $offset;
                }

                if ( strtolower( $dir ) == 'desc' ) {
                    $history['dir'] = 'desc';
                }
            }

            if ( $page != '' ) {
                $this->doExport = true;
            }
        } else {
            // Default to current-only for GET requests.
            $page = $request->getText( 'pages', $par ?? '' );
            $historyCheck = $request->getCheck( 'history' );

            if ( $historyCheck ) {
                $history = WikiExporter::FULL;
            } else {
                $history = WikiExporter::CURRENT;
            }

            if ( $page != '' ) {
                $this->doExport = true;
            }
        }

        if ( !$config->get( MainConfigNames::ExportAllowHistory ) ) {
            // Override
            $history = WikiExporter::CURRENT;
        }

        $list_authors = $request->getCheck( 'listauthors' );
        if ( !$this->curonly || !$config->get( MainConfigNames::ExportAllowListContributors ) ) {
            $list_authors = false;
        }

        if ( $this->doExport ) {
            $this->getOutput()->disable();

            // Cancel output buffering and gzipping if set
            // This should provide safer streaming for pages with history
            wfResetOutputBuffers();
            $request->response()->header( 'Content-type: application/xml; charset=utf-8' );
            $request->response()->header( 'X-Robots-Tag: noindex,nofollow' );

            if ( $request->getCheck( 'wpDownload' ) ) {
                // Provide a sensible filename suggestion
                $filename = urlencode( $config->get( MainConfigNames::Sitename ) . '-' .
                    wfTimestampNow() . '.xml' );
                $request->response()->header( "Content-disposition: attachment;filename={$filename}" );
            }

            // @phan-suppress-next-next-line PhanPossiblyUndeclaredVariable
            // @phan-suppress-next-line PhanTypeMismatchArgumentNullable history is set when used
            $this->doExport( $page, $history, $list_authors, $exportall );

            return;
        }

        $out = $this->getOutput();
        $out->addWikiMsg( 'exporttext' );

        if ( $page == '' ) {
            $categoryName = $request->getText( 'catname' );
        } else {
            $categoryName = '';
        }
        $canExportAll = $config->get( MainConfigNames::ExportAllowAll );
        $hideIf = $canExportAll ? [ 'hide-if' => [ '===', 'exportall', '1' ] ] : [];

        $formDescriptor = [
            'catname' => [
                'type' => 'textwithbutton',
                'name' => 'catname',
                'horizontal-label' => true,
                'label-message' => 'export-addcattext',
                'default' => $categoryName,
                'size' => 40,
                'buttontype' => 'submit',
                'buttonname' => 'addcat',
                'buttondefault' => $this->msg( 'export-addcat' )->text(),
            ] + $hideIf,
        ];
        if ( $config->get( MainConfigNames::ExportFromNamespaces ) ) {
            $formDescriptor += [
                'nsindex' => [
                    'type' => 'namespaceselectwithbutton',
                    'default' => $nsindex,
                    'label-message' => 'export-addnstext',
                    'horizontal-label' => true,
                    'name' => 'nsindex',
                    'id' => 'namespace',
                    'cssclass' => 'namespaceselector',
                    'buttontype' => 'submit',
                    'buttonname' => 'addns',
                    'buttondefault' => $this->msg( 'export-addns' )->text(),
                ] + $hideIf,
            ];
        }

        if ( $canExportAll ) {
            $formDescriptor += [
                'exportall' => [
                    'type' => 'check',
                    'label-message' => 'exportall',
                    'name' => 'exportall',
                    'id' => 'exportall',
                    'default' => $request->wasPosted() && $request->getCheck( 'exportall' ),
                ],
            ];
        }

        $formDescriptor += [
            'textarea' => [
                'class' => HTMLTextAreaField::class,
                'name' => 'pages',
                'label-message' => 'export-manual',
                'nodata' => true,
                'rows' => 10,
                'default' => $page,
            ] + $hideIf,
        ];

        if ( $config->get( MainConfigNames::ExportAllowHistory ) ) {
            $formDescriptor += [
                'curonly' => [
                    'type' => 'check',
                    'label-message' => 'exportcuronly',
                    'name' => 'curonly',
                    'id' => 'curonly',
                    'default' => !$request->wasPosted() || $request->getCheck( 'curonly' ),
                ],
            ];
        } else {
            $out->addWikiMsg( 'exportnohistory' );
        }

        $formDescriptor += [
            'templates' => [
                'type' => 'check',
                'label-message' => 'export-templates',
                'name' => 'templates',
                'id' => 'wpExportTemplates',
                'default' => $request->wasPosted() && $request->getCheck( 'templates' ),
            ],
        ];

        if ( $config->get( MainConfigNames::ExportMaxLinkDepth ) || $this->userCanOverrideExportDepth() ) {
            $formDescriptor += [
                'pagelink-depth' => [
                    'type' => 'text',
                    'name' => 'pagelink-depth',
                    'id' => 'pagelink-depth',
                    'label-message' => 'export-pagelinks',
                    'default' => '0',
                    'size' => 20,
                ],
            ];
        }

        $formDescriptor += [
            'wpDownload' => [
                'type' => 'check',
                'name' => 'wpDownload',
                'id' => 'wpDownload',
                'default' => !$request->wasPosted() || $request->getCheck( 'wpDownload' ),
                'label-message' => 'export-download',
            ],
        ];

        if ( $config->get( MainConfigNames::ExportAllowListContributors ) ) {
            $formDescriptor += [
                'listauthors' => [
                    'type' => 'check',
                    'label-message' => 'exportlistauthors',
                    'default' => $request->wasPosted() && $request->getCheck( 'listauthors' ),
                    'name' => 'listauthors',
                    'id' => 'listauthors',
                ],
            ];
        }

        $htmlForm = HTMLForm::factory( 'ooui', $formDescriptor, $this->getContext() );
        $htmlForm->setSubmitTextMsg( 'export-submit' );
        $htmlForm->prepareForm()->displayForm( false );
        $this->addHelpLink( 'Help:Export' );
    }

    /**
     * @return bool
     */
    protected function userCanOverrideExportDepth() {
        return $this->getAuthority()->isAllowed( 'override-export-depth' );
    }

    /**
     * Do the actual page exporting
     *
     * @param string $page User input on what page(s) to export
     * @param int $history One of the WikiExporter history export constants
     * @param bool $list_authors Whether to add distinct author list (when
     *   not returning full history)
     * @param bool $exportall Whether to export everything
     */
    protected function doExport( $page, $history, $list_authors, $exportall ) {
        // If we are grabbing everything, enable full history and ignore the rest
        if ( $exportall ) {
            $history = WikiExporter::FULL;
        } else {
            $pageSet = []; // Inverted index of all pages to look up

            // Split up and normalize input
            foreach ( explode( "\n", $page ) as $pageName ) {
                $pageName = trim( $pageName );
                $title = Title::newFromText( $pageName );
                if ( $title && !$title->isExternal() && $title->getText() !== '' ) {
                    // Only record each page once!
                    $pageSet[$title->getPrefixedText()] = true;
                }
            }

            // Set of original pages to pass on to further manipulation...
            $inputPages = array_keys( $pageSet );

            // Look up any linked pages if asked...
            if ( $this->templates ) {
                $pageSet = $this->getTemplates( $inputPages, $pageSet );
            }
            $pageSet = $this->getExtraPages( $inputPages, $pageSet );
            $linkDepth = $this->pageLinkDepth;
            if ( $linkDepth ) {
                $pageSet = $this->getPageLinks( $inputPages, $pageSet, $linkDepth );
            }

            $pages = array_keys( $pageSet );

            // Normalize titles to the same format and remove dupes, see T19374
            foreach ( $pages as $k => $v ) {
                $pages[$k] = str_replace( ' ', '_', $v );
            }

            $pages = array_unique( $pages );
        }

        /* Ok, let's get to it... */
        $db = $this->dbProvider->getReplicaDatabase();

        $exporter = $this->wikiExporterFactory->getWikiExporter( $db, $history );
        $exporter->list_authors = $list_authors;
        $exporter->openStream();

        if ( $exportall ) {
            $exporter->allPages();
        } else {
            // @phan-suppress-next-line PhanPossiblyUndeclaredVariable
            foreach ( $pages as $page ) {
                # T10824: Only export pages the user can read
                $title = Title::newFromText( $page );
                if ( $title === null ) {
                    // @todo Perhaps output an <error> tag or something.
                    continue;
                }

                if ( !$this->getAuthority()->authorizeRead( 'read', $title ) ) {
                    // @todo Perhaps output an <error> tag or something.
                    continue;
                }

                $exporter->pageByTitle( $title );
            }
        }

        $exporter->closeStream();
    }

    /**
     * @param Title $title
     * @return string[]
     */
    protected function getPagesFromCategory( $title ) {
        $maxPages = $this->getConfig()->get( MainConfigNames::ExportPagelistLimit );

        $name = $title->getDBkey();

        $dbr = $this->dbProvider->getReplicaDatabase();
        $res = $dbr->newSelectQueryBuilder()
            ->select( [ 'page_namespace', 'page_title' ] )
            ->from( 'page' )
            ->join( 'categorylinks', null, 'cl_from=page_id' )
            ->where( [ 'cl_to' => $name ] )
            ->limit( $maxPages )
            ->caller( __METHOD__ )->fetchResultSet();

        $pages = [];

        foreach ( $res as $row ) {
            $pages[] = Title::makeName( $row->page_namespace, $row->page_title );
        }

        return $pages;
    }

    /**
     * @param int $nsindex
     * @return string[]
     */
    protected function getPagesFromNamespace( $nsindex ) {
        $maxPages = $this->getConfig()->get( MainConfigNames::ExportPagelistLimit );

        $dbr = $this->dbProvider->getReplicaDatabase();
        $res = $dbr->newSelectQueryBuilder()
            ->select( [ 'page_namespace', 'page_title' ] )
            ->from( 'page' )
            ->where( [ 'page_namespace' => $nsindex ] )
            ->limit( $maxPages )
            ->caller( __METHOD__ )->fetchResultSet();

        $pages = [];

        foreach ( $res as $row ) {
            $pages[] = Title::makeName( $row->page_namespace, $row->page_title );
        }

        return $pages;
    }

    /**
     * Expand a list of pages to include templates used in those pages.
     * @param array $inputPages List of titles to look up
     * @param array $pageSet Associative array indexed by titles for output
     * @return array Associative array index by titles
     */
    protected function getTemplates( $inputPages, $pageSet ) {
        [ $nsField, $titleField ] = $this->linksMigration->getTitleFields( 'templatelinks' );
        $queryInfo = $this->linksMigration->getQueryInfo( 'templatelinks' );
        $dbr = $this->dbProvider->getReplicaDatabase();
        $queryBuilder = $dbr->newSelectQueryBuilder()
            ->caller( __METHOD__ )
            ->select( [ 'namespace' => $nsField, 'title' => $titleField ] )
            ->from( 'page' )
            ->join( 'templatelinks', null, 'page_id=tl_from' )
            ->tables( array_diff( $queryInfo['tables'], [ 'templatelinks' ] ) )
            ->joinConds( $queryInfo['joins'] );
        return $this->getLinks( $inputPages, $pageSet, $queryBuilder );
    }

    /**
     * Add extra pages to the list of pages to export.
     * @param string[] $inputPages List of page titles to export
     * @param bool[] $pageSet Initial associative array indexed by string page titles
     * @return bool[] Associative array indexed by string page titles including extra pages
     */
    private function getExtraPages( $inputPages, $pageSet ) {
        $extraPages = [];
        $this->getHookRunner()->onSpecialExportGetExtraPages( $inputPages, $extraPages );
        foreach ( $extraPages as $extraPage ) {
            $pageSet[$this->titleFormatter->getPrefixedText( $extraPage )] = true;
        }
        return $pageSet;
    }

    /**
     * Validate link depth setting, if available.
     * @param int|null $depth
     * @return int
     */
    protected function validateLinkDepth( $depth ) {
        if ( $depth === null || $depth < 0 ) {
            return 0;
        }

        if ( !$this->userCanOverrideExportDepth() ) {
            $maxLinkDepth = $this->getConfig()->get( MainConfigNames::ExportMaxLinkDepth );
            if ( $depth > $maxLinkDepth ) {
                return $maxLinkDepth;
            }
        }

        /*
         * There's a HARD CODED limit of 5 levels of recursion here to prevent a
         * crazy-big export from being done by someone setting the depth
         * number too high. In other words, last resort safety net.
         */

        return intval( min( $depth, 5 ) );
    }

    /**
     * Expand a list of pages to include pages linked to from that page.
     * @param array $inputPages
     * @param array $pageSet
     * @param int $depth
     * @return array
     */
    protected function getPageLinks( $inputPages, $pageSet, $depth ) {
        for ( ; $depth > 0; --$depth ) {
            [ $nsField, $titleField ] = $this->linksMigration->getTitleFields( 'pagelinks' );
            $queryInfo = $this->linksMigration->getQueryInfo( 'pagelinks' );
            $dbr = $this->dbProvider->getReplicaDatabase();
            $queryBuilder = $dbr->newSelectQueryBuilder()
                ->caller( __METHOD__ )
                ->select( [ 'namespace' => $nsField, 'title' => $titleField ] )
                ->from( 'page' )
                ->join( 'pagelinks', null, 'page_id=pl_from' )
                ->tables( array_diff( $queryInfo['tables'], [ 'pagelinks' ] ) )
                ->joinConds( $queryInfo['joins'] );
            $pageSet = $this->getLinks( $inputPages, $pageSet, $queryBuilder );
            $inputPages = array_keys( $pageSet );
        }

        return $pageSet;
    }

    /**
     * Expand a list of pages to include items used in those pages.
     * @param array $inputPages Array of page titles
     * @param array $pageSet
     * @param SelectQueryBuilder $queryBuilder
     * @return array
     */
    protected function getLinks( $inputPages, $pageSet, SelectQueryBuilder $queryBuilder ) {
        foreach ( $inputPages as $page ) {
            $title = Title::newFromText( $page );
            if ( $title ) {
                $pageSet[$title->getPrefixedText()] = true;
                /// @todo FIXME: May or may not be more efficient to batch these
                ///        by namespace when given multiple input pages.
                $result = ( clone $queryBuilder )
                    ->where( [
                        'page_namespace' => $title->getNamespace(),
                        'page_title' => $title->getDBkey()
                    ] )
                    ->fetchResultSet();

                foreach ( $result as $row ) {
                    $template = Title::makeTitle( $row->namespace, $row->title );
                    $pageSet[$template->getPrefixedText()] = true;
                }
            }
        }

        return $pageSet;
    }

    protected function getGroupName() {
        return 'pagetools';
    }
}

/** @deprecated class alias since 1.41 */
class_alias( SpecialExport::class, 'SpecialExport' );