wikimedia/pywikibot

View on GitHub
pywikibot/pagegenerators/__init__.py

Summary

Maintainability
A
1 hr
Test Coverage
"""This module offers a wide variety of page generators.

A page generator is an object that is iterable (see :pep:`255`) and
that yields page objects on which other scripts can then work.

Most of these functions just wrap a Site or Page method that returns a
generator. For testing purposes listpages.py can be used, to print page
titles to standard output.

These parameters are supported to specify which pages titles to be used:

&params;
"""
#
# (C) Pywikibot team, 2008-2024
#
# Distributed under the terms of the MIT license.
#
from __future__ import annotations

from typing import TYPE_CHECKING, Any

import pywikibot
from pywikibot.backports import Callable, Generator, Iterable
from pywikibot.pagegenerators._factory import GeneratorFactory
from pywikibot.pagegenerators._filters import (
    CategoryFilterPageGenerator,
    EdittimeFilterPageGenerator,
    ItemClaimFilterPageGenerator,
    NamespaceFilterPageGenerator,
    PageTitleFilterPageGenerator,
    QualityFilterPageGenerator,
    RedirectFilterPageGenerator,
    RegexBodyFilterPageGenerator,
    RegexFilterPageGenerator,
    SubpageFilterGenerator,
    UserEditFilterGenerator,
    WikibaseItemFilterPageGenerator,
)
from pywikibot.pagegenerators._generators import (
    AllpagesPageGenerator,
    AncientPagesPageGenerator,
    CategorizedPageGenerator,
    DayPageGenerator,
    DeadendPagesPageGenerator,
    FileLinksGenerator,
    GoogleSearchPageGenerator,
    ImagesPageGenerator,
    InterwikiPageGenerator,
    LanguageLinksPageGenerator,
    LinkedPageGenerator,
    LinksearchPageGenerator,
    LiveRCPageGenerator,
    LogeventsPageGenerator,
    LonelyPagesPageGenerator,
    LongPagesPageGenerator,
    MySQLPageGenerator,
    NewimagesPageGenerator,
    NewpagesPageGenerator,
    PagePilePageGenerator,
    PagesFromPageidGenerator,
    PagesFromTitlesGenerator,
    PetScanPageGenerator,
    PrefixingPageGenerator,
    RandomPageGenerator,
    RandomRedirectPageGenerator,
    RecentChangesPageGenerator,
    SearchPageGenerator,
    ShortPagesPageGenerator,
    SubCategoriesPageGenerator,
    SupersetPageGenerator,
    TextIOPageGenerator,
    UnCategorizedCategoryGenerator,
    UnCategorizedImageGenerator,
    UnCategorizedPageGenerator,
    UnCategorizedTemplateGenerator,
    UnconnectedPageGenerator,
    UnusedFilesGenerator,
    UnwatchedPagesPageGenerator,
    UserContributionsGenerator,
    WantedPagesPageGenerator,
    WikibaseItemGenerator,
    WikibaseSearchItemPageGenerator,
    WikidataPageFromItemGenerator,
    WikidataSPARQLPageGenerator,
    WithoutInterwikiPageGenerator,
    XMLDumpPageGenerator,
    YearPageGenerator,
    page_with_property_generator,
)
from pywikibot.tools.collections import DequeGenerator


__all__ = (
    # factory
    'GeneratorFactory',
    # filter
    'CategoryFilterPageGenerator',
    'EdittimeFilterPageGenerator',
    'ItemClaimFilterPageGenerator',
    'NamespaceFilterPageGenerator',
    'PageTitleFilterPageGenerator',
    'QualityFilterPageGenerator',
    'RedirectFilterPageGenerator',
    'RegexBodyFilterPageGenerator',
    'RegexFilterPageGenerator',
    'SubpageFilterGenerator',
    'UserEditFilterGenerator',
    'WikibaseItemFilterPageGenerator',
    # page generators
    'AllpagesPageGenerator',
    'AncientPagesPageGenerator',
    'CategorizedPageGenerator',
    'DayPageGenerator',
    'DeadendPagesPageGenerator',
    'FileLinksGenerator',
    'GoogleSearchPageGenerator',
    'ImagesPageGenerator',
    'InterwikiPageGenerator',
    'LanguageLinksPageGenerator',
    'LinkedPageGenerator',
    'LinksearchPageGenerator',
    'LiveRCPageGenerator',
    'LogeventsPageGenerator',
    'LonelyPagesPageGenerator',
    'LongPagesPageGenerator',
    'MySQLPageGenerator',
    'NewimagesPageGenerator',
    'NewpagesPageGenerator',
    'page_with_property_generator',
    'PagesFromPageidGenerator',
    'PagesFromTitlesGenerator',
    'PagePilePageGenerator',
    'PetScanPageGenerator',
    'PrefixingPageGenerator',
    'RandomPageGenerator',
    'RandomRedirectPageGenerator',
    'RecentChangesPageGenerator',
    'SearchPageGenerator',
    'ShortPagesPageGenerator',
    'SubCategoriesPageGenerator',
    'SupersetPageGenerator',
    'TextIOPageGenerator',
    'UnCategorizedCategoryGenerator',
    'UnCategorizedImageGenerator',
    'UnCategorizedPageGenerator',
    'UnCategorizedTemplateGenerator',
    'UnconnectedPageGenerator',
    'UnusedFilesGenerator',
    'UnwatchedPagesPageGenerator',
    'UserContributionsGenerator',
    'WantedPagesPageGenerator',
    'WikibaseItemGenerator',
    'WikibaseSearchItemPageGenerator',
    'WikidataPageFromItemGenerator',
    'WikidataSPARQLPageGenerator',
    'WithoutInterwikiPageGenerator',
    'XMLDumpPageGenerator',
    'YearPageGenerator',
    # other generators
    'DequePreloadingGenerator',
    'PageClassGenerator',
    'PageWithTalkPageGenerator',
    'PreloadingEntityGenerator',
    'PreloadingGenerator',
    'RepeatingGenerator',
)


parameterHelp = """\
GENERATOR OPTIONS
=================

-cat                Work on all pages which are in a specific category.
                    Argument can also be given as "-cat:categoryname" or
                    as "-cat:categoryname|fromtitle" (using # instead of |
                    is also allowed in this one and the following)

-catr               Like -cat, but also recursively includes pages in
                    subcategories, sub-subcategories etc. of the
                    given category.
                    Argument can also be given as "-catr:categoryname" or
                    as "-catr:categoryname|fromtitle".

-subcats            Work on all subcategories of a specific category.
                    Argument can also be given as "-subcats:categoryname" or
                    as "-subcats:categoryname|fromtitle".

-subcatsr           Like -subcats, but also includes sub-subcategories etc. of
                    the given category.
                    Argument can also be given as "-subcatsr:categoryname" or
                    as "-subcatsr:categoryname|fromtitle".

-uncat              Work on all pages which are not categorised.

-uncatcat           Work on all categories which are not categorised.

-uncatfiles         Work on all files which are not categorised.

-file               Read a list of pages to treat from the named text file.
                    Page titles in the file may be either enclosed with
                    [[brackets]], or be separated by new lines.
                    Argument can also be given as "-file:filename".

-filelinks          Work on all pages that use a certain image/media file.
                    Argument can also be given as "-filelinks:filename".

-search             Work on all pages that are found in a MediaWiki search
                    across all namespaces.

-logevents          Work on articles that were on a specified Special:Log.
                    The value may be a comma separated list of these values::

                        logevent,username,start,end

                    .. deprecated:: 9.2
                       backward compatible *total* argument like
                       ``logevent,username,total``; use ``-limit`` filter
                       option instead (see below).

                    To use the default value, use an empty string.

                    .. note:: 'start' is the most recent date and log
                       events are iterated from present to past. If
                       'start' is not provided, it means 'now'; if 'end'
                       is not provided, it means 'since the beginning'.

                    .. seealso::
                       *letype* of :api:`Logevents` for the supported
                       types of log events.

                    **Examples:**

                    ``-logevents:move`` gives pages from move log (usually
                    redirects)

                    ``-logevents:delete -limit20`` gives 20 pages from deletion
                    log

                    ``-logevents:protect,Usr`` gives pages from protect log by
                    user Usr

                    ``-logevents:patrol,Usr -limit:20`` gives 20 patrolled
                    pages by Usr

                    ``-logevents:upload,,20121231,20100101`` gives upload pages
                    in the 2010s, 2011s, and 2012s

                    ``-logevents:review,,20121231`` gives review pages since
                    the beginning till the 31 Dec 2012

                    ``-logevents:review,Usr,20121231`` gives review pages by
                    user Usr since the beginning till the 31 Dec 2012

                    In some cases it must be given as
                    ``-logevents:"move,Usr,20"``

-interwiki          Work on the given page and all equivalent pages in other
                    languages. This can, for example, be used to fight
                    multi-site spamming.
                    Attention: this will cause the bot to modify
                    pages on several wiki sites, this is not well tested,
                    so check your edits!

-links              Work on all pages that are linked from a certain page.
                    Argument can also be given as "-links:linkingpagetitle".

-liverecentchanges  Work on pages from the live recent changes feed. If used as
                    -liverecentchanges:x, work on x recent changes.

-imagesused         Work on all images that contained on a certain page.
                    Can also be given as "-imagesused:linkingpagetitle".

-newimages          Work on the most recent new images. If given as
                    -newimages:x, will work on x newest images.

-newpages           Work on the most recent new pages. If given as -newpages:x,
                    will work on x newest pages.

-recentchanges      Work on the pages with the most recent changes. If
                    given as -recentchanges:x, will work on the x most recently
                    changed pages. If given as -recentchanges:offset,duration
                    it will work on pages changed from 'offset' minutes with
                    'duration' minutes of timespan. rctags are supported too.
                    The rctag must be the very first parameter part.

                    **Examples:**

                    ``-recentchanges:20`` gives the 20 most recently changed
                    pages

                    ``-recentchanges:120,70`` will give pages with 120 offset
                    minutes and 70 minutes of timespan

                    ``-recentchanges:visualeditor,10`` gives the 10 most
                    recently changed pages marked with 'visualeditor'

                    ``-recentchanges:"mobile edit,60,35"`` will retrieve pages
                    marked with 'mobile edit' for the given offset and timespan

-unconnectedpages   Work on the most recent unconnected pages to the Wikibase
                    repository. Given as -unconnectedpages:x, will work on the
                    x most recent unconnected pages.

-ref                Work on all pages that link to a certain page.
                    Argument can also be given as "-ref:referredpagetitle".

-start              Specifies that the robot should go alphabetically through
                    all pages on the home wiki, starting at the named page.
                    Argument can also be given as "-start:pagetitle".

                    You can also include a namespace. For example,
                    "-start:Template:!" will make the bot work on all pages
                    in the template namespace.

                    default value is start:!

-prefixindex        Work on pages commencing with a common prefix.

-transcludes        Work on all pages that use a certain template.
                    Argument can also be given as "-transcludes:Title".

-unusedfiles        Work on all description pages of images/media files that
                    are not used anywhere.
                    Argument can be given as "-unusedfiles:n" where
                    n is the maximum number of articles to work on.

-lonelypages        Work on all articles that are not linked from any other
                    article.
                    Argument can be given as "-lonelypages:n" where
                    n is the maximum number of articles to work on.

-unwatched          Work on all articles that are not watched by anyone.
                    Argument can be given as "-unwatched:n" where
                    n is the maximum number of articles to work on.

-property           Work on all pages with a given property name from
                    Special:PagesWithProp. Usage:

                        -property:name

-usercontribs       Work on all articles that were edited by a certain user.
                    (Example : -usercontribs:DumZiBoT)

-weblink            Work on all articles that contain an external link to
                    a given URL; may be given as "-weblink:url"

-withoutinterwiki   Work on all pages that don't have interlanguage links.
                    Argument can be given as "-withoutinterwiki:n" where
                    n is the total to fetch.

-mysqlquery         Takes a MySQL query string like
                    "SELECT page_namespace, page_title FROM page
                    WHERE page_namespace = 0"
                    and treats the resulting pages. See :manpage:`MySQL`
                    for more details.

-supersetquery      Takes a SQL query string like
                    "SELECT page_namespace, page_title FROM page
                    WHERE page_namespace = 0" and run it in
                    https://superset.wmcloud.org/ and treats
                    the resulting pages.

-sparql             Takes a SPARQL SELECT query string including ?item
                    and works on the resulting pages.

-sparqlendpoint     Specify SPARQL endpoint URL (optional).
                    (Example: -sparqlendpoint:http://myserver.com/sparql)

-searchitem         Takes a search string and works on Wikibase pages that
                    contain it.
                    Argument can be given as "-searchitem:text", where text
                    is the string to look for, or "-searchitem:lang:text",
                    where lang is the language to search items in.

-wantedpages        Work on pages that are linked, but do not exist;
                    may be given as "-wantedpages:n" where n is the maximum
                    number of articles to work on.

-wantedcategories   Work on categories that are used, but do not exist;
                    may be given as "-wantedcategories:n" where n is the
                    maximum number of categories to work on.

-wantedfiles        Work on files that are used, but do not exist;
                    may be given as "-wantedfiles:n" where n is the maximum
                    number of files to work on.

-wantedtemplates    Work on templates that are used, but do not exist;
                    may be given as "-wantedtemplates:n" where n is the
                    maximum number of templates to work on.

-random             Work on random pages returned by [[Special:Random]].
                    Can also be given as "-random:n" where n is the number
                    of pages to be returned.

-randomredirect     Work on random redirect pages returned by
                    [[Special:RandomRedirect]]. Can also be given as
                    "-randomredirect:n" where n is the number of pages to be
                    returned.

-google             Work on all pages that are found in a Google search.
                    You need a Google Web API license key. Note that Google
                    doesn't give out license keys anymore. See google_key in
                    config.py for instructions.
                    Argument can also be given as "-google:searchstring".

-page               Work on a single page. Argument can also be given as
                    "-page:pagetitle", and supplied multiple times for
                    multiple pages.

-pageid             Work on a single pageid. Argument can also be given as
                    "-pageid:pageid1,pageid2,." or
                    "-pageid:'pageid1|pageid2|..'"
                    and supplied multiple times for multiple pages.

-pagepile           Work on a PagePile. Argument is the pile id (an integer)

-linter             Work on pages that contain lint errors. Extension Linter
                    must be available on the site.
                    -linter select all categories.
                    -linter:high, -linter:medium or -linter:low select all
                    categories for that prio.
                    Single categories can be selected with commas as in
                    -linter:cat1,cat2,cat3

                    Adding '/int' identifies Lint ID to start querying from:
                    e.g. -linter:high/10000

                    -linter:show just shows available categories.

-querypage          Work on pages provided by a QueryPage-based special
                    page. Usage:

                        -querypage:name

                    ``-querypage`` without argument shows special pages
                    available.

                    .. seealso:: :api:`Querypage`

-url                Read a list of pages to treat from the provided URL.
                    The URL must return text in the same format as expected for
                    the -file argument, e.g. page titles separated by newlines
                    or enclosed in brackets.

.. tip::
   use ``-limit:n`` filter option to fetch only n pages.

FILTER OPTIONS
==============

-catfilter          Filter the page generator to only yield pages in the
                    specified category. See -cat generator for argument format.

-grep               A regular expression that needs to match the article
                    otherwise the page won't be returned.
                    Multiple -grep:regexpr can be provided and the page will
                    be returned if content is matched by any of the regexpr
                    provided.
                    Case insensitive regular expressions will be used and
                    dot matches any character, including a newline.

-grepnot            Like -grep, but return the page only if the regular
                    expression does not match.

-intersect          Work on the intersection of all the provided generators.

-limit              When used with any other argument ``-limit:n``
                    specifies a set of pages, work on no more than n
                    pages in total. If used with multiple generators,
                    pages are yielded in a roundrobin way.

-namespaces         Filter the page generator to only yield pages in the
-namespace          specified namespaces. Separate multiple namespace
-ns                 numbers or names with commas.

                    Examples::

                    -ns:0,2,4
                    -ns:Help,MediaWiki

                    You may use a preleading "not" to exclude the namespace.

                    Examples::

                    -ns:not:2,3
                    -ns:not:Help,File

                    If used with -newpages/-random/-randomredirect/-linter
                    generators, -namespace/ns must be provided before
                    -newpages/-random/-randomredirect/-linter.
                    If used with -recentchanges generator, efficiency is
                    improved if -namespace is provided before -recentchanges.

                    If used with -start generator, -namespace/ns shall contain
                    only one value.

-onlyif             A claim the page needs to contain, otherwise the item won't
                    be returned.
                    The format is property=value,qualifier=value. Multiple (or
                    none) qualifiers can be passed, separated by commas.

                    Examples:

                    .. code-block:: shell

                       P1=Q2 (property P1 must contain value Q2),
                       P3=Q4,P5=Q6,P6=Q7 (property P3 with value Q4 and
                       qualifiers: P5 with value Q6 and P6 with value Q7).

                    Value can be page ID, coordinate in format:
                    latitude,longitude[,precision] (all values are in decimal
                    degrees), year, or plain string.

                    The argument can be provided multiple times and the item
                    page will be returned only if all claims are present.
                    Argument can be also given as "-onlyif:expression".

-onlyifnot          A claim the page must not contain, otherwise the
                    item won't be returned. For usage and examples, see
                    `-onlyif` above.

-ql                 Filter pages based on page quality.
                    This is only applicable if contentmodel equals
                    'proofread-page', otherwise has no effects.
                    Valid values are in range 0-4.
                    Multiple values can be comma-separated.

-redirect           Filter pages based on whether they are redirects. To return
                    only pages that are not redirects, use -redirect:false

-subpage            -subpage:n filters pages to only those that have depth n
                    i.e. a depth of 0 filters out all pages that are subpages,
                    and a depth of 1 filters out all pages that are subpages of
                    subpages.


-titleregex         A regular expression that needs to match the article title
                    otherwise the page won't be returned.
                    Multiple -titleregex:regexpr can be provided and the page
                    will be returned if title is matched by any of the regexpr
                    provided.
                    Case insensitive regular expressions will be used and
                    dot matches any character.

-titleregexnot      Like -titleregex, but return the page only if the regular
                    expression does not match.
"""  # noqa: N816

docuReplacements = {'&params;': parameterHelp}  # noqa: N816

if TYPE_CHECKING:
    PRELOAD_SITE_TYPE = dict[pywikibot.site.BaseSite,
                             list[pywikibot.page.Page]]

# if a bot uses GeneratorFactory, the module should include the line
#   docuReplacements = {'&params;': pywikibot.pagegenerators.parameterHelp}
# and include the marker &params; in the module's docstring
#
# We manually include it so the parameters show up in the auto-generated
# module documentation:

__doc__ = __doc__.replace('&params;', parameterHelp)


def PageClassGenerator(generator: Iterable[pywikibot.page.Page]
                       ) -> Generator[pywikibot.page.Page, None, None]:
    """Yield pages from another generator as Page subclass objects.

    The page class type depends on the page namespace.
    Objects may be Category, FilePage, Userpage or Page.
    """
    for page in generator:
        if page.namespace() == page.site.namespaces.USER:
            yield pywikibot.User(page)
        elif page.namespace() == page.site.namespaces.FILE:
            yield pywikibot.FilePage(page)
        elif page.namespace() == page.site.namespaces.CATEGORY:
            yield pywikibot.Category(page)
        else:
            yield page


def PageWithTalkPageGenerator(
    generator: Iterable[pywikibot.page.BasePage],
    return_talk_only: bool = False,
) -> Generator[pywikibot.page.BasePage, None, None]:
    """Yield pages and associated talk pages from another generator.

    Only yields talk pages if the original generator yields a non-talk page,
    and does not check if the talk page in fact exists.

    """
    for page in generator:
        if not return_talk_only or page.isTalkPage():
            yield page
        if not page.isTalkPage():
            yield page.toggleTalkPage()


def RepeatingGenerator(
    generator: Callable[..., Iterable[pywikibot.page.BasePage]],
    key_func: Callable[[pywikibot.page.BasePage], Any] = lambda x: x,
    sleep_duration: int = 60,
    total: int | None = None,
    **kwargs: Any,
) -> Generator[pywikibot.page.Page, None, None]:
    """Yield items in live time.

    The provided generator must support parameter 'start', 'end',
    'reverse', and 'total' such as site.recentchanges(), site.logevents().

    To fetch revisions in recentchanges in live time::

        gen = RepeatingGenerator(site.recentchanges, lambda x: x['revid'])

    To fetch new pages in live time::

        gen = RepeatingGenerator(site.newpages, lambda x: x[0])

    Note that other parameters not listed below will be passed
    to the generator function. Parameter 'reverse', 'start', 'end'
    will always be discarded to prevent the generator yielding items
    in wrong order.

    :param generator: a function returning a generator that will be queried
    :param key_func: a function returning key that will be used to detect
        duplicate entry
    :param sleep_duration: duration between each query
    :param total: if it is a positive number, iterate no more than this
        number of items in total. Otherwise, iterate forever
    :return: a generator yielding items in ascending order by time
    """
    kwargs.pop('reverse', None)  # always get newest item first
    kwargs.pop('start', None)  # don't set start time
    kwargs.pop('end', None)  # don't set stop time

    seen: set[Any] = set()
    while total is None or len(seen) < total:
        def filtered_generator() -> Generator[pywikibot.page.BasePage,
                                              None, None]:
            for item in generator(total=None if seen else 1, **kwargs):
                key = key_func(item)
                if key not in seen:
                    seen.add(key)
                    yield item
                    if len(seen) == total:
                        return
                else:
                    break
            pywikibot.sleep(sleep_duration)

        yield from reversed(list(filtered_generator()))


def PreloadingGenerator(generator: Iterable[pywikibot.page.Page],
                        groupsize: int = 50,
                        quiet: bool = False
                        ) -> Generator[pywikibot.page.Page, None, None]:
    """Yield preloaded pages taken from another generator.

    :param generator: pages to iterate over
    :param groupsize: how many pages to preload at once
    :param quiet: If False (default), show the "Retrieving pages"
        message
    """
    # pages may be on more than one site, for example if an interwiki
    # generator is used, so use a separate preloader for each site
    sites: PRELOAD_SITE_TYPE = {}
    # build a list of pages for each site found in the iterator
    for page in generator:
        site = page.site
        sites.setdefault(site, []).append(page)

        groupsize = min(groupsize, site.maxlimit)
        if len(sites[site]) >= groupsize:
            # if this site is at the groupsize, process it
            group = sites.pop(site)
            yield from site.preloadpages(group, groupsize=groupsize,
                                         quiet=quiet)

    for site, pages in sites.items():
        # process any leftover sites that never reached the groupsize
        yield from site.preloadpages(pages, groupsize=groupsize, quiet=quiet)


def DequePreloadingGenerator(
    generator: DequeGenerator,
    groupsize: int = 50,
    quiet: bool = False,
) -> Generator[pywikibot.page.Page, None, None]:
    """Preload generator of type DequeGenerator.

    :param generator: pages to iterate over
    :param groupsize: how many pages to preload at once
    :param quiet: If False (default), show the "Retrieving pages"
        message
    """
    assert isinstance(generator, DequeGenerator), \
        'generator must be a DequeGenerator object'

    while True:
        page_count = min(len(generator), groupsize)
        if not page_count:
            return

        yield from PreloadingGenerator(generator, page_count, quiet)


def PreloadingEntityGenerator(
    generator: Iterable[pywikibot.page.WikibaseEntity],
    groupsize: int = 50,
) -> Generator[pywikibot.page.WikibaseEntity, None, None]:
    """Yield preloaded pages taken from another generator.

    Function basically is copied from above, but for Wikibase entities.

    :param generator: pages to iterate over
    :param groupsize: how many pages to preload at once
    """
    sites: dict[pywikibot.site.BaseSite,
                list[pywikibot.page.WikibaseEntity]] = {}
    for page in generator:
        site = page.site
        sites.setdefault(site, []).append(page)
        if len(sites[site]) >= groupsize:
            # if this site is at the groupsize, process it
            group = sites.pop(site)
            repo = site.data_repository()
            yield from repo.preload_entities(group, groupsize)

    for site, pages in sites.items():
        # process any leftover sites that never reached the groupsize
        repo = site.data_repository()
        yield from repo.preload_entities(pages, groupsize)