wikimedia/pywikibot

View on GitHub
scripts/category_graph.py

Summary

Maintainability
A
55 mins
Test Coverage
#!/usr/bin/env python3
r"""Visualizes category hierarchy.

Generates graphical representation in formats dot, svg and html5
of category hierarchy.

Usage:

    pwb.py category_graph [-style STYLE] [-depth DEPTH] [-from FROM] [-to TO]

actions:

-from [FROM]   Category name to scan, default is main category, "?" to ask.

optional arguments:

-to TO         base file name to save, "?" to ask
-style STYLE   graphviz style definitions in dot format (see below)
-depth DEPTH   maximal hierarchy depth. 2 by default
-downsize K    font size divider for subcategories. 4 by default
               Use 1 for the same font size

.. seealso:: https://graphviz.org/doc/info/attrs.html
   for graphviz style definitions.

Example
-------

Visualizes main category:

    pwb.py -v category_graph -from

Extended example with style settings:

    pwb.py category_graph -from Life -downsize 1.5 \
    -style 'graph[rankdir=BT ranksep=0.5] node[shape=circle style=filled \
    fillcolor=green] edge[style=dashed penwidth=3]'


.. versionadded:: 8.0
"""
#
# (C) Pywikibot team, 2022-2024
#
# Distributed under the terms of the MIT license.
#
from __future__ import annotations

import argparse
import glob
from collections import defaultdict
from contextlib import suppress
from pathlib import Path

import pywikibot
from pywikibot import config
from pywikibot.bot import SingleSiteBot, suggest_help


try:
    import pydot
except ImportError as e:
    pydot = e


class CategoryGraphBot(SingleSiteBot):
    """Bot to create graph of the category structure."""

    @staticmethod
    def setup_args(ap):
        """Declares arguments."""
        ap.add_argument('-from', nargs='?', default=argparse.SUPPRESS)
        ap.add_argument('-to', nargs='?', default='')
        ap.add_argument('-style', nargs='?', default='')
        ap.add_argument('-depth', nargs='?', default=2)
        ap.add_argument('-downsize', nargs='?', default=4)

    def __init__(self, args: argparse.Namespace) -> None:
        """Initializer."""
        super().__init__()
        self.args = args
        cat_title = vars(args)['from']
        if not cat_title:
            cat_title = 'Main topic classifications'
        if cat_title == '?':
            cat_title = pywikibot.input(
                'For which category do you want to create a graph?')

        pywikibot.info(f'Scanning {cat_title!r}')
        self.cat = pywikibot.Category(self.site, cat_title)
        self.to = args.to
        if self.to == '?':
            self.to = pywikibot.input(
                'Please enter the name of the file '
                'where the tree should be saved,\n'
                'or press enter to use category name:')
        if not self.to:
            self.to = cat_title.replace(' ', '_')
        self.rev = defaultdict(list)
        self.fw = defaultdict(list)
        self.leaves = set()
        self.counter = 0
        font = 'fontname="Helvetica,Arial,sans-serif"'
        style = f'graph [rankdir=LR ranksep=2 concentrate=true {font}] ' \
                f'node [newrank=true shape=plaintext {font}] ' \
                f'edge [arrowhead=open labeldistance=3 ' \
                f'labelfontcolor="#00000080" {font}] ' + args.style
        self.dot = pydot.graph_from_dot_data(f'digraph {{{style}}}')[0]
        self.dot.set_name(f'"{cat_title}"')

    def scan_level(self, cat, level, hue=None) -> str:
        """Recursive function to fill dot graph.

        :param cat: the Category of the node we're currently opening.
        :param level: the current decreasing from depth to zero level in
            the tree (for recursion), opposite of depth.
        """
        title = cat.title(with_ns=False)
        size = float(self.args.downsize) ** level
        subcats = sorted(cat.subcategories())

        def node():
            subs = ', '.join([c.title(with_ns=False).replace(' ', ' ')
                              for c in subcats])
            n = pydot.Node(title,
                           label=rf'"{title}\n{len(subcats)} C"',
                           tooltip=title + '\n\n' + subs,
                           URL=cat.full_url(),
                           fontsize=int(10 * size))
            return n

        def edge(n, h):
            minlen = n % columns + 1 if level != self.args.depth else 1
            e = pydot.Edge(title,
                           subcat.title(with_ns=False),
                           tooltip=title + '  ⟶  '
                           + subcat.title(with_ns=False),
                           headlabel=title,
                           # distribute the graph to depth
                           minlen=minlen,
                           penwidth=round(size / 2, 2),
                           arrowsize=round(size / 4, 2),
                           color=str(round(h, 2)) + ' 1 0.7',
                           labelfontsize=int(3 * size),
                           labelfontcolor=str(round(h, 2)) + ' 1 0.5')
            return e

        if config.verbose_output:
            pywikibot.info('Adding ' + cat.title(with_ns=False))

        node = node()
        self.dot.add_node(node)
        self.counter += 1
        if not level or self.counter >= 1e4:
            # because graphviz crashes on huge graphs
            if self.counter == 1e4:
                pywikibot.warning('Number of nodes reached limit')
            self.leaves.add(node.get_name())
            return

        columns = len(subcats) // 5 + 1
        for n, subcat in enumerate(subcats):
            # generating different hue for color per each root branch
            h = hue if hue is not None else (11 / 18 * n) % 1
            e = edge(n, h)
            self.dot.add_edge(e)
            # repeat recursively
            self.scan_level(subcat, level - 1, h)
            # track graph's structure to reduse too big graph
            self.rev[e.get_destination()].append(e.get_source())
            self.fw[e.get_source()].append(e.get_destination())

    def run(self) -> None:
        """Main function of CategoryGraphBot."""
        self.scan_level(self.cat, int(self.args.depth))
        # reduce too big graph
        if self.counter > 1000:
            pywikibot.warning('Removing standalone subcategories '
                              'because graph is too big')
            for n in self.leaves:
                while len(self.rev[n]) == 1:
                    if config.verbose_output:
                        pywikibot.info('Removing ' + n)

                    self.dot.del_edge(self.rev[n][0], n)
                    self.dot.del_node(n)
                    self.fw[self.rev[n][0]].remove(n)
                    if self.fw[self.rev[n][0]]:
                        break
                    n = self.rev[n][0]

        pywikibot.info('Saving results')
        pywikibot.info(self.to + '.gv')
        self.dot.write(self.to + '.gv', encoding='utf-8')
        pywikibot.info(self.to + '.svg')
        self.dot.write_svg(self.to + '.svg', encoding='utf-8')
        pywikibot.info(self.to + '.html')

        header = ('<head><meta charset="UTF-8"/>'
                  '<title>' + self.cat.title(with_ns=False)
                  + '</title> </head>\n'
                  '<div style="position:absolute;">'
                  'Zoom and drag with mouse. '
                  'Nodes are links to Wikipedia.'
                  '</div>\n'
                  '<script '
                  'src="https://unpkg.com/panzoom@9.4.0/dist/panzoom.min.js" '
                  'query="#graph0" name="pz"></script>\n'
                  '<style> svg { height:100%; width:100%; } </style>\n')
        with open(self.to + '.html', mode='wb') as o:
            o.write(header.encode())
            o.write(self.dot.create('dot', 'svg', encoding='utf-8'))


def main(*args: str) -> None:
    """
    Process command line arguments and invoke bot.

    If args is an empty list, sys.argv is used.

    :param args: command line arguments
    """
    ap = argparse.ArgumentParser(add_help=False)
    CategoryGraphBot.setup_args(ap)
    local_args = pywikibot.handle_args()
    args, rest = ap.parse_known_args(local_args)

    if suggest_help(
        missing_action='from' not in args,
        unknown_parameters=rest,
        missing_dependencies=(['pydot'] if isinstance(pydot, ImportError)
                              else [])
    ):
        return

    file_path = args.to
    # If file exists, ask user if ok to overwrite. Otherwise, make
    # the file, including directories unless it is top level.
    if glob.glob(file_path + '.*'):
        choice = pywikibot.input_yn(f'Files exist for {file_path}. Overwrite?',
                                    'n', automatic_quit=False)
        if not choice:
            pywikibot.info('Exiting...')
            return
    else:
        dir_path = Path(file_path)
        with suppress(FileNotFoundError):
            dir_path.parent.mkdir(parents=True, exist_ok=True)

    bot = CategoryGraphBot(args)
    bot.run()


if __name__ == '__main__':
    main()