UnB-KnEDLe/DODFMiner

View on GitHub
dodfminer/run.py

Summary

Maintainability
A
3 hrs
Test Coverage
#!/usr/bin/env python3

"""Main application module.

Contains class miner which is an interface to handle all extraction tasks.

Usage example::

    dodfminer --help

"""


from dodfminer.cli import CLI
from dodfminer.downloader.core import Downloader
from dodfminer.extract.pure.core import ContentExtractor
from dodfminer.extract.polished.helper import extract_multiple_acts, extract_multiple_acts_parallel, \
    extract_multiple_acts_with_committee, xml_multiple


class Miner():
    """Main DODFMiner class.

    The Miner class is a interface that handles download, pre-extraction,
    and extraction of DODFs.

    Attributes:
        start_date: Argument start_date from CLI
        end_date: Argument end_date from CLI
        single: Argument single from CLI
        ext_content: Argument extract_content from CLI

    """

    def __init__(self):
        """Init Miner class to handle all the extraction process."""
        self.cli = CLI()
        self.args = self.cli.parse()


    def download(self):
        """Download PDFs with parameters from CLI."""
        downloader = Downloader(save_path=self.args.save_path)
        if(self.args.file_type == 'pdf'):
            downloader.pull(self.args.start_date, self.args.end_date)
        else:
            downloader._prog_bar.disable = True
            downloader.pull_json(self.args.url)


    def extract_content(self):
        """Extract Content from PDFs."""
        if self.args.single_file is None:
            if self.args.type_of_extr is not None:
                if self.args.type_of_extr == 'pure-text':
                    ContentExtractor.extract_to_txt(
                        folder=self.args.input_folder)
                elif self.args.type_of_extr == 'with-titles':
                    ContentExtractor.extract_to_json(folder=self.args.input_folder,
                                                     titles_with_boxes=True)
                elif self.args.type_of_extr == 'blocks':
                    ContentExtractor.extract_to_json(
                        folder=self.args.input_folder)
            elif self.args.act != 'all':
                if self.args.committee:
                    extract_multiple_acts_with_committee(self.args.input_folder, self.args.act, self.args.backend)
                elif self.args.number_of_processes is not None:
                    extract_multiple_acts_parallel(self.args.input_folder, self.args.act, self.args.backend, self.args.number_of_processes)
                else:
                    extract_multiple_acts(self.args.input_folder, self.args.act, self.args.backend)
            elif self.args.xml is not False:
                xml_multiple(self.args.input_folder, self.args.backend)
            else:
                self.cli.extract_content_parser.print_help()
        elif self.args.single_file is not None:
            self._extract_single_file()

    def _extract_single_file(self):
        if self.args.type_of_extr is not None:
            if self.args.type_of_extr == 'pure-text':
                ContentExtractor.extract_text(self.args.single_file,
                                              single=True)
            elif self.args.type_of_extr == 'with-titles':
                ContentExtractor.extract_structure(self.args.single_file,
                                                   single=True)
            elif self.args.type_of_extr == 'blocks':
                ContentExtractor.extract_text(
                    self.args.single_file, single=True, block=True)
        elif self.args.act != 'all':
            if self.args.committee:
                extract_multiple_acts_with_committee(self.args.single_file, self.args.act, self.args.backend)
            elif self.args.number_of_processes is not None:
                extract_multiple_acts_parallel(self.args.single_file, self.args.act, self.args.backend, self.args.number_of_processes)
            else:
                extract_multiple_acts(self.args.single_file, self.args.act, self.args.backend)
        elif self.args.xml is not False:
            xml_multiple(self.args.single_file, self.args.backend)
        else:
            self.cli.extract_content_parser.print_help()

    @classmethod
    def _log(cls, msg):
        print(f"[DODFMiner] {msg}")


def run():
    miner = Miner()
    if miner.args.subparser_name == 'downloader':
        miner.download()
    elif miner.args.subparser_name == 'extract':
        miner.extract_content()
    else:
        miner.cli.parser.print_help()


if __name__ == '__main__':
    run()