dodfminer/cli.py
"""Command Line Input handler module.
Typical usage example:
args = CLI().parse()
"""
from argparse import ArgumentParser
from dodfminer.__version__ import __version__
act_choices = ["aposentadoria",
"reversoes",
"nomeacao",
"exoneracao",
"abono",
"retificacoes",
"substituicao",
"cessoes",
"sem_efeito_aposentadoria",
"efetivos_nome",
"efetivos_exo",
"sem_efeito_exo_nom",
"efetivos_ret",
"comissionados_ret",
"contrato_convenio",
"aditamento",
"licitacao",
"suspensao",
"anulacao_revogacao",
"contrato",
"convenio"]
class CLI():
"""CLI Class contains all parameters to handle arguments.
Set Command Line Input groups and arguments.
Attributes:
parser (:obj:`ArgumentParser`): An ArgumentParser object.
subparsers: Adds subparser to the parser, each one is like a
standalone aplication.
def_start_date (str): Start date to download DODFS. Default start date
to download 01/19.
def_end_date (str): End date to download DODFS. Default end date
to download 01/19.
pure_text (bool): Enable extraction in pure text mode.
Defaults to False.
block (bool): Enable extraction in bloc mode.
Defaults to False.
titles_with_boxes (bool): Enable extraction in titles with boxes mode.
Defaults to False.
save_path (str): Save path of the download. Defaults to './data'.
input_folder (str): Path where the extractor should look to files.
Defaults to './data'.
"""
def __init__(self):
"""Init CLI class with default values."""
desc = """Data extractor of PDF documents from the Official Gazette
of the Federal District, Brazil."""
epilog = f'© Copyright 2020, KnEDLe Team. Version {__version__}'
self.parser = ArgumentParser(prog="DODFMiner", description=desc,
epilog=epilog)
self.subparsers = self.parser.add_subparsers(dest='subparser_name')
self.def_start_date = '01/2019'
self.def_end_date = '01/2019'
self.save_path = './'
self.file_type = "pdf"
self.download_parser = None
self.extract_content_parser = None
self.url = 'https://www.dodf.df.gov.br/index/jornal-json'
@classmethod
def _new_group(cls, name, subparser):
"""Create new argument group.
Args:
name: Name of the group.
subparser: The subparser.
Returns:
The argparse group created.
"""
group = subparser.add_argument_group(name)
return group
def _download_parser(self):
"""Create parser for download configs."""
self.download_parser = self.subparsers.add_parser("downloader")
help_text = 'File type to download.'
self.download_parser.add_argument('-f', '--file_type', dest='file_type',
default=self.file_type, type=str,
choices=['pdf', 'json'], help=help_text)
help_text = 'Folder to output the download DODFs'
self.download_parser.add_argument('-sp', '--save_path', dest='save_path',
default=self.save_path, type=str,
help=help_text)
help_text = 'Input the date in either mm/yyyy or mm-yyyy.'
self.download_parser.add_argument('-sd', '--start_date', dest='start_date',
default=self.def_start_date, type=str,
help=help_text)
help_text = 'Input the date in either mm/yyyy or mm-yyyy.'
self.download_parser.add_argument('-ed', '--end_date', dest='end_date',
default=self.def_end_date, type=str,
help=help_text)
help_text = 'URL to download JSON file from.'
self.download_parser.add_argument('-url', dest='url', default=self.url,
type=str, help=help_text)
def _extract_content_parser(self):
"""Create parser for extraction configs."""
self.extract_content_parser = self.subparsers.add_parser("extract")
group = self._new_group('Extraction Configs',
self.extract_content_parser)
group.add_argument('-i', '--input-folder', dest='input_folder',
default='./', type=str,
help='Path to the PDFs folder')
group.add_argument('-s', '--single-file', dest='single_file', type=str,
default=None,
help='Path to the single file to extract')
group.add_argument('-t', '--type-of-extraction', dest='type_of_extr',
default=None, type=str, nargs='?',
choices=['pure-text', 'blocks', 'with-titles'],
help="Type of text extraction")
group.add_argument('-a', '--act', dest='act', default='all', type=str,
choices=act_choices, nargs='*',
help='Which acts to extract to CSV')
group.add_argument('-b', '--backend', dest='backend', default='regex',
type=str, choices=['regex', 'ner'],
help="The backend to be used in CSV extraction")
group.add_argument('-c', '--committee', dest='committee', action='store_true',
help="Use committee classification for acts")
group.add_argument('-x', '--xml', dest='xml', default=False, nargs='*',
type=bool, help="Generate TeamTat XML Annotations")
group.add_argument('-p', '--number-of-processes',
type=int, help='Number os processes for extraction')
def get_parser(self):
return self.parser
def parse(self):
"""Create parser and parse the arguments.
Returns:
The cli arguments parsed.
"""
self._download_parser()
self._extract_content_parser()
return self.parser.parse_args()