libratom/libratom

View on GitHub
libratom/scripts/download_edrm_zipped_files.py

Summary

Maintainability
A
0 mins
Test Coverage
#!/usr/bin/env python
# pylint: disable=missing-docstring,unused-argument

import logging
from pathlib import Path

import click
import click_log

from libratom.lib.download import download_files

CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"])

# From conftest.py
ENRON_DATASET_URL = "https://www.ibiblio.org/enron/RevisedEDRMv1_Complete"
CACHED_ENRON_DATA_DIR = Path("/tmp/libratom/test_data/RevisedEDRMv1_Complete")
EDRM_PART_NAME_MAPPING = {
    1: "albert_meyers",
    2: "andrea_ring",
    3: "andrew_lewis",
    4: "andy_zipper",
    12: "chris_dorland",
    44: "jason_wolfe",
    129: "vkaminski",
}

# Set configuration on the root logger
click_log.basic_config(logging.getLogger())


def set_log_level_from_verbose(ctx, param, value):
    if value > 1:
        level = logging.DEBUG
    elif value > 0:
        level = logging.INFO
    else:
        # Default
        level = logging.WARNING
    logging.getLogger().setLevel(level)
    return level


@click.command(
    context_settings=CONTEXT_SETTINGS,
    help=f"Download edrm files into {CACHED_ENRON_DATA_DIR}/",
)
@click.option(
    "-n",
    "--part-number",
    required=False,
    type=click.Choice([str(key) for key in EDRM_PART_NAME_MAPPING.keys()]),
    help="Download the given part number. If this is not provided, download the entire Enron dataset.",
)
@click.option(
    "-v",
    "--verbose",
    count=True,
    callback=set_log_level_from_verbose,
    help="Increase verbosity (can be repeated).",
    expose_value=False,
)
def download_edrm_zipped_files(part_number) -> None:
    """Download EDRM Enron files into CACHED_ENRON_DATA_DIR."""

    if part_number:
        urls = [f"{ENRON_DATASET_URL}/{EDRM_PART_NAME_MAPPING[int(part_number)]}.zip"]
    else:
        urls = [
            f"{ENRON_DATASET_URL}/{name}.zip"
            for name in EDRM_PART_NAME_MAPPING.values()
        ]

    download_files(urls, CACHED_ENRON_DATA_DIR, dry_run=False)


if __name__ == "__main__":
    download_edrm_zipped_files()