Borda/dataset-histology-landmarks

View on GitHub
handlers/run_generate_landmarks.py

Summary

Maintainability
A
3 hrs
Test Coverage
B
89%
"""
According given annotations create a consensus annotations
and scale it into particular scales used in dataset

The expected structure of annotations is as follows
ANNOTATIONS/<tissue>/<user>_scale-<number>pc/<csv-file>
The expected structure of dataset is
DATASET/<tissue>/scale-<number>pc/<image-file>

EXAMPLE
-------
>> python run_generate_landmarks.py -a annotations -d dataset

>> python handlers/run_generate_landmarks.py \
    -a /datagrid/Medical/dataset_ANHIR/landmarks_annot \
    -d /datagrid/Medical/dataset_ANHIR/landmarks_all \
    --scales 2 5 10 15 20 25 50 100

Copyright (C) 2014-2019 Jiri Borovec <jiri.borovec@fel.cvut.cz>
"""


import argparse
import glob
import logging
import os
import sys
from functools import partial

import pandas as pd
from birl.utilities.experiments import iterate_mproc_map

sys.path += [os.path.abspath('.'), os.path.abspath('..')]  # Add path to root
from handlers.utilities import SCALES, TEMPLATE_FOLDER_SCALE, NB_THREADS
from handlers.utilities import (
    parse_args, create_folder_path, list_sub_folders, create_consensus_landmarks
)


def create_arg_parser():
    """ argument parser from cmd

    SEE: https://docs.python.org/3/library/argparse.html
    :return {str: ...}:
    """
    parser = argparse.ArgumentParser()
    parser.add_argument('-a', '--path_annots', type=str, required=False,
                        help='path to folder with annotations',
                        default='annotations')
    parser.add_argument('-d', '--path_dataset', type=str, required=False,
                        help='path to the output directory - dataset',
                        default='dataset')
    parser.add_argument('--scales', type=int, required=False, nargs='*',
                        help='generated scales for the dataset', default=SCALES)
    parser.add_argument('--consensus', type=str, required=False,
                        help='method for consensus landmarks',
                        choices=['mean', 'median'], default='mean')
    parser.add_argument('--nb_jobs', type=int, required=False,
                        help='number of processes in parallel',
                        default=NB_THREADS)
    return parser


def generate_consensus_landmarks(path_set, path_dataset, tp_consensus='mean'):
    """ generate consensus landmarks for a particular image/landmark set

    :param str path_set: path to the set with annotations
    :param str path_dataset: output dataset path (root)
    :param str tp_consensus: type of consensus landmarks
    :return {str: int}:
    """
    path_annots = list_sub_folders(path_set, '*_scale-*pc')
    logging.debug('>> found annotations: %i', len(path_annots))

    dict_lnds, dict_lens = create_consensus_landmarks(path_annots,
                                                      method=tp_consensus)

    path_scale = os.path.join(path_dataset, os.path.basename(path_set),
                              TEMPLATE_FOLDER_SCALE % 100)
    create_folder_path(path_scale)
    for name in dict_lnds:
        dict_lnds[name].to_csv(os.path.join(path_scale, name))

    return {os.path.basename(path_set): dict_lens}


def dataset_generate_landmarks(path_annots, path_dataset, tp_consensus='mean',
                               nb_jobs=NB_THREADS):
    """ generate consensus landmarks in full scale (100%)

    :param str path_annots: path to folder with annotations
    :param str path_dataset: output dataset path
    :param str tp_consensus: type of consensus landmarks
    :param nb_jobs: run parallel jobs
    :return [int]:
    """
    list_sets = list_sub_folders(path_annots)
    logging.info('Found sets: %i', len(list_sets))

    _wrap_lnds = partial(generate_consensus_landmarks, path_dataset=path_dataset,
                         tp_consensus=tp_consensus)
    counts = list(iterate_mproc_map(
        _wrap_lnds, sorted(list_sets), nb_workers=nb_jobs, desc='consensus landmarks'))
    return counts


def scale_set_landmarks(path_set, scales=SCALES):
    """ scale given set with landmarks to particular scales

    :param str path_set: path to image/landmark set
    :param [int] scales: selected output scales
    :return {str: int}: collection of lengths
    """
    logging.debug('> processing: %s', path_set)
    path_scale100 = os.path.join(path_set, TEMPLATE_FOLDER_SCALE % 100)
    if not os.path.isdir(path_scale100):
        logging.error('missing base scale 100pc in "%s"', path_scale100)
        return
    list_csv = glob.glob(os.path.join(path_scale100, '*.csv'))
    logging.debug('>> found landmarks: %i', len(list_csv))
    dict_lnds = {os.path.basename(p): pd.read_csv(p, index_col=0)
                 for p in list_csv}
    set_scales = {}
    for sc in (sc for sc in scales if sc not in [100]):  # drop the base scale
        path_scale = os.path.join(path_set, TEMPLATE_FOLDER_SCALE % sc)
        create_folder_path(path_scale)
        for name in dict_lnds:
            df_scale = dict_lnds[name] * (sc / 100.)
            df_scale.to_csv(os.path.join(path_scale, name))
        set_scales[sc] = len(dict_lnds)
    dict_lens = {os.path.basename(path_set): set_scales}
    return dict_lens


def dataset_scale_landmarks(path_dataset, scales=SCALES, nb_jobs=NB_THREADS):
    """" scale whole dataset

    :param str path_dataset:
    :param [int] scales: selected output scales
    :param nb_jobs: run parallel jobs
    :return [int]:
    """
    list_sets = list_sub_folders(path_dataset)
    logging.info('Found sets: %i', len(list_sets))

    _wrap_scale = partial(scale_set_landmarks, scales=scales)
    counts = list(iterate_mproc_map(
        _wrap_scale, sorted(list_sets), nb_workers=nb_jobs, desc='scaling sets'))
    return counts


def main(path_annots, path_dataset, scales, consensus='mean', nb_jobs=NB_THREADS):
    count_gene = dataset_generate_landmarks(path_annots, path_dataset,
                                            tp_consensus=consensus, nb_jobs=nb_jobs)
    count_scale = dataset_scale_landmarks(path_dataset, scales=scales,
                                          nb_jobs=nb_jobs)
    return count_gene, count_scale


if __name__ == '__main__':
    logging.basicConfig(level=logging.INFO)
    logging.info('running...')

    params = parse_args(create_arg_parser())
    main(**params)

    logging.info('DONE')