nigroup/nideep

View on GitHub
nideep/datasets/washington-rgbd/washington_rgbd.py

Summary

Maintainability
F
1 wk
Test Coverage
import os
import numpy as np
import cv2
import argparse
import shutil
import pandas as pd
import logging
from sklearn.model_selection import train_test_split


class WashingtonRGBD(object):
    """

    Data Wrapper class for WashingtonRGBD dataset
    Attributes
    -----------
    root_dir: root directory until the rgbd-dataset folder. For example: /mnt/raid/data/ni/dnn/pduy/rgbd-dataset
    csv_default: the default directory for loading/saving the csv description of the dataset
    csv_interpolated_default: the default directory for loading/saving the pose-interpolated csv description of the
    dataset.

    """

    def __init__(self, root_dir='', csv_default='', csv_perframe_default='', csv_interpolated_default=''):
        self.logger = logging.getLogger(__name__)
        self.root_dir = root_dir
        self.csv_default = csv_default
        self.csv_perframe_default = csv_perframe_default
        self.csv_interpolated_default = csv_interpolated_default

    # Load the dataset metadata to a Pandas dataframe and save the result to a csv file
    # if it does not exists
    # otherwise read the csv
    # The missing pose values will be saved as -1
    def load_metadata(self):
        if os.path.isfile(self.csv_default):
            self.logger.info('reading from ' + self.csv_default)
            return pd.read_csv(self.csv_default)

        file_list = os.walk(self.root_dir)

        data = []

        for current_root, _, files in file_list:
            # For the time being, we do not work on the mask, location
            # For the pose, it should be attached to the corresponding data entry, not as a separate entry
            # files = [f for f in files if 'mask' not in f and 'loc' not in f and 'pose' not in f]
            files = [f for f in files if 'pose' not in f]

            for f in files:
                self.logger.info("processing " + f)

                pose_value = -1

                name_components = f.split('_')

                # The category name can be 1 word or 2 words, such as 'apple' or 'cell_phone'
                # So, when splitting the file name by '_', there can be 5 or 6 components
                # That's why I read the name backward to make sure I get the proper data pieces
                # reversed_name_components = np.flip(name_components, axis=0)

                if len(name_components) < 5:
                    continue

                n_components = len(name_components)
                if n_components > 5:    # if n_components > 5, it means the category name has more than 1 word
                    category = '_'.join(name_components[0: n_components - 4])
                else:
                    category = name_components[0]

                instance_number = name_components[-4]
                video_no = name_components[-3]
                frame_no = name_components[-2]
                data_type = name_components[-1].split('.')[0]

                name_components[n_components - 1] = 'pose.txt'
                pose_file_name = '_'.join(name_components)

                try:
                    with open(os.path.join(current_root, pose_file_name), 'r') as pose_file:
                        pose_value = pose_file.readline()
                        self.logger.info("pose value = " + str(pose_value))
                except IOError:
                    self.logger.info("No pose value for this instance!")

                data.append({'location': os.path.join(current_root, f),
                             'category': category,
                             'instance_number': int(instance_number),
                             'video_no': int(video_no),
                             'frame_no': int(frame_no),
                             'pose': float(pose_value),
                             'data_type': data_type})

        data_frame = pd.DataFrame(data) \
            .sort_values(['data_type', 'category', 'instance_number', 'video_no', 'frame_no'])

        self.logger.info("csv saved to file: " + self.csv_default)
        data_frame.to_csv(self.csv_default, index=False)

        return data_frame

    # Interpolate the missing pose values (saved as -1 by the load_metadata() method)
    def interpolate_poses(self, data_frame):
        if os.path.isfile(self.csv_interpolated_default):
            self.logger.info('reading from ' + self.csv_interpolated_default)
            return pd.read_csv(self.csv_interpolated_default)

        self.logger.info('Interpolating ...')

        sorted_df = data_frame.sort_values(['data_type', 'category', 'instance_number', 'video_no', 'frame_no'])
        poses = np.array(sorted_df['pose'])

        # current_video = -1

        unit_diff_angle = 0
        for i in range(0, len(poses)):
            # if (sorted_df['video_no'][i] != current_video) and (poses[i] == 0):
            if sorted_df.frame_no[i] == 1 and i + 5 < len(poses):
                distance = poses[i + 5] - poses[i]

                # in some cases, the next angle exceeds 360 causing an overflow
                if distance < -180:
                    distance = distance + 360
                elif distance > 180:
                    distance = distance - 360

                unit_diff_angle = distance * 1.0 / 5

            elif poses[i] == -1:
                poses[i] = poses[i - 1] + unit_diff_angle
                if poses[i] > 360:
                    poses[i] = poses[i] - 360
                elif poses[i] < 0:
                    poses[i] = poses[i] + 360

        sorted_df['pose'] = poses
        sorted_df.to_csv(self.csv_interpolated_default, index=False)

        self.logger.info('Interpolation finished!')

        return sorted_df

    # Get a new dataframe where each row represent all information about 1 frame including the rgb and depth locations
    # structure: ['category', 'instance_number', 'video_no', 'frame_no', 'crop_location', 'depthcrop_location']
    def aggregate_frame_data(self):
        if os.path.isfile(self.csv_perframe_default):
            return pd.read_csv(self.csv_perframe_default)

        raw_df = self.interpolate_poses(self.load_metadata())

        raw_rgb_df = raw_df[raw_df.data_type == 'crop']
        raw_depth_df = raw_df[raw_df.data_type == 'depthcrop']
        raw_maskcrop_df = raw_df[raw_df.data_type == 'maskcrop']
        raw_loc_df = raw_df[raw_df.data_type == 'loc']

        data = []

        for i in range(len(raw_rgb_df.index)):
            current_rgb_row = raw_rgb_df.iloc[[i]]

            current_category = current_rgb_row.category.values[0]
            current_instance_number = current_rgb_row.instance_number.values[0]
            current_video_no = current_rgb_row.video_no.values[0]
            current_frame_no = current_rgb_row.frame_no.values[0]
            current_pose = current_rgb_row.pose.values[0]

            current_crop_location = current_rgb_row.location.values[0]

            current_depthcrop_location = raw_depth_df[(raw_depth_df.category == current_category)
                                                      & (raw_depth_df.instance_number == current_instance_number)
                                                      & (raw_depth_df.video_no == current_video_no)
                                                      & (raw_depth_df.frame_no == current_frame_no)].location.values[0]

            try:
                current_maskcrop_location = raw_maskcrop_df[(raw_maskcrop_df.category == current_category)
                                                            & (raw_maskcrop_df.instance_number == current_instance_number)
                                                            & (raw_maskcrop_df.video_no == current_video_no)
                                                            & (raw_maskcrop_df.frame_no == current_frame_no)].location.values[0]
            except IndexError:
                current_maskcrop_location = ""

            current_loc_location = raw_loc_df[(raw_loc_df.category == current_category)
                                              & (raw_loc_df.instance_number == current_instance_number)
                                              & (raw_loc_df.video_no == current_video_no)
                                              & (raw_loc_df.frame_no == current_frame_no)].location.values[0]

            self.logger.info("processing " + os.path.split(current_crop_location)[1]
                             + " and " + os.path.split(current_depthcrop_location)[1]
                             + " and " + os.path.split(current_maskcrop_location)[1]
                             + " and " + os.path.split(current_loc_location)[1])

            data.append({
                'category': current_category,
                'instance_number': current_instance_number,
                'video_no': current_video_no,
                'frame_no': current_frame_no,
                'pose': current_pose,
                'crop_location': current_crop_location,
                'depthcrop_location': current_depthcrop_location,
                'maskcrop_location': current_maskcrop_location,
                'loc_location': current_loc_location
            })

        new_df = pd.DataFrame(data)
        new_df.to_csv(self.csv_perframe_default, index=False)
        return new_df

    # add location of the interpolated depth map
    # Now the columns are:
    # ['category', 'instance_number', 'video_no', 'frame_no',
    # 'crop_location', 'depthcrop_location', 'filled_depthcrop_location']
    def add_filled_depth_to_aggregated_data(self):
        aggregated_df = self.aggregate_frame_data()
        depth_locations = aggregated_df.depthcrop_location
        filled_depth_locations = map(lambda location:
                                     os.path.join(os.path.split(location)[0],
                                                  os.path.splitext(os.path.split(location)[1])[0] + '_filled.png'),
                                     depth_locations)
        aggregated_df['filled_depthcrop_location'] = pd.Series(filled_depth_locations, index=aggregated_df.index)

        aggregated_df.to_csv(self.csv_perframe_default, index=False)
        return aggregated_df

    def extract_rgb_only(self, output_path):
        data_frame = self.load_metadata()
        rgb_files = data_frame[data_frame['data_type'] == 'crop']['location']

        for f in rgb_files:
            shutil.copy(os.path.join(self.root_dir, f), output_path)

    # Combine an rgb image with a rotated image of the same object horizontally into 1 image,
    # together with a train-test-split for doing a hold-out validation.
    # Only one elevation video_no is taken, the other elevations are ignored
    # Left:RGB, (Middle: Depth Map),  Right: Rotation
    # split_method = {'random', 'eitel'}
    def combine_viewpoints(self, angle, video_no, should_include_depth, output_path, split_method='random'):

        def join(df, output_path):
            for i in range(len(df.index)):
                current_original_file_df = df.iloc[[i]]

                # Filtering out the rotation candidates,
                # most of the things should be the same, except for frame_no,
                # and the 2 poses should differentiate by the provided angle with an error bound of +-1
                rotation_candidates = df[(df.category == current_original_file_df.category.values[0])
                                         & (df.instance_number == current_original_file_df.instance_number.values[0])
                                         & (df.video_no == current_original_file_df.video_no.values[0])
                                         & (df.pose <= current_original_file_df.pose.values[0] + angle + 1)
                                         & (df.pose >= current_original_file_df.pose.values[0] + angle - 1)]

                for j in range(len(rotation_candidates.index)):
                    current_rotated_file_df = rotation_candidates.iloc[[j]]

                    locations = []
                    names = []

                    locations.append(current_original_file_df.crop_location.values[0])
                    names.append(os.path.split(locations[0])[1])

                    if should_include_depth:
                        locations.append(current_original_file_df.depthcrop_location.values[0])
                        names.append(os.path.split(locations[1])[1])

                    locations.append(current_rotated_file_df.crop_location.values[0])
                    names.append(os.path.split(locations[2])[1])

                    self.logger.info("merging " + " and ".join(names))
                    self.perform_cv_combination(locations, names, output_path)

        data_frame = self.aggregate_frame_data()

        # Filter out one elevation only
        data_frame = data_frame[data_frame['video_no'] == video_no]

        # train test split
        train, test = train_test_split(data_frame, test_size=0.2) if split_method == 'random' \
            else self.train_test_split_eitel(data_frame)

        # construct training and test sets, saving to disk
        join(train, os.path.join(output_path, 'train'))
        join(test, os.path.join(output_path, 'test'))

    # this method combines every rgb frame with its depthmap on the right
    # split method: "random" or "eitel" - the method used in Eitel et al.
    # https://arxiv.org/abs/1507.06821
    def combine_rgb_depth(self, output_path, split_method='random'):
        def join(df, output_path):
            for i in range(len(df.index)):
                current_row = df.iloc[[i]]
                locations = [current_row.crop_location.values[0], current_row.depthcrop_location.values[0]]
                names = [os.path.split(location)[1] for location in locations]

                self.perform_cv_combination(locations, names, output_path)

        df = self.aggregate_frame_data()

        train, test = train_test_split(df, test_size=0.2) if split_method == 'random' \
            else self.train_test_split_eitel(df)

        join(train, os.path.join(output_path, 'train'))
        join(test, os.path.join(output_path, 'test'))

    # combine all of the image in an array to only 1 image and save to file
    @staticmethod
    def perform_cv_combination(locations, names, output_path):
        if output_path != '' and not os.path.isdir(output_path):
            os.makedirs(output_path)

        imgs = [cv2.imread(location) for location in locations]

        min_height = min([len(img) for img in imgs])
        min_width = min([len(img[0]) for img in imgs])

        imgs = map(lambda x: cv2.resize(x, (min_width, min_height)), imgs)

        img = np.concatenate(imgs, axis=1)
        cv2.imwrite(os.path.join(output_path,
                                 '_'.join([os.path.splitext(name)[0] for name in names])
                                 + os.path.splitext(names[0])[1]),
                    img)

    # Train-test split method in Eitel et al.
    @staticmethod
    def train_test_split_eitel(train_info_df, seed=1000, train_output='', test_output=''):
        if train_output != '' and not os.path.isdir(os.path.split(train_output)[0]):
            os.makedirs(os.path.split(train_output)[0])
        if test_output != '' and not os.path.isdir(os.path.split(test_output)[0]):
            os.makedirs(os.path.split(test_output)[0])

        np.random.seed(seed)
        categories = np.unique(train_info_df.category)

        test_df = pd.DataFrame(columns=list(train_info_df))
        train_df = pd.DataFrame(columns=list(train_info_df))
        for category in categories:
            # Select 1 category for test and insert the whole category to test set
            category_df = train_info_df[train_info_df.category == category]
            max_instance = np.max(category_df.instance_number)
            test_instance = np.random.randint(max_instance) + 1
            test_df = test_df.append(category_df[category_df.instance_number == test_instance])

            # For the rest, select every 5th frame for test, rest for training
            training_instances_df = category_df[category_df.instance_number != test_instance]
            train_df = train_df.append(training_instances_df[training_instances_df.frame_no % 5 != 1])
            test_df = test_df.append(training_instances_df[training_instances_df.frame_no % 5 == 1])

        if train_df.shape[0] + test_df.shape[0] == train_info_df.shape[0]:
            if train_output != '':
                train_df.to_csv(train_output)
            if test_output != '':
                test_df.to_csv(test_output)

            return train_df, test_df
        else:
            raise ValueError('Train and Test do not sum up to the original dataframe')

    # create train and test CSVs, where each frame is associated with a rotated frame by 20 degrees (10th item)
    def train_test_split_eitel_stereo_rgb(self, seed=1000, rotation_index=10, train_output='', test_output=''):
        self.logger.info('Splitting train/test for stereo rgb')
        washington_aggredated_df = self.aggregate_frame_data()

        dicts = []
        categories = washington_aggredated_df.category.unique()
        for cat in categories:
            items_in_cat = washington_aggredated_df[washington_aggredated_df.category == cat]
            instances = items_in_cat.instance_number.unique()
            for ins in instances:
                items_in_ins = items_in_cat[items_in_cat.instance_number == ins]
                vids = items_in_ins.video_no.unique()
                for vid in vids:
                    items_in_vid = items_in_ins[items_in_ins.video_no == vid] \
                        .sort_values('frame_no')[['crop_location', 'filled_depthcrop_location', 'frame_no', 'pose']]
                    rotated_items = items_in_vid.iloc[rotation_index: items_in_vid.shape[0]] \
                        .append(items_in_vid.iloc[0: rotation_index])

                    for item_index in range(rotated_items.shape[0]):
                        original_item = items_in_vid.iloc[item_index]
                        rotated_item = rotated_items.iloc[item_index]

                        if original_item.pose > rotated_item.pose:
                            (original_item, rotated_item) = (rotated_item, original_item)

                        dicts.append({'category': cat,
                                      'instance_number': int(ins),
                                      'video_no': int(vid),
                                      'frame_no': int(original_item.frame_no),
                                      'rgb_original_path': original_item.crop_location,
                                      'depth_original_path': original_item.filled_depthcrop_location,
                                      'rgb_target_path': rotated_item.crop_location,
                                      'depth_target_path': rotated_item.filled_depthcrop_location})

        input_df = pd.DataFrame(dicts)

        if train_output != '' and not os.path.isdir(os.path.split(train_output)[0]):
            os.makedirs(os.path.split(train_output)[0])
        if test_output != '' and not os.path.isdir(os.path.split(test_output)[0]):
            os.makedirs(os.path.split(test_output)[0])

        np.random.seed(seed)
        categories = np.unique(input_df.category)

        test_df = pd.DataFrame(columns=list(input_df))
        train_df = pd.DataFrame(columns=list(input_df))
        for category in categories:
            # Select 1 instance for test and insert the whole category to test set
            category_df = input_df[input_df.category == category]
            max_instance = np.max(category_df.instance_number)
            test_instance = np.random.randint(max_instance) + 1
            test_df = test_df.append(category_df[category_df.instance_number == test_instance])

            # For the rest, select every 5th frame for test, rest for training
            training_instances_df = category_df[category_df.instance_number != test_instance]
            train_df = train_df.append(training_instances_df[training_instances_df.frame_no % 5 != 1])
            test_df = test_df.append(training_instances_df[training_instances_df.frame_no % 5 == 1])

        if train_df.shape[0] + test_df.shape[0] == input_df.shape[0]:
            if train_output != '':
                train_df.to_csv(train_output, index=False)
            if test_output != '':
                test_df.to_csv(test_output, index=False)

            return train_df, test_df
        else:
            raise ValueError('Train and Test do not sum up to the original dataframe')



if __name__ == '__main__':
    ROOT_DEFAULT = '/mnt/raid/data/ni/dnn/pduy/rgbd-dataset'
    CSV_DEFAULT = '/mnt/raid/data/ni/dnn/pduy/rgbd-dataset/rgbd-dataset.csv'
    CSV_AGGREGATED_DEFAULT = '/mnt/raid/data/ni/dnn/pduy/rgbd-dataset/rgbd-dataset-interpolated-aggregated.csv'
    CSV_INTERPOLATED_DEFAULT = '/mnt/raid/data/ni/dnn/pduy/rgbd-dataset/rgbd-dataset-interpolated.csv'
    CSV_EITEL_TRAIN_DEFAULT = '/mnt/raid/data/ni/dnn/pduy/rgbd-dataset/eitel-train.csv'
    CSV_EITEL_TEST_DEFAULT = '/mnt/raid/data/ni/dnn/pduy/rgbd-dataset/eitel-test.csv'
    CSV_EITEL_TRAIN_STEREO_RGB_DEFAULT = '/mnt/raid/data/ni/dnn/pduy/rgbd-dataset/eitel-train-stereo-rgb.csv'
    CSV_EITEL_TEST_STEREO_RGB_DEFAULT = '/mnt/raid/data/ni/dnn/pduy/rgbd-dataset/eitel-test-stereo-rgb.csv'

    logging.basicConfig(level=logging.INFO)

    parser = argparse.ArgumentParser()
    parser.add_argument("--rootdir", default=ROOT_DEFAULT)
    parser.add_argument("--csv_dir", default=CSV_DEFAULT)
    parser.add_argument("--csv_perframe_dir", default=CSV_AGGREGATED_DEFAULT)
    parser.add_argument("--csv_interpolated_dir", default=CSV_INTERPOLATED_DEFAULT)
    parser.add_argument("--processed_data_output", default='')
    parser.add_argument("--angle", default=10, type=int)
    parser.add_argument("--depth_included", default=False, type=bool)

    args = parser.parse_args()

    if args.processed_data_output != '' and not os.path.isdir(args.processed_data_output):
        os.makedirs(args.processed_data_output)

    washington_dataset = WashingtonRGBD(root_dir=args.rootdir,
                                        csv_default=args.csv_dir,
                                        csv_perframe_default=args.csv_perframe_dir,
                                        csv_interpolated_default=args.csv_interpolated_dir)

    washington_dataset.train_test_split_eitel_stereo_rgb(train_output=CSV_EITEL_TRAIN_STEREO_RGB_DEFAULT,
                                                     test_output=CSV_EITEL_TEST_STEREO_RGB_DEFAULT)