ptp/components/tasks/image_text_to_class/clevr.py from IBM/pytorchpipe

ptp/components/tasks/image_text_to_class/clevr.py
Summary

Maintainability

1 day
Test Coverage

Issues
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# Copyright (C) IBM Corporation 2019
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

__author__ = "Tomasz Kornuta"

import os
import json
from PIL import Image

import torch
from torchvision import transforms

from ptp.components.tasks.task import Task
from ptp.data_types.data_definition import DataDefinition

#from ptp.components.mixins.io import save_nparray_to_csv_file
from ptp.configuration.config_parsing import get_value_from_dictionary, get_value_list_from_dictionary
from ptp.configuration.configuration_error import ConfigurationError

class CLEVR(Task):
    """
    Task providing data associated with CLEVR (Compositional Language andElementary Visual Reasoning) diagnostics dataset

    The dataset consists of three splits:
        - A training set of 70,000 images and 699,989 questions
        - A validation set of 15,000 images and 149,991 questions
        - A test set of 15,000 images and 14,988 questions
        - Answers for all train and val questions
        - Scene graph annotations for train and val images giving ground-truth locations, attributes, and relationships for objects
        - Functional program representations for all training and validation images

    CLEVR contains a total of 90 question families, eachwith a single program template and an average of four texttemplates.
    Those are further aggregated into 13 Question Types:
        - Querying attributes (Size, Color, Material, Shape)
        - Comparing attributes (Size, Color, Material, Shape)
        - Existence
        - Counting
        - Integer comparison (Equal, Less, More)

    For more details please refer to the associated _website or _paper for more details.

    .. _website: https://cs.stanford.edu/people/jcjohns/clevr/

    .._paper: https://arxiv.org/pdf/1612.06890

    """
    def __init__(self, name, config):
        """
        Initializes task object. Calls base constructor. Downloads the dataset if not present and loads the adequate files depending on the mode.

        :param name: Name of the component.

        :param class_type: Class type of the component.

        :param config: Dictionary of parameters (read from configuration ``.yaml`` file).
        """
        # Call constructors of parent classes.
        Task.__init__(self, name, CLEVR, config)

        # Get key mappings of all output streams.
        self.key_images = self.stream_keys["images"]
        self.key_image_ids = self.stream_keys["image_ids"]
        self.key_questions = self.stream_keys["questions"]
        self.key_answers = self.stream_keys["answers"]
        self.key_question_type_ids = self.stream_keys["question_type_ids"]
        self.key_question_type_names = self.stream_keys["question_type_names"]

        # Get flag informing whether we want to stream images or not.
        self.stream_images = self.config['stream_images']

        # Check the resize image option.
        if "resize_image" in self.config:
            if len(self.config['resize_image']) != 2:
                self.logger.error("'resize_image' field must contain 2 values: the desired height and width")
                exit(-1)

            # Output image dimensions.
            self.height = self.config['resize_image'][0]
            self.width = self.config['resize_image'][1]
            self.depth = 3
            resize = True
        else:
            # Use original image dimensions.
            self.height = 480 
            self.width = 320 
            self.depth = 3
            resize = False
        self.logger.info("Setting image size to [D  x H x W]: {} x {} x {}".format(self.depth,  self.height, self.width))

        # Set global variables - all dimensions ASIDE OF BATCH.
        self.globals["image_height"] = self.height
        self.globals["image_width"] = self.width
        self.globals["image_depth"] = self.depth

        # Get image preprocessing.
        self.image_preprocessing = get_value_list_from_dictionary(
            "image_preprocessing", self.config,
            'none | normalize | all'.split(" | ")
            )
        if 'none' in self.image_preprocessing:
            self.image_preprocessing = []
        if 'all' in self.image_preprocessing:
            self.image_preprocessing = ['normalize']

        if resize:
            # Add resize as transformation.
                self.image_preprocessing = ["resize"] + self.image_preprocessing
        self.logger.info("Applied image preprocessing: {}".format(self.image_preprocessing))

        # Mapping of question subtypes to types (not used, but keeping it just in case).
        #self.question_subtype_to_type_mapping = {
        #    'query_size': 'query_attribute',
        #    'equal_size': 'compare_attribute',
        #    'query_shape': 'query_attribute',
        #    'query_color': 'query_attribute',
        #    'greater_than': 'compare_integer',
        #    'equal_material': 'compare_attribute',
        #    'equal_color': 'compare_attribute',
        #    'equal_shape': 'compare_attribute',
        #    'less_than': 'compare_integer',
        #    'count': 'count',
        #    'exist': 'exist',
        #    'equal_integer': 'compare_integer',
        #    'query_material': 'query_attribute'}

        # Mapping of question subtypes to types.
        self.question_subtype_to_id_mapping = {
            'query_size': 0,
            'equal_size': 1,
            'query_shape': 2,
            'query_color': 3,
            'greater_than': 4,
            'equal_material': 5,
            'equal_color': 6,
            'equal_shape': 7,
            'less_than': 8,
            'count': 9,
            'exist': 10,
            'equal_integer': 11,
            'query_material': 12}

        # Mapping of question families to subtypes.
        self.question_family_id_to_subtype_mapping = {
            0: "equal_integer", 1: "less_than", 2: "greater_than", 3: "equal_integer", 4: "less_than", 5: "greater_than", 6: "equal_integer", 7: "less_than", 8: "greater_than", 9: "equal_size",
            10: "equal_color", 11: "equal_material", 12: "equal_shape", 13: "equal_size", 14: "equal_size", 15: "equal_size", 16: "equal_color", 17: "equal_color", 18: "equal_color", 19: "equal_material",
            20: "equal_material", 21: "equal_material", 22: "equal_shape", 23: "equal_shape", 24: "equal_shape", 25: "count", 26: "exist", 27: "query_size", 28: "query_shape", 29: "query_color",
            30: "query_material", 31: "count", 32: "query_size", 33: "query_color", 34: "query_material", 35: "query_shape", 36: "exist", 37: "exist", 38: "exist", 39: "exist",
            40: "count", 41: "count", 42: "count", 43: "count", 44: "exist", 45: "exist", 46: "exist", 47: "exist", 48: "count", 49: "count",
            50: "count", 51: "count", 52: "query_color", 53: "query_material", 54: "query_shape", 55: "query_size", 56: "query_material", 57: "query_shape", 58: "query_size", 59: "query_color",
            60: "query_shape", 61: "query_size", 62: "query_color", 63: "query_material", 64: "count", 65: "count", 66: "count", 67: "count", 68: "count", 69: "count",
            70: "count", 71: "count", 72: "count", 73: "exist", 74: "query_size", 75: "query_color", 76: "query_material", 77: "query_shape", 78: "count", 79: "exist",
            80: "query_size", 81: "query_color", 82: "query_material", 83: "query_shape", 84: "count", 85: "exist", 86: "query_shape", 87: "query_material", 88: "query_color", 89: "query_size"}

        # Finally, "merge" those two.
        self.question_family_id_to_subtype_id_mapping = { key: self.question_subtype_to_id_mapping[value] for key, value in self.question_family_id_to_subtype_mapping.items() }


        # Get the absolute path.
        self.data_folder = os.path.expanduser(self.config['data_folder'])

        # Get split.
        split = get_value_from_dictionary('split', self.config, "training | validation | test | cogent_a_training | cogent_a_validation | cogent_b_validation".split(" | "))

        # Set split-dependent data.
        if split == 'training':
            # Training split folder and file with data question.
            data_file = os.path.join(self.data_folder, "questions", 'CLEVR_train_questions.json')
            self.split_image_folder = os.path.join(self.data_folder, "images", "train")

        elif split == 'validation':
            # Validation split folder and file with data question.
            data_file = os.path.join(self.data_folder, "questions", 'CLEVR_val_questions.json')
            self.split_image_folder = os.path.join(self.data_folder, "images", "val")

        elif split == 'test':
            # Test split folder and file with data question.
            data_file = os.path.join(self.data_folder, "questions", 'CLEVR_test_questions.json')
            self.split_image_folder = os.path.join(self.data_folder, "images", "test")

        else: # cogent
            raise ConfigurationError("Split {} not supported yet".format(split))

        # Load dataset.
        self.dataset = self.load_dataset(data_file)
        
        # Display exemplary sample.
        i = 0
        sample = self.dataset[i]
        # Check if this is a test set.
        if "answer" not in sample.keys():
            sample["answer"] = "<UNK>"
            sample[self.key_question_type_ids] = -1
            sample[self.key_question_type_names] = "<UNK>"
        else:
            sample[self.key_question_type_ids] = self.question_family_id_to_subtype_id_mapping[sample["question_family_index"]]
            sample[self.key_question_type_names] = self.question_family_id_to_subtype_mapping[sample["question_family_index"]]

        self.logger.info("Exemplary sample {} ({}):\n  question_type: {} ({})\n  image_ids: {}\n  question: {}\n  answer: {}".format(
            i, sample["question_index"],
            sample[self.key_question_type_ids],
            sample[self.key_question_type_names],
            sample["image_filename"],
            sample["question"],
            sample["answer"]
            ))



    def output_data_definitions(self):
        """
        Function returns a dictionary with definitions of output data produced the component.

        :return: dictionary containing output data definitions (each of type :py:class:`ptp.utils.DataDefinition`).
        """
        # Add all "standard" streams.
        d = {
            self.key_indices: DataDefinition([-1, 1], [list, int], "Batch of sample indices [BATCH_SIZE] x [1]"),
            self.key_image_ids: DataDefinition([-1, 1], [list, str], "Batch of image names, each being a single word [BATCH_SIZE] x [STRING]"),
            self.key_question_type_ids: DataDefinition([-1], [torch.Tensor], "Batch of target question type indices, each being a single index [BATCH_SIZE]"),
            self.key_question_type_names: DataDefinition([-1, 1], [list, str], "Batch of target question type names, each being a single word [BATCH_SIZE] x [STRING]"),
            }
        
        # Return images only when required.
        if self.stream_images:
            d[self.key_images] = DataDefinition([-1, self.depth, self.height, self.width], [torch.Tensor], "Batch of images [BATCH_SIZE x IMAGE_DEPTH x IMAGE_HEIGHT x IMAGE_WIDTH]")

        # Add stream with questions.
        d[self.key_questions] = DataDefinition([-1, 1], [list, str], "Batch of questions, each being a string consisting of many words [BATCH_SIZE] x [STRING]")

        # Add stream with answers.
        d[self.key_answers]= DataDefinition([-1, 1], [list, str], "Batch of target answers, each being a string consisting of sinlge word (label) [BATCH_SIZE] x [STRING]")

        return d


    def __len__(self):
        """
        Returns the "size" of the "task" (total number of samples).

        :return: The size of the task.
        """
        return len(self.dataset)


    def load_dataset(self, source_data_file):
        """
        Loads the dataset from source file

        :param source_data_file: jSON file with image ids, questions, answers, scene graphs, etc.

        """
        self.logger.info("Loading dataset from:\n {}".format(source_data_file))
        dataset = []

        with open(source_data_file) as f:
            self.logger.info("Loading samples from '{}'...".format(source_data_file))
            dataset = json.load(f)['questions']

        self.logger.info("Loaded dataset consisting of {} samples".format(len(dataset)))
        return dataset


    def get_image(self, img_id):
        """
        Function loads and returns image along with its size.
        Additionally, it performs all the required transformations.

        :param img_id: Identifier of the images.
        :param img_folder: Path to the image.

        :return: image (Tensor)
        """

        # Load the image.
        img = Image.open(os.path.join(self.split_image_folder, img_id)).convert('RGB')

        image_transformations_list = []

        # Optional: resize.
        if 'resize' in self.image_preprocessing:
            image_transformations_list.append(transforms.Resize([self.height,self.width]))

        # Add obligatory transformation.
        image_transformations_list.append(transforms.ToTensor())

        # Optional: normalization.
        if 'normalize' in self.image_preprocessing:
            # Use normalization that the pretrained models from TorchVision require.
            image_transformations_list.append(transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]))

        # Resize the image and transform to Torch Tensor.
        transforms_com = transforms.Compose(image_transformations_list)
        # Apply transformations.
        img = transforms_com(img)

        # Return image.
        return img

    def __getitem__(self, index):
        """
        Getter method to access the dataset and return a single sample.

        :param index: index of the sample to return.
        :type index: int

        :return: DataStreams({'indices', 'images', 'images_ids','questions', 'answers', 'question_type_ids', 'question_type_names'})
        """
        # Get item.
        item = self.dataset[index]

        # Create the resulting sample (data dict).
        data_streams = self.create_data_streams(index)

        # Load and stream the image ids.
        img_id = item["image_filename"]
        data_streams[self.key_image_ids] = img_id

        # Load the adequate image - only when required.
        if self.stream_images:
            img = self.get_image(img_id)
            # Image related variables.
            data_streams[self.key_images] = img

        # Return question.
        data_streams[self.key_questions] = item["question"]

        # Return answer. 
        if "answer" in item.keys():
            data_streams[self.key_answers] = item["answer"]
        else:
            data_streams[self.key_answers] = "<UNK>"

        # Question type related variables.
        if "question_family_index" in item.keys():
            data_streams[self.key_question_type_ids] = self.question_family_id_to_subtype_id_mapping[item["question_family_index"]]
            data_streams[self.key_question_type_names] = self.question_family_id_to_subtype_mapping[item["question_family_index"]]
        else:
            data_streams[self.key_question_type_ids] = -1
            data_streams[self.key_question_type_names] = "<UNK>"

        # Return sample.
        return data_streams


    def collate_fn(self, batch):
        """
        Combines a list of DataStreams (retrieved with :py:func:`__getitem__`) into a batch.

        :param batch: list of individual samples to combine
        :type batch: list

        :return: DataStreams({'indices', 'images', 'images_ids','questions', 'answers', 'category_ids', 'image_sizes'})

        """
        # Collate indices.
        data_streams = self.create_data_streams([sample[self.key_indices] for sample in batch])

        # Stack images.
        data_streams[self.key_image_ids] = [item[self.key_image_ids] for item in batch]
        if self.stream_images:
            data_streams[self.key_images] = torch.stack([item[self.key_images] for item in batch]).type(torch.FloatTensor)

        # Collate lists/lists of lists.
        data_streams[self.key_questions] = [item[self.key_questions] for item in batch]
        data_streams[self.key_answers] = [item[self.key_answers] for item in batch]

        # Stack categories.
        data_streams[self.key_question_type_ids] = torch.tensor([item[self.key_question_type_ids] for item in batch])
        data_streams[self.key_question_type_names] = [item[self.key_question_type_names] for item in batch]

        # Return collated dict.
        return data_streams