ptp/components/models/vision/convnet_encoder.py
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# Copyright (C) IBM Corporation 2019
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
__author__ = "Younes Bouhadjar, Vincent Marois, Tomasz Kornuta"
import torch
import numpy as np
import torch.nn as nn
from ptp.components.models.model import Model
from ptp.data_types.data_definition import DataDefinition
class ConvNetEncoder(Model):
"""
A simple image encoder consisting of 3 consecutive convolutional layers. \
The parameters of input image (width, height and depth) are not hardcoded so the encoder can be adjusted for given application.
"""
def __init__(self, name, config):
"""
Constructor of the ``SimpleConvNet``. \
The overall structure of this CNN is as follows:
(Conv1 -> MaxPool1 -> ReLu) -> (Conv2 -> MaxPool2 -> ReLu) -> (Conv3 -> MaxPool3 -> ReLu)
The parameters that the user can change are:
- For Conv1, Conv2 & Conv3: number of output channels, kernel size, stride and padding.
- For MaxPool1, MaxPool2 & MaxPool3: Kernel size
.. note::
We are using the default values of ``dilatation``, ``groups`` & ``bias`` for ``nn.Conv2D``.
Similarly for the ``stride``, ``padding``, ``dilatation``, ``return_indices`` & ``ceil_mode`` of \
``nn.MaxPool2D``.
:param name: Name of the model (tken from the configuration file).
:param config: dict of parameters (read from configuration ``.yaml`` file).
:type config: ``ptp.configuration.ConfigInterface``
"""
# Call base constructor.
super(ConvNetEncoder, self).__init__(name, ConvNetEncoder, config)
# Set key mappings.
self.key_inputs = self.stream_keys["inputs"]
self.key_feature_maps = self.stream_keys["feature_maps"]
# Get input image information from the global parameters.
self.input_width = self.globals["input_width"]
self.input_height = self.globals["input_height"]
self.input_depth = self.globals["input_depth"]
# Retrieve the Conv1 parameters.
self.out_channels_conv1 = config['conv1']['out_channels']
self.kernel_size_conv1 = config['conv1']['kernel_size']
self.stride_conv1 = config['conv1']['stride']
self.padding_conv1 = config['conv1']['padding']
# Retrieve the MaxPool1 parameter.
self.kernel_size_maxpool1 = config['maxpool1']['kernel_size']
# Retrieve the Conv2 parameters.
self.out_channels_conv2 = config['conv2']['out_channels']
self.kernel_size_conv2 = config['conv2']['kernel_size']
self.stride_conv2 = config['conv2']['stride']
self.padding_conv2 = config['conv2']['padding']
# Retrieve the MaxPool2 parameter.
self.kernel_size_maxpool2 = config['maxpool2']['kernel_size']
# Retrieve the Conv3 parameters.
self.out_channels_conv3 = config['conv3']['out_channels']
self.kernel_size_conv3 = config['conv3']['kernel_size']
self.stride_conv3 = config['conv3']['stride']
self.padding_conv3 = config['conv3']['padding']
# Retrieve the MaxPool3 parameter.
self.kernel_size_maxpool3 = config['maxpool3']['kernel_size']
# We can compute the spatial size of the output volume as a function of the input volume size (W),
# the receptive field size of the Conv Layer neurons (F), the stride with which they are applied (S),
# and the amount of zero padding used (P) on the border.
# The corresponding equation is conv_size = ((W−F+2P)/S)+1.
# doc for nn.Conv2D: https://pytorch.org/docs/stable/nn.html#torch.nn.Conv2d
# doc for nn.MaxPool2D: https://pytorch.org/docs/stable/nn.html#torch.nn.MaxPool2d
# ----------------------------------------------------
# Conv1
self.conv1 = nn.Conv2d(in_channels=self.input_depth,
out_channels=self.out_channels_conv1,
kernel_size=self.kernel_size_conv1,
stride=self.stride_conv1,
padding=self.padding_conv1,
dilation=1,
groups=1,
bias=True)
self.width_features_conv1 = np.floor(
((self.input_width - self.kernel_size_conv1 + 2*self.padding_conv1) / self.stride_conv1) + 1)
self.height_features_conv1 = np.floor(
((self.input_height - self.kernel_size_conv1 + 2*self.padding_conv1) / self.stride_conv1) + 1)
# ----------------------------------------------------
# MaxPool1
self.maxpool1 = nn.MaxPool2d(kernel_size=self.kernel_size_maxpool1)
self.width_features_maxpool1 = np.floor(
((self.width_features_conv1 - self.maxpool1.kernel_size + 2 * self.maxpool1.padding) / self.maxpool1.stride) + 1)
self.height_features_maxpool1 = np.floor(
((self.height_features_conv1 - self.maxpool1.kernel_size + 2 * self.maxpool1.padding) / self.maxpool1.stride) + 1)
# ----------------------------------------------------
# Conv2
self.conv2 = nn.Conv2d(in_channels=self.out_channels_conv1,
out_channels=self.out_channels_conv2,
kernel_size=self.kernel_size_conv2,
stride=self.stride_conv2,
padding=self.padding_conv2,
dilation=1,
groups=1,
bias=True)
self.width_features_conv2 = np.floor(
((self.width_features_maxpool1 - self.kernel_size_conv2 + 2*self.padding_conv2) / self.stride_conv2) + 1)
self.height_features_conv2 = np.floor(
((self.height_features_maxpool1 - self.kernel_size_conv2 + 2*self.padding_conv2) / self.stride_conv2) + 1)
# ----------------------------------------------------
# MaxPool2
self.maxpool2 = nn.MaxPool2d(kernel_size=self.kernel_size_maxpool2)
self.width_features_maxpool2 = np.floor(
((self.width_features_conv2 - self.maxpool2.kernel_size + 2 * self.maxpool2.padding) / self.maxpool2.stride) + 1)
self.height_features_maxpool2 = np.floor(
((self.height_features_conv2 - self.maxpool2.kernel_size + 2 * self.maxpool2.padding) / self.maxpool2.stride) + 1)
# ----------------------------------------------------
# Conv3
self.conv3 = nn.Conv2d(in_channels=self.out_channels_conv2,
out_channels=self.out_channels_conv3,
kernel_size=self.kernel_size_conv3,
stride=self.stride_conv3,
padding=self.padding_conv3,
dilation=1,
groups=1,
bias=True)
self.width_features_conv3 = np.floor(
((self.width_features_maxpool2 - self.kernel_size_conv3 + 2*self.padding_conv3) / self.stride_conv3) + 1)
self.height_features_conv3 = np.floor(
((self.height_features_maxpool2 - self.kernel_size_conv3 + 2*self.padding_conv3) / self.stride_conv3) + 1)
# ----------------------------------------------------
# MaxPool3
self.maxpool3 = nn.MaxPool2d(kernel_size=self.kernel_size_maxpool3)
self.width_features_maxpool3 = np.floor(
((self.width_features_conv3 - self.maxpool3.kernel_size + 2 * self.maxpool3.padding) / self.maxpool3.stride) + 1)
self.height_features_maxpool3 = np.floor(
((self.height_features_conv3 - self.maxpool1.kernel_size + 2 * self.maxpool3.padding) / self.maxpool3.stride) + 1)
# Set global variables: output dims
self.globals["feature_map_height"] = self.height_features_maxpool3
self.globals["feature_map_width"] = self.width_features_maxpool3
self.globals["feature_map_depth"] = self.out_channels_conv3
# log some info.
self.logger.info('Input: [-1, {}, {}, {}]'.format(self.input_depth, self.input_width, self.input_height))
self.logger.info('Computed output shape of each layer:')
self.logger.info('Conv1: [-1, {}, {}, {}]'.format(self.out_channels_conv1, self.width_features_conv1,
self.height_features_conv1))
self.logger.info('MaxPool1: [-1, {}, {}, {}]'.format(self.out_channels_conv1, self.width_features_maxpool1,
self.height_features_maxpool1))
self.logger.info('Conv2: [-1, {}, {}, {}]'.format(self.out_channels_conv2, self.width_features_conv2,
self.height_features_conv2))
self.logger.info('MaxPool2: [-1, {}, {}, {}]'.format(self.out_channels_conv2, self.width_features_maxpool2,
self.height_features_maxpool2))
self.logger.info('Conv3: [-1, {}, {}, {}]'.format(self.out_channels_conv3, self.width_features_conv3,
self.height_features_conv3))
self.logger.info('MaxPool3: [-1, {}, {}, {}]'.format(self.out_channels_conv3, self.width_features_maxpool3,
self.height_features_maxpool3))
def input_data_definitions(self):
"""
Function returns a dictionary with definitions of input data that are required by the component.
:return: dictionary containing input data definitions (each of type :py:class:`ptp.utils.DataDefinition`).
"""
return {
self.key_inputs: DataDefinition([-1, self.input_depth, self.input_height, self.input_width], [torch.Tensor], "Batch of images [BATCH_SIZE x IMAGE_DEPTH x IMAGE_HEIGHT x IMAGE WIDTH]"),
}
def output_data_definitions(self):
"""
Function returns a dictionary with definitions of output data produced the component.
:return: dictionary containing output data definitions (each of type :py:class:`ptp.utils.DataDefinition`).
"""
return {
self.key_feature_maps: DataDefinition([-1, self.out_channels_conv3, self.height_features_maxpool3, self.width_features_maxpool3], [torch.Tensor], "Batch of filter maps [BATCH_SIZE x FEAT_DEPTH x FEAT_HEIGHT x FEAT_WIDTH]")
}
def forward(self, data_streams):
"""
forward pass of the ``SimpleConvNet`` model.
:param data_streams: DataStreams({'inputs','outputs'}), where:
- inputs: [batch_size, in_depth, in_height, in_width],
- feature_maps: batch of feature maps [batch_size, out_depth, out_height, out_width]
"""
# get images
images = data_streams[self.key_inputs]
# apply Convolutional layer 1
out_conv1 = self.conv1(images)
# apply max_pooling and relu
out_maxpool1 = torch.nn.functional.relu(self.maxpool1(out_conv1))
# apply Convolutional layer 2
out_conv2 = self.conv2(out_maxpool1)
# apply max_pooling and relu
out_maxpool2 = torch.nn.functional.relu(self.maxpool2(out_conv2))
# apply Convolutional layer 3
out_conv3 = self.conv3(out_maxpool2)
# apply max_pooling and relu
out_maxpool3 = torch.nn.functional.relu(self.maxpool3(out_conv3))
# Add output to datadict.
data_streams.publish({self.key_feature_maps: out_maxpool3})