configs/vqa_med_2019/c2_classification/c2_class_lstm_resnet50_coattn_mfb_cat_is.yml
# Load config defining tasks for training, validation and testing.
default_configs: vqa_med_2019/c2_classification/default_c2_classification.yml
training:
task:
batch_size: 64
# Appy all preprocessing/data augmentations.
question_preprocessing: lowercase,remove_punctuation,tokenize
streams:
# Task is returning tokenized questions.
questions: tokenized_questions
validation:
task:
batch_size: 64
# Appy all preprocessing/data augmentations.
question_preprocessing: lowercase,remove_punctuation,tokenize
streams:
# Task is returning tokenized questions.
questions: tokenized_questions
pipeline:
global_publisher:
priority: 0
type: GlobalVariablePublisher
# Add input_size to globals.
keys: [question_encoder_output_size, question_attention_activation_size, image_attention_activation_size, pooling_activation_size]
values: [200, 800, 4096, 512]
################# PIPE 0: question #################
# Model 1: Embeddings
question_embeddings:
priority: 1.2
type: SentenceEmbeddings
embeddings_size: 100
pretrained_embeddings_file: glove.6B.100d.txt
data_folder: ~/data/vqa-med
word_mappings_file: questions.all.word.mappings.csv
streams:
inputs: tokenized_questions
outputs: embedded_questions
# Model 2: RNN
question_lstm:
priority: 1.3
type: RecurrentNeuralNetwork
cell_type: GRU
prediction_mode: Dense
use_logsoftmax: False
output_last_state: False
initial_state: Trainable
dropout_rate: 0.1
hidden_size: 128
streams:
inputs: embedded_questions
predictions: question_activations
globals:
input_size: embeddings_size
prediction_size: question_encoder_output_size
# Self Attention for question.
question_attention:
priority: 1.4
type: SelfAttention
latent_size: 128
num_attention_heads: 4
streams:
question_encodings: question_activations
outputs: question_attention_activations
globals:
question_encoding_size: question_encoder_output_size
output_size: question_attention_activation_size
################# PIPE 2: image #################
# Image encoder.
image_encoder:
priority: 2.1
type: GenericImageEncoder
model_type: resnet50
return_feature_maps: True
streams:
inputs: images
outputs: feature_maps
image_attention:
priority: 2.2
type: QuestionDrivenAttention
dropout_rate: 0.3
latent_size: 1024
output_mode: 'Image'
num_attention_heads: 2
streams:
image_encodings: feature_maps
question_encodings: question_attention_activations
outputs: image_attention_activations
globals:
question_encoding_size: question_attention_activation_size
output_size: image_attention_activation_size
################# PIPE 3: image-question fusion #################
# MFB
question_image_fusion:
priority: 3.1
type: FactorizedBilinearPooling
dropout_rate: 0.3
latent_size: 512
pool_factor: 2
streams:
image_encodings: image_attention_activations
question_encodings: question_attention_activations
outputs: pooling_activations
globals:
image_encoding_size: image_attention_activation_size
question_encoding_size: question_attention_activation_size
output_size: pooling_activation_size
classifier:
priority: 4.1
type: FeedForwardNetwork
hidden_sizes: [100]
dropout_rate: 0.2
streams:
inputs: pooling_activations
globals:
input_size: pooling_activation_size
prediction_size: vocabulary_size_c2
#: pipeline