configs/vqa_med_2019/evaluation/deepta/glove_gru_vgg16_coattn_mfb_is_cat_ffn_c1234_loss.yml
# Load config defining tasks for training, validation and testing.
default_configs: vqa_med_2019/default_vqa_med_2019.yml
hyperparameters:
# In here I am putting some of the hyperparameters from spreadsheet.
question_preprocessing: &question_preprocessing lowercase, remove_punctuation, tokenize
# Accepted formats: a,b,c or [a,b,c]
# none | lowercase | remove_punctuation | tokenize | random_remove_stop_words | random_shuffle_words | all
image_preprocessing: &image_preprocessing normalize
# Accepted formats: a,b,c or [a,b,c]
# none | random_affine | random_horizontal_flip | normalize | all
# Image encoder.
image_encoder_model: &image_encoder_model vgg16
# Options: vgg16 | densenet121 | resnet152 | resnet50
# image_encoder_output_size_val: &image_encoder_output_size_val 2048
image_attention_multihead_size_val: &image_attention_multihead_size_val 2
# image_attention_output_size_val: &image_attention_output_size_val 4096
# Question encoder.
question_encoder_embeddings: &question_encoder_embeddings glove.6B.100d.txt
# Options: '' | glove.6B.50d.txt | glove.6B.100d.txt | glove.6B.200d.txt | glove.6B.300d.txt | glove.42B.300d.txt | glove.840B.300d.txt | glove.twitter.27B.txt | mimic.fastText.no_clean.300d.pickled
question_encoder_embeddings_size_val: &question_encoder_embeddings_size_val 100
question_encoder_lstm_size_val: &question_encoder_lstm_size_val 128
question_encoder_output_size_val: &question_encoder_output_size_val 200
question_attention_multihead_size_val: &question_attention_multihead_size_val 4
question_attention_output_size_val: &question_attention_output_size_val 800
# Fusion I: image + question
question_image_fusion_type_val: &question_image_fusion_type FactorizedBilinearPooling
# Options: LowRankBilinearPooling | FactorizedBilinearPooling (component: question_image_fusion)
question_image_fusion_size_val: &question_image_fusion_size_val 512
# Image size encoder.
image_size_encoder_output_size_val: &image_size_encoder_output_size_val 10
# Fusion II: (image + question) + image size (must be = question_image_fusion_size_val + image_size_encoder_output_size_val)
question_image_size_fusion_size_val: &question_image_size_fusion_size_val 522
# Final classifier: FFN.
answer_classifier_hidden_sizes_val: &answer_classifier_hidden_sizes_val [100]
batch_size: &batch_size 64
# Training parameters:
training:
task:
batch_size: *batch_size
categories: all #C1,C2,C3 # TODO: all
export_sample_weights: ~/data/vqa-med/answers.c1_c2_c3_c4_binary_yn.weights.csv
# Appy all preprocessing/data augmentations.
question_preprocessing: *question_preprocessing
image_preprocessing: *image_preprocessing
streams:
questions: tokenized_questions
sampler:
weights: ~/data/vqa-med/answers.c1_c2_c3_c4_binary_yn.weights.csv
# Optimizer parameters:
optimizer:
type: Adam
lr: 0.0001
# Terminal conditions:
terminal_conditions:
loss_stop_threshold: 1.0e-3
episode_limit: 5000 #10000
epoch_limit: -1
# Validation parameters:
validation:
task:
batch_size: *batch_size
categories: all #C1,C2,C3 # TODO: all
# Appy all preprocessing/data augmentations.
question_preprocessing: *question_preprocessing
image_preprocessing: *image_preprocessing
streams:
questions: tokenized_questions
pipeline:
################# PIPE 0: SHARED #################
# Add global variables.
global_publisher:
priority: 0
type: GlobalVariablePublisher
# Add input_size to globals.
keys: [question_encoder_output_size, question_attention_output_size, image_size_encoder_input_size, image_size_encoder_output_size, fused_activation_size]
values: [*question_encoder_output_size_val, *question_attention_output_size_val, 2, *image_size_encoder_output_size_val, *question_image_fusion_size_val]
# Statistics.
batch_size:
priority: 0.1
type: BatchSizeStatistics
# Answer encoding.
pipe1_all_answer_indexer:
priority: 0.2
type: LabelIndexer
data_folder: ~/data/vqa-med
word_mappings_file: answers.all.word.mappings.csv # TODO: all
# Export mappings and size to globals.
export_word_mappings_to_globals: True
streams:
inputs: answers
outputs: answers_ids
globals:
vocabulary_size: vocabulary_size_c1234_binary_yn # TODO: rename
word_mappings: word_mappings_c1234_binary_yn
################# PIPE 1: SHARED QUESTION ENCODER #################
# Model 1: question embeddings
pipe1_question_embeddings:
priority: 1.1
type: SentenceEmbeddings
embeddings_size: *question_encoder_embeddings_size_val
pretrained_embeddings_file: *question_encoder_embeddings
data_folder: ~/data/vqa-med
word_mappings_file: questions.all.word.mappings.csv
streams:
inputs: tokenized_questions
outputs: embedded_questions
globals:
embeddings_size: pipe1_embeddings_size
# Model 2: question RNN
pipe1_lstm:
priority: 1.2
type: RecurrentNeuralNetwork
cell_type: GRU
hidden_size: *question_encoder_lstm_size_val
prediction_mode: Dense
initial_state: Trainable
use_logsoftmax: False
output_last_state: False
streams:
inputs: embedded_questions
predictions: question_activations
globals:
input_size: pipe1_embeddings_size
prediction_size: question_encoder_output_size
# Model 3: self attention for question.
question_attention:
priority: 1.3
type: SelfAttention
latent_size: *question_encoder_lstm_size_val
num_attention_heads: 4
streams:
question_encodings: question_activations
outputs: question_attention_activations
globals:
question_encoding_size: question_encoder_output_size
output_size: question_attention_output_size
################# PIPE 2: SHARED IMAGE ENCODER #################
# Image encoder.
image_encoder:
priority: 2.1
type: GenericImageEncoder
model_type: *image_encoder_model
return_feature_maps: True
streams:
inputs: images
outputs: feature_maps
image_attention:
priority: 2.2
type: QuestionDrivenAttention
dropout_rate: 0.3
latent_size: 256
output_mode: 'Image'
num_attention_heads: 2
streams:
image_encodings: feature_maps
question_encodings: question_attention_activations
outputs: image_attention_activations
globals:
question_encoding_size: question_attention_output_size
output_size: image_attention_output_size
################# PIPE 3: image-question fusion #################
# MFB.
question_image_fusion:
priority: 3.1
type: *question_image_fusion_type
dropout_rate: 0.5
latent_size: 512
pool_factor: 2
streams:
image_encodings: image_attention_activations
question_encodings: question_attention_activations
outputs: fused_activations
globals:
image_encoding_size: image_attention_output_size
question_encoding_size: question_attention_output_size
# output_size: image_attention_output_size #fused_activation_size
################# PIPE 4: SHARED IMAGE SIZE ENCODER #################
# Model - image size classifier.
image_size_encoder:
priority: 4.1
type: FeedForwardNetwork
use_losfotmax: False
streams:
inputs: image_sizes
predictions: image_size_activations
globals:
input_size: image_size_encoder_input_size
prediction_size: image_size_encoder_output_size
################# PIPE 5: image-question-image size fusion #################
# 5th subpipeline: concatenation
concat:
priority: 5.1
type: ConcatenateTensor
input_streams: [fused_activations,image_size_activations]
# ConcatenateTensor
dim: 1 # default
input_dims: [[-1,*question_image_fusion_size_val],[-1,*image_size_encoder_output_size_val]]
output_dims: [-1,*question_image_size_fusion_size_val]
streams:
outputs: concatenated_activations
globals:
output_size: concatenated_activations_size
################# PIPE 6: C1 + C2 + C3 questions #################
# Model 4: FFN C123 answering
pipe6_c123_answer_classifier:
priority: 6.3
type: FeedForwardNetwork
hidden: *answer_classifier_hidden_sizes_val
dropout_rate: 0.5
streams:
inputs: concatenated_activations
predictions: pipe6_c123_predictions
globals:
input_size: concatenated_activations_size
prediction_size: vocabulary_size_c1234_binary_yn
pipe6_c123_nllloss:
priority: 6.4
type: NLLLoss
targets_dim: 1
streams:
predictions: pipe6_c123_predictions
targets: answers_ids
loss: pipe6_c123_loss
pipe6_c123_precision_recall:
priority: 6.5
type: PrecisionRecallStatistics
use_word_mappings: True
show_class_scores: True
#show_confusion_matrix: True
streams:
predictions: pipe6_c123_predictions
targets: answers_ids
globals:
word_mappings: word_mappings_c123_binary_yn
statistics:
precision: pipe6_c123_precision
recall: pipe6_c123_recall
f1score: pipe6_c123_f1score
# C123 Predictions decoder.
pipe5_c123_prediction_decoder:
priority: 6.6
type: WordDecoder
# Use the same word mappings as label indexer.
import_word_mappings_from_globals: True
streams:
inputs: pipe6_c123_predictions
outputs: predicted_answers
globals:
word_mappings: word_mappings_c123_binary_yn
################# PIPE 7: MERGE ANSWERS #################
# Viewers.
viewer:
priority: 7.3
type: StreamViewer
input_streams:
tokenized_questions,
category_names, pipe0_predicted_question_categories_names,
answers, predicted_answers