configs/vqa_med_2019/evaluation/tom/glove_lstm_resnet152_mcb_is_cat_ffn_c123_loss.yml
# Load config defining tasks for training, validation and testing.
default_configs:
vqa_med_2019/default_vqa_med_2019.yml #,vqa_med_2019/frozen_pipelines/frozen_question_categorization_glove_rnn_ffn.yml
hyperparameters:
# In here I am putting some of the hyperparameters from spreadsheet.
question_preprocessing: &question_preprocessing lowercase, remove_punctuation, tokenize
# Accepted formats: a,b,c or [a,b,c]
# none | lowercase | remove_punctuation | tokenize | random_remove_stop_words | random_shuffle_words | all
image_preprocessing: &image_preprocessing normalize
# Accepted formats: a,b,c or [a,b,c]
# none | random_affine | random_horizontal_flip | normalize | all
# Image encoder.
image_encoder_model: &image_encoder_model resnet152
# Options: vgg16 | densenet121 | resnet152 | resnet50
image_encoder_output_size_val: &image_encoder_output_size_val 100
# Question encoder.
question_encoder_embeddings: &question_encoder_embeddings glove.6B.50d.txt
# Options: '' | glove.6B.50d.txt | glove.6B.100d.txt | glove.6B.200d.txt | glove.6B.300d.txt | glove.42B.300d.txt | glove.840B.300d.txt | glove.twitter.27B.txt | mimic.fastText.no_clean.300d.pickled
question_encoder_embeddings_size_val: &question_encoder_embeddings_size_val 50
question_encoder_lstm_size_val: &question_encoder_lstm_size_val 50
question_encoder_output_size_val: &question_encoder_output_size_val 100
# Fusion I: image + question
question_image_fusion_type_val: &question_image_fusion_type CompactBilinearPooling
# Options: LowRankBilinearPooling | CompactBilinearPooling | QuestionDrivenAttention
question_image_fusion_size_val: &question_image_fusion_size_val 100
# Image size encoder.
image_size_encoder_output_size_val: &image_size_encoder_output_size_val 10
# Fusion II: (image + question) + image size (must be = question_image_fusion_size_val + image_size_encoder_output_size_val)
question_image_size_fusion_size_val: &question_image_size_fusion_size_val 110
# Final classifier: FFN.
answer_classifier_hidden_sizes_val: &answer_classifier_hidden_sizes_val [100]
batch_size: &batch_size 150
preload_images: &preload_images False
num_workers: &num_workers 3
# Training parameters:
training:
task:
batch_size: *batch_size
categories: C1,C2,C3
export_sample_weights: ~/data/vqa-med/answers.c1_c2_c3_binary_yn.weights.csv
# Appy all preprocessing/data augmentations.
question_preprocessing: *question_preprocessing
image_preprocessing: *image_preprocessing
# Preload images.
preload_images: *preload_images
streams:
questions: tokenized_questions
sampler:
weights: ~/data/vqa-med/answers.c1_c2_c3_binary_yn.weights.csv
# Use four workers for loading images.
dataloader:
num_workers: *num_workers
# Optimizer parameters:
optimizer:
type: Adam
lr: 0.0001
# Terminal conditions:
terminal_conditions:
loss_stop_threshold: 1.0e-3
episode_limit: 10000
epoch_limit: -1
# Validation parameters:
validation:
task:
batch_size: *batch_size
categories: C1,C2,C3
# Appy all preprocessing/data augmentations.
question_preprocessing: *question_preprocessing
image_preprocessing: *image_preprocessing
# Preload images: false, as we will need them only once, at the end.
preload_images: false
streams:
questions: tokenized_questions
dataloader:
num_workers: 1
pipeline:
################# PIPE 0: SHARED #################
# Add global variables.
global_publisher:
priority: 0
type: GlobalVariablePublisher
# Add input_size to globals.
keys: [question_encoder_output_size, image_size_encoder_input_size, image_size_encoder_output_size, image_encoder_output_size, fused_activation_size]
values: [*question_encoder_output_size_val, 2, *image_size_encoder_output_size_val, *image_encoder_output_size_val, *question_image_fusion_size_val]
# Statistics.
batch_size:
priority: 0.1
type: BatchSizeStatistics
# Answer encoding.
pipe1_all_answer_indexer:
priority: 0.2
type: LabelIndexer
data_folder: ~/data/vqa-med
word_mappings_file: answers.c1_c2_c3_binary_yn.word.mappings.csv
# Export mappings and size to globals.
export_word_mappings_to_globals: True
streams:
inputs: answers
outputs: answers_ids
globals:
vocabulary_size: vocabulary_size_c123_binary_yn
word_mappings: word_mappings_c123_binary_yn
################# PIPE 1: SHARED QUESTION ENCODER #################
# Model 1: question embeddings
pipe1_question_embeddings:
priority: 1.1
type: SentenceEmbeddings
embeddings_size: *question_encoder_embeddings_size_val
pretrained_embeddings_file: *question_encoder_embeddings
data_folder: ~/data/vqa-med
word_mappings_file: questions.all.word.mappings.csv
streams:
inputs: tokenized_questions
outputs: embedded_questions
globals:
embeddings_size: pipe1_embeddings_size
# Model 2: question RNN
pipe1_lstm:
priority: 1.2
type: RecurrentNeuralNetwork
cell_type: LSTM
hidden_size: *question_encoder_lstm_size_val
prediction_mode: Last
initial_state: Trainable
use_logsoftmax: False
streams:
inputs: embedded_questions
predictions: question_activations
globals:
input_size: pipe1_embeddings_size
prediction_size: question_encoder_output_size
################# PIPE 2: SHARED IMAGE ENCODER #################
# Image encoder.
image_encoder:
priority: 2.1
type: GenericImageEncoder
model: *image_encoder_model
streams:
inputs: images
outputs: image_activations
globals:
output_size: image_encoder_output_size
################# PIPE 3: SHARED IMAGE SIZE ENCODER #################
# Model - image size classifier.
image_size_encoder:
priority: 3.1
type: FeedForwardNetwork
use_logsoftmax: False
streams:
inputs: image_sizes
predictions: image_size_activations
globals:
input_size: image_size_encoder_input_size
prediction_size: image_size_encoder_output_size
################# PIPE 4: image-question fusion #################
# Element wise multiplication + FF.
question_image_fusion:
priority: 4.1
type: *question_image_fusion_type
dropout_rate: 0.5
streams:
image_encodings: image_activations
question_encodings: question_activations
outputs: fused_activations
globals:
image_encoding_size: image_encoder_output_size
question_encoding_size: question_encoder_output_size
output_size: fused_activation_size
question_image_ffn:
priority: 4.2
type: FeedForwardNetwork
hidden_sizes: [*question_image_fusion_size_val]
dropout_rate: 0.5
use_logsoftmax: False
streams:
inputs: fused_activations
predictions: question_image_activations
globals:
input_size: fused_activation_size
prediction_size: fused_activation_size
################# PIPE 5: image-question-image size fusion #################
# 5th subpipeline: concatenation
concat:
priority: 5.1
type: ConcatenateTensor
input_streams: [question_image_activations,image_size_activations]
# ConcatenateTensor
dim: 1 # default
input_dims: [[-1,*question_image_fusion_size_val],[-1,*image_size_encoder_output_size_val]]
output_dims: [-1,*question_image_size_fusion_size_val]
streams:
outputs: concatenated_activations
globals:
output_size: concatenated_activations_size
################# PIPE 6: C1 + C2 + C3 questions #################
# Model 4: FFN C123 answering
pipe6_c123_answer_classifier:
priority: 6.3
type: FeedForwardNetwork
hidden: *answer_classifier_hidden_sizes_val
dropout_rate: 0.5
streams:
inputs: concatenated_activations
predictions: pipe6_c123_predictions
globals:
input_size: concatenated_activations_size
prediction_size: vocabulary_size_c123_binary_yn
pipe6_c123_nllloss:
priority: 6.4
type: NLLLoss
targets_dim: 1
streams:
predictions: pipe6_c123_predictions
targets: answers_ids
loss: pipe6_c123_loss
pipe6_c123_precision_recall:
priority: 6.5
type: PrecisionRecallStatistics
use_word_mappings: True
show_class_scores: True
#show_confusion_matrix: True
streams:
predictions: pipe6_c123_predictions
targets: answers_ids
globals:
word_mappings: word_mappings_c123_binary_yn
statistics:
precision: pipe6_c123_precision
recall: pipe6_c123_recall
f1score: pipe6_c123_f1score
# C123 Predictions decoder.
pipe5_c123_prediction_decoder:
priority: 6.6
type: WordDecoder
# Use the same word mappings as label indexer.
import_word_mappings_from_globals: True
streams:
inputs: pipe6_c123_predictions
outputs: predicted_answers
globals:
word_mappings: word_mappings_c123_binary_yn
################# PIPE 9: MERGE ANSWERS #################
# Viewers.
viewer:
priority: 9.3
type: StreamViewer
input_streams:
tokenized_questions,
category_names, predicted_category_names,
answers, predicted_answers
#: pipeline