configs/vqa_med_2019/evaluation/tom/glove_lstm_vgg16_att_is_cat_ffn_c123_loss.yml from IBM/pytorchpipe

configs/vqa_med_2019/evaluation/tom/glove_lstm_vgg16_att_is_cat_ffn_c123_loss.yml
Summary

Maintainability

Test Coverage

Issues
# Load config defining tasks for training, validation and testing.
default_configs:
  vqa_med_2019/default_vqa_med_2019.yml #,vqa_med_2019/frozen_pipelines/frozen_question_categorization_glove_rnn_ffn.yml

hyperparameters:
  # In here I am putting some of the hyperparameters from spreadsheet.

  question_preprocessing: &question_preprocessing lowercase, remove_punctuation, tokenize
  # Accepted formats: a,b,c or [a,b,c]
  # none | lowercase | remove_punctuation | tokenize | random_remove_stop_words | random_shuffle_words | all

  image_preprocessing: &image_preprocessing normalize
  # Accepted formats: a,b,c or [a,b,c]
  # none | random_affine | random_horizontal_flip | normalize | all

  # Image encoder.
  image_encoder_model: &image_encoder_model vgg16
  # Options: vgg16 | densenet121 | resnet152 | resnet50
  #image_encoder_output_size_val: &image_encoder_output_size_val 100
  # INFO: this variable is not important, as we are using features in this pipeline!!
  
  # Question encoder.
  question_encoder_embeddings: &question_encoder_embeddings glove.6B.50d.txt
  # Options: '' | glove.6B.50d.txt | glove.6B.100d.txt | glove.6B.200d.txt | glove.6B.300d.txt | glove.42B.300d.txt | glove.840B.300d.txt | glove.twitter.27B.txt | mimic.fastText.no_clean.300d.pickled
  question_encoder_embeddings_size_val: &question_encoder_embeddings_size_val 50
  question_encoder_lstm_size_val: &question_encoder_lstm_size_val 50
  question_encoder_output_size_val: &question_encoder_output_size_val 100
  
  # Fusion I: image + question
  question_image_fusion_type_val: &question_image_fusion_type QuestionDrivenAttention
  # Options: LowRankBilinearPooling | QuestionDrivenAttention
  #question_image_fusion_size_val: &question_image_fusion_size_val 1124
  # INFO: this variable is set by QuestionDrivenAttention component!

  # Image size encoder.
  image_size_encoder_output_size_val: &image_size_encoder_output_size_val 10

  # Fusion II: (image + question) + image size (must be = question_image_fusion_size_val + image_size_encoder_output_size_val)
  question_image_size_fusion_size_val: &question_image_size_fusion_size_val 1134

  # Final classifier: FFN.
  answer_classifier_hidden_sizes_val: &answer_classifier_hidden_sizes_val [500]

  batch_size: &batch_size 300
  preload_images: &preload_images False
  num_workers: &num_workers 4

# Training parameters:
training:
  task:
    batch_size: *batch_size
    categories: C1,C2,C3
    export_sample_weights: ~/data/vqa-med/answers.c1_c2_c3_binary_yn.weights.csv
    # Appy all preprocessing/data augmentations.
    question_preprocessing: *question_preprocessing
    image_preprocessing: *image_preprocessing 
    # Preload images.
    preload_images: *preload_images
    streams: 
      questions: tokenized_questions
  sampler:
    weights: ~/data/vqa-med/answers.c1_c2_c3_binary_yn.weights.csv
  # Use four workers for loading images.
  dataloader:
    num_workers: *num_workers

  # Optimizer parameters:
  optimizer:
    type: Adam
    lr: 0.0001

  # Terminal conditions:
  terminal_conditions:
    loss_stop_threshold: 1.0e-3
    episode_limit: 10000
    epoch_limit: -1

# Validation parameters:
validation:
  task:
    batch_size: *batch_size
    categories: C1,C2,C3
    # Appy all preprocessing/data augmentations.
    question_preprocessing: *question_preprocessing
    image_preprocessing: *image_preprocessing 
    # Preload images: false, as we will need them only once, at the end.
    preload_images: false
    streams: 
      questions: tokenized_questions
  dataloader:
    num_workers: 1


pipeline:
  
  ################# PIPE 0: SHARED #################

  # Add global variables.
  global_publisher:
    priority: 0
    type: GlobalVariablePublisher
    # Add input_size to globals.
    keys: [question_encoder_output_size, image_size_encoder_input_size, image_size_encoder_output_size] #, image_encoder_output_size] #, fused_activation_size]
    values: [*question_encoder_output_size_val, 2, *image_size_encoder_output_size_val] #, *image_encoder_output_size_val] #, *question_image_fusion_size_val]

  # Statistics.
  batch_size:
    priority: 0.1
    type: BatchSizeStatistics

  # Answer encoding.
  pipe1_all_answer_indexer:
    priority: 0.2
    type: LabelIndexer
    data_folder: ~/data/vqa-med
    word_mappings_file: answers.c1_c2_c3_binary_yn.word.mappings.csv
    # Export mappings and size to globals.
    export_word_mappings_to_globals: True
    streams:
      inputs: answers
      outputs: answers_ids
    globals:
      vocabulary_size: vocabulary_size_c123_binary_yn
      word_mappings: word_mappings_c123_binary_yn


  ################# PIPE 1: SHARED QUESTION ENCODER #################

  # Model 1: question embeddings
  pipe1_question_embeddings:
    priority: 1.1
    type: SentenceEmbeddings
    embeddings_size: *question_encoder_embeddings_size_val
    pretrained_embeddings_file: *question_encoder_embeddings
    data_folder: ~/data/vqa-med
    word_mappings_file: questions.all.word.mappings.csv
    streams:
      inputs: tokenized_questions
      outputs: embedded_questions
    globals:
      embeddings_size: pipe1_embeddings_size     
  
  # Model 2: question RNN
  pipe1_lstm:
    priority: 1.2
    type: RecurrentNeuralNetwork
    cell_type: LSTM
    hidden_size: *question_encoder_lstm_size_val
    prediction_mode: Last
    initial_state: Trainable
    use_logsoftmax: False
    streams:
      inputs: embedded_questions
      predictions: question_activations
    globals:
      input_size: pipe1_embeddings_size
      prediction_size: question_encoder_output_size

  ################# PIPE 2: SHARED IMAGE ENCODER #################

  # Image encoder.
  image_encoder:
    priority: 2.1
    type: GenericImageEncoder
    model: *image_encoder_model
    return_feature_maps: True
    streams:
      inputs: images
      outputs: feature_maps

  ################# PIPE 3: SHARED IMAGE SIZE ENCODER #################

  # Model - image size classifier.
  image_size_encoder:
    priority: 3.1
    type: FeedForwardNetwork 
    use_logsoftmax: False
    streams:
      inputs: image_sizes
      predictions: image_size_activations
    globals:
      input_size: image_size_encoder_input_size
      prediction_size: image_size_encoder_output_size

  ################# PIPE 4: image-question fusion  #################
  # Attention + FF.
  question_image_fusion:
    priority: 4.1
    type: *question_image_fusion_type
    dropout_rate: 0.5
    # Attention params.
    latent_size: 100
    num_attention_heads: 2
    streams:
      image_encodings: feature_maps
      question_encodings: question_activations
      outputs: fused_activations
    globals:
      question_encoding_size: question_encoder_output_size
      output_size: fused_activation_size


  question_image_ffn:
    priority: 4.2
    type: FeedForwardNetwork 
    #hidden_sizes: [*question_image_fusion_size_val]
    dropout_rate: 0.5
    use_logsoftmax: False
    streams:
      inputs: fused_activations
      predictions: question_image_activations
    globals:
      input_size: fused_activation_size
      prediction_size: fused_activation_size

  ################# PIPE 5: image-question-image size fusion #################

  # 5th subpipeline: concatenation 
  concat:
    priority: 5.1
    type: ConcatenateTensor
    input_streams: [question_image_activations,image_size_activations]
    # ConcatenateTensor 
    dim: 1 # default
    input_dims: [[-1,1124],[-1,*image_size_encoder_output_size_val]]
    output_dims: [-1,*question_image_size_fusion_size_val]
    streams:
      outputs: concatenated_activations
    globals:
      output_size: concatenated_activations_size

  ################# PIPE 6: C1 + C2 + C3 questions #################

  # Model 4: FFN C123 answering
  pipe6_c123_answer_classifier:
    priority: 6.3
    type: FeedForwardNetwork
    hidden: *answer_classifier_hidden_sizes_val
    dropout_rate: 0.5
    streams:
      inputs: concatenated_activations
      predictions: pipe6_c123_predictions
    globals:
      input_size: concatenated_activations_size
      prediction_size: vocabulary_size_c123_binary_yn

  pipe6_c123_nllloss:
    priority: 6.4
    type: NLLLoss
    targets_dim: 1
    streams:
      predictions: pipe6_c123_predictions
      targets: answers_ids
      loss: pipe6_c123_loss

  pipe6_c123_precision_recall:
    priority: 6.5
    type: PrecisionRecallStatistics
    use_word_mappings: True
    show_class_scores: True
    #show_confusion_matrix: True
    streams:
      predictions: pipe6_c123_predictions
      targets: answers_ids
    globals:
      word_mappings: word_mappings_c123_binary_yn
    statistics:
      precision: pipe6_c123_precision
      recall: pipe6_c123_recall
      f1score: pipe6_c123_f1score

  # C123 Predictions decoder.
  pipe5_c123_prediction_decoder:
    priority: 6.6
    type: WordDecoder
    # Use the same word mappings as label indexer.
    import_word_mappings_from_globals: True
    streams:
      inputs: pipe6_c123_predictions
      outputs: predicted_answers
    globals:
      word_mappings: word_mappings_c123_binary_yn

  ################# PIPE 9: MERGE ANSWERS #################

  # Viewers.
  viewer:
    priority: 9.3
    type: StreamViewer
    input_streams:
      tokenized_questions,
      category_names, predicted_category_names,
      answers, predicted_answers


#: pipeline