IBM/pytorchpipe

View on GitHub
configs/vqa_med_2019/evaluation/deepta/glove_gru_vgg16_coattn_mfb_is_cat_ffn_c1234_loss.yml

Summary

Maintainability
Test Coverage
# Load config defining tasks for training, validation and testing.
default_configs: vqa_med_2019/default_vqa_med_2019.yml

hyperparameters:
  # In here I am putting some of the hyperparameters from spreadsheet.

  question_preprocessing: &question_preprocessing lowercase, remove_punctuation, tokenize
  # Accepted formats: a,b,c or [a,b,c]
  # none | lowercase | remove_punctuation | tokenize | random_remove_stop_words | random_shuffle_words | all

  image_preprocessing: &image_preprocessing normalize
  # Accepted formats: a,b,c or [a,b,c]
  # none | random_affine | random_horizontal_flip | normalize | all

  # Image encoder.
  image_encoder_model: &image_encoder_model vgg16
  # Options: vgg16 | densenet121 | resnet152 | resnet50
  # image_encoder_output_size_val: &image_encoder_output_size_val 2048
  image_attention_multihead_size_val: &image_attention_multihead_size_val 2
  # image_attention_output_size_val: &image_attention_output_size_val 4096

  # Question encoder.
  question_encoder_embeddings: &question_encoder_embeddings glove.6B.100d.txt
  # Options: '' | glove.6B.50d.txt | glove.6B.100d.txt | glove.6B.200d.txt | glove.6B.300d.txt | glove.42B.300d.txt | glove.840B.300d.txt | glove.twitter.27B.txt | mimic.fastText.no_clean.300d.pickled
  question_encoder_embeddings_size_val: &question_encoder_embeddings_size_val 100
  question_encoder_lstm_size_val: &question_encoder_lstm_size_val 128
  question_encoder_output_size_val: &question_encoder_output_size_val 200
  question_attention_multihead_size_val: &question_attention_multihead_size_val 4
  question_attention_output_size_val: &question_attention_output_size_val 800

  # Fusion I: image + question
  question_image_fusion_type_val: &question_image_fusion_type FactorizedBilinearPooling
  # Options: LowRankBilinearPooling | FactorizedBilinearPooling (component: question_image_fusion)
  question_image_fusion_size_val: &question_image_fusion_size_val 512

  # Image size encoder.
  image_size_encoder_output_size_val: &image_size_encoder_output_size_val 10

  # Fusion II: (image + question) + image size (must be = question_image_fusion_size_val + image_size_encoder_output_size_val)
  question_image_size_fusion_size_val: &question_image_size_fusion_size_val 522

  # Final classifier: FFN.
  answer_classifier_hidden_sizes_val: &answer_classifier_hidden_sizes_val [100]

  batch_size: &batch_size 64

# Training parameters:
training:
  task:
    batch_size: *batch_size
    categories: all #C1,C2,C3 # TODO: all
    export_sample_weights: ~/data/vqa-med/answers.c1_c2_c3_c4_binary_yn.weights.csv
    # Appy all preprocessing/data augmentations.
    question_preprocessing: *question_preprocessing
    image_preprocessing: *image_preprocessing
    streams:
      questions: tokenized_questions
  sampler:
    weights: ~/data/vqa-med/answers.c1_c2_c3_c4_binary_yn.weights.csv

  # Optimizer parameters:
  optimizer:
    type: Adam
    lr: 0.0001

  # Terminal conditions:
  terminal_conditions:
    loss_stop_threshold: 1.0e-3
    episode_limit: 5000 #10000
    epoch_limit: -1

# Validation parameters:
validation:
  task:
    batch_size: *batch_size
    categories: all #C1,C2,C3  # TODO: all
    # Appy all preprocessing/data augmentations.
    question_preprocessing: *question_preprocessing
    image_preprocessing: *image_preprocessing
    streams:
      questions: tokenized_questions


pipeline:

  ################# PIPE 0: SHARED #################

  # Add global variables.
  global_publisher:
    priority: 0
    type: GlobalVariablePublisher
    # Add input_size to globals.
    keys: [question_encoder_output_size, question_attention_output_size, image_size_encoder_input_size, image_size_encoder_output_size, fused_activation_size]
    values: [*question_encoder_output_size_val, *question_attention_output_size_val, 2, *image_size_encoder_output_size_val, *question_image_fusion_size_val]

  # Statistics.
  batch_size:
    priority: 0.1
    type: BatchSizeStatistics

  # Answer encoding.
  pipe1_all_answer_indexer:
    priority: 0.2
    type: LabelIndexer
    data_folder: ~/data/vqa-med
    word_mappings_file: answers.all.word.mappings.csv  # TODO: all
    # Export mappings and size to globals.
    export_word_mappings_to_globals: True
    streams:
      inputs: answers
      outputs: answers_ids
    globals:
      vocabulary_size: vocabulary_size_c1234_binary_yn #  TODO: rename
      word_mappings: word_mappings_c1234_binary_yn

  ################# PIPE 1: SHARED QUESTION ENCODER #################

  # Model 1: question embeddings
  pipe1_question_embeddings:
    priority: 1.1
    type: SentenceEmbeddings
    embeddings_size: *question_encoder_embeddings_size_val
    pretrained_embeddings_file: *question_encoder_embeddings
    data_folder: ~/data/vqa-med
    word_mappings_file: questions.all.word.mappings.csv
    streams:
      inputs: tokenized_questions
      outputs: embedded_questions
    globals:
      embeddings_size: pipe1_embeddings_size

  # Model 2: question RNN
  pipe1_lstm:
    priority: 1.2
    type: RecurrentNeuralNetwork
    cell_type: GRU
    hidden_size: *question_encoder_lstm_size_val
    prediction_mode: Dense
    initial_state: Trainable
    use_logsoftmax: False
    output_last_state: False
    streams:
      inputs: embedded_questions
      predictions: question_activations
    globals:
      input_size: pipe1_embeddings_size
      prediction_size: question_encoder_output_size


  # Model 3: self attention for question.
  question_attention:
    priority: 1.3
    type: SelfAttention
    latent_size: *question_encoder_lstm_size_val
    num_attention_heads: 4
    streams:
      question_encodings: question_activations
      outputs: question_attention_activations
    globals:
      question_encoding_size: question_encoder_output_size
      output_size: question_attention_output_size

  ################# PIPE 2: SHARED IMAGE ENCODER #################

  # Image encoder.
  image_encoder:
    priority: 2.1
    type: GenericImageEncoder
    model_type: *image_encoder_model
    return_feature_maps: True
    streams:
      inputs: images
      outputs: feature_maps


  image_attention:
    priority: 2.2
    type: QuestionDrivenAttention
    dropout_rate: 0.3
    latent_size: 256
    output_mode: 'Image'
    num_attention_heads: 2
    streams:
      image_encodings: feature_maps
      question_encodings: question_attention_activations
      outputs: image_attention_activations
    globals:
      question_encoding_size: question_attention_output_size
      output_size: image_attention_output_size

  ################# PIPE 3: image-question fusion  #################
  # MFB.
  question_image_fusion:
    priority: 3.1
    type: *question_image_fusion_type
    dropout_rate: 0.5
    latent_size: 512
    pool_factor: 2
    streams:
      image_encodings: image_attention_activations
      question_encodings: question_attention_activations
      outputs: fused_activations
    globals:
      image_encoding_size: image_attention_output_size
      question_encoding_size: question_attention_output_size
      # output_size: image_attention_output_size #fused_activation_size

  ################# PIPE 4: SHARED IMAGE SIZE ENCODER #################

  # Model - image size classifier.
  image_size_encoder:
    priority: 4.1
    type: FeedForwardNetwork
    use_losfotmax: False
    streams:
      inputs: image_sizes
      predictions: image_size_activations
    globals:
      input_size: image_size_encoder_input_size
      prediction_size: image_size_encoder_output_size

  ################# PIPE 5: image-question-image size fusion #################

  # 5th subpipeline: concatenation
  concat:
    priority: 5.1
    type: ConcatenateTensor
    input_streams: [fused_activations,image_size_activations]
    # ConcatenateTensor
    dim: 1 # default
    input_dims: [[-1,*question_image_fusion_size_val],[-1,*image_size_encoder_output_size_val]]
    output_dims: [-1,*question_image_size_fusion_size_val]
    streams:
      outputs: concatenated_activations
    globals:
      output_size: concatenated_activations_size

  ################# PIPE 6: C1 + C2 + C3 questions #################

  # Model 4: FFN C123 answering
  pipe6_c123_answer_classifier:
    priority: 6.3
    type: FeedForwardNetwork
    hidden: *answer_classifier_hidden_sizes_val
    dropout_rate: 0.5
    streams:
      inputs: concatenated_activations
      predictions: pipe6_c123_predictions
    globals:
      input_size: concatenated_activations_size
      prediction_size: vocabulary_size_c1234_binary_yn

  pipe6_c123_nllloss:
    priority: 6.4
    type: NLLLoss
    targets_dim: 1
    streams:
      predictions: pipe6_c123_predictions
      targets: answers_ids
      loss: pipe6_c123_loss

  pipe6_c123_precision_recall:
    priority: 6.5
    type: PrecisionRecallStatistics
    use_word_mappings: True
    show_class_scores: True
    #show_confusion_matrix: True
    streams:
      predictions: pipe6_c123_predictions
      targets: answers_ids
    globals:
      word_mappings: word_mappings_c123_binary_yn
    statistics:
      precision: pipe6_c123_precision
      recall: pipe6_c123_recall
      f1score: pipe6_c123_f1score

  # C123 Predictions decoder.
  pipe5_c123_prediction_decoder:
    priority: 6.6
    type: WordDecoder
    # Use the same word mappings as label indexer.
    import_word_mappings_from_globals: True
    streams:
      inputs: pipe6_c123_predictions
      outputs: predicted_answers
    globals:
      word_mappings: word_mappings_c123_binary_yn

  ################# PIPE 7: MERGE ANSWERS #################

  # Viewers.
  viewer:
    priority: 7.3
    type: StreamViewer
    input_streams:
      tokenized_questions,
      category_names, pipe0_predicted_question_categories_names,
      answers, predicted_answers