IBM/pytorchpipe

View on GitHub
configs/vqa_med_2019/c2_classification/c2_class_lstm_resnet50_coattn_mfb_cat_is.yml

Summary

Maintainability
Test Coverage
# Load config defining tasks for training, validation and testing.
default_configs: vqa_med_2019/c2_classification/default_c2_classification.yml

training:
  task:
    batch_size: 64
    # Appy all preprocessing/data augmentations.
    question_preprocessing: lowercase,remove_punctuation,tokenize
    streams:
      # Task is returning tokenized questions.
      questions: tokenized_questions

validation:
  task:
    batch_size: 64
    # Appy all preprocessing/data augmentations.
    question_preprocessing: lowercase,remove_punctuation,tokenize
    streams:
      # Task is returning tokenized questions.
      questions: tokenized_questions


pipeline:

  global_publisher:
    priority: 0
    type: GlobalVariablePublisher
    # Add input_size to globals.
    keys: [question_encoder_output_size, question_attention_activation_size, image_attention_activation_size, pooling_activation_size]
    values: [200, 800, 4096, 512]

  ################# PIPE 0: question #################

  # Model 1: Embeddings
  question_embeddings:
    priority: 1.2
    type: SentenceEmbeddings
    embeddings_size: 100
    pretrained_embeddings_file: glove.6B.100d.txt
    data_folder: ~/data/vqa-med
    word_mappings_file: questions.all.word.mappings.csv
    streams:
      inputs: tokenized_questions
      outputs: embedded_questions

  # Model 2: RNN
  question_lstm:
    priority: 1.3
    type: RecurrentNeuralNetwork
    cell_type: GRU
    prediction_mode: Dense
    use_logsoftmax: False
    output_last_state: False
    initial_state: Trainable
    dropout_rate: 0.1
    hidden_size: 128
    streams:
      inputs: embedded_questions
      predictions: question_activations
    globals:
      input_size: embeddings_size
      prediction_size: question_encoder_output_size

  # Self Attention for question.
  question_attention:
    priority: 1.4
    type: SelfAttention
    latent_size: 128
    num_attention_heads: 4
    streams:
      question_encodings: question_activations
      outputs: question_attention_activations
    globals:
      question_encoding_size: question_encoder_output_size
      output_size: question_attention_activation_size

  ################# PIPE 2: image #################
  # Image encoder.
  image_encoder:
    priority: 2.1
    type: GenericImageEncoder
    model_type: resnet50
    return_feature_maps: True
    streams:
      inputs: images
      outputs: feature_maps


  image_attention:
    priority: 2.2
    type: QuestionDrivenAttention
    dropout_rate: 0.3
    latent_size: 1024
    output_mode: 'Image'
    num_attention_heads: 2
    streams:
      image_encodings: feature_maps
      question_encodings: question_attention_activations
      outputs: image_attention_activations
    globals:
      question_encoding_size: question_attention_activation_size
      output_size: image_attention_activation_size

  ################# PIPE 3: image-question fusion  #################
  # MFB
  question_image_fusion:
    priority: 3.1
    type: FactorizedBilinearPooling
    dropout_rate: 0.3
    latent_size: 512
    pool_factor: 2
    streams:
      image_encodings: image_attention_activations
      question_encodings: question_attention_activations
      outputs: pooling_activations
    globals:
      image_encoding_size: image_attention_activation_size
      question_encoding_size: question_attention_activation_size
      output_size: pooling_activation_size

  classifier:
    priority: 4.1
    type: FeedForwardNetwork
    hidden_sizes: [100]
    dropout_rate: 0.2
    streams:
      inputs: pooling_activations
    globals:
      input_size: pooling_activation_size
      prediction_size: vocabulary_size_c2


  #: pipeline