IBM/pytorchpipe

View on GitHub
configs/default/components/models/language/sentence_embeddings.yml

Summary

Maintainability
Test Coverage
# This file defines the default values for the Sentence Embeddings.

####################################################################
# 1. CONFIGURATION PARAMETERS that will be LOADED by the component.
####################################################################

# Folder where task will store data (LOADED)
data_folder: '~/data/'

# Source files that will be used to create the vocabulary  (LOADED)
source_vocabulary_files: ''

# Additional tokens that will be added to vocabulary (LOADED)
# This list can be extended, but <PAD> and <EOS> are special tokens.
# <PAD> is ALWAYS used for padding shorter sequences.
additional_tokens: '<PAD>'

# Enable <EOS> (end of sequence) token.
eos_token: False

export_pad_index_to_globals: False

# File containing word (LOADED)
word_mappings_file: 'word_mappings.csv'

# If set, component will always (re)generate the vocabulary (LOADED)
regenerate: False 

# Flag informing whether word mappings will be imported from globals (LOADED)
import_word_mappings_from_globals: False

# Flag informing whether word mappings will be exported to globals (LOADED)
export_word_mappings_to_globals: False

# Fixed padding length
# -1  -> For each batch, automatically pad to the length of the longest sequence of the batch
#        (variable from batch to batch)
# > 0 -> Pad each pad to the chosen length (fixed for all batches)
fixed_padding: -1

# File containing pretrained embeddings (LOADED)
# Empty means that no embeddings will be loaded.
# Options: 
# '' | glove.6B.50d.txt | glove.6B.100d.txt | glove.6B.200d.txt | glove.6B.300d.txt |
# glove.42B.300d.txt | glove.840B.300d.txt | glove.twitter.27B.txt | mimic.fastText.no_clean.300d.pickled
pretrained_embeddings_file: ''

streams: 
  ####################################################################
  # 2. Keymappings associated with INPUT and OUTPUT streams.
  ####################################################################

  # Stream containing batch of inputs (INPUT)
  inputs: inputs

  # Stream containing predictions (OUTPUT)
  predictions: predictions

globals:
  ####################################################################
  # 3. Keymappings of variables that will be RETRIEVED from GLOBALS.
  ####################################################################

  ####################################################################
  # 4. Keymappings associated with GLOBAL variables that will be SET.
  ####################################################################

  # The loaded/exported word mappings (RETRIEVED/SET)
  # This depends on the import/export configuration flags above.
  word_mappings: word_mappings

  # Size of the vocabulary (RETRIEVED/SET)
  # This depends on the import/export configuration flags above.
  vocabulary_size: vocabulary_size

  # Size of the embeddings (SET)
  # It is exported to globals, so other components can use it during
  # their initialization.
  embeddings_size: embeddings_size

  # Index of the <PAD> token
  # Will be set only if `export_pad_mapping_to_globals == True`
  pad_index: pad_index

  ####################################################################
  # 5. Keymappings associated with statistics that will be ADDED.
  ####################################################################