kiwi/assets/config/train.yaml
run:
###################################################
# Generic configurations options related to
# handling of experiments
experiment_name: BERT WMT20 EN-DE
seed: 42
use_mlflow: false
trainer:
###################################################
# Generic options related to the training process
# that apply to all models
deterministic: true
gpus: -1
epochs: 10
main_metric:
- WMT19_MCC
- PEARSON
gradient_max_norm: 1.
gradient_accumulation_steps: 1
# Control the model precision, see
# https://pytorch-lightning.readthedocs.io/en/stable/amp.html
# for more info on the configuration options
# Fast mixed precision by default:
amp_level: O2
precision: 16
log_interval: 100
checkpoint:
validation_steps: 0.2
early_stop_patience: 10
defaults:
###################################################
# Example of composition of configuration files
# this config is sourced from /config/data/wmt20.qe.en_de.yaml
- data: wmt20.qe.en_de
system:
###################################################
# System configs are responsible for all the system
# specific configurations. From model settings to
# optimizers and specific processing options.
# All configs must have either `class_name` or `load`
class_name: Bert
batch_size: 8
num_data_workers: 4
model:
################################################
# Modeling options. These can change a lot about
# the architecture of the system. With many configuration
# options adding (or removing) layers.
encoder:
model_name: bert-base-multilingual-cased
use_mlp: false
freeze: false
decoder:
hidden_size: 768
bottleneck_size: 768
dropout: 0.1
outputs:
####################################################
# Output options configure the downstream tasks the
# model will be trained on by adding specific layers
# responsible for transforming decoder features into
# predictions.
word_level:
target: true
gaps: true
source: true
class_weights:
target_tags:
BAD: 3.0
gap_tags:
BAD: 5.0
source_tags:
BAD: 3.0
sentence_level:
hter: true
use_distribution: false
binary: false
n_layers_output: 2
sentence_loss_weight: 1
tlm_outputs:
fine_tune: false
optimizer:
class_name: adamw
learning_rate: 1e-5
warmup_steps: 0.1
training_steps: 12000
data_processing:
share_input_fields_encoders: true