kiwi/systems/encoders/xlm.py from Unbabel/OpenKiwi

kiwi/systems/encoders/xlm.py
Summary

Maintainability

1 day
Test Coverage

72%
Issues
Coverage
#  OpenKiwi: Open-Source Machine Translation Quality Estimation
#  Copyright (C) 2020 Unbabel <openkiwi@unbabel.com>
#
#  This program is free software: you can redistribute it and/or modify
#  it under the terms of the GNU Affero General Public License as published
#  by the Free Software Foundation, either version 3 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU Affero General Public License for more details.
#
#  You should have received a copy of the GNU Affero General Public License
#  along with this program.  If not, see <https://www.gnu.org/licenses/>.
#
import logging
from collections import Counter, OrderedDict
from pathlib import Path
from typing import Dict, Union

import torch
from more_itertools import one
from pydantic import validator
from torch import Tensor, nn
from transformers import (
    XLM_PRETRAINED_MODEL_ARCHIVE_LIST,
    AutoTokenizer,
    XLMConfig,
    XLMModel,
)
from transformers.tokenization_xlm import lowercase_and_remove_accent

from kiwi import constants as const
from kiwi.data.encoders.field_encoders import TextEncoder
from kiwi.data.vocabulary import Vocabulary
from kiwi.systems._meta_module import MetaModule
from kiwi.utils.data_structures import DefaultFrozenDict
from kiwi.utils.io import BaseConfig
from kiwi.utils.tensors import pieces_to_tokens, retrieve_tokens_mask, select_positions

logger = logging.getLogger(__name__)


class XLMTextEncoder(TextEncoder):
    def __init__(self, tokenizer_name):
        tokenizer = AutoTokenizer.from_pretrained(str(tokenizer_name))
        self.xlm_tokenizer = tokenizer

        def subtokenize(token):
            """Split a token into pieces.

            This is essentially the same as ``xlm_tokenizer._tokenize()`` but without
            tokenizing a sentence (according to given language), because QE data is
            already tokenized.
            """
            cleaned_token = lowercase_and_remove_accent([token])
            cleaned_token = one(cleaned_token)  # Must have a single element

            if cleaned_token:
                split_token = self.xlm_tokenizer.bpe(token).split(' ')
            else:
                logger.warning(
                    f'XLM tokenization for token "{token}" returned "{cleaned_token}"; '
                    f'replacing with "."'
                )
                split_token = ['.']

            return split_token

        super().__init__(
            subtokenize=subtokenize,
            pad_token=tokenizer.pad_token,
            unk_token=tokenizer.unk_token,
            bos_token=tokenizer.cls_token,  # we don't use xlm.bos_token on purpose
            eos_token=tokenizer.sep_token,
            specials_first=True,
            # extra options from fields?
            include_lengths=True,
            include_bounds=True,
        )

        self.vocab = Vocabulary(
            counter=Counter(),
            unk_token=self.unk_token,
            pad_token=self.pad_token,
            bos_token=self.bos_token,
            eos_token=self.eos_token,
            specials_first=self.specials_first,
        )
        self.vocab.stoi = DefaultFrozenDict(
            tokenizer.encoder, tokenizer.encoder[self.unk_token]
        )
        self.vocab.itos = tokenizer.decoder

    def fit_vocab(
        self,
        samples,
        vocab_size=None,
        vocab_min_freq=0,
        embeddings_name=None,
        keep_rare_words_with_embeddings=False,
        add_embeddings_vocab=False,
    ):
        logger.info(
            'Vocabulary already defined for XLM field; not fitting it to data now'
        )
        if vocab_size:
            self.vocab.max_size(vocab_size)


@MetaModule.register_subclass
class XLMEncoder(MetaModule):
    """XLM model using Hugging Face's transformers library.

    The following command was used to fine-tune XLM on the in-domain data (retrieved
    from .pth file)::

        python train.py --exp_name tlm_clm --dump_path './dumped/' \
            --data_path '/mnt/shared/datasets/kiwi/parallel/en_de_indomain' \
            --lgs 'ar-bg-de-el-en-es-fr-hi-ru-sw-th-tr-ur-vi-zh' \
            --clm_steps 'en-de,de-en' --mlm_steps 'en-de,de-en' \
            --reload_model 'models/mlm_tlm_xnli15_1024.pth' --encoder_only True \
            --emb_dim 1024 --n_layers 12 --n_heads 8 --dropout '0.1' \
            --attention_dropout '0.1' --gelu_activation true --batch_size 32 \
            --bptt 256 --optimizer
            'adam_inverse_sqrt,beta1=0.9,beta2=0.98,lr=0.0001,weight_decay=0' \
            --epoch_size 200000 --validation_metrics _valid_mlm_ppl --max_vocab 95000 \
            --tokens_per_batch 1200 --exp_id "5114"

    Old version was converted using hf-transformers util method::

        convert_xlm_checkpoint_to_pytorch(
            self.config.model_name / 'indomain.pth',
            self.config.model_name / 'finetuned_wmt_en-de'
        )

    Old settings in QE not really used for the best run and submission:

    .. code-block:: yaml

        fb-causal-lambda: 0.0
        fb-keep-prob: 0.1
        fb-mask-prob: 0.8
        fb-model: data/trained_models/fb_pretrain/xnli/indomain.pth
        fb-pred-prob: 0.15
        fb-rand-prob: 0.1
        fb-src-lang: en
        fb-tgt-lang: de
        fb-tlm-lambda: 0.0
        fb-vocab: data/trained_models/fb_pretrain/xnli/vocab_xnli_15.txt

    """

    class Config(BaseConfig):
        model_name: Union[str, Path] = 'xlm-mlm-tlm-xnli15-1024'
        """Pre-trained XLM model to use."""

        source_language: str = 'en'
        target_language: str = 'de'

        use_mismatch_features: bool = False
        """Use Alibaba's mismatch features."""

        use_predictor_features: bool = False
        """Use features originally proposed in the Predictor model."""

        interleave_input: bool = False
        """Concatenate SOURCE and TARGET without internal padding
        (111222000 instead of 111002220)"""

        freeze: bool = False
        """Freeze XLM during training."""

        use_mlp: bool = True
        """Apply a linear layer on top of XLM."""

        hidden_size: int = 100
        """Size of the linear layer on top of XLM."""

        @validator('model_name', pre=True)
        def fix_relative_path(cls, v):
            if v not in XLM_PRETRAINED_MODEL_ARCHIVE_LIST:
                v = Path(v)
                if not v.is_absolute():
                    v = Path.cwd().joinpath(v)
            return v

        @validator('use_mismatch_features', 'use_predictor_features', pre=True)
        def no_implementation(cls, v):
            if v:
                raise NotImplementedError('Not yet implemented')
            return False

    def __init__(
        self, vocabs: Dict[str, Vocabulary], config: Config, pre_load_model: bool = True
    ):
        super().__init__(config=config)

        if pre_load_model:
            self.xlm = XLMModel.from_pretrained(
                self.config.model_name, output_hidden_states=True
            )
        else:
            xlm_config = XLMConfig.from_pretrained(
                self.config.model_name, output_hidden_states=True
            )
            self.xlm = XLMModel(xlm_config)

        self.source_lang_id = self.xlm.config.lang2id.get(self.config.source_language)
        self.target_lang_id = self.xlm.config.lang2id.get(self.config.target_language)

        if None in (self.source_lang_id, self.target_lang_id):
            raise ValueError(
                f'Invalid lang_id for XLM model.'
                f' Valid ids are: {self.xlm.config.lang2id.keys()}'
            )

        self.mlp = None
        if self.config.use_mlp:
            self.mlp = nn.Sequential(
                nn.Linear(self.xlm.config.hidden_size, self.config.hidden_size),
                nn.Tanh(),
            )
            output_size = self.config.hidden_size
        else:
            output_size = self.xlm.config.hidden_size

        self._sizes = {
            const.TARGET: output_size,
            const.TARGET_LOGITS: output_size,
            const.TARGET_SENTENCE: 2 * output_size,
            const.SOURCE: output_size,
            const.SOURCE_LOGITS: output_size,
        }

        self.vocabs = {
            const.TARGET: vocabs[const.TARGET],
            const.SOURCE: vocabs[const.SOURCE],
        }

        self.output_embeddings = self.xlm.embeddings

        if self.config.freeze:
            for param in self.xlm.parameters():
                param.requires_grad = False

    def load_state_dict(
        self,
        state_dict: Union[Dict[str, Tensor], Dict[str, Tensor]],
        strict: bool = True,
    ):
        try:
            keys = super().load_state_dict(state_dict, strict)
        except RuntimeError as e:
            if "position_ids" in str(e):
                # FIXME: hack to get around Transformers 3.1 breaking changes
                # https://github.com/huggingface/transformers/issues/6882
                self.xlm.embeddings._non_persistent_buffers_set.add('position_ids')
                keys = super().load_state_dict(state_dict, strict)
                self.xlm.embeddings._non_persistent_buffers_set.discard('position_ids')
            else:
                raise e
        return keys

    @classmethod
    def input_data_encoders(cls, config: Config):
        return {
            const.SOURCE: XLMTextEncoder(tokenizer_name=config.model_name),
            const.TARGET: XLMTextEncoder(tokenizer_name=config.model_name),
        }

    def size(self, field=None):
        if field:
            return self._sizes[field]
        return self._sizes

    def forward(
        self,
        batch_inputs,
        *args,
        include_target_logits=False,
        include_source_logits=False,
    ):
        # XLM gets it's input as a concatenation of both embeddings
        # or as an interleave of inputs
        if self.config.interleave_input:
            merge_input_fn = self.interleave_input
        else:
            merge_input_fn = self.concat_input

        input_ids, _, attention_mask, position_ids, lang_ids = merge_input_fn(
            batch_a=batch_inputs[const.SOURCE],
            batch_b=batch_inputs[const.TARGET],
            pad_id=self.vocabs[const.TARGET].pad_id,
            lang_a=self.source_lang_id,
            lang_b=self.target_lang_id,
        )

        # encoded_layers also includes the embedding layer
        # encoded_layers[-1] is the last layer
        last_layer, encoded_layers = self.xlm(
            input_ids=input_ids,
            attention_mask=attention_mask,
            langs=lang_ids,
            position_ids=position_ids,
        )

        # TODO: select one of these strategies via cli
        # TODO: get a BETTER strategy
        # features = sum(encoded_layers[-5:-1])
        # features = encoded_layers[-2]
        features = last_layer

        if self.config.use_mlp:
            features = self.mlp(features)

        # Build the feature dictionary to be returned to the system
        output_features = self.split_outputs(
            features,
            batch_inputs,
            interleaved=self.config.interleave_input,
            label_a=const.SOURCE,
            label_b=const.TARGET,
        )

        # Convert pieces to tokens
        output_features[const.TARGET] = pieces_to_tokens(
            output_features[const.TARGET], batch_inputs[const.TARGET]
        )
        output_features[const.SOURCE] = pieces_to_tokens(
            output_features[const.SOURCE], batch_inputs[const.SOURCE]
        )
        source_len = batch_inputs[const.SOURCE].bounds_lengths
        target_len = batch_inputs[const.TARGET].bounds_lengths

        # NOTE: assuming here that features is already split into target and source
        source_features = output_features[const.SOURCE]
        target_features = output_features[const.TARGET]

        # Sentence-level features
        sentence_target_features = target_features[:, 0].unsqueeze(
            1
        ) + select_positions(target_features, (target_len - 1).unsqueeze(1))
        sentence_source_features = source_features[:, 0].unsqueeze(
            1
        ) + select_positions(source_features, (source_len - 1).unsqueeze(1))
        sentence_features = torch.cat(
            (sentence_target_features, sentence_source_features), dim=-1
        )

        output_features[const.TARGET_SENTENCE] = sentence_features
        output_features[const.TARGET] = target_features
        output_features[const.SOURCE] = source_features

        # Logits for multi-task fine-tuning
        if include_target_logits:
            output_features[const.TARGET_LOGITS] = torch.einsum(
                'vh,bsh->bsv',
                self.output_embeddings.weight,
                output_features[const.TARGET],
            )
        if include_source_logits:
            output_features[const.SOURCE_LOGITS] = torch.einsum(
                'vh,bsh->bsv',
                self.output_embeddings.weight,
                output_features[const.SOURCE],
            )

        # Additional features
        if self.config.use_mismatch_features:
            raise NotImplementedError

        return output_features

    @staticmethod
    def concat_input(batch_a, batch_b, pad_id, lang_a=None, lang_b=None):
        """Concatenate tensors of two batches into one tensor.

        Return:
            the concatenation, a mask of types (a as zeroes and b as ones)
                and concatenation of attention_mask.
        """
        ids_a = batch_a.tensor
        ids_b = batch_b.tensor
        attention_mask_a = retrieve_tokens_mask(batch_a)
        attention_mask_b = retrieve_tokens_mask(batch_b)
        types_a = torch.zeros_like(ids_a)
        types_b = torch.ones_like(ids_b)
        position_ids_a = torch.arange(
            ids_a.size(1), dtype=torch.long, device=ids_a.device
        )
        position_ids_a = position_ids_a.unsqueeze(0).expand(ids_a.size())
        position_ids_b = torch.arange(
            ids_b.size(1), dtype=torch.long, device=ids_b.device
        )
        position_ids_b = position_ids_b.unsqueeze(0).expand(ids_b.size())

        input_ids = torch.cat((ids_a, ids_b), dim=1)
        token_type_ids = torch.cat((types_a, types_b), dim=1)
        attention_mask = torch.cat((attention_mask_a, attention_mask_b), dim=1)
        position_ids = torch.cat((position_ids_a, position_ids_b), dim=1)

        if lang_a is not None and lang_b is not None:
            lang_id_a = torch.ones_like(ids_a) * lang_a
            lang_id_b = torch.ones_like(ids_b) * lang_b
            lang_ids = torch.cat((lang_id_a, lang_id_b), dim=1)
            # lang_ids *= attention_mask.unsqueeze(-1).to(lang_ids.dtype)
            lang_ids *= attention_mask.to(lang_ids.dtype)

            return input_ids, token_type_ids, attention_mask, position_ids, lang_ids

        return input_ids, token_type_ids, attention_mask, position_ids

    @staticmethod
    def interleave_input(batch_a, batch_b, pad_id, lang_a=None, lang_b=None):
        """Interleave the source + target embeddings into one tensor.

        This means making the input as [batch, target [SEP] source].

        Return:
            interleave of embds, mask of target (as zeroes) and source (as ones)
                and concatenation of attention_mask.
        """
        ids_a = batch_a.tensor
        ids_b = batch_b.tensor

        batch_size = ids_a.size(0)

        lengths_a = batch_a.lengths
        lengths_b = batch_b.lengths

        # max_pair_length = ids_a.size(1) + ids_b.size(1)
        max_pair_length = lengths_a + lengths_b

        input_ids = torch.full(
            (batch_size, max_pair_length),
            pad_id,
            dtype=ids_a.dtype,
            device=ids_a.device,
        )
        token_type_ids = torch.zeros_like(input_ids)
        attention_mask = torch.zeros_like(input_ids)

        for i in range(batch_size):
            # <s> and </s> are included in the mask (=1)
            len_a = lengths_a[i].item()
            len_b = lengths_b[i].item()

            input_ids[i, :len_b] = ids_b[i, :len_b]
            token_type_ids[i, :len_b] = 0
            attention_mask[i, :len_b] = 1

            input_ids[i, len_b : len_b + len_a] = ids_a[i, :len_a]
            token_type_ids[i, len_b : len_b + len_a] = 1
            attention_mask[i, len_b : len_b + len_a] = 1

        # TODO, why is attention mask 1 for all positions?
        return input_ids, token_type_ids, attention_mask

    @staticmethod
    def split_outputs(
        features: torch.Tensor,
        batch_inputs,
        interleaved: bool = False,
        label_a: str = const.SOURCE,
        label_b: str = const.TARGET,
    ):
        """Split contexts to get tag_side outputs.

        Arguments:
            features (tensor): XLM output: <s> source </s> </s> target </s>
                Shape of (bs, 1 + source_len + 2 + target_len + 1, 2)
            batch_inputs:
            interleaved (bool): whether the concat strategy was 'interleaved'.
            label_a: dictionary key for sequence A in ``features``.
            label_b: dictionary key for sequence B in ``features``.

        Return:
            dict of tensors, one per tag side.
        """
        outputs = OrderedDict()

        if interleaved:
            raise NotImplementedError('interleaving not supported.')
            # TODO: fix code below to use the lengths information and not bounds
            # if interleaved, shift each source sample by its correspondent length
            lengths_a = batch_inputs[const.TARGET].lengths
            shift = lengths_a.unsqueeze(-1)

            range_vector = torch.arange(
                features.size(0), device=features.device
            ).unsqueeze(1)

            target_bounds = batch_inputs[const.TARGET].bounds
            features_a = features[range_vector, target_bounds]
            # Shift bounds by target length and preserve padding
            source_bounds = batch_inputs[const.SOURCE].bounds
            m = (source_bounds != -1).long()  # for masking out padding (which is -1)
            shifted_bounds = (source_bounds + shift) * m + source_bounds * (1 - m)
            features_b = features[range_vector, shifted_bounds]
        else:
            # otherwise, shift all by max_length
            lengths_a = batch_inputs[label_a].lengths
            # if we'd like to maintain the word pieces we merely select all
            features_a = features[:, : lengths_a.max()]
            features_b = features[:, lengths_a.max() :]

        outputs[label_a] = features_a
        outputs[label_b] = features_b

        return outputs