if hidden_size % self._num_heads != 0:
        raise ValueError(
            "The input size (%d) is not a multiple of the number of attention "
            "heads (%d)" % (hidden_size, self._num_heads))