official/nlp/data/create_xlnet_pretraining_data.py from tensorflow/models

official/nlp/data/create_xlnet_pretraining_data.py
Summary

Maintainability

3 days
Test Coverage

Issues
# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Create LM TF examples for XLNet."""

import dataclasses
import json
import math
import os

import random
from typing import Iterable, Mapping, List, Optional, Tuple
import unicodedata

# Import libraries

from absl import app
from absl import flags
from absl import logging

import numpy as np
import tensorflow as tf, tf_keras

from official.nlp.tools import tokenization

special_symbols = {
    "<unk>": 0,
    "<s>": 1,
    "</s>": 2,
    "<cls>": 3,
    "<sep>": 4,
    "<pad>": 5,
    "<mask>": 6,
    "<eod>": 7,
    "<eop>": 8,
}

FLAGS = flags.FLAGS

flags.DEFINE_integer("seq_length", 512,
                     help="Sequence length.")
flags.DEFINE_integer("reuse_length", 256,
                     help="Number of token that can be reused as memory. "
                     "Could be half of `seq_len`.")
flags.DEFINE_string("input_file", None,
                    "Input raw text file (or comma-separated list of files).")
flags.DEFINE_string(
    "save_dir", None,
    "Directory for saving processed data.")
flags.DEFINE_string("sp_model_file", "",
                    "The path to the model used by sentence piece tokenizer.")
flags.DEFINE_bool("use_eod_token", True,
                  "Whether or not to include EOD tokens.")
flags.DEFINE_bool("bi_data", True, "Whether or not to use bi-directional data.")
flags.DEFINE_bool(
    "do_lower_case", True,
    "Whether to lower case the input text. Should be True for uncased "
    "models and False for cased models.")
flags.DEFINE_integer("per_host_batch_size", 32, "Batch size per host.")
flags.DEFINE_integer("num_cores_per_host", 16,
                     "The number of (TPU) cores per host.")
flags.DEFINE_string("prefix", "", "Filename prefix.")
flags.DEFINE_string("suffix", "", "Filename suffix.")

flags.DEFINE_integer("task_id", None,
                     "The id of the current task.")
flags.DEFINE_integer("num_tasks", None,
                     "The total number of tasks.")
flags.DEFINE_integer("num_passes", 1, "The number of times to run the script.")


@dataclasses.dataclass
class TrainingInstance:
  """Representation of a single XLNet Pretraining instance."""
  data: Iterable[int]
  segment_ids: Iterable[int]
  boundary_indices: Iterable[int]
  label: int

  def to_feature(self) -> Mapping[str, tf.train.Feature]:
    feat = lambda x: tf.train.Feature(int64_list=tf.train.Int64List(value=x))
    return dict(
        input_word_ids=feat(self.data),
        input_type_ids=feat(self.segment_ids),
        boundary_indices=feat(self.boundary_indices),
        label=feat([self.label]))

  def to_example(self) -> tf.train.Example:
    return tf.train.Example(
        features=tf.train.Features(feature=self.to_feature()))

  def __str__(self):
    def seq_to_str(seq):
      return " ".join([str(x) for x in seq])

    s = ""
    s += "tokens: %s\n" % seq_to_str(self.data)
    s += "segment_ids: %s\n" % seq_to_str(self.segment_ids)
    s += "boundary_indices: %s\n" % seq_to_str(self.boundary_indices)
    s += "label: %s\n" % self.label
    s += "\n"
    return s

  def __repr__(self):
    return self.__str__()


def _preprocess_line(line: str, do_lower_case: bool = False) -> str:
  """Preprocesses an individual raw text line.

  This function will:
    - Remove extraneous spaces.
    - Replace `` with ", and '' with ".
    - Replaces accents.
    - Applies lower casing.

  Args:
    line: The input line to preprocess.
    do_lower_case: Whether or not to lower case the text.

  Returns:
    The preprocessed line.

  """
  line = " ".join(line.split())
  line = line.replace("``", "\"").replace("''", "\"")

  # Replace accents.
  line = unicodedata.normalize("NFKD", line)
  line = "".join([c for c in line if not unicodedata.combining(c)])

  if do_lower_case:
    line = line.lower()
  return line


def preprocess_and_tokenize_input_files(
    input_files: Iterable[str],
    tokenizer: tokenization.FullSentencePieceTokenizer,
    use_eod: bool = True,
    do_lower_case: bool = False,
    log_example_freq: int = 100000) -> List[Tuple[np.array, np.array]]:
  """Preprocesses and encodes raw text from input files.

  This function preprocesses raw text and encodes them into tokens using a
  `SentencePieceModel` tokenization method. This also provides the sentence
  indicator for each token.

  Args:
    input_files: The list of input file names.
    tokenizer: The SentencePiece tokenizer that has the attribute `sp_model`.
    use_eod: Whether or not to use an EOD indicator. If `False`, then EOD is
      not included.
    do_lower_case: Whether or not to apply lower casing during raw text
      preprocessing.
    log_example_freq: The optional field for how many lines to process before
      emitting an info log.

  Returns:
    The preprocessed list. Each entry in the list is a tuple consisting of
    the token IDs and the sentence IDs.

  """
  all_data = []
  eod_symbol = special_symbols["<eod>"]

  total_number_of_lines = 0

  # Input file format:
  # (1) One sentence per line. These should ideally be actual sentences, not
  # entire paragraphs or arbitrary spans of text. (Because we use the
  # sentence boundaries for the "next sentence prediction" task).
  # (2) Blank lines between documents. Document boundaries are needed so
  # that the "next sentence prediction" task doesn't span between documents.
  for input_file in input_files:
    line_count = 0
    logging.info("Preprocessing %s", input_file)

    all_tokens = []
    all_sentence_ids = []

    sentence_id = True

    with tf.io.gfile.GFile(input_file, "rb") as reader:
      while True:
        line = tokenization.convert_to_unicode(reader.readline())
        if not line:
          break

        line_count += 1
        if line_count % log_example_freq == 0:
          logging.info("Loading line %d", line_count)

        line = line.strip()

        if not line:
          if use_eod:
            token_ids = [eod_symbol]
            sentence_id = not sentence_id
          else:
            continue
        else:
          preprocessed_line = _preprocess_line(
              line=line, do_lower_case=do_lower_case)
          token_ids = tokenization.encode_ids(
              sp_model=tokenizer.sp_model, text=preprocessed_line)

        all_tokens.extend(token_ids)
        all_sentence_ids.extend([sentence_id] * len(token_ids))
        sentence_id = not sentence_id
      logging.info("Finished processing %s. Number of lines: %d",
                   input_file, line_count)
      if line_count == 0:
        continue
      total_number_of_lines += line_count
      all_tokens = np.array(all_tokens, dtype=np.int64)
      all_sentence_ids = np.array(all_sentence_ids, dtype=bool)
      all_data.append((all_tokens, all_sentence_ids))

  logging.info("Completed text preprocessing. Total number of lines: %d",
               total_number_of_lines)
  return all_data


def _reshape_to_batch_dimensions(
    tokens: np.array,
    sentence_ids: np.array,
    per_host_batch_size: int) -> Tuple[np.array, np.array]:
  """Truncates and reshapes input data with a batch major dimension.

  Args:
    tokens: The input token ids. This should have the same shape as
      `sentence_ids`.
    sentence_ids: The input sentence ids. This should have the same shape as
      `token_ids`.
    per_host_batch_size: The target per-host batch size.

  Returns:
    The tuple of reshaped tokens and sentence_ids.
  """
  num_steps = len(tokens) // per_host_batch_size
  truncated_data_length = num_steps * per_host_batch_size

  logging.info("per_host_batch_size: %d", per_host_batch_size)
  logging.info("num_steps: %d", num_steps)
  def truncate_and_reshape(a):
    return a[:truncated_data_length].reshape((per_host_batch_size, num_steps))

  return (truncate_and_reshape(tokens), truncate_and_reshape(sentence_ids))


def _create_a_and_b_segments(
    tokens: np.array,
    sentence_ids: np.array,
    begin_index: int,
    total_length: int,
    no_cut_probability: float = 0.5):
  """Splits segments A and B from a single instance of tokens and sentence ids.

  Args:
    tokens: The 1D input token ids. This represents an individual entry within a
      batch.
    sentence_ids: The 1D input sentence ids. This represents an individual entry
      within a batch. This should be the same length as `tokens`.
    begin_index: The reference beginning index to split data.
    total_length: The target combined length of segments A and B.
    no_cut_probability: The probability of not cutting a segment despite
      a cut possibly existing.

  Returns:
    A tuple consisting of A data, B data, and label.

  """
  data_length = tokens.shape[0]
  if begin_index + total_length >= data_length:
    logging.info("[_create_segments]: begin_index %d + total_length %d >= "
                 "data_length %d", begin_index, total_length, data_length)
    return None

  end_index = begin_index + 1
  cut_indices = []

  # Identify all indices where sentence IDs change from one to the next.
  while end_index < data_length:
    if sentence_ids[end_index] != sentence_ids[end_index - 1]:
      if end_index - begin_index >= total_length:
        break
      cut_indices.append(end_index)
    end_index += 1

  a_begin = begin_index

  if not cut_indices or random.random() < no_cut_probability:
    # Segments A and B are contained within the same sentence.
    label = 0
    if not cut_indices:
      a_end = end_index
    else:
      a_end = random.choice(cut_indices)
    b_length = max(1, total_length - (a_end - a_begin))
    b_begin = random.randint(0, data_length - 1 - b_length)
    b_end = b_begin + b_length

    while b_begin > 0 and sentence_ids[b_begin - 1] == sentence_ids[b_begin]:
      b_begin -= 1
    while (b_end < data_length - 1 and
           sentence_ids[b_end - 1] == sentence_ids[b_end]):
      b_end += 1
  else:
    # Segments A and B are different sentences.
    label = 1
    a_end = random.choice(cut_indices)
    b_begin = a_end
    b_end = end_index

  while a_end - a_begin + b_end - b_begin > total_length:
    if a_end - a_begin > b_end - b_begin:
      # Delete only the right side for the LM objective.
      a_end -= 1
    else:
      b_end -= 1
  if a_end >= data_length or b_end >= data_length:
    logging.info("[_create_segments]: a_end %d or b_end %d >= data_length %d",
                 a_end, b_end, data_length)
    return None

  a_data = tokens[a_begin: a_end]
  b_data = tokens[b_begin: b_end]
  return a_data, b_data, label


def _is_functional_piece(piece: str) -> bool:
  return piece != "<unk>" and piece.startswith("<") and piece.endswith(">")


def _is_start_piece(piece: str) -> bool:
  special_pieces = set(list('!"#$%&\"()*+,-./:;?@[\\]^_`{|}~'))
  if (piece.startswith("▁") or piece in special_pieces):
    return True
  else:
    return False


def _get_boundary_indices(
    data: np.array,
    tokenizer: tokenization.FullSentencePieceTokenizer) -> np.array:
  """Gets the boundary indices of whole words."""
  seq_length = len(data)
  boundary_indices = []
  for index, piece in enumerate(tokenizer.convert_ids_to_tokens(data.tolist())):
    if _is_start_piece(piece) and not _is_functional_piece(piece):
      boundary_indices.append(index)
  boundary_indices.append(seq_length)
  return boundary_indices


def _convert_tokens_to_instances(
    tokens: np.array,
    sentence_ids: np.array,
    per_host_batch_size: int,
    seq_length: int,
    reuse_length: int,
    bi_data: bool,
    tokenizer: tokenization.FullSentencePieceTokenizer,
    num_cores_per_host: int = 0,
    logging_frequency: int = 500) -> List[TrainingInstance]:
  """Converts tokens and sentence IDs into individual training instances.

  The format of data in the XLNet pretraining task is very similar to the
  BERT pretraining task. Two segments A and B are randomly sampled, and the
  contatenation of A and B into a single sequence is used to perform
  language modeling.

  To create an XLNet Pretraining instance from a single long sequence, S:
  - Create a segment of length `reuse_length`. This first segment represents
    past tokens. During modeling, this segment is used to cache obtained
    content representations for the segment recurrence mechanism.
  - Similar to BERT, create a segment of length `seq_length` - `reuse_length`
    composed of A and B segments.
    For XLNet, the order is "A", "SEP", "B", "SEP", "CLS".

  Args:
    tokens: All tokens concatenated into a single list.
    sentence_ids: All sentence IDs concatenated into a single list.
    per_host_batch_size: The target batch size per host.
    seq_length: The max sequence length.
    reuse_length: The number of tokens to use from the previous segment.
    bi_data: Whether or not to use bidirectional data.
    tokenizer: The SentencePiece tokenizer that has the attribute `sp_model`.
    num_cores_per_host: The number of cores per host. This is required if
      `bi_data` = `True`.
    logging_frequency: The frequency at which to log status updates.

  Returns:
    A list of `TrainingInstance` objects.
  """
  instances = []

  per_core_batch_size = (per_host_batch_size // num_cores_per_host
                         if bi_data else None)

  if bi_data:
    logging.info("Bi-directional data enabled.")
    assert per_host_batch_size % (2 * num_cores_per_host) == 0
    forward_tokens, forward_sentence_ids = _reshape_to_batch_dimensions(
        tokens=tokens,
        sentence_ids=sentence_ids,
        per_host_batch_size=per_host_batch_size // 2)
    forward_data_shape = (num_cores_per_host, 1, per_core_batch_size // 2, -1)

    forward_tokens = forward_tokens.reshape(forward_data_shape)
    forward_sentence_ids = forward_sentence_ids.reshape(forward_data_shape)

    backwards_tokens = forward_tokens[:, :, :, ::-1]
    backwards_sentence_ids = forward_sentence_ids[:, :, :, ::-1]

    tokens = np.concatenate([forward_tokens, backwards_tokens], 1).reshape(
        per_host_batch_size, -1)
    sentence_ids = np.concatenate(
        [forward_sentence_ids, backwards_sentence_ids]).reshape(
            per_host_batch_size, -1)
  else:
    logging.info("Bi-directional data disabled.")
    tokens, sentence_ids = _reshape_to_batch_dimensions(
        tokens=tokens,
        sentence_ids=sentence_ids,
        per_host_batch_size=per_host_batch_size)

  logging.info("Tokens shape: %s", tokens.shape)

  data_length = tokens.shape[1]
  sep = np.array([special_symbols["<sep>"]], dtype=np.int64)
  cls = np.array([special_symbols["<cls>"]], dtype=np.int64)
  # 2 sep, 1 cls
  num_special_tokens = 3

  data_index = 0
  batch_number = 0
  step_size = reuse_length if reuse_length else seq_length
  num_batches = math.ceil(data_length / step_size)

  while data_index + seq_length <= data_length:
    if batch_number % logging_frequency == 0:
      logging.info("Processing batch %d of %d", batch_number, num_batches)

    for batch_index in range(per_host_batch_size):
      previous_segment_tokens = tokens[
          batch_index, data_index: data_index + reuse_length]

      results = _create_a_and_b_segments(
          tokens=tokens[batch_index],
          sentence_ids=sentence_ids[batch_index],
          begin_index=data_index + reuse_length,
          total_length=seq_length - reuse_length - num_special_tokens)

      if results is None:
        logging.info("Stopping at data index: %d", data_index)
        break
      a_data, b_data, label = results

      data = np.concatenate(
          [previous_segment_tokens, a_data, sep, b_data, sep, cls])
      a_length = a_data.shape[0]
      b_length = b_data.shape[0]
      segment_ids = ([0] * (reuse_length + a_length) + [0]
                     + [1] * b_length + [1] + [2])
      boundary_indices = _get_boundary_indices(tokenizer=tokenizer,
                                               data=data)
      assert len(data) == seq_length
      assert len(segment_ids) == seq_length
      assert len(boundary_indices) > 0  # pylint: disable=g-explicit-length-test

      instances.append(TrainingInstance(
          data=data,
          segment_ids=segment_ids,
          boundary_indices=boundary_indices,
          label=label))
    batch_number += 1
    data_index += step_size
  return instances


def write_instances_to_tfrecord(
    instances: Iterable[TrainingInstance],
    save_path: str):
  """Writes instances to TFRecord."""
  record_writer = tf.io.TFRecordWriter(save_path)
  logging.info("Start writing to %s.", save_path)

  for i, instance in enumerate(instances):
    if i < 5:
      logging.info("Instance %d: %s", i, str(instance))
    record_writer.write(instance.to_example().SerializeToString())

  record_writer.close()
  logging.info("Done writing %s.", save_path)


def shuffle_and_combine_preprocessed_data(
    all_data: List[Tuple[np.array, np.array]]) -> Tuple[np.array, np.array]:
  """Shuffles and combines preprocessed token/sentence IDs from documents."""
  document_permutation = np.random.permutation(len(all_data))

  previous_sentence_id = None

  all_tokens, all_sentence_ids = [], []
  for document_index in document_permutation:
    tokens, sentence_ids = all_data[document_index]
    # pylint: disable=g-explicit-length-test
    if len(tokens) == 0:
      continue
    if (previous_sentence_id is not None and
        sentence_ids[0] == previous_sentence_id):
      sentence_ids = np.logical_not(sentence_ids)

    all_tokens.append(tokens)
    all_sentence_ids.append(sentence_ids)

    previous_sentence_id = sentence_ids[-1]

  return np.concatenate(all_tokens), np.concatenate(all_sentence_ids)


def get_tfrecord_name(
    per_host_batch_size: int,
    num_cores_per_host: int,
    seq_length: int,
    bi_data: bool,
    reuse_length: int,
    do_lower_case: bool,
    use_eod_token: bool,
    prefix: str = "",
    suffix: str = "",
    pass_id: int = 0,
    num_passes: int = 1,
    task_id: int = None,
    num_tasks: int = None) -> str:
  """Formats the resulting TFRecord name based on provided inputs."""
  components = []
  if prefix:
    components.append(prefix)
  components.append("seqlen-{}".format(seq_length))
  if reuse_length == 0:
    components.append("memless")
  else:
    components.append("reuse-{}".format(reuse_length))
  components.append("bs-{}".format(per_host_batch_size))
  components.append("cores-{}".format(num_cores_per_host))

  if do_lower_case:
    components.append("uncased")
  else:
    components.append("cased")
  if use_eod_token:
    components.append("eod")
  if bi_data:
    components.append("bi")
  else:
    components.append("uni")

  if suffix:
    components.append(suffix)

  s = "_".join(components) + ".tfrecord"
  if num_passes == 1 and task_id is None:
    return s

  if task_id is None:
    num_tasks = 1
    task_id = 0

  current_shard = task_id * num_passes + pass_id
  total_shards = num_tasks * num_passes
  return s + "-{}-of-{}".format(current_shard, total_shards)


def create_tfrecords(
    tokenizer: tokenization.FullSentencePieceTokenizer,
    input_file_or_files: str,
    use_eod_token: bool,
    do_lower_case: bool,
    per_host_batch_size: int,
    seq_length: int,
    reuse_length: int,
    bi_data: bool,
    num_cores_per_host: int,
    save_dir: str,
    prefix: str = "",
    suffix: str = "",
    num_tasks: Optional[int] = None,
    task_id: Optional[int] = None,
    num_passes: int = 1):
  """Runs the end-to-end preprocessing pipeline."""

  logging.info("Input configuration:")
  logging.info("input file(s): %s", input_file_or_files)
  logging.info("use_eod_token: %s", use_eod_token)
  logging.info("do_lower_case: %s", do_lower_case)
  logging.info("per_host_batch_size: %d", per_host_batch_size)
  logging.info("seq_length: %d", seq_length)
  logging.info("reuse_length: %d", reuse_length)
  logging.info("bi_data: %s", bi_data)
  logging.info("num_cores_per_host: %d", num_cores_per_host)
  logging.info("save_dir: %s", save_dir)
  if task_id is not None and num_tasks is not None:
    logging.info("task_id: %d", task_id)
    logging.info("num_tasks: %d", num_tasks)

  input_files = []
  for input_pattern in input_file_or_files.split(","):
    input_files.extend(tf.io.gfile.glob(input_pattern))

  logging.info("*** Reading from input files ***")
  for input_file in input_files:
    logging.info("  %s", input_file)

  logging.info("Shuffling the files with a fixed random seed.")
  np.random.shuffle(input_files)
  if num_tasks is not None:
    assert task_id is not None
    logging.info("Total number of input files: %d", len(input_files))
    logging.info("Splitting into %d shards of %d files each.",
                 num_tasks, len(input_files) // num_tasks)
    input_files = input_files[task_id::num_tasks]

  all_data = preprocess_and_tokenize_input_files(
      input_files=input_files,
      tokenizer=tokenizer,
      use_eod=use_eod_token,
      do_lower_case=do_lower_case)
  for pass_id in range(num_passes):
    logging.info("Beginning pass %d of %d", pass_id, num_passes)
    tokens, sentence_ids = shuffle_and_combine_preprocessed_data(all_data)

    assert len(tokens) == len(sentence_ids)

    filename = get_tfrecord_name(
        per_host_batch_size=per_host_batch_size,
        num_cores_per_host=num_cores_per_host,
        seq_length=seq_length,
        bi_data=bi_data,
        use_eod_token=use_eod_token,
        reuse_length=reuse_length,
        do_lower_case=do_lower_case,
        prefix=prefix,
        suffix=suffix,
        pass_id=pass_id,
        num_passes=num_passes,
        num_tasks=num_tasks,
        task_id=task_id)
    save_path = os.path.join(save_dir, filename)
    if os.path.exists(save_path):
      # If the path already exists, then we were probably preempted but
      # previously wrote this file.
      logging.info("%s already exists, skipping this batch.", save_path)
    else:
      instances = _convert_tokens_to_instances(
          tokenizer=tokenizer,
          tokens=tokens,
          sentence_ids=sentence_ids,
          per_host_batch_size=per_host_batch_size,
          seq_length=seq_length,
          reuse_length=reuse_length,
          bi_data=bi_data,
          num_cores_per_host=num_cores_per_host)
      write_instances_to_tfrecord(instances=instances, save_path=save_path)

  if task_id is None or task_id == 0:
    corpus_info = {
        "vocab_size": 32000,
        "per_host_batch_size": per_host_batch_size,
        "num_cores_per_host": num_cores_per_host,
        "seq_length": seq_length,
        "reuse_length": reuse_length,
        "do_lower_case": do_lower_case,
        "bi_data": bi_data,
        "use_eod_token": use_eod_token,
    }
    corpus_fname = os.path.basename(filename) + ".json"
    corpus_destination = os.path.join(save_dir, corpus_fname)
    logging.info("Saving corpus info to %s", corpus_destination)

    with tf.io.gfile.GFile(corpus_destination, "w") as fp:
      json.dump(corpus_info, fp)


def main(_):
  tokenizer = tokenization.FullSentencePieceTokenizer(FLAGS.sp_model_file)
  create_tfrecords(
      tokenizer=tokenizer,
      input_file_or_files=FLAGS.input_file,
      use_eod_token=FLAGS.use_eod_token,
      do_lower_case=FLAGS.do_lower_case,
      per_host_batch_size=FLAGS.per_host_batch_size,
      seq_length=FLAGS.seq_length,
      reuse_length=FLAGS.reuse_length,
      bi_data=FLAGS.bi_data,
      num_cores_per_host=FLAGS.num_cores_per_host,
      save_dir=FLAGS.save_dir,
      prefix=FLAGS.prefix,
      suffix=FLAGS.suffix,
      num_tasks=FLAGS.num_tasks,
      task_id=FLAGS.task_id,
      num_passes=FLAGS.num_passes)


if __name__ == "__main__":
  np.random.seed(0)
  logging.set_verbosity(logging.INFO)
  app.run(main)