data_processing/convert_math.py from IlyaGusev/rulm

data_processing/convert_math.py
Summary

Maintainability

0 mins
Test Coverage

Issues
import sys
import zipfile
import os
import random
import json
from collections import Counter

from tqdm import tqdm
from bs4 import BeautifulSoup

from data_processing.util import gen_batch, PlainArchive


input_path = sys.argv[1]
output_path = sys.argv[2]


def get_txt_filepaths_from_zip(archive):
    txt_files = []
    for file_path in archive.namelist():
        if file_path.endswith('.txt') and file_path.startswith("whole_corpus/train"):
            print(file_path)
            txt_files.append(file_path)
    return txt_files


def parse_single_txt(path_to_txt, zip_file):
    with zip_file.open(path_to_txt) as f:
        lines = f.readlines()
    lines = [line.decode("utf-8").strip() for line in lines]
    lines = [line for line in lines if line]
    assert len(lines) % 2 == 0, "\n".join(lines)

    examples = []
    for i in range(len(lines) // 2):
        question = lines[2 * i]
        answer = lines[2 * i + 1]
        assert len(answer) < len(question), f"{answer} vs {question}"
        example =  question + " Ответ: " + answer
        examples.append(example)
    return examples


if __name__ == "__main__":
    zip_file = zipfile.ZipFile(input_path)
    txt_paths = get_txt_filepaths_from_zip(zip_file)
    archive = PlainArchive(output_path)
    all_examples = []
    for path in tqdm(txt_paths, total=len(txt_paths)):
        examples = parse_single_txt(path, zip_file)
        all_examples.extend(examples)
    random.shuffle(all_examples)
    for batch_num, batch in enumerate(gen_batch(all_examples, 1000)):
        text = "\n".join(batch)
        if not text:
            continue
        archive.add_data(
            text=text,
            meta={
                "source": "math",
                "url": None
            }
        )