IlyaGusev/rupo

View on GitHub
rupo/main/morph.py

Summary

Maintainability
A
55 mins
Test Coverage
# -*- coding: utf-8 -*-
# Автор: Гусев Илья
# Описание: Разметка по грамматическим значениям

import os
from typing import List, TextIO

from sentence_splitter import SentenceSplitter
from rnnmorph.predictor import RNNMorphPredictor

from rupo.main.tokenizer import Tokenizer, Token


class Morph:
    @staticmethod
    def get_morph_markup(input_filenames: List[str], output_filename: str):
        """
        Разметка по грамматическим значениям

        :param input_filenames: входные текстовые файлы
        :param output_filename: путь к файлу, куда будет сохранена разметка
        """
        if os.path.exists(output_filename):
            os.remove(output_filename)

        sentence_splitter = SentenceSplitter(language='ru')
        morph_predictor = RNNMorphPredictor()

        for filename in input_filenames:
            with open(filename, "r", encoding="utf-8") as r, open(output_filename, "w+", encoding="utf-8") as w:
                for line in r:
                    Morph.__process_line(line, w, sentence_splitter, morph_predictor)

    @staticmethod
    def __process_line(line: str, output_file: TextIO, sentence_splitter: SentenceSplitter,
                       morph_predictor: RNNMorphPredictor):
        sentences = sentence_splitter.split(line)
        for sentence in sentences:
            words = [token.text for token in Tokenizer.tokenize(sentence)
                     if token.text != '' and token.token_type != Token.TokenType.SPACE]
            if not words:
                continue
            forms = morph_predictor.predict_sentence_tags(words)
            for form in forms:
                if form.pos == "PUNCT":
                    continue
                output_file.write("%s\t%s\t%s\t%s\n" % (form.word, form.normal_form, form.pos, form.tag))
            output_file.write("\n")