BLKSerene/Wordless

View on GitHub
wordless/wl_nlp/wl_syl_tokenization.py

Summary

Maintainability
A
1 hr
Test Coverage
# ----------------------------------------------------------------------
# Wordless: NLP - Syllable tokenization
# Copyright (C) 2018-2024  Ye Lei (叶磊)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
# ----------------------------------------------------------------------

import re

import pythainlp

from wordless.wl_nlp import wl_nlp_utils, wl_texts, wl_word_tokenization

def wl_syl_tokenize(main, inputs, lang, syl_tokenizer = 'default', force = False):
    if (
        not isinstance(inputs, str)
        and inputs
        and list(inputs)[0].syls is not None
        and not force
    ):
        return inputs
    else:
        if inputs and lang in main.settings_global['syl_tokenizers']:
            syls_tokens = []

            if syl_tokenizer == 'default':
                syl_tokenizer = main.settings_custom['syl_tokenization']['syl_tokenizer_settings'][lang]

            wl_nlp_utils.init_syl_tokenizers(
                main,
                lang = lang,
                syl_tokenizer = syl_tokenizer
            )

            if isinstance(inputs, str):
                tokens = wl_word_tokenization.wl_word_tokenize_flat(main, inputs, lang = lang)
                texts = wl_texts.to_token_texts(tokens)
            else:
                texts, token_properties = wl_texts.split_texts_properties(inputs)

            section_size = main.settings_custom['files']['misc_settings']['read_files_in_chunks']
            texts_sections = wl_nlp_utils.to_sections_unequal(texts, section_size = section_size * 50)

            for texts_section in texts_sections:
                syls_tokens.extend(wl_syl_tokenize_tokens(main, texts_section, lang, syl_tokenizer))

            # Remove empty syllables and whitespace around syllables
            syls_tokens = [
                tuple(wl_texts.clean_texts(syls))
                for syls in syls_tokens
                if any(syls)
            ]

            if isinstance(inputs, str):
                wl_texts.set_token_properties(tokens, 'syls', syls_tokens)

                return tokens
            else:
                tokens = wl_texts.combine_texts_properties(texts, token_properties)
                wl_texts.set_token_properties(tokens, 'syls', syls_tokens)

                wl_texts.update_token_properties(inputs, tokens)

                return inputs
        # Do not set syllable properties if syllable tokenization is not supported
        else:
            if isinstance(inputs, str):
                tokens = wl_word_tokenization.wl_word_tokenize_flat(main, inputs, lang = lang)

                return tokens
            else:
                return inputs

def wl_syl_tokenize_tokens(main, tokens, lang, syl_tokenizer):
    syls_tokens = []

    for token in tokens:
        # NLTK
        if syl_tokenizer == 'nltk_legality':
            nltk_syl_tokenizer_legality = main.__dict__['nltk_syl_tokenizer_legality']

            syls_tokens.append(nltk_syl_tokenizer_legality.tokenize(token))
        elif syl_tokenizer == 'nltk_sonority_sequencing':
            nltk_syl_tokenizer_sonority_sequencing = main.__dict__['nltk_syl_tokenizer_sonority_sequencing']

            syls_tokens.append(nltk_syl_tokenizer_sonority_sequencing.tokenize(token))
        # Pyphen
        elif syl_tokenizer.startswith('pyphen_'):
            pyphen_syl_tokenizer = main.__dict__[f'pyphen_syl_tokenizer_{lang}']
            syls = re.split(r'\-+', pyphen_syl_tokenizer.inserted(token))

            if any(syls):
                syls_tokens.append(syls)
            else:
                syls_tokens.append([token])
        # Thai
        elif syl_tokenizer == 'pythainlp_han_solo':
            syls_tokens.append(pythainlp.tokenize.syllable_tokenize(token, engine = 'han_solo'))
        elif syl_tokenizer == 'pythainlp_syl_dict':
            syls_tokens.append(pythainlp.tokenize.syllable_tokenize(token, engine = 'dict'))

    return syls_tokens