BLKSerene/Wordless

View on GitHub
wordless/wl_nlp/wl_token_processing.py

Summary

Maintainability
C
1 day
Test Coverage
# ----------------------------------------------------------------------
# Wordless: NLP - Token processing
# Copyright (C) 2018-2024  Ye Lei (叶磊)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
# ----------------------------------------------------------------------

import copy

from wordless.wl_checks import wl_checks_tokens
from wordless.wl_nlp import (
    wl_dependency_parsing,
    wl_lemmatization,
    wl_pos_tagging,
    wl_stop_word_lists,
    wl_syl_tokenization,
    wl_texts,
    wl_word_detokenization
)
from wordless.wl_utils import wl_misc

# Assign part-of-speech tags
def text_pos_tag(main, text, settings):
    if settings['assign_pos_tags'] and not text.tagged:
        wl_pos_tagging.wl_pos_tag(
            main,
            inputs = text.get_tokens_flat(),
            lang = text.lang
        )

# Apply lemmatization / Match inflected forms
def text_lemmatize(main, text, token_settings, search_settings = None):
    search_settings = search_settings or {
        'match_inflected_forms': False,
        'context_settings': {'incl': {'incl': False}, 'excl': {'excl': False}}
    }

    if (
        token_settings.get('apply_lemmatization', False)
        or search_settings['match_inflected_forms']
        or (
            search_settings['context_settings']['incl']['incl']
            and search_settings['context_settings']['incl']['match_inflected_forms']
        )
        or (
            search_settings['context_settings']['excl']['excl']
            and search_settings['context_settings']['excl']['match_inflected_forms']
        )
    ):
        wl_lemmatization.wl_lemmatize(
            main,
            inputs = text.get_tokens_flat(),
            lang = text.lang
        )

# Syllable tokenization
def text_syl_tokenize(main, text):
    wl_syl_tokenization.wl_syl_tokenize(
        main,
        inputs = text.get_tokens_flat(),
        lang = text.lang,
    )

# Ignore tags
def text_ignore_tags(text, settings):
    if settings['ignore_tags']:
        text.set_token_properties('tag', None)

# Use tags only
def text_use_tags_only(text, settings):
    if settings['use_tags']:
        text.set_token_texts(text.get_token_properties('tag', flat = True))

        # Remove all tags and punctuation marks
        text.set_token_properties('tag', None)
        text.set_token_properties('punc_mark', None)

# Filter stop words after token texts have been converted to lowercase, replaced by lemmas, or replaced by tags
def text_filter_stop_words(main, text, settings):
    if settings['filter_stop_words']:
        stop_words = wl_stop_word_lists.wl_get_stop_word_list(main, lang = text.lang)

        if main.settings_custom['stop_word_lists']['stop_word_list_settings']['case_sensitive']:
            for para in text.tokens_multilevel:
                for sentence in para:
                    for sentence_seg in sentence:
                        for i, token in enumerate(sentence_seg):
                            # Convert to strings to ignore tags and punctuation marks, if any, when checking for stop words
                            if str(token) in stop_words:
                                sentence_seg[i] = wl_texts.Wl_Token('')
        else:
            stop_words = {token.lower() for token in stop_words}

            for para in text.tokens_multilevel:
                for sentence in para:
                    for sentence_seg in sentence:
                        for i, token in enumerate(sentence_seg):
                            # Convert to strings to ignore tags and punctuation marks, if any, when checking for stop words
                            if token.lower() in stop_words:
                                sentence_seg[i] = wl_texts.Wl_Token('')

def remove_empty_tokens(tokens_multilevel):
    tokens_multilevel = [
        [
            [
                [token for token in sentence_seg if token]
                for sentence_seg in sentence
            ]
            for sentence in para
        ]
        for para in tokens_multilevel
    ]

    return tokens_multilevel

def remove_empty_paras(tokens_multilevel):
    tokens_multilevel = [para for para in tokens_multilevel if para]

    return tokens_multilevel

def wl_process_tokens(main, text, token_settings, search_settings = None):
    settings = copy.deepcopy(token_settings)

    if not settings['words']:
        settings['all_lowercase'] = False
        settings['all_uppercase'] = False
        settings['title_case'] = False

    if settings['use_tags']:
        settings['apply_lemmatization'] = False

    text_pos_tag(main, text, token_settings)
    text_lemmatize(main, text, token_settings, search_settings)

    text_modified = copy.deepcopy(text)

    # Remove empty paragraphs
    text_modified.tokens_multilevel = [para for para in text_modified.tokens_multilevel if para]

    # Remove tags temporarily if text is untagged and users do not choose to assign POS tags on the fly
    if not settings['assign_pos_tags'] and not text.tagged:
        text_modified.set_token_properties('tag', None)

    # Punctuation marks
    if not settings['punc_marks']:
        for para in text_modified.tokens_multilevel:
            for sentence in para:
                for i, sentence_seg in enumerate(sentence):
                    sentence[i] = [
                        token
                        for token in sentence_seg
                        if not wl_checks_tokens.is_punc(token)
                    ]

    # Treat as all lowercase
    if settings['treat_as_all_lowercase']:
        text_modified.set_token_texts([token.lower() for token in text_modified.get_tokens_flat()])

        # Also convert tags, lemmas, and syllables, if any, to lowercase
        for para in text_modified.tokens_multilevel:
            for sentence in para:
                for sentence_seg in sentence:
                    for token in sentence_seg:
                        if token.tag is not None:
                            token.tag = token.tag.lower()

                        if token.lemma is not None:
                            token.lemma = token.lemma.lower()

                        if token.syls is not None:
                            token.syls = [syl.lower() for syl in token.syls]

    # Words
    if settings['words']:
        # Lowercase
        if not settings['all_lowercase']:
            for para in text_modified.tokens_multilevel:
                for sentence in para:
                    for sentence_seg in sentence:
                        for i, token in enumerate(sentence_seg):
                            if token.islower():
                                sentence_seg[i] = wl_texts.Wl_Token('')
        # Uppercase
        if not settings['all_uppercase']:
            for para in text_modified.tokens_multilevel:
                for sentence in para:
                    for sentence_seg in sentence:
                        for i, token in enumerate(sentence_seg):
                            if token.isupper():
                                sentence_seg[i] = wl_texts.Wl_Token('')
        # Title Case
        if not settings['title_case']:
            for para in text_modified.tokens_multilevel:
                for sentence in para:
                    for sentence_seg in sentence:
                        for i, token in enumerate(sentence_seg):
                            if token.istitle():
                                sentence_seg[i] = wl_texts.Wl_Token('')
    else:
        for para in text_modified.tokens_multilevel:
            for sentence in para:
                for sentence_seg in sentence:
                    for i, token in enumerate(sentence_seg):
                        if wl_checks_tokens.is_word_alphabetic(token):
                            sentence_seg[i] = wl_texts.Wl_Token('')

    # Numerals
    if not settings['nums']:
        for para in text_modified.tokens_multilevel:
            for sentence in para:
                for sentence_seg in sentence:
                    for i, token in enumerate(sentence_seg):
                        if wl_checks_tokens.is_num(token):
                            sentence_seg[i] = wl_texts.Wl_Token('')

    # Replace token texts with lemmas
    if settings['apply_lemmatization']:
        text_modified.set_token_texts(text_modified.get_token_properties('lemma', flat = True))

        # Re-apply syllable tokenization to lemmas if syllables have been assigned
        if text_modified.has_token_properties('syls'):
            tokens = wl_syl_tokenization.wl_syl_tokenize(
                main,
                inputs = wl_texts.to_tokens(text_modified.get_token_properties('lemma', flat = True), lang = text_modified.lang),
                lang = text_modified.lang,
                force = True
            )

            text_modified.set_token_properties('syls', wl_texts.get_token_properties(tokens, 'syls'))

    text_modified.update_num_tokens()

    return text_modified

def wl_process_tokens_ngram_generator(main, text, token_settings, search_settings = None):
    text_modified = wl_process_tokens(main, text, token_settings, search_settings)

    text_ignore_tags(text_modified, token_settings)
    text_use_tags_only(text_modified, token_settings)
    text_filter_stop_words(main, text_modified, token_settings)

    text_modified.update_num_tokens()

    return text_modified

def wl_process_tokens_profiler(main, text, token_settings, profiler_tab):
    # Punctuation marks must be preserved for some readability measures (e.g. Wheeler & Smith's Readability Formula)
    text.tokens_multilevel_with_puncs = copy.deepcopy(text.tokens_multilevel)

    text_syl_tokenize(main, text)

    if profiler_tab in ['readability', 'all']:
        if text.lang in main.settings_global['pos_taggers']:
            wl_pos_tagging.wl_pos_tag_universal(main, text.get_tokens_flat(), lang = text.lang, tagged = text.tagged)

        # Polish variant of Gunning Fog Index
        if text.lang == 'pol':
            wl_lemmatization.wl_lemmatize(main, text.get_tokens_flat(), lang = text.lang)

    if profiler_tab in ['lexical_density_diversity', 'all']:
        # Lexical density
        if text.lang in main.settings_global['pos_taggers']:
            wl_pos_tagging.wl_pos_tag_universal(main, text.get_tokens_flat(), lang = text.lang, tagged = text.tagged)

    text_modified = wl_process_tokens_ngram_generator(main, text, token_settings)
    text_modified.tokens_multilevel = remove_empty_tokens(text_modified.tokens_multilevel)
    text_modified.update_num_tokens()

    return text_modified

def wl_process_tokens_wordlist_generator(main, text, token_settings, generation_settings):
    # Syllabification
    if generation_settings['syllabification']:
        text_syl_tokenize(main, text)

    text_modified = wl_process_tokens_ngram_generator(main, text, token_settings)
    text_modified.update_num_tokens()

    return text_modified

def wl_process_tokens_colligation_extractor(main, text, token_settings, search_settings):
    # Do not modify custom settings, as adding new options would clear user's custom settings
    settings = copy.deepcopy(token_settings)
    # Always assign part-of-speech tags
    settings['assign_pos_tags'] = True

    text_modified = wl_process_tokens(main, text, settings, search_settings)

    text_modified.tags = wl_texts.to_tokens(
        text_modified.get_token_properties('tag', flat = True),
        lang = text.lang
    )

    text_ignore_tags(text_modified, token_settings)
    text_use_tags_only(text_modified, token_settings)
    text_filter_stop_words(main, text_modified, token_settings)

    text_modified.update_num_tokens()

    return text_modified

def wl_process_tokens_concordancer(main, text, token_settings, search_settings, preserve_blank_lines = False):
    text_pos_tag(main, text, token_settings)
    text_lemmatize(main, text, token_settings, search_settings)

    text_modified = copy.deepcopy(text)

    # Remove tags temporarily if text is untagged and users do not choose to assign POS tags on the fly
    if not token_settings['assign_pos_tags'] and not text.tagged:
        text_modified.set_token_properties('tag', '')

    # Punctuation marks
    if not token_settings['punc_marks']:
        tokens_flat_punc_marks = []

        for i, token in enumerate(text_modified.get_tokens_flat()):
            if wl_checks_tokens.is_punc(token):
                # Check if the first token is a punctuation mark
                if i == 0:
                    tokens_flat_punc_marks.append(wl_texts.Wl_Token('', lang = token.lang, punc_mark = token))
                else:
                    token_text = wl_word_detokenization.wl_word_detokenize(
                        main,
                        tokens = [
                            str(tokens_flat_punc_marks[-1]) + (tokens_flat_punc_marks[-1].punc_mark or ''),
                            token
                        ],
                        lang = text_modified.lang
                    )

                    tokens_flat_punc_marks[-1].punc_mark = token_text.replace(str(tokens_flat_punc_marks[-1]), '')
            else:
                tokens_flat_punc_marks.append(token)

        # Remove punctuation marks to match length
        for i, para in enumerate(text_modified.tokens_multilevel):
            for j, sentence in enumerate(para):
                for k, sentence_seg in enumerate(sentence):
                    if i == 0 and j == 0 and k == 0:
                        tokens = []

                        for l, token in enumerate(sentence_seg):
                            # Do not remove the first token and set it to an empty token instead if it is a punctuation mark
                            if l == 0 and wl_checks_tokens.is_punc(token):
                                tokens.append(wl_texts.set_token_text(token, ''))
                            elif not wl_checks_tokens.is_punc(token):
                                tokens.append(token)

                        sentence[k] = tokens
                    else:
                        sentence[k] = [token for token in sentence_seg if not wl_checks_tokens.is_punc(token)]

        # Also remove punctuation marks in heads
        for para in text_modified.tokens_multilevel:
            for sentence in para:
                for sentence_seg in sentence:
                    for token in sentence_seg:
                        if wl_checks_tokens.is_punc(token.head):
                            token.head = None

        text_modified.set_tokens(tokens_flat_punc_marks)

    text_ignore_tags(text_modified, token_settings)
    text_use_tags_only(text_modified, token_settings)

    if not preserve_blank_lines:
        text_modified.tokens_multilevel = remove_empty_paras(text_modified.tokens_multilevel)

    text_modified.update_num_tokens()

    return text_modified

def wl_process_tokens_dependency_parser(main, text, token_settings, search_settings):
    # Do not modify original sentence tokenization during dependency parsing
    for para in text.tokens_multilevel:
        for sentence in para:
            wl_dependency_parsing.wl_dependency_parse(
                main,
                inputs = list(wl_misc.flatten_list(sentence)),
                lang = text.lang,
            )

    return wl_process_tokens_concordancer(main, text, token_settings, search_settings)