BLKSerene/Wordless

View on GitHub
tests/tests_nlp/tests_spacy/test_spacy.py

Summary

Maintainability
A
1 hr
Test Coverage
# ----------------------------------------------------------------------
# Wordless: Tests - NLP - spaCy
# Copyright (C) 2018-2024  Ye Lei (叶磊)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
# ----------------------------------------------------------------------

from tests import wl_test_init, wl_test_lang_examples
from tests.tests_nlp import test_dependency_parsing, test_lemmatization, test_pos_tagging
from wordless.wl_nlp import wl_nlp_utils, wl_sentence_tokenization, wl_texts, wl_word_tokenization
from wordless.wl_utils import wl_conversion

main = wl_test_init.Wl_Test_Main(switch_lang_utils = 'spacy')

def wl_test_spacy(
    lang,
    results_sentence_tokenize_trf = None, results_sentence_tokenize_lg = None,
    results_word_tokenize = None,
    results_pos_tag = None, results_pos_tag_universal = None,
    results_lemmatize = None,
    results_dependency_parse = None
):
    lang_no_suffix = wl_conversion.remove_lang_code_suffixes(main, lang)
    wl_nlp_utils.check_models(main, langs = [lang], lang_utils = [[f'spacy_{lang_no_suffix}']])

    test_sentence = getattr(wl_test_lang_examples, f'SENTENCE_{lang.upper()}')

    wl_test_sentence_tokenize(lang, results_sentence_tokenize_trf, results_sentence_tokenize_lg)
    wl_test_word_tokenize(lang, test_sentence, results_word_tokenize)

    # Tokenized
    tokens = wl_word_tokenization.wl_word_tokenize_flat(
        main,
        text = test_sentence,
        lang = lang
    )

    if lang != 'other':
        wl_test_pos_tag(lang, test_sentence, tokens, results_pos_tag, results_pos_tag_universal)
        wl_test_lemmatize(lang, test_sentence, tokens, results_lemmatize)
        wl_test_dependency_parse(lang, test_sentence, tokens, results_dependency_parse)

def wl_test_sentence_tokenize(lang, results_trf, results_lg):
    lang_no_suffix = wl_conversion.remove_lang_code_suffixes(main, lang)
    test_text = ''.join(getattr(wl_test_lang_examples, f'TEXT_{lang.upper()}'))

    if lang == 'other':
        sentence_tokenizer_trf = 'spacy_sentencizer'
    else:
        sentence_tokenizer_trf = f'spacy_dependency_parser_{lang_no_suffix}'

    sentences_trf = wl_sentence_tokenization.wl_sentence_tokenize(
        main,
        text = test_text,
        lang = lang,
        sentence_tokenizer = sentence_tokenizer_trf
    )

    print(f'{lang} / {sentence_tokenizer_trf}:')
    print(f'{sentences_trf}\n')

    # The count of sentences should be more than 1
    if lang not in ['zho_cn']:
        assert len(sentences_trf) > 1

    assert sentences_trf == results_trf

    if not wl_nlp_utils.LANGS_SPACY[lang_no_suffix].endswith('_trf'):
        sentence_tokenizer_lg = f'spacy_sentence_recognizer_{lang_no_suffix}'

        sentences_lg = wl_sentence_tokenization.wl_sentence_tokenize(
            main,
            text = test_text,
            lang = lang,
            sentence_tokenizer = sentence_tokenizer_lg
        )

        print(f'{lang} / {sentence_tokenizer_lg}:')
        print(f'{sentences_lg}\n')

        # The count of sentences should be more than 1
        assert len(sentences_lg) > 1

        assert sentences_lg == results_lg

def wl_test_word_tokenize(lang, test_sentence, results):
    lang_no_suffix = wl_conversion.remove_lang_code_suffixes(main, lang)
    word_tokenizer = f'spacy_{lang_no_suffix}'

    tokens = wl_word_tokenization.wl_word_tokenize_flat(
        main,
        text = test_sentence,
        lang = lang,
        word_tokenizer = word_tokenizer
    )

    print(f'{lang} / {word_tokenizer}:')
    print(f'{tokens}\n')

    # The count of tokens should be more than 1
    assert len(tokens) > 1
    # The count of tokens should be more than the length of tokens split by space
    assert len(tokens) > len(test_sentence.split())

    assert wl_texts.to_display_texts(tokens) == results

def wl_test_pos_tag(lang, test_sentence, tokens, results, results_universal):
    lang_no_suffix = wl_conversion.remove_lang_code_suffixes(main, lang)
    pos_tagger = f'spacy_{lang_no_suffix}'

    test_pos_tagging.wl_test_pos_tag_models(lang, pos_tagger, test_sentence, tokens, results, results_universal)

def wl_test_lemmatize(lang, test_sentence, tokens, results):
    lang_no_suffix = wl_conversion.remove_lang_code_suffixes(main, lang)
    lemmatizer = f'spacy_{lang_no_suffix}'

    test_lemmatization.wl_test_lemmatize_models(lang, lemmatizer, test_sentence, tokens, results)

def wl_test_dependency_parse(lang, test_sentence, tokens, results):
    lang_no_suffix = wl_conversion.remove_lang_code_suffixes(main, lang)
    dependency_parser = f'spacy_{lang_no_suffix}'

    test_dependency_parsing.wl_test_dependency_parse_models(lang, dependency_parser, test_sentence, tokens, results)