BLKSerene/Wordless

View on GitHub
tests/tests_measures/test_measures_lexical_density_diversity.py

Summary

Maintainability
A
0 mins
Test Coverage
# ----------------------------------------------------------------------
# Wordless: Tests - Measures - Lexical density/diversity
# Copyright (C) 2018-2024  Ye Lei (叶磊)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
# ----------------------------------------------------------------------

import numpy
import scipy

from tests import wl_test_init
from wordless.wl_measures import wl_measures_lexical_density_diversity

main = wl_test_init.Wl_Test_Main()
settings = main.settings_custom['measures']['lexical_density_diversity']

TOKENS_10 = ['This', 'is', 'a', 'sentence', '.'] * 2
TOKENS_100 = ['This', 'is', 'a', 'sentence', '.'] * 20
TOKENS_101 = ['This', 'is', 'a', 'sentence', '.'] * 20 + ['another']
TOKENS_1000 = ['This', 'is', 'a', 'sentence', '.'] * 200

# Reference: Popescu, I.-I. (2009). Word frequency studies (p. 26). Mouton de Gruyter.
TOKENS_225 = [1] * 11 + [2, 3] * 9 + [4] * 7 + [5, 6] * 6 + [7, 8] * 5 + list(range(9, 16)) * 4 + list(range(16, 22)) * 3 + list(range(22, 40)) * 2 + list(range(40, 125))

def get_test_text(tokens):
    return wl_test_init.Wl_Test_Text(main, [[[tokens]]])

text_tokens_10 = get_test_text(TOKENS_10)
text_tokens_100 = get_test_text(TOKENS_100)
text_tokens_101 = get_test_text(TOKENS_101)
text_tokens_1000 = get_test_text(TOKENS_1000)
text_tokens_225 = get_test_text(TOKENS_225)

def test_brunets_index():
    w = wl_measures_lexical_density_diversity.brunets_index(main, text_tokens_100)

    assert w == numpy.power(100, numpy.power(5, -0.165))

def test_cttr():
    cttr = wl_measures_lexical_density_diversity.cttr(main, text_tokens_100)

    assert cttr == 5 / (2 * 100) ** 0.5

# Reference: Fisher, R. A., Steven, A. C., & Williams, C. B. (1943). The relation between the number of species and the number of individuals in a random sample of an animal population. Journal of Animal Ecology, 12(1), 56. https://doi.org/10.2307/1411
def test_fishers_index_of_diversity():
    tokens = [str(i) for i in range(240)] + ['0'] * (15609 - 240)
    alpha = wl_measures_lexical_density_diversity.fishers_index_of_diversity(main, get_test_text(tokens))

    assert round(alpha, 3) == 40.247

def test_herdans_vm():
    vm = wl_measures_lexical_density_diversity.herdans_vm(main, text_tokens_100)

    assert vm == (5 * 20 ** 2) / (100 ** 2) - 1 / 5

def test_hdd():
    hdd_100 = wl_measures_lexical_density_diversity.hdd(main, text_tokens_100)

    assert hdd_100 == (1 - scipy.stats.hypergeom.pmf(k = 0, M = 100, n = 20, N = 42)) * (1 / 42) * 5

def test_honores_stat():
    r = wl_measures_lexical_density_diversity.honores_stat(main, text_tokens_100)

    assert r == 100 * numpy.log(100 / (1 - 0 / 5))

def test_lexical_density():
    lexical_density = wl_measures_lexical_density_diversity.lexical_density(main, text_tokens_100)

    assert lexical_density == 20 / 100

def test_logttr():
    settings['logttr']['variant'] = 'Herdan'
    logttr_herdan = wl_measures_lexical_density_diversity.logttr(main, text_tokens_100)
    settings['logttr']['variant'] = 'Somers'
    logttr_somers = wl_measures_lexical_density_diversity.logttr(main, text_tokens_100)
    settings['logttr']['variant'] = 'Rubet'
    logttr_rubet = wl_measures_lexical_density_diversity.logttr(main, text_tokens_100)
    settings['logttr']['variant'] = 'Maas'
    logttr_maas = wl_measures_lexical_density_diversity.logttr(main, text_tokens_100)
    settings['logttr']['variant'] = 'Dugast'
    logttr_dugast = wl_measures_lexical_density_diversity.logttr(main, text_tokens_100)

    num_types = 5
    num_tokens = 100

    assert logttr_herdan == numpy.log(num_types) / numpy.log(num_tokens)
    assert logttr_somers == numpy.log(numpy.log(num_types)) / numpy.log(numpy.log(num_tokens))
    assert logttr_rubet == numpy.log(num_types) / numpy.log(numpy.log(num_tokens))
    assert logttr_maas == (numpy.log(num_tokens) - numpy.log(num_types)) / (numpy.log(num_tokens) ** 2)
    assert logttr_dugast == (numpy.log(num_tokens) ** 2) / (numpy.log(num_tokens) - numpy.log(num_types))

def test_msttr():
    msttr_100 = wl_measures_lexical_density_diversity.msttr(main, text_tokens_101)
    settings['msttr']['num_tokens_in_each_seg'] = 1000
    msttr_1000 = wl_measures_lexical_density_diversity.msttr(main, text_tokens_101)

    assert msttr_100 == 5 / 100
    assert msttr_1000 == 0

def test_mtld():
    mtld_100 = wl_measures_lexical_density_diversity.mtld(main, text_tokens_100)

    assert mtld_100 == 100 / (14 + 0 / 0.28)

def test_mattr():
    mattr_100 = wl_measures_lexical_density_diversity.mattr(main, text_tokens_100)
    mattr_1000 = wl_measures_lexical_density_diversity.mattr(main, text_tokens_1000)

    assert mattr_100 == wl_measures_lexical_density_diversity.ttr(main, text_tokens_100)
    assert mattr_1000 == 5 / 500

# Reference: Popescu I.-I., Mačutek, J, & Altmann, G. (2008). Word frequency and arc length. Glottometrics, 17, 21, 33.
def test_popescu_macutek_altmanns_b1_b2_b3_b4_b5():
    b1, b2, b3, b4, b5 = wl_measures_lexical_density_diversity.popescu_macutek_altmanns_b1_b2_b3_b4_b5(main, text_tokens_225)

    assert round(b1, 3) == 0.969
    assert round(b2, 3) == 0.527
    assert round(b3, 3) == 0.961
    assert round(b4, 3) == 0.078
    assert round(b5, 3) == 0.664

# Reference: Popescu, I.-I. (2009). Word frequency studies (p. 30). Mouton de Gruyter.
def test_popescus_r1():
    r1 = wl_measures_lexical_density_diversity.popescus_r1(main, text_tokens_225)

    assert round(r1, 4) == 0.8667

# Reference: Popescu, I.-I. (2009). Word frequency studies (p. 39). Mouton de Gruyter.
def test_popescus_r2():
    r2 = wl_measures_lexical_density_diversity.popescus_r2(main, text_tokens_225)

    assert round(r2, 3) == 0.871

# Reference: Popescu, I.-I. (2009). Word frequency studies (p. 51). Mouton de Gruyter.
def test_popescus_r3():
    r3 = wl_measures_lexical_density_diversity.popescus_r3(main, text_tokens_225)

    assert round(r3, 4) == 0.3778

# Reference: Popescu, I.-I. (2009). Word frequency studies (p. 59). Mouton de Gruyter.
def test_popescus_r4():
    r4 = wl_measures_lexical_density_diversity.popescus_r4(main, text_tokens_225)

    assert round(r4, 4) == 0.6344

# Reference: Popescu, I.-I. (2009). Word frequency studies (pp. 170, 172). Mouton de Gruyter.
def test_repeat_rate():
    settings['repeat_rate']['use_data'] = 'Rank-frequency distribution'
    rr_distribution = wl_measures_lexical_density_diversity.repeat_rate(main, text_tokens_225)
    settings['repeat_rate']['use_data'] = 'Frequency spectrum'
    rr_spectrum = wl_measures_lexical_density_diversity.repeat_rate(main, text_tokens_225)

    assert round(rr_distribution, 4) == 0.0153
    assert round(rr_spectrum, 4) == 0.4974

def test_rttr():
    rttr = wl_measures_lexical_density_diversity.rttr(main, text_tokens_100)

    assert rttr == 5 / 100 ** 0.5

# Reference: Popescu, I.-I. (2009). Word frequency studies (pp. 176, 178). Mouton de Gruyter.
def test_shannon_entropy():
    settings['shannon_entropy']['use_data'] = 'Rank-frequency distribution'
    h_distribution = wl_measures_lexical_density_diversity.shannon_entropy(main, text_tokens_225)
    settings['shannon_entropy']['use_data'] = 'Frequency spectrum'
    h_spectrum = wl_measures_lexical_density_diversity.shannon_entropy(main, text_tokens_225)

    assert round(h_distribution, 4) == 6.5270
    assert round(h_spectrum, 4) == 1.6234

def test_simpsons_l():
    l = wl_measures_lexical_density_diversity.simpsons_l(main, text_tokens_100)

    assert l == (5 * 20 ** 2 - 100) / (100 * (100 - 1))

def test_ttr():
    ttr = wl_measures_lexical_density_diversity.ttr(main, text_tokens_100)

    assert ttr == 5 / 100

def test_vocdd():
    vocdd_10 = wl_measures_lexical_density_diversity.vocdd(main, text_tokens_10)
    vocdd_100 = wl_measures_lexical_density_diversity.vocdd(main, text_tokens_100)
    vocdd_1000 = wl_measures_lexical_density_diversity.vocdd(main, text_tokens_1000)

    assert vocdd_10 > 0
    assert vocdd_100 > 0
    assert vocdd_1000 > 0

def test_yules_characteristic_k():
    k = wl_measures_lexical_density_diversity.yules_characteristic_k(main, text_tokens_100)

    assert k == 10000 * ((5 * 20 ** 2 - 100) / (100 ** 2))

def test_yules_index_of_diversity():
    index_of_diversity = wl_measures_lexical_density_diversity.yules_index_of_diversity(main, text_tokens_100)

    assert index_of_diversity == (100 ** 2) / (5 * 20 ** 2 - 100)

if __name__ == '__main__':
    test_brunets_index()
    test_cttr()
    test_fishers_index_of_diversity()
    test_herdans_vm()
    test_hdd()
    test_honores_stat()
    test_lexical_density()
    test_logttr()
    test_msttr()
    test_mtld()
    test_mattr()
    test_popescu_macutek_altmanns_b1_b2_b3_b4_b5()
    test_popescus_r1()
    test_popescus_r2()
    test_popescus_r3()
    test_popescus_r4()
    test_repeat_rate()
    test_rttr()
    test_shannon_entropy()
    test_simpsons_l()
    test_ttr()
    test_vocdd()
    test_yules_characteristic_k()
    test_yules_index_of_diversity()