tests/tests_measures/test_measures_effect_size.py
# ----------------------------------------------------------------------
# Wordless: Tests - Measures - Effect size
# Copyright (C) 2018-2024 Ye Lei (叶磊)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# ----------------------------------------------------------------------
import numpy
from tests import wl_test_init
from wordless.wl_measures import wl_measures_effect_size
main = wl_test_init.Wl_Test_Main()
def assert_zeros(func, result = 0):
numpy.testing.assert_array_equal(
func(
main,
numpy.array([0] * 10),
numpy.array([0] * 10),
numpy.array([0] * 10),
numpy.array([0] * 10)
),
numpy.array([result] * 10)
)
# Reference: Gabrielatos, C., & Marchi, A. (2012, September 13–14). Keyness: Appropriate metrics and practical issues [Conference session]. CADS International Conference 2012, University of Bologna, Italy. (pp. 21-22)
def test_pct_diff():
numpy.testing.assert_array_equal(
numpy.round(wl_measures_effect_size.pct_diff(
main,
numpy.array([20] * 2),
numpy.array([1] * 2),
numpy.array([29954 - 20] * 2),
numpy.array([23691 - 1] * 2)
), 2),
numpy.array([1481.83] * 2)
)
numpy.testing.assert_array_equal(
wl_measures_effect_size.pct_diff(
main,
numpy.array([0, 1, 0]),
numpy.array([1, 0, 0]),
numpy.array([0, 0, 0]),
numpy.array([1, 1, 0])
),
numpy.array([float('-inf'), float('inf'), 0])
)
def test_im3():
assert_zeros(wl_measures_effect_size.im3)
# Reference: Smadja, F., McKeown, K. R., & Hatzivassiloglou, V. (1996). Translating collocations for bilingual lexicons: A statistical approach. Computational Linguistics, 22(1), pp. 1–38. (p. 13)
def test_dice_sorensen_coeff():
numpy.testing.assert_array_equal(
numpy.round(wl_measures_effect_size.dice_sorensen_coeff(
main,
numpy.array([130] * 2),
numpy.array([3121 - 130] * 2),
numpy.array([143 - 130] * 2),
numpy.array([-1] * 2)
), 2),
numpy.array([0.08] * 2)
)
assert_zeros(wl_measures_effect_size.dice_sorensen_coeff)
# Reference: Hofland, K., & Johanson, S. (1982). Word frequencies in British and American English. Norwegian Computing Centre for the Humanities. (p. 471)
def test_diff_coeff():
numpy.testing.assert_array_equal(
numpy.round(wl_measures_effect_size.diff_coeff(
main,
numpy.array([18] * 2),
numpy.array([35] * 2),
numpy.array([1000000 - 18] * 2),
numpy.array([1000000 - 35] * 2)
), 2),
numpy.array([-0.32] * 2)
)
assert_zeros(wl_measures_effect_size.diff_coeff)
def test_jaccard_index():
assert_zeros(wl_measures_effect_size.jaccard_index)
# Reference: Kilgarriff, A. (2009). Simple maths for keywords. In M. Mahlberg, V. González-Díaz, & C. Smith (Eds.), Proceedings of the Corpus Linguistics Conference 2009 (p. 171). University of Liverpool.
def test_kilgarriffs_ratio():
numpy.testing.assert_array_equal(
numpy.round(wl_measures_effect_size.kilgarriffs_ratio(
main,
numpy.array([35] * 2),
numpy.array([263] * 2),
numpy.array([112289776] * 2),
numpy.array([1559716979] * 2)
), 4),
numpy.array([1.1224] * 2)
)
assert_zeros(wl_measures_effect_size.kilgarriffs_ratio, result = 1)
def test_log_dice():
assert_zeros(wl_measures_effect_size.log_dice, result = 14)
def test_lfmd():
assert_zeros(wl_measures_effect_size.lfmd)
# Reference: Hardie, A. (2014, April 28). Log Ratio: An informal introduction. ESRC Centre for Corpus Approaches to Social Science (CASS). http://cass.lancs.ac.uk/log-ratio-an-informal-introduction/.
def test_log_ratio():
numpy.testing.assert_array_equal(
wl_measures_effect_size.log_ratio(
main,
numpy.array([1] * 2),
numpy.array([1] * 2),
numpy.array([1000000 - 1] * 2),
numpy.array([1000000 - 1] * 2)
),
numpy.array([0] * 2)
)
numpy.testing.assert_array_equal(
wl_measures_effect_size.log_ratio(
main,
numpy.array([0, 1, 0]),
numpy.array([1, 0, 0]),
numpy.array([0, 0, 0]),
numpy.array([1, 1, 0])
),
numpy.array([float('-inf'), float('inf'), 0])
)
def test_mi_log_f():
assert_zeros(wl_measures_effect_size.mi_log_f)
# Reference: Pedersen, T. (1998). Dependent bigram identification. In Proceedings of the Fifteenth National Conference on Artificial Intelligence (p. 1197). AAAI Press.
def test_min_sensitivity():
numpy.testing.assert_array_equal(
numpy.round(wl_measures_effect_size.min_sensitivity(
main,
numpy.array([17] * 2),
numpy.array([240] * 2),
numpy.array([1001] * 2),
numpy.array([1298742] * 2)
), 3),
numpy.array([0.017] * 2)
)
assert_zeros(wl_measures_effect_size.min_sensitivity)
def test_md():
assert_zeros(wl_measures_effect_size.md)
def test_me():
assert_zeros(wl_measures_effect_size.me)
# Reference: Dunning, T. E. (1998). Finding structure in text, genome and other symbolic sequences [Doctoral dissertation, University of Sheffield]. arXiv. arxiv.org/pdf/1207.1847.pdf (p. 51)
def test_mi():
numpy.testing.assert_array_equal(
numpy.round(wl_measures_effect_size.mi(
main,
numpy.array([2] * 2, dtype = float),
numpy.array([0] * 2, dtype = float),
numpy.array([0] * 2, dtype = float),
numpy.array([7, 997], dtype = float)
), 3),
numpy.array([0.764, 0.021])
)
assert_zeros(wl_measures_effect_size.mi)
# Reference: Pojanapunya, P., & Todd, R. W. (2016). Log-likelihood and odds ratio keyness statistics for different purposes of keyword analysis. Corpus Linguistics and Linguistic Theory, 15(1), pp. 133–167. https://doi.org/10.1515/cllt-2015-0030 (p. 154)
def test_odds_ratio():
numpy.testing.assert_array_equal(
numpy.round(wl_measures_effect_size.odds_ratio(
main,
numpy.array([16217] * 2, dtype = float),
numpy.array([735] * 2, dtype = float),
numpy.array([2796938 - 16217] * 2, dtype = float),
numpy.array([2087946 - 735] * 2, dtype = float)
), 1),
numpy.array([16.6] * 2)
)
numpy.testing.assert_array_equal(
wl_measures_effect_size.odds_ratio(
main,
numpy.array([0, 1, 0]),
numpy.array([1, 0, 0]),
numpy.array([0, 0, 0]),
numpy.array([1, 1, 0])
),
numpy.array([float('-inf'), float('inf'), 0])
)
# Reference: Church, K. W., & Hanks, P. (1990). Word association norms, mutual information, and lexicography. Computational Linguistics, 16(1), 22–29. (p. 24)
def test_pmi():
numpy.testing.assert_array_equal(
numpy.round(wl_measures_effect_size.pmi(
main,
numpy.array([8] * 2),
numpy.array([1105 - 8] * 2),
numpy.array([44 - 8] * 2),
numpy.array([15000000 - 1105 - 44 + 8] * 2)
), 1),
numpy.array([11.3] * 2)
)
assert_zeros(wl_measures_effect_size.pmi)
def test_poisson_collocation_measure():
assert_zeros(wl_measures_effect_size.poisson_collocation_measure)
def test_im2():
assert_zeros(wl_measures_effect_size.im2)
# Reference: Church, K. W., & Gale, W. A. (1991, September 29–October 1). Concordances for parallel text [Paper presentation]. Using Corpora: Seventh Annual Conference of the UW Centre for the New OED and Text Research, St. Catherine's College, Oxford, United Kingdom.
def test_squared_phi_coeff():
numpy.testing.assert_array_equal(
numpy.round(wl_measures_effect_size.squared_phi_coeff(
main,
numpy.array([31950] * 2, dtype = float),
numpy.array([12004] * 2, dtype = float),
numpy.array([4793] * 2, dtype = float),
numpy.array([848330] * 2, dtype = float)
), 2),
numpy.array([0.62] * 2)
)
assert_zeros(wl_measures_effect_size.squared_phi_coeff)
if __name__ == '__main__':
test_pct_diff()
test_im3()
test_dice_sorensen_coeff()
test_diff_coeff()
test_jaccard_index()
test_kilgarriffs_ratio()
test_log_dice()
test_lfmd()
test_log_ratio()
test_mi_log_f()
test_min_sensitivity()
test_md()
test_me()
test_mi()
test_odds_ratio()
test_pmi()
test_poisson_collocation_measure()
test_im2()
test_squared_phi_coeff()