wordless/wl_measures/wl_measures_statistical_significance.py from BLKSerene/Wordless

wordless/wl_measures/wl_measures_statistical_significance.py
Summary

Maintainability

4 hrs
Test Coverage

Issues
# ----------------------------------------------------------------------
# Wordless: Measures - Statistical significance
# Copyright (C) 2018-2024  Ye Lei (叶磊)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
# ----------------------------------------------------------------------

# pylint: disable=unused-argument

import numpy
from PyQt5.QtCore import QCoreApplication
import scipy.stats

from wordless.wl_measures import wl_measure_utils

_tr = QCoreApplication.translate

def get_freqs_marginal(o11s, o12s, o21s, o22s):
    o1xs = o11s + o12s
    o2xs = o21s + o22s
    ox1s = o11s + o21s
    ox2s = o12s + o22s

    return o1xs, o2xs, ox1s, ox2s

def get_freqs_expected(o11s, o12s, o21s, o22s):
    o1xs, o2xs, ox1s, ox2s = get_freqs_marginal(o11s, o12s, o21s, o22s)
    oxxs = o1xs + o2xs

    e11s = wl_measure_utils.numpy_divide(o1xs * ox1s, oxxs)
    e12s = wl_measure_utils.numpy_divide(o1xs * ox2s, oxxs)
    e21s = wl_measure_utils.numpy_divide(o2xs * ox1s, oxxs)
    e22s = wl_measure_utils.numpy_divide(o2xs * ox2s, oxxs)

    return e11s, e12s, e21s, e22s

# Do not over-correct when the difference between observed and expected value is small than 0.5
# Reference: https://github.com/scipy/scipy/issues/13875
def yatess_correction(o11s, o12s, o21s, o22s, e11s, e12s, e21s, e22s):
    e_o_diffs_11 = e11s - o11s
    e_o_diffs_12 = e12s - o12s
    e_o_diffs_21 = e21s - o21s
    e_o_diffs_22 = e22s - o22s

    o11s = numpy.where(numpy.abs(e_o_diffs_11) > 0.5, o11s + 0.5 * numpy.sign(e_o_diffs_11), e11s)
    o12s = numpy.where(numpy.abs(e_o_diffs_12) > 0.5, o12s + 0.5 * numpy.sign(e_o_diffs_12), e12s)
    o21s = numpy.where(numpy.abs(e_o_diffs_21) > 0.5, o21s + 0.5 * numpy.sign(e_o_diffs_21), e21s)
    o22s = numpy.where(numpy.abs(e_o_diffs_22) > 0.5, o22s + 0.5 * numpy.sign(e_o_diffs_22), e22s)

    return o11s, o12s, o21s, o22s

def get_alt(direction):
    if direction == _tr('wl_measures_statistical_significance', 'Two-tailed'):
        alt = 'two-sided'
    elif direction == _tr('wl_measures_statistical_significance', 'Left-tailed'):
        alt = 'less'
    elif direction == _tr('wl_measures_statistical_significance', 'Right-tailed'):
        alt = 'greater'

    return alt

# Fisher's exact test
# References: Pedersen, T. (1996). Fishing for exactness. In T. Winn (Ed.), Proceedings of the Sixth Annual South-Central Regional SAS Users' Group Conference (pp. 188–200). The South–Central Regional SAS Users' Group.
def fishers_exact_test(main, o11s, o12s, o21s, o22s):
    settings = main.settings_custom['measures']['statistical_significance']['fishers_exact_test']

    p_vals = numpy.array([
        scipy.stats.fisher_exact(
            [[o11, o12], [o21, o22]],
            alternative = get_alt(settings['direction'])
        )[1]
        for o11, o12, o21, o22 in zip(o11s, o12s, o21s, o22s)
    ])

    return [None] * len(p_vals), p_vals

# Log-likelihood ratio test
# References: Dunning, T. E. (1993). Accurate methods for the statistics of surprise and coincidence. Computational Linguistics, 19(1), 61–74.
def log_likelihood_ratio_test(main, o11s, o12s, o21s, o22s):
    settings = main.settings_custom['measures']['statistical_significance']['log_likelihood_ratio_test']

    e11s, e12s, e21s, e22s = get_freqs_expected(o11s, o12s, o21s, o22s)

    if settings['apply_correction']:
        o11s, o12s, o21s, o22s = yatess_correction(o11s, o12s, o21s, o22s, e11s, e12s, e21s, e22s)

    gs_11 = o11s * wl_measure_utils.numpy_log(wl_measure_utils.numpy_divide(o11s, e11s))
    gs_12 = o12s * wl_measure_utils.numpy_log(wl_measure_utils.numpy_divide(o12s, e12s))
    gs_21 = o21s * wl_measure_utils.numpy_log(wl_measure_utils.numpy_divide(o21s, e21s))
    gs_22 = o22s * wl_measure_utils.numpy_log(wl_measure_utils.numpy_divide(o22s, e22s))

    gs = 2 * (gs_11 + gs_12 + gs_21 + gs_22)
    p_vals = numpy.array([
        scipy.stats.distributions.chi2.sf(g, 1)
        for g in gs
    ])

    return gs, p_vals

# Mann-Whitney U test
# References: Kilgarriff, A. (2001). Comparing corpora. International Journal of Corpus Linguistics, 6(1), 232–263. https://doi.org/10.1075/ijcl.6.1.05kil
def mann_whitney_u_test(main, freqs_x1s, freqs_x2s):
    settings = main.settings_custom['measures']['statistical_significance']['mann_whitney_u_test']

    num_types = len(freqs_x1s)
    u1s = numpy.empty(shape = num_types, dtype = numpy.float64)
    p_vals = numpy.empty(shape = num_types, dtype = numpy.float64)

    for i, (freqs_x1, freqs_x2) in enumerate(zip(freqs_x1s, freqs_x2s)):
        u1, p_val = scipy.stats.mannwhitneyu(
            freqs_x1, freqs_x2,
            use_continuity = settings['apply_correction'],
            alternative = get_alt(settings['direction'])
        )

        u1s[i] = u1
        p_vals[i] = p_val

    return u1s, p_vals

# Pearson's chi-squared test
# References:
#     Hofland, K., & Johanson, S. (1982). Word frequencies in British and American English. Norwegian Computing Centre for the Humanities.
#     Oakes, M. P. (1998). Statistics for Corpus Linguistics. Edinburgh University Press.
def pearsons_chi_squared_test(main, o11s, o12s, o21s, o22s):
    settings = main.settings_custom['measures']['statistical_significance']['pearsons_chi_squared_test']

    e11s, e12s, e21s, e22s = get_freqs_expected(o11s, o12s, o21s, o22s)

    if settings['apply_correction']:
        o11s, o12s, o21s, o22s = yatess_correction(o11s, o12s, o21s, o22s, e11s, e12s, e21s, e22s)

    chi2s_11 = wl_measure_utils.numpy_divide((o11s - e11s) ** 2, e11s)
    chi2s_12 = wl_measure_utils.numpy_divide((o12s - e12s) ** 2, e12s)
    chi2s_21 = wl_measure_utils.numpy_divide((o21s - e21s) ** 2, e21s)
    chi2s_22 = wl_measure_utils.numpy_divide((o22s - e22s) ** 2, e22s)

    chi2s = chi2s_11 + chi2s_12 + chi2s_21 + chi2s_22
    p_vals = numpy.array([
        scipy.stats.distributions.chi2.sf(chi2, 1)
        for chi2 in chi2s
    ])

    return chi2s, p_vals

# Student's t-test (1-sample)
# References: Church, K., Gale, W., Hanks, P., & Hindle, D. (1991). Using statistics in lexical analysis. In U. Zernik (Ed.), Lexical acquisition: Exploiting on-line resources to build a lexicon (pp. 115–164). Psychology Press.
def students_t_test_1_sample(main, o11s, o12s, o21s, o22s):
    settings = main.settings_custom['measures']['statistical_significance']['students_t_test_1_sample']

    oxxs = o11s + o12s + o21s + o22s
    e11s, _, _, _ = get_freqs_expected(o11s, o12s, o21s, o22s)

    t_stats = wl_measure_utils.numpy_divide(o11s - e11s, numpy.sqrt(o11s * (1 - wl_measure_utils.numpy_divide(o11s, oxxs))))
    p_vals = numpy.empty_like(t_stats)

    if settings['direction'] == _tr('wl_measures_statistical_significance', 'Two-tailed'):
        for i, (oxx, t_stat) in enumerate(zip(oxxs, t_stats)):
            p_vals[i] = scipy.stats.distributions.t.sf(numpy.abs(t_stat), oxx - 1) * 2 if oxx > 1 else 1
    elif settings['direction'] == _tr('wl_measures_statistical_significance', 'Left-tailed'):
        for i, (oxx, t_stat) in enumerate(zip(oxxs, t_stats)):
            p_vals[i] = scipy.stats.distributions.t.cdf(t_stat, oxx - 1) if oxx > 1 else 1
    elif settings['direction'] == _tr('wl_measures_statistical_significance', 'Right-tailed'):
        for i, (oxx, t_stat) in enumerate(zip(oxxs, t_stats)):
            p_vals[i] = scipy.stats.distributions.t.sf(t_stat, oxx - 1) if oxx > 1 else 1

    return t_stats, p_vals

# Student's t-test (2-sample)
# References: Paquot, M., & Bestgen, Y. (2009). Distinctive words in academic writing: A comparison of three statistical tests for keyword extraction. Language and Computers, 68, 247–269.
def students_t_test_2_sample(main, freqs_x1s, freqs_x2s):
    settings = main.settings_custom['measures']['statistical_significance']['students_t_test_2_sample']

    num_types = len(freqs_x1s)
    t_stats = numpy.empty(shape = num_types, dtype = numpy.float64)
    p_vals = numpy.empty(shape = num_types, dtype = numpy.float64)

    for i, (freqs_x1, freqs_x2) in enumerate(zip(freqs_x1s, freqs_x2s)):
        if any(freqs_x1) or any(freqs_x2):
            t_stat, p_val = scipy.stats.ttest_ind(
                freqs_x1, freqs_x2,
                equal_var = True,
                alternative = get_alt(settings['direction'])
            )
        else:
            t_stat = 0
            p_val = 1

        t_stats[i] = t_stat
        p_vals[i] = p_val

    return t_stats, p_vals

def _z_test_p_val(z_scores, direction):
    p_vals = numpy.empty_like(z_scores)

    if direction == _tr('wl_measures_statistical_significance', 'Two-tailed'):
        for i, z_score in enumerate(z_scores):
            p_vals[i] = scipy.stats.distributions.norm.sf(numpy.abs(z_score)) * 2
    elif direction == _tr('wl_measures_statistical_significance', 'Left-tailed'):
        for i, z_score in enumerate(z_scores):
            p_vals[i] = scipy.stats.distributions.norm.cdf(z_score)
    elif direction == _tr('wl_measures_statistical_significance', 'Right-tailed'):
        for i, z_score in enumerate(z_scores):
            p_vals[i] = scipy.stats.distributions.norm.sf(z_score)

    return p_vals

# Z-test
# References: Dennis, S. F. (1964). The construction of a thesaurus automatically from a sample of text. In M. E. Stevens, V. E. Giuliano, & L. B. Heilprin (Eds.), Proceedings of the symposium on statistical association methods for mechanized documentation (pp. 61–148). National Bureau of Standards.
def z_test(main, o11s, o12s, o21s, o22s):
    settings = main.settings_custom['measures']['statistical_significance']['z_test']

    oxxs = o11s + o12s + o21s + o22s
    e11s, _, _, _ = get_freqs_expected(o11s, o12s, o21s, o22s)

    z_scores = wl_measure_utils.numpy_divide(o11s - e11s, numpy.sqrt(e11s * (1 - wl_measure_utils.numpy_divide(e11s, oxxs))))
    p_vals = _z_test_p_val(z_scores, settings['direction'])

    return z_scores, p_vals

# Z-test (Berry-Rogghe)
# References: Berry-Rogghe, G. L. M. (1973). The computation of collocations and their relevance in lexical studies. In A. J. Aiken, R. W. Bailey, & N. Hamilton-Smith (Eds.), The computer and literary studies (pp. 103–112). Edinburgh University Press.
def z_test_berry_rogghe(main, o11s, o12s, o21s, o22s, span):
    settings = main.settings_custom['measures']['statistical_significance']['z_test_berry_rogghe']

    o1xs, o2xs, ox1s, _ = get_freqs_marginal(o11s, o12s, o21s, o22s)

    zs = o1xs + o2xs
    ps = wl_measure_utils.numpy_divide(ox1s, zs - o1xs, default = 1)
    es = ps * o1xs * span

    z_scores = wl_measure_utils.numpy_divide(o11s - es, numpy.sqrt(es * (1 - ps)))
    p_vals = _z_test_p_val(z_scores, settings['direction'])

    return z_scores, p_vals